29 files changed, 1501 insertions, 1577 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea5c3aa..225e63d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,6 +158,7 @@ ADD_LIBRARY(ousia_core
 	src/core/common/Rtti
 	src/core/common/RttiBuilder
 	src/core/common/SourceContextReader
+	src/core/common/Token
 	src/core/common/Utils
 	src/core/common/Variant
 	src/core/common/VariantConverter
@@ -180,16 +181,15 @@ ADD_LIBRARY(ousia_core
 	src/core/parser/ParserContext
 	src/core/parser/ParserScope
 	src/core/parser/stack/Callbacks
-	src/core/parser/stack/DocumentHandler
-	src/core/parser/stack/DomainHandler
-	src/core/parser/stack/GenericParserStates
-	src/core/parser/stack/Handler
-	src/core/parser/stack/ImportIncludeHandler
+#	src/core/parser/stack/DocumentHandler
+#	src/core/parser/stack/DomainHandler
+#	src/core/parser/stack/GenericParserStates
+#	src/core/parser/stack/Handler
+#	src/core/parser/stack/ImportIncludeHandler
 	src/core/parser/stack/State
-	src/core/parser/stack/Stack
-	src/core/parser/stack/TypesystemHandler
+#	src/core/parser/stack/Stack
+#	src/core/parser/stack/TypesystemHandler
 	src/core/parser/utils/SourceOffsetVector
-	src/core/parser/utils/Token
 	src/core/parser/utils/TokenizedData
 	src/core/parser/utils/Tokenizer
 	src/core/parser/utils/TokenTrie
@@ -212,19 +212,19 @@ ADD_LIBRARY(ousia_core
 #	ousia_core
 #)
 
-ADD_LIBRARY(ousia_osml
-	src/formats/osml/OsmlParser
-	src/formats/osml/OsmlStreamParser
-)
+#ADD_LIBRARY(ousia_osml
+#	src/formats/osml/OsmlParser
+#	src/formats/osml/OsmlStreamParser
+#)
 
-TARGET_LINK_LIBRARIES(ousia_osml
-	ousia_core
-)
+#TARGET_LINK_LIBRARIES(ousia_osml
+#	ousia_core
+#)
 
 ADD_LIBRARY(ousia_osxml
 	src/formats/osxml/OsxmlAttributeLocator
 	src/formats/osxml/OsxmlEventParser
-	src/formats/osxml/OsxmlParser
+#	src/formats/osxml/OsxmlParser
 )
 
 TARGET_LINK_LIBRARIES(ousia_osxml
@@ -273,19 +273,19 @@ TARGET_LINK_LIBRARIES(ousia_xml
 
 # Command line interface
 
-ADD_EXECUTABLE(ousia
-	src/cli/Main
-)
+#ADD_EXECUTABLE(ousia
+#	src/cli/Main
+#)
 
-TARGET_LINK_LIBRARIES(ousia
-	ousia_core
-	ousia_filesystem
-	ousia_html
-	ousia_xml
-	ousia_osml
-	ousia_osxml
-	${Boost_LIBRARIES}
-)
+#TARGET_LINK_LIBRARIES(ousia
+#	ousia_core
+#	ousia_filesystem
+#	ousia_html
+#	ousia_xml
+#	ousia_osml
+#	ousia_osxml
+#	${Boost_LIBRARIES}
+#)
 
 # If testing is enabled, build the unit tests
 IF(TEST)
@@ -323,11 +323,11 @@ IF(TEST)
 		test/core/model/StyleTest
 		test/core/model/TypesystemTest
 		test/core/parser/ParserScopeTest
-		test/core/parser/stack/StackTest
+#		test/core/parser/stack/StackTest
 		test/core/parser/stack/StateTest
 		test/core/parser/utils/SourceOffsetVectorTest
 		test/core/parser/utils/TokenizedDataTest
-		test/core/parser/utils/TokenizerTest
+#		test/core/parser/utils/TokenizerTest
 		test/core/parser/utils/TokenTrieTest
 		test/core/resource/ResourceLocatorTest
 		test/core/resource/ResourceRequestTest
@@ -383,29 +383,29 @@ IF(TEST)
 #		ousia_mozjs
 #	)
 
-	ADD_EXECUTABLE(ousia_test_osml
-		test/formats/osml/OsmlParserTest
-		test/formats/osml/OsmlStreamParserTest
-	)
+#	ADD_EXECUTABLE(ousia_test_osml
+#		test/formats/osml/OsmlParserTest
+#		test/formats/osml/OsmlStreamParserTest
+#	)
 
-	TARGET_LINK_LIBRARIES(ousia_test_osml
-		${GTEST_LIBRARIES}
-		ousia_core
-		ousia_osml
-		ousia_filesystem
-	)
+#	TARGET_LINK_LIBRARIES(ousia_test_osml
+#		${GTEST_LIBRARIES}
+#		ousia_core
+#		ousia_osml
+#		ousia_filesystem
+#	)
 
-	ADD_EXECUTABLE(ousia_test_osxml
-		test/formats/osxml/OsxmlEventParserTest
-		test/formats/osxml/OsxmlParserTest
-	)
+#	ADD_EXECUTABLE(ousia_test_osxml
+#		test/formats/osxml/OsxmlEventParserTest
+#		test/formats/osxml/OsxmlParserTest
+#	)
 
-	TARGET_LINK_LIBRARIES(ousia_test_osxml
-		${GTEST_LIBRARIES}
-		ousia_core
-		ousia_osxml
-		ousia_filesystem
-	)
+#	TARGET_LINK_LIBRARIES(ousia_test_osxml
+#		${GTEST_LIBRARIES}
+#		ousia_core
+#		ousia_osxml
+#		ousia_filesystem
+#	)
 
 	ADD_EXECUTABLE(ousia_test_xml
 		test/plugins/xml/XmlOutputTest
@@ -423,8 +423,8 @@ IF(TEST)
 	ADD_TEST(ousia_test_filesystem ousia_test_filesystem)
 	ADD_TEST(ousia_test_html ousia_test_html)
 #	ADD_TEST(ousia_test_mozjs ousia_test_mozjs)
-	ADD_TEST(ousia_test_osml ousia_test_osml)
-	ADD_TEST(ousia_test_osxml ousia_test_osxml)
+#	ADD_TEST(ousia_test_osml ousia_test_osml)
+#	ADD_TEST(ousia_test_osxml ousia_test_osxml)
 	ADD_TEST(ousia_test_xml ousia_test_xml)
 ENDIF()
 
@@ -442,9 +442,9 @@ INSTALL(DIRECTORY data/ DESTINATION share/ousia
 				OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE
 )
 
-INSTALL(TARGETS ousia
-		RUNTIME DESTINATION bin
-)
+#INSTALL(TARGETS ousia
+#		RUNTIME DESTINATION bin
+#)
 
 IF(INSTALL_GEDIT_HIGHLIGHTER)
 	INSTALL(FILES contrib/gtksourceview-3.0/language-specs/ousia.lang
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp
index d5d379c..f7dbdf3 100644
--- a/src/core/common/SourceContextReader.cpp
+++ b/src/core/common/SourceContextReader.cpp
@@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader,
 	ctx.relLen = end - start;           // end >= start (I2)
 
 	// Remove linebreaks at the beginning and the end
-	const std::pair<size_t, size_t> b =
-	    Utils::trim(lineBuf, Utils::isLinebreak);
+	const std::pair<size_t, size_t> b = Utils::trim(
+	    lineBuf,
+	    [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); });
 	ssize_t s = b.first, e = b.second;
 	s = std::min(s, static_cast<ssize_t>(ctx.relPos));
 
diff --git a/src/core/parser/utils/Token.cpp b/src/core/common/Token.cpp
index 8bcdbb5..8bcdbb5 100644
--- a/src/core/parser/utils/Token.cpp
+++ b/src/core/common/Token.cpp
diff --git a/src/core/parser/utils/Token.hpp b/src/core/common/Token.hpp
index f907450..0cf56b0 100644
--- a/src/core/parser/utils/Token.hpp
+++ b/src/core/common/Token.hpp
@@ -30,6 +30,7 @@
 #include <cstdint>
 #include <limits>
 #include <string>
+#include <unordered_set>
 
 #include <core/common/Location.hpp>
 
@@ -46,6 +47,11 @@ using TokenId = uint32_t;
 using TokenLength = uint16_t;
 
 /**
+ * Type used for storing token sets.
+ */
+using TokenSet = std::unordered_set<TokenId>;
+
+/**
  * Namespace containing constants for TokenId instances with special meaning.
  */
 namespace Tokens {
@@ -66,15 +72,29 @@ constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2;
 
 /**
  * Token which represents a paragraph token -- issued if two consecutive
- * newlines occur with optionally any amout of whitespace between them.
+ * newlines occur with optionally any amout of whitespace between them. The
+ * paragraph token is not repeated until more text is reached.
  */
 constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3;
 
 /**
+ * Token which represents a section token -- issued if three or more
+ * consecutive newlines occur with optionally any amout of whitespace between
+ * them. The section token is not repeated until more text is reached.
+ */
+constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4;
+
+/**
  * Token which represents an indentation token -- issued if the indentation of
- * this line is larget than the indentation of the previous line.
+ * this line is larger than the indentation of the previous line.
  */
-constexpr TokenId Indentation = std::numeric_limits<TokenId>::max() - 4;
+constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5;
+
+/**
+ * Token which represents an dedentation -- issued if the indentation of
+ * this line is smaller than the indentation of the previous line.
+ */
+constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6;
 
 /**
  * Maximum token id to be used. Tokens allocated for users should not surpass
@@ -109,6 +129,17 @@ struct Token {
 	Token() : id(Tokens::Empty) {}
 
 	/**
+	 * Constructor of a "data" token with no explicit content.
+	 *
+	 * @param location is the location of the extracted string content in the
+	 * source file.
+	 */
+	Token(SourceLocation location)
+	    : id(Tokens::Data), location(location)
+	{
+	}
+
+	/**
 	 * Constructor of the Token struct.
 	 *
 	 * @param id represents the token id.
@@ -129,6 +160,14 @@ struct Token {
 	Token(TokenId id) : id(id) {}
 
 	/**
+	 * Returns true if this token is special.
+	 *
+	 * @return true if the TokenId indicates that this token is a "special"
+	 * token.
+	 */
+	bool isSpecial() const {return id > Tokens::MaxTokenId;}
+
+	/**
 	 * The getLocation function allows the tokens to be directly passed as
 	 * parameter to Logger or LoggableException instances.
 	 *
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index a77951e..85d2c28 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename)
 	return std::string{};
 }
 
-std::string Utils::trim(const std::string &s)
-{
-	std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
-	return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
 bool Utils::startsWith(const std::string &s, const std::string &prefix)
 {
 	return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix;
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 7d96562..82a8f8c 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -124,14 +124,6 @@ public:
 	static bool hasNonWhitepaceChar(const std::string &s);
 
 	/**
-	 * Removes whitespace at the beginning and the end of the given string.
-	 *
-	 * @param s is the string that should be trimmed.
-	 * @return a trimmed copy of s.
-	 */
-	static std::string trim(const std::string &s);
-
-	/**
 	 * Trims the given string or vector of chars by returning the start and end
 	 * index.
 	 *
@@ -153,8 +145,8 @@ public:
 	 *
 	 * @param s is the container that should be trimmed.
 	 * @param len is the number of elements in the container.
-	 * @param f is a function that returns true for values that should be
-	 * removed.
+	 * @param f is a function that returns true for values at a certain index
+	 * that should be removed.
 	 * @return start and end index. Note that "end" points at the character
 	 * beyond the end, thus "end" minus "start"
 	 */
@@ -163,7 +155,7 @@ public:
 	{
 		size_t start = 0;
 		for (size_t i = 0; i < len; i++) {
-			if (!f(s[i])) {
+			if (!f(i)) {
 				start = i;
 				break;
 			}
@@ -171,7 +163,7 @@ public:
 
 		size_t end = 0;
 		for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {
-			if (!f(s[i])) {
+			if (!f(i)) {
 				end = i + 1;
 				break;
 			}
@@ -198,17 +190,33 @@ public:
 	 * the collapsed version of the string ends.
 	 * @return start and end index. Note that "end" points at the character
 	 * beyond the end, thus "end" minus "start"
+	 * @param f is a function that returns true for values at a certain index
+	 * that should be removed.
 	 */
-	template <class T>
-	static std::string trim(const T &s, size_t len, size_t &start, size_t &end)
+	template <class T, class Filter>
+	static std::string trim(const T &s, size_t len, size_t &start, size_t &end,
+	                        Filter f)
 	{
-		auto res = trim(s, len, isWhitespace);
+		auto res = trim(s, len, f);
 		start = res.first;
 		end = res.second;
 		return std::string(&s[start], end - start);
 	}
 
 	/**
+	 * Removes whitespace at the beginning and the end of the given string.
+	 *
+	 * @param s is the string that should be trimmed.
+	 * @return a trimmed copy of s.
+	 */
+	static std::string trim(const std::string &s)
+	{
+		std::pair<size_t, size_t> bounds =
+		    trim(s, [&s](size_t i) { return isWhitespace(s[i]); });
+		return s.substr(bounds.first, bounds.second - bounds.first);
+	}
+
+	/**
 	 * Collapses the whitespaces in the given string (trims the string and
 	 * replaces all whitespace characters by a single one).
 	 *
@@ -219,7 +227,8 @@ public:
 	{
 		size_t start;
 		size_t end;
-		return collapse(s, s.size(), start, end);
+		return collapse(s, s.size(), start, end,
+		                [&s](size_t i) { return isWhitespace(s[i]); });
 	}
 
 	/**
@@ -236,7 +245,8 @@ public:
 	static std::string collapse(const std::string &s, size_t &start,
 	                            size_t &end)
 	{
-		return collapse(s, s.size(), start, end);
+		return collapse(s, s.size(), start, end,
+		                [&s](size_t i) { return isWhitespace(s[i]); });
 	}
 
 	/**
@@ -244,6 +254,8 @@ public:
 	 * replaces all whitespace characters by a single one).
 	 *
 	 * @tparam T is the string type that should be used.
+	 * @tparam Filter is a filter function used for detecting the character
+	 * indices that might be removed.
 	 * @param s is the string in which the whitespace should be collapsed.
 	 * @param len is the length of the input string
 	 * @param start is an output parameter which is set to the offset at which
@@ -252,9 +264,9 @@ public:
 	 * the collapsed version of the string ends.
 	 * @return a copy of s with collapsed whitespace.
 	 */
-	template <class T>
+	template <class T, class Filter>
 	static std::string collapse(const T &s, size_t len, size_t &start,
-	                            size_t &end)
+	                            size_t &end, Filter f)
 	{
 		// Result vector
 		std::vector<char> res;
@@ -268,8 +280,7 @@ public:
 		bool hadWhitespace = false;
 		for (size_t i = 0; i < len; i++) {
 			const char c = s[i];
-			const bool whitespace = isWhitespace(c);
-			if (whitespace) {
+			if (f(i)) {
 				hadWhitespace = !res.empty();
 			} else {
 				// Adapt the start and end position
diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp
deleted file mode 100644
index ed52ea3..0000000
--- a/src/core/common/WhitespaceHandler.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file WhitespaceHandler.hpp
- *
- * Contains the WhitespaceHandler classes which are used in multiple places to
- * trim, compact or preserve whitespaces while at the same time maintaining the
- * position information associated with the input strings.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
-#define _OUSIA_WHITESPACE_HANDLER_HPP_
-
-#include <string>
-#include <vector>
-
-#include "Utils.hpp"
-
-namespace ousia {
-
-/**
- * WhitespaceHandler is a based class that can be used to collect text on a
- * character-by-character basis. Note that this class and its descendants are
- * hoped to be inlined by the compiler (and used in conjunction with templates),
- * thus they are fully defined inside this header.
- */
-class WhitespaceHandler {
-public:
-	/**
-	 * Start position of the extracted text.
-	 */
-	size_t textStart;
-
-	/**
-	 * End position of the extracted text.
-	 */
-	size_t textEnd;
-
-	/**
-	 * Buffer containing the extracted text.
-	 */
-	std::vector<char> textBuf;
-
-	/**
-	 * Constructor of the TextHandlerBase base class. Initializes the start and
-	 * end position with zeros.
-	 */
-	WhitespaceHandler() : textStart(0), textEnd(0) {}
-
-	/**
-	 * Returns true if this whitespace handler has found any text and a text
-	 * token could be emitted.
-	 *
-	 * @return true if the internal data buffer is non-empty.
-	 */
-	bool hasText() { return !textBuf.empty(); }
-
-	/**
-	 * Returns the content of the WhitespaceHandler as string.
-	 */
-	std::string toString() const
-	{
-		return std::string(textBuf.data(), textBuf.size());
-	}
-};
-
-/**
- * The PreservingWhitespaceHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingWhitespaceHandler : public WhitespaceHandler {
-public:
-	/**
-	 * Appends the given character to the internal text buffer, does not
-	 * eliminate whitespace.
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 */
-	void append(char c, size_t start, size_t end)
-	{
-		append(c, start, end, textBuf, textStart, textEnd);
-	}
-
-	/**
-	 * Static version of PreservingWhitespaceHandler append
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 * @param textBuf is a reference at the text buffer that is to be used.
-	 * @param textStart is a reference at the text start variable that is to be
-	 * used.
-	 * @param textEnd is a reference at the text end variable that is to be
-	 * used.
-	 */
-	static void append(char c, size_t start, size_t end,
-	                   std::vector<char> &textBuf, size_t &textStart,
-	                   size_t &textEnd)
-	{
-		if (textBuf.empty()) {
-			textStart = start;
-		}
-		textEnd = end;
-		textBuf.push_back(c);
-	}
-};
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingWhitespaceHandler : public WhitespaceHandler {
-public:
-	/**
-	 * Buffer used internally to temporarily store all whitespace characters.
-	 * They are only added to the output buffer if another non-whitespace
-	 * character is reached.
-	 */
-	std::vector<char> whitespaceBuf;
-
-	/**
-	 * Appends the given character to the internal text buffer, eliminates
-	 * whitespace characters at the begin and end of the text.
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 */
-	void append(char c, size_t start, size_t end)
-	{
-		append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf);
-	}
-
-	/**
-	 * Static version of TrimmingWhitespaceHandler append
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 * @param textBuf is a reference at the text buffer that is to be used.
-	 * @param textStart is a reference at the text start variable that is to be
-	 * used.
-	 * @param textEnd is a reference at the text end variable that is to be
-	 * used.
-	 * @param whitespaceBuf is a reference at the buffer for storing whitespace
-	 * characters.
-	 */
-	static void append(char c, size_t start, size_t end,
-	                   std::vector<char> &textBuf, size_t &textStart,
-	                   size_t &textEnd, std::vector<char> &whitespaceBuf)
-	{
-		// Handle whitespace characters
-		if (Utils::isWhitespace(c)) {
-			if (!textBuf.empty()) {
-				whitespaceBuf.push_back(c);
-			}
-			return;
-		}
-
-		// Set the start and end offset correctly
-		if (textBuf.empty()) {
-			textStart = start;
-		}
-		textEnd = end;
-
-		// Store the character
-		if (!whitespaceBuf.empty()) {
-			textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
-			               whitespaceBuf.end());
-			whitespaceBuf.clear();
-		}
-		textBuf.push_back(c);
-	}
-};
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingWhitespaceHandler : public WhitespaceHandler {
-public:
-	/**
-	 * Flag set to true if a whitespace character was reached.
-	 */
-	bool hasWhitespace = false;
-
-	/**
-	 * Appends the given character to the internal text buffer, eliminates
-	 * redundant whitespace characters.
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 */
-	void append(char c, size_t start, size_t end)
-	{
-		append(c, start, end, textBuf, textStart, textEnd, hasWhitespace);
-	}
-
-	/**
-	 * Static version of CollapsingWhitespaceHandler append
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 * @param textBuf is a reference at the text buffer that is to be used.
-	 * @param textStart is a reference at the text start variable that is to be
-	 * used.
-	 * @param textEnd is a reference at the text end variable that is to be
-	 * used.
-	 * @param hasWhitespace is a reference at the "hasWhitespace" flag.
-	 */
-	static void append(char c, size_t start, size_t end,
-	                   std::vector<char> &textBuf, size_t &textStart,
-	                   size_t &textEnd, bool &hasWhitespace)
-	{
-		// Handle whitespace characters
-		if (Utils::isWhitespace(c)) {
-			if (!textBuf.empty()) {
-				hasWhitespace = true;
-			}
-			return;
-		}
-
-		// Set the start and end offset correctly
-		if (textBuf.empty()) {
-			textStart = start;
-		}
-		textEnd = end;
-
-		// Store the character
-		if (hasWhitespace) {
-			textBuf.push_back(' ');
-			hasWhitespace = false;
-		}
-		textBuf.push_back(c);
-	}
-};
-
-/**
- * Function that can be used to append the given buffer (e.g. a string or a
- * vector) to the whitespace handler.
- *
- * @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
- * @tparam Buffer is an iterable type.
- * @param handler is the handler to which the characters of the Buffer should be
- * appended.
- * @param buf is the buffer from which the characters should be read.
- * @param start is the start byte offset. Each character is counted as one byte.
- */
-template <typename WhitespaceHandler, typename Buffer>
-inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
-                                      size_t start)
-{
-	for (auto elem : buf) {
-		handler.append(elem, start, start + 1);
-		start++;
-	}
-}
-}
-
-#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */
-
diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp
index bb04bd3..d44176a 100644
--- a/src/core/parser/stack/DocumentHandler.cpp
+++ b/src/core/parser/stack/DocumentHandler.cpp
@@ -25,6 +25,7 @@
 #include <core/model/Domain.hpp>
 #include <core/model/Project.hpp>
 #include <core/model/Typesystem.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
 #include <core/parser/ParserScope.hpp>
 #include <core/parser/ParserContext.hpp>
 
@@ -372,8 +373,15 @@ bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field,
 	return valid && scope().resolveValue(data, type, logger);
 }
 
-bool DocumentChildHandler::data(Variant &data)
+bool DocumentChildHandler::data(TokenizedData &data)
 {
+	// TODO: Handle this correctly
+	Variant text = data.text(WhitespaceMode::TRIM);
+	if (text == nullptr) {
+		// For now, except "no data" as success
+		return true;
+	}
+
 	// We're past the region in which explicit fields can be defined in the
 	// parent structure element
 	scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true);
@@ -393,11 +401,11 @@ bool DocumentChildHandler::data(Variant &data)
 	// If it is a primitive field directly, try to parse the content.
 	if (field->isPrimitive()) {
 		// Add it as primitive content.
-		if (!convertData(field, data, logger())) {
+		if (!convertData(field, text, logger())) {
 			return false;
 		}
 
-		parent->createChildDocumentPrimitive(data, fieldIdx);
+		parent->createChildDocumentPrimitive(text, fieldIdx);
 		return true;
 	}
 
@@ -411,7 +419,7 @@ bool DocumentChildHandler::data(Variant &data)
 	for (auto primitiveField : defaultFields) {
 		// Then try to parse the content using the type specification.
 		forks.emplace_back(logger().fork());
-		if (!convertData(primitiveField, data, forks.back())) {
+		if (!convertData(primitiveField, text, forks.back())) {
 			continue;
 		}
 
@@ -424,7 +432,7 @@ bool DocumentChildHandler::data(Variant &data)
 		createPath(fieldIdx, path, parent);
 
 		// Then create the primitive element
-		parent->createChildDocumentPrimitive(data);
+		parent->createChildDocumentPrimitive(text);
 		return true;
 	}
 
@@ -434,10 +442,10 @@ bool DocumentChildHandler::data(Variant &data)
 	if (defaultFields.empty()) {
 		logger().error("Got data, but structure \"" + name() +
 		                   "\" does not have any primitive field",
-		               data);
+		               text);
 	} else {
 		logger().error("Could not read data with any of the possible fields:",
-		               data);
+		               text);
 		size_t f = 0;
 		for (auto field : defaultFields) {
 			logger().note(std::string("Field ") +
@@ -471,4 +479,4 @@ namespace RttiTypes {
 const Rtti DocumentField = RttiBuilder<ousia::parser_stack::DocumentField>(
                                "DocumentField").parent(&Node);
 }
-}
-\ No newline at end of file
+}
diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp
index 862081c..dda7d8b 100644
--- a/src/core/parser/stack/DocumentHandler.hpp
+++ b/src/core/parser/stack/DocumentHandler.hpp
@@ -167,7 +167,7 @@ public:
 
 	bool start(Variant::mapType &args) override;
 	void end() override;
-	bool data(Variant &data) override;
+	bool data(TokenizedData &data) override;
 
 	bool fieldStart(bool &isDefault, size_t fieldIdx) override;
 
@@ -213,4 +213,4 @@ extern const Rtti DocumentField;
 }
 }
 
-#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */
-\ No newline at end of file
+#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */
diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp
index bf5d4ea..3d413e8 100644
--- a/src/core/parser/stack/Handler.cpp
+++ b/src/core/parser/stack/Handler.cpp
@@ -18,6 +18,7 @@
 
 #include <core/common/Exceptions.hpp>
 #include <core/common/Logger.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
 #include <core/parser/ParserContext.hpp>
 
 #include "Callbacks.hpp"
@@ -130,7 +131,7 @@ bool EmptyHandler::annotationEnd(const Variant &className,
 	return true;
 }
 
-bool EmptyHandler::data(Variant &data)
+bool EmptyHandler::data(TokenizedData &data)
 {
 	// Support any data
 	return true;
@@ -184,10 +185,13 @@ bool StaticHandler::annotationEnd(const Variant &className,
 	return false;
 }
 
-bool StaticHandler::data(Variant &data)
+bool StaticHandler::data(TokenizedData &data)
 {
-	logger().error("Did not expect any data here", data);
-	return false;
+	if (data.text(WhitespaceMode::TRIM) != nullptr) {
+		logger().error("Did not expect any data here", data);
+		return false;
+	}
+	return true;
 }
 
 /* Class StaticFieldHandler */
@@ -227,12 +231,19 @@ void StaticFieldHandler::end()
 	}
 }
 
-bool StaticFieldHandler::data(Variant &data)
+bool StaticFieldHandler::data(TokenizedData &data)
 {
+	Variant text = data.text(WhitespaceMode::TRIM);
+	if (text == nullptr) {
+		// Providing no data here is ok as long as the "doHandle" callback
+		// function has already been called
+		return handled;
+	}
+
 	// Call the doHandle function if this has not been done before
 	if (!handled) {
 		handled = true;
-		doHandle(data, args);
+		doHandle(text, args);
 		return true;
 	}
 
@@ -240,7 +251,7 @@ bool StaticFieldHandler::data(Variant &data)
 	logger().error(
 	    std::string("Found data, but the corresponding argument \"") + argName +
 	        std::string("\" was already specified"),
-	    data);
+	    text);
 
 	// Print the location at which the attribute was originally specified
 	auto it = args.find(argName);
diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp
index 7cda7a4..929466d 100644
--- a/src/core/parser/stack/Handler.hpp
+++ b/src/core/parser/stack/Handler.hpp
@@ -31,6 +31,7 @@ namespace ousia {
 class ParserScope;
 class ParserContext;
 class Logger;
+class TokenizedData;
 
 namespace parser_stack {
 
@@ -158,40 +159,63 @@ protected:
 	 */
 	const std::string &name() const;
 
-public:
-	/**
-	 * Virtual destructor.
-	 */
-	virtual ~Handler();
-
 	/**
 	 * Calls the corresponding function in the Callbacks instance. Sets the
 	 * whitespace mode that specifies how string data should be processed. The
 	 * calls to this function are placed on a stack by the underlying Stack
-	 * class.
+	 * class. This function should be called from the "fieldStart" callback and
+	 * the "start" callback. If no whitespace mode is pushed in the "start"
+	 * method the whitespace mode "TRIM" is implicitly assumed.
 	 *
 	 * @param whitespaceMode specifies one of the three WhitespaceMode constants
 	 * PRESERVE, TRIM or COLLAPSE.
 	 */
-	void setWhitespaceMode(WhitespaceMode whitespaceMode);
+	void pushWhitespaceMode(WhitespaceMode whitespaceMode);
 
 	/**
-	 * Calls the corresponding function in the Callbacks instance.
-	 * Registers the given token as token that should be reported to the handler
-	 * using the "token" function.
-	 *
-	 * @param token is the token string that should be reported.
+	 * Pops a previously pushed whitespace mode. Calls to this function should
+	 * occur in the "end" callback and the "fieldEnd" callback. This function
+	 * can only undo pushs that were performed by the pushWhitespaceMode()
+	 * method of the same handler.
 	 */
-	void registerToken(const std::string &token);
+	void popWhitespaceMode();
 
 	/**
-	 * Calls the corresponding function in the Callbacks instance.
-	 * Unregisters the given token, it will no longer be reported to the handler
-	 * using the "token" function.
+	 * Calls the corresponding function in the Callbacks instance. Sets the
+	 * whitespace mode that specifies how string data should be processed. The
+	 * calls to this function are placed on a stack by the underlying Stack
+	 * class. This function should be called from the "fieldStart" callback and
+	 * the "start" callback. If no whitespace mode is pushed in the "start"
+	 * method the whitespace mode "TRIM" is implicitly assumed.
 	 *
-	 * @param token is the token string that should be unregistered.
+	 * @param tokens is a list of tokens that should be reported to this handler
+	 * instance via the "token" method.
 	 */
-	void unregisterToken(const std::string &token);
+	void pushTokens(const std::vector<std::string> &tokens);
+
+	/**
+	 * Pops a previously pushed whitespace mode. Calls to this function should
+	 * occur in the "end" callback and the "fieldEnd" callback. This function
+	 * can only undo pushs that were performed by the pushWhitespaceMode()
+	 * method of the same handler.
+	 */
+	void popWhitespaceMode();
+
+
+	/**
+	 * Calls the corresponding function in the Callbacks instance. This method
+	 * registers the given tokens as tokens that are generally available, tokens
+	 * must be explicitly enabled using the "pushTokens" and "popTokens" method.
+	 * Tokens that have not been registered are not guaranteed to be reported,
+	 * even though they are 
+	 */
+	void registerTokens(const std::vector<std::string> &tokens);
+
+public:
+	/**
+	 * Virtual destructor.
+	 */
+	virtual ~Handler();
 
 	/**
 	 * Returns the command name for which the handler was created.
@@ -299,11 +323,11 @@ public:
 	 * Handler instance. Should return true if the data could be handled, false
 	 * otherwise.
 	 *
-	 * @param data is a string variant containing the character data and its
-	 * location.
+	 * @param data is an instance of TokenizedData containing the segmented
+	 * character data and its location.
 	 * @return true if the data could be handled, false otherwise.
 	 */
-	virtual bool data(Variant &data) = 0;
+	virtual bool data(TokenizedData &data) = 0;
 };
 
 /**
@@ -333,7 +357,7 @@ public:
 	                     Variant::mapType &args) override;
 	bool annotationEnd(const Variant &className,
 	                   const Variant &elementName) override;
-	bool data(Variant &data) override;
+	bool data(TokenizedData &data) override;
 
 	/**
 	 * Creates an instance of the EmptyHandler class.
@@ -359,7 +383,7 @@ public:
 	                     Variant::mapType &args) override;
 	bool annotationEnd(const Variant &className,
 	                   const Variant &elementName) override;
-	bool data(Variant &data) override;
+	bool data(TokenizedData &data) override;
 };
 
 /**
@@ -412,7 +436,7 @@ protected:
 public:
 	bool start(Variant::mapType &args) override;
 	void end() override;
-	bool data(Variant &data) override;
+	bool data(TokenizedData &data) override;
 };
 }
 }
diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp
index 5b67248..309c9a0 100644
--- a/src/core/parser/stack/Stack.cpp
+++ b/src/core/parser/stack/Stack.cpp
@@ -19,6 +19,7 @@
 #include <core/common/Logger.hpp>
 #include <core/common/Utils.hpp>
 #include <core/common/Exceptions.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
 #include <core/parser/ParserScope.hpp>
 #include <core/parser/ParserContext.hpp>
 
@@ -413,16 +414,24 @@ void Stack::command(const Variant &name, const Variant::mapType &args)
 	}
 }
 
-void Stack::data(const Variant &data)
+void Stack::data(TokenizedData data)
 {
-	// End handlers that already had a default field and are currently not
-	// active.
-	endOverdueHandlers();
+	// TODO: Rewrite this function for token handling
+	// TODO: This loop needs to be refactored out
+	while (!data.atEnd()) {
+		// End handlers that already had a default field and are currently not
+		// active.
+		endOverdueHandlers();
 
-	while (true) {
-		// Check whether there is any command the data can be sent to
+		const bool hasNonWhitespaceText = data.hasNonWhitespaceText();
+
+		// Check whether there is any command the data can be sent to -- if not,
+		// make sure the data actually is data
 		if (stack.empty()) {
-			throw LoggableException("No command here to receive data.", data);
+			if (hasNonWhitespaceText) {
+				throw LoggableException("No command here to receive data.", data);
+			}
+			return;
 		}
 
 		// Fetch the current command handler information
@@ -440,7 +449,10 @@ void Stack::data(const Variant &data)
 			// If the "hadDefaultField" flag is set, we already issued an error
 			// message
 			if (!info.hadDefaultField) {
-				logger().error("Did not expect any data here", data);
+				if (hasNonWhitespaceText) {
+					logger().error("Did not expect any data here", data);
+				}
+				return;
 			}
 		}
 
@@ -454,8 +466,16 @@ void Stack::data(const Variant &data)
 			// Pass the data to the current Handler instance
 			bool valid = false;
 			try {
-				Variant dataCopy = data;
-				valid = info.handler->data(dataCopy);
+				// Create a fork of the TokenizedData and let the handler work
+				// on it
+				TokenizedData dataFork = data;
+				valid = info.handler->data(dataFork);
+
+				// If the data was validly handled by the handler, commit the
+				// change
+				if (valid) {
+					data = dataFork;
+				}
 			}
 			catch (LoggableException ex) {
 				loggerFork.log(ex);
@@ -482,6 +502,19 @@ void Stack::data(const Variant &data)
 	}
 }
 
+void Stack::data(const Variant &stringData)
+{
+	// Fetch the SourceLocation of the given stringData variant
+	SourceLocation loc = stringData.getLocation();
+
+	// Create a TokenizedData instance and feed the given string data into it
+	TokenizedData tokenizedData(loc.getSourceId());
+	tokenizedData.append(stringData.asString(), loc.getStart());
+
+	// Call the actual "data" method
+	data(tokenizedData);
+}
+
 void Stack::fieldStart(bool isDefault)
 {
 	// Make sure the current handler stack is not empty
@@ -584,4 +617,4 @@ void Stack::token(Variant token)
 	// TODO
 }
 }
-}
-\ No newline at end of file
+}
diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp
index b67ce82..cd29b28 100644
--- a/src/core/parser/stack/Stack.hpp
+++ b/src/core/parser/stack/Stack.hpp
@@ -44,6 +44,7 @@ namespace ousia {
 // Forward declarations
 class ParserContext;
 class Logger;
+class TokenizedData;
 
 namespace parser_stack {
 
@@ -292,13 +293,24 @@ public:
 	void command(const Variant &name, const Variant::mapType &args);
 
 	/**
-	 * Function that shuold be called whenever character data is found in the
+	 * Function that should be called whenever character data is found in the
 	 * input stream. May only be called if the currently is a command on the
 	 * stack.
 	 *
-	 * @param data is a string variant containing the data that has been found.
+	 * @param data is a TokenizedData instance containing the pre-segmented data
+	 * that should be read.
+	 */
+	void data(TokenizedData data);
+
+	/**
+	 * Function that shuold be called whenever character data is found in the
+	 * input stream. The given string variant is converted into a TokenizedData
+	 * instance internally.
+	 *
+	 * @param stringData is a string variant containing the data that has been
+	 * found.
 	 */
-	void data(const Variant &data);
+	void data(const Variant &stringData);
 
 	/**
 	 * Function that should be called whenever a new field starts. Fields of the
diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp
index d15055a..aaebe7d 100644
--- a/src/core/parser/utils/SourceOffsetVector.hpp
+++ b/src/core/parser/utils/SourceOffsetVector.hpp
@@ -127,7 +127,7 @@ public:
 	 * read.
 	 * @return a pair containing start and end source offset.
 	 */
-	std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx)
+	std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) const
 	{
 		// Special treatment for the last character
 		const size_t count = lens.size();
@@ -157,7 +157,31 @@ public:
 	/**
 	 * Returns the number of characters for which offsets are stored.
 	 */
-	size_t size() { return lens.size(); }
+	size_t size() const { return lens.size(); }
+
+	/**
+	 * Trims the length of the TokenizedData instance to the given length.
+	 * Removes all token matches that lie within the trimmed region.
+	 *
+	 * @param length is the number of characters to which the TokenizedData
+	 * instance should be trimmed.
+	 */
+	void trim(size_t length) {
+		if (length < size()) {
+			lens.resize(length);
+			offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1);
+		}
+	}
+
+	/**
+	 * Resets the SourceOffsetVector to the state it had when it was
+	 * constructed.
+	 */
+	void clear() {
+		lens.clear();
+		offsets.clear();
+		lastEnd = 0;
+	}
 };
 }
 
diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp
index 80cc945..a45d3ff 100644
--- a/src/core/parser/utils/TokenTrie.cpp
+++ b/src/core/parser/utils/TokenTrie.cpp
@@ -22,12 +22,12 @@ namespace ousia {
 
 /* Class DynamicTokenTree::Node */
 
-TokenTrie::Node::Node() : type(Tokens::Empty) {}
+TokenTrie::Node::Node() : id(Tokens::Empty) {}
 
 /* Class DynamicTokenTree */
 
 bool TokenTrie::registerToken(const std::string &token,
-                              TokenId type) noexcept
+                              TokenId id) noexcept
 {
 	// Abort if the token is empty -- this would taint the root node
 	if (token.empty()) {
@@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token,
 	}
 
 	// If the resulting node already has a type set, we're screwed.
-	if (node->type != Tokens::Empty) {
+	if (node->id != Tokens::Empty) {
 		return false;
 	}
 
 	// Otherwise just set the type to the given type.
-	node->type = type;
+	node->id = id;
 	return true;
 }
 
@@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
 
 		// Reset the subtree handler if this node has another type
 		node = it->second.get();
-		if ((node->type != Tokens::Empty || node->children.size() > 1) &&
+		if ((node->id != Tokens::Empty || node->children.size() > 1) &&
 		    (i + 1 != token.size())) {
 			subtreeRoot = node;
 			subtreeKey = token[i + 1];
@@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
 	}
 
 	// If the node type is already Tokens::Empty, we cannot do anything here
-	if (node->type == Tokens::Empty) {
+	if (node->id == Tokens::Empty) {
 		return false;
 	}
 
 	// If the target node has children, we cannot delete the subtree. Set the
 	// type to Tokens::Empty instead
 	if (!node->children.empty()) {
-		node->type = Tokens::Empty;
+		node->id = Tokens::Empty;
 		return true;
 	}
 
@@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept
 		}
 		node = it->second.get();
 	}
-	return node->type;
+	return node->id;
 }
 }
 
diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp
index b2d1539..c470acc 100644
--- a/src/core/parser/utils/TokenTrie.hpp
+++ b/src/core/parser/utils/TokenTrie.hpp
@@ -33,7 +33,7 @@
 #include <limits>
 #include <unordered_map>
 
-#include "Token.hpp"
+#include <core/common/Token.hpp>
 
 namespace ousia {
 
@@ -75,10 +75,9 @@ public:
 		ChildMap children;
 
 		/**
-		 * Reference at the corresponding token descriptor. Set to nullptr if
-		 * no token is attached to this node.
+		 * Id of the token represented by this node.
 		 */
-		TokenId type;
+		TokenId id;
 
 		/**
 		 * Default constructor, initializes the descriptor with nullptr.
@@ -99,10 +98,10 @@ public:
 	 *
 	 * @param token is the character sequence that should be registered as
 	 * token.
-	 * @param type is the descriptor that should be set for this token.
+	 * @param id is the descriptor that should be set for this token.
 	 * @return true if the operation is successful, false otherwise.
 	 */
-	bool registerToken(const std::string &token, TokenId type) noexcept;
+	bool registerToken(const std::string &token, TokenId id) noexcept;
 
 	/**
 	 * Unregisters the token from the token tree. Returns true if the token was
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
index fc7bfaf..aeefa26 100644
--- a/src/core/parser/utils/TokenizedData.cpp
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -48,6 +48,17 @@ struct TokenMark {
 	TokenLength len;
 
 	/**
+	 * Specifies whether the token is special or not.
+	 */
+	bool special;
+
+	/**
+	 * Maximum token length.
+	 */
+	static constexpr TokenLength MaxTokenLength =
+	    std::numeric_limits<TokenLength>::max();
+
+	/**
 	 * Constructor of the TokenMark structure, initializes all members with the
 	 * given values.
 	 *
@@ -55,9 +66,10 @@ struct TokenMark {
 	 * @param bufStart is the start position of the TokenMark in the internal
 	 * character buffer.
 	 * @param len is the length of the token.
+	 * @param special modifies the sort order, special tokens are prefered.
 	 */
-	TokenMark(TokenId id, size_t bufStart, TokenLength len)
-	    : bufStart(bufStart), id(id), len(len)
+	TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special)
+	    : bufStart(bufStart), id(id), len(len), special(special)
 	{
 	}
 
@@ -72,7 +84,8 @@ struct TokenMark {
 	TokenMark(size_t bufStart)
 	    : bufStart(bufStart),
 	      id(Tokens::Empty),
-	      len(std::numeric_limits<TokenLength>::max())
+	      len(MaxTokenLength),
+	      special(true)
 	{
 	}
 
@@ -86,8 +99,22 @@ struct TokenMark {
 	 */
 	friend bool operator<(const TokenMark &m1, const TokenMark &m2)
 	{
-		return (m1.bufStart < m2.bufStart) ||
-		       (m1.bufStart == m2.bufStart && m1.len > m2.len);
+		// Prefer the mark with the smaller bufStart
+		if (m1.bufStart < m2.bufStart) {
+			return true;
+		}
+
+		// Special handling for marks with the same bufStart
+		if (m1.bufStart == m2.bufStart) {
+			// If exactly one of the two marks is special, return true if this
+			// one is special
+			if (m1.special != m2.special) {
+				return m1.special;
+			}
+			// Otherwise prefer longer marks
+			return m1.len > m2.len;
+		}
+		return false;
 	}
 };
 }
@@ -110,9 +137,9 @@ private:
 	std::vector<char> buf;
 
 	/**
-	 * Vector containing all token marks.
+	 * Buffset storing the "protected" flag of the character data.
 	 */
-	std::vector<TokenMark> marks;
+	std::vector<bool> protectedChars;
 
 	/**
 	 * Vector storing all the character offsets efficiently.
@@ -120,9 +147,34 @@ private:
 	SourceOffsetVector offsets;
 
 	/**
+	 * Vector containing all token marks.
+	 */
+	mutable std::vector<TokenMark> marks;
+
+	/**
+	 * Position of the first linebreak in a sequence of linebreaks.
+	 */
+	size_t firstLinebreak;
+
+	/**
+	 * Current indentation level.
+	 */
+	uint16_t currentIndentation;
+
+	/**
+	 * Last indentation level.
+	 */
+	uint16_t lastIndentation;
+
+	/**
+	 * Number of linebreaks without any content between them.
+	 */
+	uint16_t numLinebreaks;
+
+	/**
 	 * Flag indicating whether the internal "marks" vector is sorted.
 	 */
-	bool sorted;
+	mutable bool sorted;
 
 public:
 	/**
@@ -132,7 +184,7 @@ public:
 	 * @param sourceId is the source identifier that should be used for
 	 * constructing the location when returning tokens.
 	 */
-	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {}
+	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); }
 
 	/**
 	 * Appends a complete string to the internal character buffer and extends
@@ -140,22 +192,22 @@ public:
 	 *
 	 * @param data is the string that should be appended to the buffer.
 	 * @param offsStart is the start offset in bytes in the input file.
+	 * @param protect if set to true, the appended characters will not be
+	 * affected by whitespace handling, they will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(const std::string &data, SourceOffset offsStart)
-	{  // Append the data to the internal buffer
-		buf.insert(buf.end(), data.begin(), data.end());
-
-		// Extend the text regions, interpolate the source position (this may
-		// yield incorrect results)
-		const size_t size = buf.size();
-		for (SourceOffset offs = offsStart; offs < offsStart + data.size();
-		     offs++) {
-			offsets.storeOffset(offs, offs + 1);
+	size_t append(const std::string &data, SourceOffset offsStart, bool protect)
+	{
+		for (size_t i = 0; i < data.size(); i++) {
+			if (offsStart != InvalidSourceOffset) {
+				append(data[i], offsStart + i, offsStart + i + 1, protect);
+			} else {
+				append(data[i], InvalidSourceOffset, InvalidSourceOffset,
+				       protect);
+			}
 		}
-
-		return size;
+		return size();
 	}
 
 	/**
@@ -165,16 +217,86 @@ public:
 	 * @param c is the character that should be appended to the buffer.
 	 * @param offsStart is the start offset in bytes in the input file.
 	 * @param offsEnd is the end offset in bytes in the input file.
+	 * @param protect if set to true, the appended character will not be
+	 * affected by whitespace handling, it will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd)
+	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+	              bool protect)
 	{
 		// Add the character to the list and store the location of the character
 		// in the source file
 		buf.push_back(c);
+		protectedChars.push_back(protect);
 		offsets.storeOffset(offsStart, offsEnd);
-		return buf.size();
+
+		// Insert special tokens
+		const size_t size = buf.size();
+		const bool isWhitespace = Utils::isWhitespace(c);
+		const bool isLinebreak = Utils::isLinebreak(c);
+
+		// Handle linebreaks
+		if (isLinebreak) {
+			// Mark linebreaks as linebreak
+			mark(Tokens::Newline, size - 1, 1, false);
+
+			// The linebreak sequence started at the previous character
+			if (numLinebreaks == 0) {
+				firstLinebreak = size - 1;
+			}
+
+			// Reset the indentation
+			currentIndentation = 0;
+
+			// Increment the number of linebreaks
+			numLinebreaks++;
+
+			const size_t markStart = firstLinebreak;
+			const size_t markLength = size - firstLinebreak;
+
+			// Issue two consecutive linebreaks as paragraph token
+			if (numLinebreaks == 2) {
+				mark(Tokens::Paragraph, markStart, markLength, false);
+			}
+
+			// Issue three consecutive linebreaks as paragraph token
+			if (numLinebreaks >= 3) {
+				mark(Tokens::Section, markStart, markLength, false);
+			}
+		} else if (isWhitespace) {
+			// Count the whitespace characters at the beginning of the line
+			if (numLinebreaks > 0) {
+				// Implement the UNIX/Pyhton rule for tabs: Tabs extend to the
+				// next multiple of eight.
+				if (c == '\t') {
+					currentIndentation = (currentIndentation + 8) & ~7;
+				} else {
+					currentIndentation++;
+				}
+			}
+		}
+
+		// Issue indent and unindent tokens
+		if (!isWhitespace && numLinebreaks > 0) {
+			// Issue a larger indentation than that in the previous line as
+			// "Indent" token
+			if (currentIndentation > lastIndentation) {
+				mark(Tokens::Indent, size - 1, 0, true);
+			}
+
+			// Issue a smaller indentation than that in the previous line as
+			// "Dedent" token
+			if (currentIndentation < lastIndentation) {
+				mark(Tokens::Dedent, size - 1, 0, true);
+			}
+
+			// Reset the internal state machine
+			lastIndentation = currentIndentation;
+			numLinebreaks = 0;
+		}
+
+		return size;
 	}
 
 	/**
@@ -184,11 +306,12 @@ public:
 	 * @param bufStart is the start position in the internal buffer. Use the
 	 * values returned by append to calculate the start position.
 	 * @param len is the length of the token.
+	 * @param special tags the mark as "special", prefering it in the sort order
 	 */
-	void mark(TokenId id, size_t bufStart, TokenLength len)
+	void mark(TokenId id, size_t bufStart, TokenLength len, bool special)
 	{
 		// Push the new instance back onto the list
-		marks.emplace_back(id, bufStart, len);
+		marks.emplace_back(id, bufStart, len, special);
 
 		// Update the sorted flag as soon as more than one element is in the
 		// list
@@ -212,9 +335,13 @@ public:
 	 * @return true if a token was returned, false if no more tokens are
 	 * available.
 	 */
-	bool next(Token &token, WhitespaceMode mode,
-	          const std::unordered_set<TokenId> &tokens, size_t &cursor)
+	bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens,
+	          TokenizedDataCursor &cursor) const
 	{
+		// Some variables for convenient access
+		size_t &bufPos = cursor.bufPos;
+		size_t &markPos = cursor.markPos;
+
 		// Sort the "marks" vector if it has not been sorted yet.
 		if (!sorted) {
 			std::sort(marks.begin(), marks.end());
@@ -222,10 +349,11 @@ public:
 		}
 
 		// Fetch the next larger TokenMark instance, make sure the token is in
-		// the "enabled" list
-		auto it =
-		    std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor));
-		while (it != marks.end() && tokens.count(it->id) == 0) {
+		// the "enabled" list and within the buffer range
+		auto it = std::lower_bound(marks.begin() + markPos, marks.end(),
+		                           TokenMark(bufPos));
+		while (it != marks.end() && (tokens.count(it->id) == 0 ||
+		                             it->bufStart + it->len > buf.size())) {
 			it++;
 		}
 
@@ -236,15 +364,15 @@ public:
 		// Depending on the whitespace mode, fetch all the data between the
 		// cursor position and the calculated end position and return a token
 		// containing that data.
-		if (cursor < end && cursor < buf.size()) {
+		if (bufPos < end && bufPos < buf.size()) {
 			switch (mode) {
 				case WhitespaceMode::PRESERVE: {
 					token = Token(
-					    Tokens::Data, std::string(&buf[cursor], end - cursor),
+					    Tokens::Data, std::string(&buf[bufPos], end - bufPos),
 					    SourceLocation(sourceId,
-					                   offsets.loadOffset(cursor).first,
+					                   offsets.loadOffset(bufPos).first,
 					                   offsets.loadOffset(end).first));
-					cursor = end;
+					bufPos = end;
 					return true;
 				}
 				case WhitespaceMode::TRIM:
@@ -254,30 +382,35 @@ public:
 					size_t stringStart;
 					size_t stringEnd;
 					std::string content;
+					const char *cBuf = &buf[bufPos];
+					auto filter = [cBuf, this](size_t i) -> bool {
+						return Utils::isWhitespace(cBuf[i]) &&
+						       !protectedChars[i];
+					};
 					if (mode == WhitespaceMode::TRIM) {
-						content = Utils::trim(&buf[cursor], end - cursor,
-						                      stringStart, stringEnd);
+						content = Utils::trim(cBuf, end - bufPos, stringStart,
+						                      stringEnd, filter);
 					} else {
-						content = Utils::collapse(&buf[cursor], end - cursor,
-						                          stringStart, stringEnd);
+						content = Utils::collapse(
+						    cBuf, end - bufPos, stringStart, stringEnd, filter);
 					}
 
 					// If the resulting string is empty (only whitespaces),
 					// abort
 					if (content.empty()) {
-						cursor = end;
+						bufPos = end;
 						break;
 					}
 
 					// Calculate the absolute positions and return the token
-					stringStart += cursor;
-					stringEnd += cursor;
+					stringStart += bufPos;
+					stringEnd += bufPos;
 					token = Token(
 					    Tokens::Data, content,
 					    SourceLocation(sourceId,
 					                   offsets.loadOffset(stringStart).first,
 					                   offsets.loadOffset(stringEnd).first));
-					cursor = end;
+					bufPos = end;
 					return true;
 				}
 			}
@@ -286,14 +419,18 @@ public:
 		// If start equals end, we're currently directly at a token
 		// instance. Return this token and advance the cursor to the end of
 		// the token.
-		if (cursor == end && it != marks.end()) {
+		if (bufPos == end && it != marks.end()) {
 			const size_t tokenStart = it->bufStart;
 			const size_t tokenEnd = it->bufStart + it->len;
 			token = Token(
 			    it->id, std::string(&buf[tokenStart], it->len),
 			    SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,
 			                   offsets.loadOffset(tokenEnd).first));
-			cursor = tokenEnd;
+
+			// Update the cursor, consume the token by incrementing the marks
+			// pos counter
+			bufPos = tokenEnd;
+			markPos = it - marks.begin() + 1;
 			return true;
 		}
 
@@ -304,11 +441,62 @@ public:
 	}
 
 	/**
+	 * Resets the TokenizedDataImpl instance to the state it had when it was
+	 * constructred.
+	 */
+	void clear()
+	{
+		buf.clear();
+		protectedChars.clear();
+		offsets.clear();
+		marks.clear();
+		currentIndentation = 0;
+		lastIndentation = 0;
+		numLinebreaks = 1;  // Assume the stream starts with a linebreak
+		sorted = true;
+	}
+
+	/**
+	 * Trims the length of the TokenizedDataImpl instance to the given length.
+	 *
+	 * @param length is the number of characters to which the TokenizedData
+	 * instance should be trimmed.
+	 */
+	void trim(size_t length)
+	{
+		if (length < size()) {
+			buf.resize(length);
+			offsets.trim(length);
+		}
+	}
+
+	/**
 	 * Returns the current size of the internal buffer.
 	 *
 	 * @return the size of the internal character buffer.
 	 */
-	size_t getSize() { return buf.size(); }
+	size_t size() const { return buf.size(); }
+
+	/**
+	 * Returns true if no data is in the data buffer.
+	 *
+	 * @return true if the "buf" instance has no data.
+	 */
+	bool empty() const { return buf.empty(); }
+
+	/**
+	 * Returns the current location of all data in the buffer.
+	 *
+	 * @return the location of the entire data represented by this instance.
+	 */
+	SourceLocation getLocation() const
+	{
+		if (empty()) {
+			return SourceLocation{sourceId};
+		}
+		return SourceLocation{sourceId, offsets.loadOffset(0).first,
+		                      offsets.loadOffset(size()).second};
+	}
 };
 
 /* Class TokenizedData */
@@ -316,50 +504,83 @@ public:
 TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}
 
 TokenizedData::TokenizedData(SourceId sourceId)
-    : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0)
+    : impl(std::make_shared<TokenizedDataImpl>(sourceId))
 {
 }
 
 TokenizedData::~TokenizedData() {}
 
-size_t TokenizedData::append(const std::string &data, SourceOffset offsStart)
+size_t TokenizedData::append(const std::string &data, SourceOffset offsStart,
+                             bool protect)
 {
-	return impl->append(data, offsStart);
+	return impl->append(data, offsStart, protect);
 }
 
 size_t TokenizedData::append(char c, SourceOffset offsStart,
-                             SourceOffset offsEnd)
+                             SourceOffset offsEnd, bool protect)
 {
-	return impl->append(c, offsStart, offsEnd);
+	return impl->append(c, offsStart, offsEnd, protect);
 }
 
 void TokenizedData::mark(TokenId id, TokenLength len)
 {
-	impl->mark(id, impl->getSize() - len, len);
+	impl->mark(id, impl->size() - len, len, false);
 }
 
 void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)
 {
-	impl->mark(id, bufStart, len);
+	impl->mark(id, bufStart, len, false);
 }
 
-bool TokenizedData::next(Token &token, WhitespaceMode mode)
+void TokenizedData::clear() { impl->clear(); }
+
+void TokenizedData::trim(size_t length) { impl->trim(length); }
+
+size_t TokenizedData::size() const { return impl->size(); }
+
+bool TokenizedData::empty() const { return impl->empty(); }
+
+SourceLocation TokenizedData::getLocation() const
 {
-	return impl->next(token, mode, tokens, cursor);
+	return impl->getLocation();
 }
 
-bool TokenizedData::text(Token &token, WhitespaceMode mode)
+TokenizedDataReader TokenizedData::reader() const
 {
-	// Copy the current cursor position to not update the actual cursor position
-	// if the operation was not successful
-	size_t cursorCopy = cursor;
-	if (!impl->next(token, mode, tokens, cursorCopy) ||
-	    token.id != Tokens::Data) {
-		return false;
-	}
+	return TokenizedDataReader(impl, TokenizedDataCursor(),
+	                           TokenizedDataCursor());
+}
+
+/* Class TokenizedDataReader */
 
-	// There is indeed a text token, update the internal cursor position
-	cursor = cursorCopy;
-	return true;
+TokenizedDataReader::TokenizedDataReader(
+    std::shared_ptr<const TokenizedDataImpl> impl,
+    const TokenizedDataCursor &readCursor,
+    const TokenizedDataCursor &peekCursor)
+    : impl(impl), readCursor(readCursor), peekCursor(peekCursor)
+{
+}
+
+TokenizedDataReaderFork TokenizedDataReader::fork()
+{
+	return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor);
+}
+
+bool TokenizedDataReader::atEnd() const
+{
+	return readCursor.bufPos >= impl->size();
+}
+
+bool TokenizedDataReader::read(Token &token, const TokenSet &tokens,
+                               WhitespaceMode mode)
+{
+	peekCursor = readCursor;
+	return impl->next(token, mode, tokens, readCursor);
+}
+
+bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens,
+                               WhitespaceMode mode)
+{
+	return impl->next(token, mode, tokens, peekCursor);
 }
 }
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 38125c4..b72ca02 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -37,40 +37,48 @@
 
 #include <core/common/Location.hpp>
 #include <core/common/Whitespace.hpp>
-
-#include "Token.hpp"
+#include <core/common/Token.hpp>
 
 namespace ousia {
 
 // Forward declaration
 class TokenizedDataImpl;
+class TokenizedDataReader;
+class TokenizedDataReaderFork;
 
 /**
- * The TokenizedData class stores data extracted from a user defined document.
- * As users are capable of defining their own tokens and these are only valid
- * in certain scopes TokenizedData allows to divide the stored data into chunks
- * separated by tokens.
+ * Internally used structure representing a cursor within the TokenizedData
+ * stream.
  */
-class TokenizedData {
-private:
+struct TokenizedDataCursor {
 	/**
-	 * Shared pointer pointing at the internal data. This data is shared when
-	 * copying TokenizedData instances, which corresponds to forking a
-	 * TokenizedData instance.
+	 * Position within the byte buffer.
 	 */
-	std::shared_ptr<TokenizedDataImpl> impl;
+	size_t bufPos;
 
 	/**
-	 * Contains all currently enabled token ids.
+	 * Position within the token mark buffer.
 	 */
-	std::unordered_set<TokenId> tokens;
+	size_t markPos;
 
 	/**
-	 * Position from which the last element was read from the internal buffer.
-	 * This information is not shared with the other instances of TokenizedData
-	 * pointing at the same location.
+	 * Default constructor. The resulting cursor points at the beginning of the
+	 * stream.
+	 */
+	TokenizedDataCursor() : bufPos(0), markPos(0) {}
+};
+
+/**
+ * The TokenizedData class stores data extracted from a user defined document.
+ * The data stored in TokenizedData
+ */
+class TokenizedData {
+private:
+	/**
+	 * Shared pointer pointing at the internal data. This data is shared with
+	 * all the TokenizedDataReader instances.
 	 */
-	size_t cursor;
+	std::shared_ptr<TokenizedDataImpl> impl;
 
 public:
 	/**
@@ -101,10 +109,13 @@ public:
 	 *
 	 * @param data is the string that should be appended to the buffer.
 	 * @param offsStart is the start offset in bytes in the input file.
+	 * @param protect if set to true, the appended characters will not be
+	 * affected by whitespace handling, they will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(const std::string &data, SourceOffset offsStart = 0);
+	size_t append(const std::string &data, SourceOffset offsStart = 0,
+	              bool protect = false);
 
 	/**
 	 * Appends a single character to the internal character buffer.
@@ -112,10 +123,13 @@ public:
 	 * @param c is the character that should be appended to the buffer.
 	 * @param start is the start offset in bytes in the input file.
 	 * @param end is the end offset in bytes in the input file.
+	 * @param protect if set to true, the appended character will not be
+	 * affected by whitespace handling, it will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd);
+	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+	              bool protect = false);
 
 	/**
 	 * Stores a token ending at the last character of the current buffer.
@@ -136,54 +150,194 @@ public:
 	void mark(TokenId id, size_t bufStart, TokenLength len);
 
 	/**
-	 * Enables a single token id. Enabled tokens will no longer be returned as
-	 * text. Instead, when querying for the next token, TokenizedData will
-	 * return them as token and not as part of a Text token.
+	 * Resets the TokenizedData instance to the state it had when it was
+	 * constructred.
+	 */
+	void clear();
+
+	/**
+	 * Trims the length of the TokenizedData instance to the given length. Note
+	 * that this function does not remove any token matches for performance
+	 * reasons, it merely renders them incaccessible. Appending new data after
+	 * calling trim will make the token marks accessible again. Thus this method
+	 * should be the last function called to modify the data buffer and the
+	 * token marks.
+	 *
+	 * @param length is the number of characters to which the TokenizedData
+	 * instance should be trimmed.
+	 */
+	void trim(size_t length);
+
+	/**
+	 * Returns the number of characters currently represented by this
+	 * TokenizedData instance.
+	 */
+	size_t size() const;
+
+	/**
+	 * Returns true if the TokenizedData instance is empty, false otherwise.
 	 *
-	 * @param id is the TokenId of the token that should be enabled.
+	 * @return true if not data is stored inside the TokenizedData instance.
 	 */
-	void enableToken(TokenId id) { tokens.insert(id); }
+	bool empty() const;
 
 	/**
-	 * Enables a set of token ids. Enabled tokens will no longer be returned as
-	 * text. Instead, when querying for the next token, TokenizedData will
-	 * return them as token and not as part of a Text token.
+	 * Returns the location of the entire TokenizedData instance.
 	 *
-	 * @param ids is the TokenId of the token that should be enabled.
+	 * @return the location of the entire data represented by this instance.
 	 */
-	void enableToken(const std::unordered_set<TokenId> &ids)
-	{
-		tokens.insert(ids.begin(), ids.end());
-	}
+	SourceLocation getLocation() const;
+
+	/**
+	 * Returns a TokenizedDataReader instance that can be used to access the
+	 * data.
+	 *
+	 * @return a new TokenizedDataReader instance pointing at the beginning of
+	 * the internal buffer.
+	 */
+	TokenizedDataReader reader() const;
+};
+
+/**
+ * The TokenizedDataReader
+ */
+class TokenizedDataReader {
+private:
+	friend TokenizedData;
+
+	/**
+	 * Shared pointer pointing at the internal data. This data is shared with
+	 * all the TokenizedDataReader instances.
+	 */
+	std::shared_ptr<const TokenizedDataImpl> impl;
+
+	/**
+	 * Position from which the last element was read from the internal buffer.
+	 */
+	TokenizedDataCursor readCursor;
+
+	/**
+	 * Position from which the last element was peeked from the internal buffer.
+	 */
+	TokenizedDataCursor peekCursor;
+
+protected:
+	/**
+	 * Protected constructor of TokenizedDataReader, taking a reference to the
+	 * internal TokenizedDataImpl structure storing the data that is accessed by
+	 * the reader.
+	 *
+	 * @param impl is the TokenizedDataImpl instance that holds the actual data.
+	 * @param readCursor is the cursor position from which tokens and text are
+	 * read.
+	 * @param peekCursor is the cursor position from which tokens and text are
+	 * peeked.
+	 */
+	TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl,
+	                    const TokenizedDataCursor &readCursor,
+	                    const TokenizedDataCursor &peekCursor);
+
+public:
+	/**
+	 * Returns a new TokenizedDataReaderFork from which tokens and text can be
+	 * read without advancing this reader instance.
+	 */
+	TokenizedDataReaderFork fork();
+
+	/**
+	 * Returns true if this TokenizedData instance is at the end.
+	 *
+	 * @return true if the end of the TokenizedData instance has been reached.
+	 */
+	bool atEnd() const;
 
 	/**
 	 * Stores the next token in the given token reference, returns true if the
-	 * operation was successful, false if there are no more tokens.
+	 * operation was successful, false if there are no more tokens. Advances the
+	 * internal cursor and re
 	 *
 	 * @param token is an output parameter into which the read token will be
 	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+	 * @param tokens is the set of token identifers, representing the currently
+	 * enabled tokens.
 	 * @param mode is the whitespace mode that should be used when a text token
 	 * is returned.
 	 * @return true if the operation was successful and there is a next token,
 	 * false if there are no more tokens.
 	 */
-	bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+	bool read(Token &token, const TokenSet &tokens = TokenSet{},
+	          WhitespaceMode mode = WhitespaceMode::TRIM);
 
 	/**
-	 * Stores the next text token in the given token reference, returns true if
-	 * the operation was successful (there was indeed a text token), false if
-	 * the next token is not a text token or there were no more tokens.
+	 * Stores the next token in the given token reference, returns true if the
+	 * operation was successful, false if there are no more tokens.
 	 *
 	 * @param token is an output parameter into which the read token will be
 	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+	 * @param tokens is the set of token identifers, representing the currently
+	 * enabled tokens.
 	 * @param mode is the whitespace mode that should be used when a text token
 	 * is returned.
 	 * @return true if the operation was successful and there is a next token,
 	 * false if there are no more tokens.
 	 */
-	bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+	bool peek(Token &token, const TokenSet &tokens = TokenSet{},
+	          WhitespaceMode mode = WhitespaceMode::TRIM);
+
+	/**
+	 * Consumes the peeked tokens, the read cursor will now be at the position
+	 * of the peek cursor.
+	 */
+	void consumePeek() { readCursor = peekCursor; }
+
+	/**
+	 * Resets the peek cursor to the position of the read cursor.
+	 */
+	void resetPeek() { peekCursor = readCursor; }
+};
+
+/**
+ * The TokenizedDataReaderFork class is created when forking a
+ * TokenizedDataReader
+ */
+class TokenizedDataReaderFork : public TokenizedDataReader {
+private:
+	friend TokenizedDataReader;
+
+	/**
+	 * Reference pointing at the parent TokenizedDataReader to which changes may
+	 * be commited.
+	 */
+	TokenizedDataReader &parent;
+
+	/**
+	 * Private constructor of TokenizedDataReaderFork, taking a reference to the
+	 * internal TokenizedDataImpl structure storing the data that is accessed by
+	 * the reader and a reference at the parent TokenizedDataReader.
+	 *
+	 * @param parent is the TokenizedDataReader instance to which the current
+	 * read/peek progress may be commited.
+	 * @param impl is the TokenizedDataImpl instance that holds the actual data.
+	 * @param readCursor is the cursor position from which tokens and text are
+	 * read.
+	 * @param peekCursor is the cursor position from which tokens and text are
+	 * peeked.
+	 */
+	TokenizedDataReaderFork(TokenizedDataReader &parent,
+	                        std::shared_ptr<const TokenizedDataImpl> impl,
+	                        const TokenizedDataCursor &readCursor,
+	                        const TokenizedDataCursor &peekCursor)
+	    : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
+	{
+	}
+
+public:
+	/**
+	 * Commits the read/peek progress to the underlying parent.
+	 */
+	void commit() { parent = *this; }
 };
 }
 
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */
 
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 2e0ac13..e78b0f4 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -22,8 +22,8 @@
 #include <core/common/CharReader.hpp>
 #include <core/common/Exceptions.hpp>
 #include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
 
+#include "TokenizedData.hpp"
 #include "Tokenizer.hpp"
 
 namespace ousia {
@@ -42,26 +42,33 @@ struct TokenMatch {
 	Token token;
 
 	/**
-	 * Current length of the data within the text handler. The text buffer needs
-	 * to be trimmed to this length if this token matches.
+	 * Position at which this token starts in the TokenizedData instance.
 	 */
-	size_t textLength;
+	size_t dataStartOffset;
 
 	/**
-	 * End location of the current text handler. This location needs to be used
-	 * for the text token that is emitted before the actual token.
+	 * Set to true if the matched token is a primary token.
 	 */
-	size_t textEnd;
+	bool primary;
 
 	/**
 	 * Constructor of the TokenMatch class.
 	 */
-	TokenMatch() : textLength(0), textEnd(0) {}
+	TokenMatch() : dataStartOffset(0), primary(false) {}
 
 	/**
 	 * Returns true if this TokenMatch instance actually represents a match.
+	 *
+	 * @return true if the TokenMatch actually has a match.
+	 */
+	bool hasMatch() const { return token.id != Tokens::Empty; }
+
+	/**
+	 * Returns the length of the matched token.
+	 *
+	 * @return the length of the token string.
 	 */
-	bool hasMatch() { return token.id != Tokens::Empty; }
+	size_t size() const { return token.content.size(); }
 };
 
 /* Internal class TokenLookup */
@@ -83,36 +90,28 @@ private:
 	size_t start;
 
 	/**
-	 * Current length of the data within the text handler. The text buffer needs
-	 * to be trimmed to this length if this token matches.
+	 * Position at which this token starts in the TokenizedData instance.
 	 */
-	size_t textLength;
-
-	/**
-	 * End location of the current text handler. This location needs to be used
-	 * for the text token that is emitted before the actual token.
-	 */
-	size_t textEnd;
+	size_t dataStartOffset;
 
 public:
 	/**
 	 * Constructor of the TokenLookup class.
 	 *
 	 * @param node is the current node.
-	 * @param start is the start position.
-	 * @param textLength is the text buffer length of the previous text token.
-	 * @param textEnd is the current end location of the previous text token.
+	 * @param start is the start position in the source file.
+	 * @param dataStartOffset is the current length of the TokenizedData buffer.
 	 */
-	TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength,
-	            size_t textEnd)
-	    : node(node), start(start), textLength(textLength), textEnd(textEnd)
+	TokenLookup(const TokenTrie::Node *node, size_t start,
+	            size_t dataStartOffset)
+	    : node(node), start(start), dataStartOffset(dataStartOffset)
 	{
 	}
 
 	/**
 	 * Tries to extend the current path in the token trie with the given
-	 * character. If a complete token is matched, stores this match in the
-	 * tokens list (in case it is longer than any previous token).
+	 * character. If a complete token is matched, stores the match in the given
+	 * TokenMatch reference and returns true.
 	 *
 	 * @param c is the character that should be appended to the current prefix.
 	 * @param lookups is a list to which new TokeLookup instances are added --
@@ -123,73 +122,49 @@ public:
 	 * Tokenizer.
 	 * @param end is the end byte offset of the current character.
 	 * @param sourceId is the source if of this file.
+	 * @return true if a token was matched, false otherwise.
 	 */
-	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
-	             const std::vector<std::string> &tokens, SourceOffset end,
-	             SourceId sourceId)
+	bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+	             const std::vector<Tokenizer::TokenDescriptor> &tokens,
+	             SourceOffset end, SourceId sourceId)
 	{
-		// Check whether we can continue the current token path with the given
-		// character without visiting an already visited node
+		// Set to true once a token has been matched
+		bool res = false;
+
+		// Check whether we can continue the current token path, if not, abort
 		auto it = node->children.find(c);
 		if (it == node->children.end()) {
-			return;
+			return res;
 		}
 
 		// Check whether the new node represents a complete token a whether it
 		// is longer than the current token. If yes, replace the current token.
 		node = it->second.get();
-		if (node->type != Tokens::Empty) {
-			const std::string &str = tokens[node->type];
-			size_t len = str.size();
-			if (len > match.token.content.size()) {
-				match.token =
-				    Token{node->type, str, {sourceId, start, end}};
-				match.textLength = textLength;
-				match.textEnd = textEnd;
-			}
+		if (node->id != Tokens::Empty) {
+			const Tokenizer::TokenDescriptor &descr = tokens[node->id];
+			match.token = Token(node->id, descr.string,
+			                    SourceLocation(sourceId, start, end));
+			match.dataStartOffset = dataStartOffset;
+			match.primary = descr.primary;
+			res = true;
 		}
 
 		// If this state can possibly be advanced, store it in the states list.
 		if (!node->children.empty()) {
 			lookups.emplace_back(*this);
 		}
+		return res;
 	}
 };
 
-/**
- * Transforms the given token into a data token containing the extracted
- * text.
- *
- * @param handler is the WhitespaceHandler containing the collected data.
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
-static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match,
-                           SourceId sourceId)
-{
-	if (match.hasMatch()) {
-		match.token.content =
-		    std::string{handler.textBuf.data(), match.textLength};
-		match.token.location =
-		    SourceLocation{sourceId, handler.textStart, match.textEnd};
-	} else {
-		match.token.content = handler.toString();
-		match.token.location =
-		    SourceLocation{sourceId, handler.textStart, handler.textEnd};
-	}
-	match.token.id = Tokens::Data;
-}
 }
 
 /* Class Tokenizer */
 
-Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
-    : whitespaceMode(whitespaceMode), nextTokenId(0)
-{
-}
+Tokenizer::Tokenizer() : nextTokenId(0) {}
 
-template <typename TextHandler, bool read>
-bool Tokenizer::next(CharReader &reader, Token &token)
+template <bool read>
+bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 {
 	// If we're in the read mode, reset the char reader peek position to the
 	// current read position
@@ -199,43 +174,62 @@ bool Tokenizer::next(CharReader &reader, Token &token)
 
 	// Prepare the lookups in the token trie
 	const TokenTrie::Node *root = trie.getRoot();
-	TokenMatch match;
+	TokenMatch bestMatch;
 	std::vector<TokenLookup> lookups;
 	std::vector<TokenLookup> nextLookups;
 
-	// Instantiate the text handler
-	TextHandler textHandler;
-
 	// Peek characters from the reader and try to advance the current token tree
 	// cursor
 	char c;
+	const size_t initialDataSize = data.size();
 	size_t charStart = reader.getPeekOffset();
 	const SourceId sourceId = reader.getSourceId();
 	while (reader.peek(c)) {
 		const size_t charEnd = reader.getPeekOffset();
-		const size_t textLength = textHandler.textBuf.size();
-		const size_t textEnd = textHandler.textEnd;
+		const size_t dataStartOffset = data.size();
 
 		// If we do not have a match yet, start a new lookup from the root
-		if (!match.hasMatch()) {
-			TokenLookup{root, charStart, textLength, textEnd}.advance(
-			    c, nextLookups, match, tokens, charEnd, sourceId);
+		if (!bestMatch.hasMatch()) {
+			lookups.emplace_back(root, charStart, dataStartOffset);
 		}
 
 		// Try to advance all other lookups with the new character
+		TokenMatch match;
 		for (TokenLookup &lookup : lookups) {
-			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+			// Continue if the current lookup
+			if (!lookup.advance(c, nextLookups, match, tokens, charEnd,
+			                    sourceId)) {
+				continue;
+			}
+
+			// If the matched token is primary, check whether it is better than
+			// the current best match, if yes, replace the best match. In any
+			// case just continue
+			if (match.primary) {
+				if (match.size() > bestMatch.size()) {
+					bestMatch = match;
+				}
+				continue;
+			}
+
+			// Otherwise -- if the matched token is a non-primary token (and no
+			// primary token has been found until now) -- mark the match in the
+			// TokenizedData
+			if (!bestMatch.hasMatch()) {
+				data.mark(match.token.id, data.size() - match.size() + 1,
+				          match.size());
+			}
 		}
 
 		// We have found a token and there are no more states to advance or the
 		// text handler has found something -- abort to return the new token
-		if (match.hasMatch()) {
-			if ((nextLookups.empty() || textHandler.hasText())) {
+		if (bestMatch.hasMatch()) {
+			if ((nextLookups.empty() || data.size() > initialDataSize)) {
 				break;
 			}
 		} else {
 			// Record all incomming characters
-			textHandler.append(c, charStart, charEnd);
+			data.append(c, charStart, charEnd);
 		}
 
 		// Swap the lookups and the nextLookups list
@@ -246,60 +240,53 @@ bool Tokenizer::next(CharReader &reader, Token &token)
 		charStart = charEnd;
 	}
 
-	// If we found text, emit that text
-	if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
-		buildDataToken(textHandler, match, sourceId);
+	// If we found data, emit a corresponding data token
+	if (data.size() > initialDataSize &&
+	    (!bestMatch.hasMatch() ||
+	     bestMatch.dataStartOffset > initialDataSize)) {
+		// If we have a "bestMatch" wich starts after text data has started,
+		// trim the TokenizedData to this offset
+		if (bestMatch.dataStartOffset > initialDataSize) {
+			data.trim(bestMatch.dataStartOffset);
+		}
+
+		// Create a token containing the data location
+		bestMatch.token = Token{data.getLocation()};
 	}
 
 	// Move the read/peek cursor to the end of the token, abort if an error
 	// happens while doing so
-	if (match.hasMatch()) {
+	if (bestMatch.hasMatch()) {
 		// Make sure we have a valid location
-		if (match.token.location.getEnd() == InvalidSourceOffset) {
+		if (bestMatch.token.location.getEnd() == InvalidSourceOffset) {
 			throw OusiaException{"Token end position offset out of range"};
 		}
 
 		// Seek to the end of the current token
-		const size_t end = match.token.location.getEnd();
+		const size_t end = bestMatch.token.location.getEnd();
 		if (read) {
 			reader.seek(end);
 		} else {
 			reader.seekPeekCursor(end);
 		}
-		token = match.token;
+		token = bestMatch.token;
 	} else {
 		token = Token{};
 	}
-	return match.hasMatch();
+	return bestMatch.hasMatch();
 }
 
-bool Tokenizer::read(CharReader &reader, Token &token)
+bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data)
 {
-	switch (whitespaceMode) {
-		case WhitespaceMode::PRESERVE:
-			return next<PreservingWhitespaceHandler, true>(reader, token);
-		case WhitespaceMode::TRIM:
-			return next<TrimmingWhitespaceHandler, true>(reader, token);
-		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingWhitespaceHandler, true>(reader, token);
-	}
-	return false;
+	return next<true>(reader, token, data);
 }
 
-bool Tokenizer::peek(CharReader &reader, Token &token)
+bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data)
 {
-	switch (whitespaceMode) {
-		case WhitespaceMode::PRESERVE:
-			return next<PreservingWhitespaceHandler, false>(reader, token);
-		case WhitespaceMode::TRIM:
-			return next<TrimmingWhitespaceHandler, false>(reader, token);
-		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingWhitespaceHandler, false>(reader, token);
-	}
-	return false;
+	return next<false>(reader, token, data);
 }
 
-TokenId Tokenizer::registerToken(const std::string &token)
+TokenId Tokenizer::registerToken(const std::string &token, bool primary)
 {
 	// Abort if an empty token should be registered
 	if (token.empty()) {
@@ -309,8 +296,8 @@ TokenId Tokenizer::registerToken(const std::string &token)
 	// Search for a new slot in the tokens list
 	TokenId type = Tokens::Empty;
 	for (size_t i = nextTokenId; i < tokens.size(); i++) {
-		if (tokens[i].empty()) {
-			tokens[i] = token;
+		if (!tokens[i].valid()) {
+			tokens[i] = TokenDescriptor(token, primary);
 			type = i;
 			break;
 		}
@@ -320,62 +307,47 @@ TokenId Tokenizer::registerToken(const std::string &token)
 	// override the special token type handles
 	if (type == Tokens::Empty) {
 		type = tokens.size();
-		if (type == Tokens::Data || type == Tokens::Empty) {
+		if (type >= Tokens::MaxTokenId) {
 			throw OusiaException{"Token type ids depleted!"};
 		}
-		tokens.emplace_back(token);
+		tokens.emplace_back(token, primary);
 	}
 	nextTokenId = type + 1;
 
-	// Try to register the token in the trie -- if this fails, remove it
-	// from the tokens list
+	// Try to register the token in the trie -- if this fails, remove it from
+	// the tokens list
 	if (!trie.registerToken(token, type)) {
-		tokens[type] = std::string{};
+		tokens[type] = TokenDescriptor();
 		nextTokenId = type;
 		return Tokens::Empty;
 	}
 	return type;
 }
 
-bool Tokenizer::unregisterToken(TokenId type)
+bool Tokenizer::unregisterToken(TokenId id)
 {
 	// Unregister the token from the trie, abort if an invalid type is given
-	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
-		tokens[type] = std::string{};
-		nextTokenId = type;
+	if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) {
+		tokens[id] = TokenDescriptor();
+		nextTokenId = id;
 		return true;
 	}
 	return false;
 }
 
-std::string Tokenizer::getTokenString(TokenId type)
-{
-	if (type < tokens.size()) {
-		return tokens[type];
-	}
-	return std::string{};
-}
+static Tokenizer::TokenDescriptor EmptyTokenDescriptor;
 
-void Tokenizer::setWhitespaceMode(WhitespaceMode mode)
+const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const
 {
-	whitespaceMode = mode;
+	if (id < tokens.size()) {
+		return tokens[id];
+	}
+	return EmptyTokenDescriptor;
 }
 
-WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; }
-
 /* Explicitly instantiate all possible instantiations of the "next" member
    function */
-template bool Tokenizer::next<PreservingWhitespaceHandler, false>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, false>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, false>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<PreservingWhitespaceHandler, true>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, true>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, true>(
-    CharReader &reader, Token &token);
+template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &);
+template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &);
 }
 
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index f21c6a3..74e3f0d 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -19,8 +19,8 @@
 /**
  * @file Tokenizer.hpp
  *
- * Tokenizer that can be reconfigured at runtime used for parsing the plain
- * text format.
+ * Tokenizer that can be reconfigured at runtime and is used for parsing the
+ * plain text format.
  *
  * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
  */
@@ -28,44 +28,80 @@
 #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
 #define _OUSIA_DYNAMIC_TOKENIZER_HPP_
 
-#include <set>
+#include <cstdint>
 #include <string>
 #include <vector>
 
 #include <core/common/Location.hpp>
-#include <core/common/Whitespace.hpp>
+#include <core/common/Token.hpp>
 
-#include "Token.hpp"
 #include "TokenTrie.hpp"
 
 namespace ousia {
 
 // Forward declarations
 class CharReader;
+class TokenizedData;
 
 /**
  * The Tokenizer is used to extract tokens and chunks of text from a
- * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters. Note that the
- * Tokenizer always tries to extract the longest possible token from the
- * tokenizer.
+ * CharReader. It allows to register and unregister tokens while parsing. Note
+ * that the Tokenizer always tries to extract the longest possible token from
+ * the tokenizer. Tokens can be registered as primary or non-primary token. If
+ * a Token is registered as a primary token, it is returned as a single Token
+ * instance if it occurs. In the non-primary case the token is returned as part
+ * of a segmented TokenizedData instance.
  */
 class Tokenizer {
-private:
+public:
 	/**
-	 * Internally used token trie. This object holds all registered tokens.
+	 * Internally used structure describing a registered token.
 	 */
-	TokenTrie trie;
+	struct TokenDescriptor {
+		/**
+		 * String describing the token.
+		 */
+		std::string string;
+
+		/**
+		 * Set to true if this token is primary.
+		 */
+		bool primary;
+
+		/**
+		 * Constructor of the TokenDescriptor class.
+		 *
+		 * @param string is the string representation of the registered token.
+		 * @param primary specifies whether the token is a primary token that
+		 * should be returned as a single token, or a secondary token, that
+		 * should be returned as part of TokenizedData.
+		 */
+		TokenDescriptor(const std::string &string, bool primary)
+		    : string(string), primary(primary)
+		{
+		}
+
+		/**
+		 * Default constructor.
+		 */
+		TokenDescriptor() : primary(false) {}
+
+		/**
+		 * Returns true if the TokenDescriptor represents a valid token.
+		 */
+		bool valid() { return !string.empty(); }
+	};
 
+private:
 	/**
-	 * Flag defining whether whitespaces should be preserved or not.
+	 * Internally used token trie. This object holds all registered tokens.
 	 */
-	WhitespaceMode whitespaceMode;
+	TokenTrie trie;
 
 	/**
 	 * Vector containing all registered token types.
 	 */
-	std::vector<std::string> tokens;
+	std::vector<TokenDescriptor> tokens;
 
 	/**
 	 * Next index in the tokens list where to search for a new token id.
@@ -74,90 +110,78 @@ private:
 
 	/**
 	 * Templated function used internally to read the current token. The
-	 * function is templated in order to force code generation for all six
-	 * combiations of whitespace modes and reading/peeking.
+	 * function is templated in order to force optimized code generation for
+	 * both reading and peeking.
 	 *
-	 * @tparam TextHandler is the type to be used for the textHandler instance.
-	 * @tparam read specifies whether the function should start from and advance
-	 * the read pointer of the char reader.
+	 * @tparam read specifies whether the method should read the token or just
+	 * peek.
 	 * @param reader is the CharReader instance from which the data should be
 	 * read.
 	 * @param token is the token structure into which the token information
 	 * should be written.
+	 * @param data is a reference at the TokenizedData instance to which the
+	 * token information should be appended.
 	 * @return false if the end of the stream has been reached, true otherwise.
 	 */
-	template <typename TextHandler, bool read>
-	bool next(CharReader &reader, Token &token);
+	template <bool read>
+	bool next(CharReader &reader, Token &token, TokenizedData &data);
 
 public:
 	/**
 	 * Constructor of the Tokenizer class.
-	 *
-	 * @param whitespaceMode specifies how whitespace should be handled.
 	 */
-	Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+	Tokenizer();
 
 	/**
-	 * Registers the given string as a token. Returns a const pointer at a
-	 * TokenDescriptor that will be used to reference the newly created token.
+	 * Registers the given string as a token. Returns a unique identifier
+	 * describing the registered token.
 	 *
 	 * @param token is the token string that should be registered.
-	 * @return a unique identifier for the registered token or EmptyToken if
+	 * @param primary specifies whether the token is a primary token -- if true,
+	 * the token will be returned as a single, standalone token. Otherwise the
+	 * token will be returned as part of a "TokenizedData" structure.
+	 * @return a unique identifier for the registered token or Tokens::Empty if
 	 * an error occured.
 	 */
-	TokenId registerToken(const std::string &token);
+	TokenId registerToken(const std::string &token, bool primary = true);
 
 	/**
 	 * Unregisters the token belonging to the given TokenId.
 	 *
 	 * @param type is the token type that should be unregistered. The
-	 *TokenId
-	 * must have been returned by registerToken.
+	 * TokenId must have been returned by registerToken.
 	 * @return true if the operation was successful, false otherwise (e.g.
-	 * because the given TokenDescriptor was already unregistered).
+	 * because the token with the given TokenId was already unregistered).
 	 */
-	bool unregisterToken(TokenId type);
+	bool unregisterToken(TokenId id);
 
 	/**
 	 * Returns the token that was registered under the given TokenId id or
-	 *an
-	 * empty string if an invalid TokenId id is given.
+	 * an empty string if an invalid TokenId id is given.
 	 *
-	 * @param type is the TokenId id for which the corresponding token
-	 *string
+	 * @param id is the TokenId for which the corresponding TokenDescriptor
 	 * should be returned.
-	 * @return the registered token string or an empty string if the given type
-	 * was invalid.
-	 */
-	std::string getTokenString(TokenId type);
-
-	/**
-	 * Sets the whitespace mode.
-	 *
-	 * @param whitespaceMode defines how whitespace should be treated in text
-	 * tokens.
-	 */
-	void setWhitespaceMode(WhitespaceMode mode);
-
-	/**
-	 * Returns the current value of the whitespace mode.
-	 *
-	 * @return the whitespace mode.
+	 * @return the registered TokenDescriptor or an invalid TokenDescriptor if
+	 * the given TokenId is invalid.
 	 */
-	WhitespaceMode getWhitespaceMode();
+	const TokenDescriptor& lookupToken(TokenId id) const;
 
 	/**
 	 * Reads a new token from the CharReader and stores it in the given
-	 * Token instance.
+	 * Token instance. If the token has the id Tokens::Data, use the "getData"
+	 * method to fetch a reference at the underlying TokenizedData instance
+	 * storing the data.
 	 *
 	 * @param reader is the CharReader instance from which the data should be
 	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
+	 * @param data is a reference at the TokenizedData instance to which the
+	 * token information should be appended.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool read(CharReader &reader, Token &token);
+	bool read(CharReader &reader, Token &token, TokenizedData &data);
 
 	/**
 	 * The peek method does not advance the read position of the char reader,
@@ -167,10 +191,12 @@ public:
 	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
+	 * @param data is a reference at the TokenizedData instance to which the
+	 * token information should be appended.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool peek(CharReader &reader, Token &token);
+	bool peek(CharReader &reader, Token &token, TokenizedData &data);
 };
 }
 
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index f61ac7d..d4cdbf8 100644
--- a/src/formats/osml/OsmlStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -94,92 +94,11 @@ public:
 
 static const PlainFormatTokens OsmlTokens;
 
-/**
- * Class used internally to collect data issued via "DATA" event.
- */
-class DataHandler {
-private:
-	/**
-	 * Internal character buffer.
-	 */
-	std::vector<char> buf;
-
-	/**
-	 * Start location of the character data.
-	 */
-	SourceOffset start;
-
-	/**
-	 * End location of the character data.
-	 */
-	SourceOffset end;
-
-public:
-	/**
-	 * Default constructor, initializes start and end with zeros.
-	 */
-	DataHandler() : start(0), end(0) {}
-
-	/**
-	 * Returns true if the internal buffer is empty.
-	 *
-	 * @return true if no characters were added to the internal buffer, false
-	 * otherwise.
-	 */
-	bool isEmpty() { return buf.empty(); }
-
-	/**
-	 * Appends a single character to the internal buffer.
-	 *
-	 * @param c is the character that should be added to the internal buffer.
-	 * @param charStart is the start position of the character.
-	 * @param charEnd is the end position of the character.
-	 */
-	void append(char c, SourceOffset charStart, SourceOffset charEnd)
-	{
-		if (isEmpty()) {
-			start = charStart;
-		}
-		buf.push_back(c);
-		end = charEnd;
-	}
-
-	/**
-	 * Appends a string to the internal buffer.
-	 *
-	 * @param s is the string that should be added to the internal buffer.
-	 * @param stringStart is the start position of the string.
-	 * @param stringEnd is the end position of the string.
-	 */
-	void append(const std::string &s, SourceOffset stringStart,
-	            SourceOffset stringEnd)
-	{
-		if (isEmpty()) {
-			start = stringStart;
-		}
-		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
-		end = stringEnd;
-	}
-
-	/**
-	 * Converts the internal buffer to a variant with attached location
-	 * information.
-	 *
-	 * @param sourceId is the source id which is needed for building the
-	 * location information.
-	 * @return a Variant with the internal buffer content as string and
-	 * the correct start and end location.
-	 */
-	Variant toVariant(SourceId sourceId)
-	{
-		Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
-		res.setLocation({sourceId, start, end});
-		return res;
-	}
-};
-
 OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
-    : reader(reader), logger(logger), tokenizer(OsmlTokens)
+    : reader(reader),
+      logger(logger),
+      tokenizer(OsmlTokens),
+      data(reader.getSourceId())
 {
 	// Place an intial command representing the complete file on the stack
 	commands.push(Command{"", Variant::mapType{}, true, true, true, false});
@@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
 Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
 {
 	bool first = true;
-	bool hasCharSiceNSSep = false;
+	bool hasCharSinceNSSep = false;
 	std::vector<char> identifier;
 	size_t end = reader.getPeekOffset();
 	char c, c2;
@@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
 		if ((first && Utils::isIdentifierStartCharacter(c)) ||
 		    (!first && Utils::isIdentifierCharacter(c))) {
 			identifier.push_back(c);
-		} else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+		} else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&
 		           Utils::isIdentifierStartCharacter(c2)) {
 			identifier.push_back(c);
 		} else {
@@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
 		// This is no longer the first character
 		first = false;
 
-		// Advance the hasCharSiceNSSep flag
-		hasCharSiceNSSep = allowNSSep && (c != ':');
+		// Advance the hasCharSinceNSSep flag
+		hasCharSinceNSSep = allowNSSep && (c != ':');
 
 		end = reader.getPeekOffset();
 		reader.consumePeek();
@@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment()
 {
 	Token token;
 	size_t depth = 1;
-	while (tokenizer.read(reader, token)) {
+	while (tokenizer.read(reader, token, data)) {
+		// Throw the comment data away
+		data.clear();
+
 		if (token.id == OsmlTokens.BlockCommentEnd) {
 			depth--;
 			if (depth == 0) {
@@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment()
 	}
 }
 
-bool OsmlStreamParser::checkIssueData(DataHandler &handler)
+bool OsmlStreamParser::checkIssueData()
 {
-	if (!handler.isEmpty()) {
-		data = handler.toVariant(reader.getSourceId());
+	if (!data.empty()) {
 		location = data.getLocation();
 		reader.resetPeek();
 		return true;
@@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField()
 
 OsmlStreamParser::State OsmlStreamParser::parse()
 {
-	// Handler for incomming data
-	DataHandler handler;
+	// Reset the data handler
+	data.clear();
 
 	// Read tokens until the outer loop should be left
 	Token token;
-	while (tokenizer.peek(reader, token)) {
+	while (tokenizer.peek(reader, token, data)) {
 		const TokenId type = token.id;
 
 		// Special handling for Backslash and Text
@@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
 			// Try to parse a command
 			if (Utils::isIdentifierStartCharacter(c)) {
 				// Make sure to issue any data before it is to late
-				if (checkIssueData(handler)) {
+				if (checkIssueData()) {
 					return State::DATA;
 				}
 
@@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()
 			// If this was an annotation start token, add the parsed < to the
 			// output
 			if (type == OsmlTokens.AnnotationStart) {
-				handler.append('<', token.location.getStart(),
-				               token.location.getStart() + 1);
+				data.append('<', token.location.getStart(),
+				            token.location.getStart() + 1);
 			}
 
-			handler.append(c, token.location.getStart(),
-			               reader.getPeekOffset());
+			data.append(c, token.location.getStart(), reader.getPeekOffset());
 			reader.consumePeek();
 			continue;
 		} else if (type == Tokens::Data) {
@@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse()
 				location = token.location;
 				return State::FIELD_START;
 			}
-
-			// Append the text to the data handler
-			handler.append(token.content, token.location.getStart(),
-			               token.location.getEnd());
-
 			reader.consumePeek();
 			continue;
 		}
 
 		// A non-text token was reached, make sure all pending data commands
 		// have been issued
-		if (checkIssueData(handler)) {
+		if (checkIssueData()) {
 			return State::DATA;
 		}
 
@@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse()
 			Command &cmd = commands.top();
 			if (!cmd.inField) {
 				cmd.inField = true;
-				return State::FIELD_START;
 			}
-			logger.error(
+			return State::FIELD_START;
+/*			logger.error(
 			    "Got field start token \"{\", but no command for which to "
 			    "start the field. Write \"\\{\" to insert this sequence as "
 			    "text.",
-			    token);
+			    token);*/
 		} else if (token.id == OsmlTokens.FieldEnd) {
-			if (closeField()) {
+			closeField();
+			return State::FIELD_END;
+/*			if (closeField()) {
 				return State::FIELD_END;
 			}
 			logger.error(
 			    "Got field end token \"}\", but there is no field to end. "
 			    "Write \"\\}\" to insert this sequence as text.",
-			    token);
+			    token);*/
 		} else if (token.id == OsmlTokens.DefaultFieldStart) {
 			// Try to start a default field the first time the token is reached
 			Command &topCmd = commands.top();
 			if (!topCmd.inField) {
 				topCmd.inField = true;
 				topCmd.inDefaultField = true;
-				return State::FIELD_START;
 			}
-			logger.error(
+			return State::FIELD_START;
+/*			logger.error(
 			    "Got default field start token \"{!\", but no command for "
 			    "which to start the field. Write \"\\{!\" to insert this "
 			    "sequence as text",
-			    token);
+			    token);*/
 		} else if (token.id == OsmlTokens.AnnotationEnd) {
 			// We got a single annotation end token "\>" -- simply issue the
 			// ANNOTATION_END event
@@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
 	}
 
 	// Issue available data
-	if (checkIssueData(handler)) {
+	if (checkIssueData()) {
 		return State::DATA;
 	}
 
@@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse()
 	return State::END;
 }
 
+Variant OsmlStreamParser::getText(WhitespaceMode mode)
+{
+	TokenizedData dataFork = data;
+	Variant text = dataFork.text(mode);
+	location = text.getLocation();
+	return text;
+}
+
 const Variant &OsmlStreamParser::getCommandName() const
 {
 	return commands.top().name;
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index dc3034c..453a2bb 100644
--- a/src/formats/osml/OsmlStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -29,17 +29,19 @@
 #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
 #define _OUSIA_OSML_STREAM_PARSER_HPP_
 
-#include <stack>
+#include <memory>
 
 #include <core/common/Variant.hpp>
+#include <core/common/Whitespace.hpp>
 #include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
 
 namespace ousia {
 
 // Forward declarations
 class CharReader;
 class Logger;
-class DataHandler;
+class OsmlStreamParserImpl;
 
 /**
  * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
@@ -137,26 +139,15 @@ public:
 		Variant arguments;
 
 		/**
-		 * Set to true if this is a command with clear begin and end.
-		 */
-		bool hasRange : 1;
-
-		/**
-		 * Set to true if we are currently inside a field of this command.
-		 */
-		bool inField : 1;
-
-		/**
-		 * Set to true if we are currently in the range field of the command
-		 * (implies inField being set to true).
+		 * Vector used as stack for holding the number of opening/closing braces
+		 * and the corresponding "isDefaultField" flag.
 		 */
-		bool inRangeField : 1;
+		std::vector<bool> fields;
 
 		/**
-		 * Set to true if we are currently in a field that has been especially
-		 * marked as default field (using the "|") syntax.
+		 * Set to true if this is a command with clear begin and end.
 		 */
-		bool inDefaultField : 1;
+		bool hasRange;
 
 		/**
 		 * Default constructor.
@@ -164,7 +155,6 @@ public:
 		Command()
 		    : hasRange(false),
 		      inField(false),
-		      inRangeField(false),
 		      inDefaultField()
 		{
 		}
@@ -178,15 +168,10 @@ public:
 		 * command.
 		 * @param hasRange should be set to true if this is a command with
 		 * explicit range.
-		 * @param inField is set to true if we currently are inside a field
-		 * of this command.
-		 * @param inRangeField is set to true if we currently are inside the
-		 * outer field of a ranged command.
 		 * @param inDefaultField is set to true if we currently are in a
 		 * specially marked default field.
 		 */
-		Command(Variant name, Variant arguments, bool hasRange,
-		        bool inField, bool inRangeField, bool inDefaultField)
+		Command(Variant name, Variant arguments, bool hasRange)
 		    : name(std::move(name)),
 		      arguments(std::move(arguments)),
 		      hasRange(hasRange),
@@ -215,25 +200,20 @@ private:
 	Tokenizer tokenizer;
 
 	/**
-	 * Stack containing the current commands.
-	 */
-	std::stack<Command> commands;
-
-	/**
-	 * Variant containing the data that has been read (always is a string,
-	 * contains the exact location of the data in the source file).
+	 * Variant containing the tokenized data that was returned from the
+	 * tokenizer as data.
 	 */
-	Variant data;
+	TokenizedData data;
 
 	/**
-	 * Contains the location of the last token.
+	 * Stack containing the current commands.
 	 */
-	SourceLocation location;
+	std::stack<Command> commands;
 
 	/**
-	 * Contains the field index of the current command.
+	 * Pointer at 
 	 */
-	size_t fieldIdx;
+	std::unique_ptr<OsmlStreamParserImpl> impl;
 
 	/**
 	 * Function used internall to parse an identifier.
@@ -291,12 +271,10 @@ private:
 	/**
 	 * Checks whether there is any data pending to be issued, if yes, issues it.
 	 *
-	 * @param handler is the data handler that contains the data that may be
-	 * returned to the user.
 	 * @return true if there was any data and DATA should be returned by the
 	 * parse function, false otherwise.
 	 */
-	bool checkIssueData(DataHandler &handler);
+	bool checkIssueData();
 
 	/**
 	 * Called before any data is appended to the internal data handler. Checks
@@ -328,6 +306,12 @@ public:
 	OsmlStreamParser(CharReader &reader, Logger &logger);
 
 	/**
+	 * Destructor of the OsmlStreamParser, needed to destroy the incomplete
+	 * OsmlStreamParserImpl.
+	 */
+	~OsmlStreamParser();
+
+	/**
 	 * Continues parsing. Returns one of the states defined in the State enum.
 	 * Callers should stop once the State::END state is reached. Use the getter
 	 * functions to get more information about the current state, such as the
@@ -344,7 +328,19 @@ public:
 	 * @return a reference at a variant containing the data parsed by the
 	 * "parse" function.
 	 */
-	const Variant &getData() const { return data; }
+	const TokenizedData &getData() const { return data; }
+
+	/**
+	 * Returns the complete content of the internal TokenizedData instance as
+	 * a single string Variant. This method is mainly used in the unit tests for
+	 * this class, it simply calls the text() method of TokenizedData.
+	 *
+	 * @param mode is the WhitespaceMode that should be used for returning the
+	 * text.
+	 * @return a string variant containing the text content of the internal
+	 * TokenizedData instance or a nullptr variant if there is no text.
+	 */
+	Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE);
 
 	/**
 	 * Returns a reference at the internally stored command name. Only valid if
@@ -371,13 +367,6 @@ public:
 	 * syntax).
 	 */
 	bool inDefaultField() const;
-
-	/**
-	 * Returns a reference at the char reader.
-	 *
-	 * @return the last internal token location.
-	 */
-	const SourceLocation &getLocation() const { return location; }
 };
 }
 
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
index c9254b0..855f80d 100644
--- a/src/formats/osxml/OsxmlEventParser.cpp
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -25,7 +25,6 @@
 #include <core/common/Variant.hpp>
 #include <core/common/VariantReader.hpp>
 #include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
 
 #include "OsxmlAttributeLocator.hpp"
 #include "OsxmlEventParser.hpp"
@@ -57,17 +56,6 @@ public:
 	std::vector<char> textBuf;
 
 	/**
-	 * Current whitespace buffer (for the trimming whitspace mode)
-	 */
-	std::vector<char> whitespaceBuf;
-
-	/**
-	 * Flag indicating whether a whitespace character was present (for the
-	 * collapsing whitespace mode).
-	 */
-	bool hasWhitespace;
-
-	/**
 	 * Current character data start.
 	 */
 	size_t textStart;
@@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
 	SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
 
 	// Fetch some variables for convenience
-	const WhitespaceMode mode = parser->getWhitespaceMode();
 	OsxmlEventParserData &data = parser->getData();
 	std::vector<char> &textBuf = data.textBuf;
-	std::vector<char> &whitespaceBuf = data.whitespaceBuf;
-	bool &hasWhitespace = data.hasWhitespace;
-	size_t &textStart = data.textStart;
-	size_t &textEnd = data.textEnd;
-
-	size_t pos = loc.getStart();
-	for (size_t i = 0; i < ulen; i++, pos++) {
-		switch (mode) {
-			case WhitespaceMode::PRESERVE:
-				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
-				                                    textStart, textEnd);
-				break;
-			case WhitespaceMode::TRIM:
-				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
-				                                  textStart, textEnd,
-				                                  whitespaceBuf);
-				break;
-			case WhitespaceMode::COLLAPSE:
-				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
-				                                    textStart, textEnd,
-				                                    hasWhitespace);
-				break;
-		}
+
+	// Update start and end position
+	if (textBuf.empty()) {
+		data.textStart = loc.getStart();
 	}
+	data.textEnd = loc.getEnd();
+
+	// Insert the data into the text buffer
+	textBuf.insert(textBuf.end(), &s[0], &s[ulen]);
 }
 
 /* Class OsxmlEvents */
@@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {}
 /* Class OsxmlEventParser */
 
 OsxmlEventParserData::OsxmlEventParserData()
-    : depth(0),
-      annotationEndTagDepth(-1),
-      hasWhitespace(false),
-      textStart(0),
-      textEnd(0)
+    : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0)
 {
 }
 
@@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId)
 
 	// Reset the text buffers
 	textBuf.clear();
-	whitespaceBuf.clear();
-	hasWhitespace = false;
 	textStart = 0;
 	textEnd = 0;
 
@@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
     : reader(reader),
       events(events),
       logger(logger),
-      whitespaceMode(WhitespaceMode::COLLAPSE),
       data(new OsxmlEventParserData())
 {
 }
@@ -532,16 +497,6 @@ void OsxmlEventParser::parse()
 	}
 }
 
-void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
-{
-	this->whitespaceMode = whitespaceMode;
-}
-
-WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
-{
-	return whitespaceMode;
-}
-
 CharReader &OsxmlEventParser::getReader() const { return reader; }
 
 Logger &OsxmlEventParser::getLogger() const { return logger; }
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
index e39245f..e3fd5d4 100644
--- a/src/formats/osxml/OsxmlEventParser.hpp
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -32,8 +32,6 @@
 #include <memory>
 #include <string>
 
-#include <core/common/Whitespace.hpp>
-
 namespace ousia {
 
 // Forward declarations
@@ -99,13 +97,10 @@ public:
 	virtual void fieldEnd() = 0;
 
 	/**
-	 * Called whenever data is found. Whitespace data is handled as specified
-	 * and the data has been parsed to the specified variant type. This function
-	 * is not called if the parsing failed, the parser prints an error message
-	 * instead.
+	 * Called whenever string data is found.
 	 *
-	 * @param data is the already parsed data that should be passed to the
-	 * handler.
+	 * @param data is a Variant containing the string data that was found in the
+	 * XML file.
 	 */
 	virtual void data(const Variant &data) = 0;
 };
@@ -135,11 +130,6 @@ private:
 	Logger &logger;
 
 	/**
-	 * Current whitespace mode.
-	 */
-	WhitespaceMode whitespaceMode;
-
-	/**
 	 * Data to be used by the internal functions.
 	 */
 	std::unique_ptr<OsxmlEventParserData> data;
@@ -171,21 +161,6 @@ public:
 	void parse();
 
 	/**
-	 * Sets the whitespace handling mode.
-	 *
-	 * @param whitespaceMode defines how whitespace in the data should be
-	 * handled.
-	 */
-	void setWhitespaceMode(WhitespaceMode whitespaceMode);
-
-	/**
-	 * Returns the current whitespace handling mode.
-	 *
-	 * @return the currently set whitespace handling mode.
-	 */
-	WhitespaceMode getWhitespaceMode() const;
-
-	/**
 	 * Returns the internal CharReader reference.
 	 *
 	 * @return the CharReader reference.
diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp
index a93f14a..83966d5 100644
--- a/test/core/parser/stack/StackTest.cpp
+++ b/test/core/parser/stack/StackTest.cpp
@@ -24,6 +24,7 @@
 #include <core/parser/stack/Handler.hpp>
 #include <core/parser/stack/Stack.hpp>
 #include <core/parser/stack/State.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
 
 #include <core/StandaloneEnvironment.hpp>
 
@@ -53,7 +54,7 @@ struct Tracker {
 	Variant::mapType annotationStartArgs;
 	Variant annotationEndClassName;
 	Variant annotationEndElementName;
-	Variant dataData;
+	TokenizedData dataData;
 
 	bool startResult;
 	bool fieldStartSetIsDefault;
@@ -81,7 +82,7 @@ struct Tracker {
 		annotationStartArgs = Variant::mapType{};
 		annotationEndClassName = Variant::fromString(std::string{});
 		annotationEndElementName = Variant::fromString(std::string{});
-		dataData = Variant::fromString(std::string{});
+		dataData = TokenizedData();
 
 		startResult = true;
 		fieldStartSetIsDefault = false;
@@ -157,7 +158,7 @@ public:
 		return tracker.annotationEndResult;
 	}
 
-	bool data(Variant &data) override
+	bool data(TokenizedData &data) override
 	{
 		tracker.dataCount++;
 		tracker.dataData = data;
@@ -363,7 +364,7 @@ TEST(Stack, multipleFields)
 
 		s.data("test");
 		tracker.expect(1, 0, 1, 0, 0, 0, 1);  // sc, ec, fsc, fse, asc, aec, dc
-		EXPECT_EQ("test", tracker.dataData);
+		EXPECT_EQ("test", tracker.dataData.text().asString());
 
 		s.fieldEnd();
 		tracker.expect(1, 0, 1, 1, 0, 0, 1);  // sc, ec, fsc, fse, asc, aec, dc
@@ -375,7 +376,7 @@ TEST(Stack, multipleFields)
 
 		s.data("test2");
 		tracker.expect(1, 0, 2, 1, 0, 0, 2);  // sc, ec, fsc, fse, asc, aec, dc
-		EXPECT_EQ("test2", tracker.dataData);
+		EXPECT_EQ("test2", tracker.dataData.text().asString());
 
 		s.fieldEnd();
 		tracker.expect(1, 0, 2, 2, 0, 0, 2);  // sc, ec, fsc, fse, asc, aec, dc
@@ -387,7 +388,7 @@ TEST(Stack, multipleFields)
 
 		s.data("test3");
 		tracker.expect(1, 0, 3, 2, 0, 0, 3);  // sc, ec, fsc, fse, asc, aec, dc
-		EXPECT_EQ("test3", tracker.dataData);
+		EXPECT_EQ("test3", tracker.dataData.text().asString());
 
 		s.fieldEnd();
 		tracker.expect(1, 0, 3, 3, 0, 0, 3);  // sc, ec, fsc, fse, asc, aec, dc
@@ -744,4 +745,4 @@ TEST(Stack, fieldAfterDefaultField)
 	ASSERT_FALSE(logger.hasError());
 }
 }
-}
-\ No newline at end of file
+}
diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp
index 231bad9..dfe2526 100644
--- a/test/core/parser/utils/TokenizedDataTest.cpp
+++ b/test/core/parser/utils/TokenizedDataTest.cpp
@@ -22,6 +22,43 @@
 
 namespace ousia {
 
+void assertToken(TokenizedDataReader &reader, TokenId id,
+                 const std::string &text, const TokenSet &tokens = TokenSet{},
+                 WhitespaceMode mode = WhitespaceMode::TRIM,
+                 SourceOffset start = InvalidSourceOffset,
+                 SourceOffset end = InvalidSourceOffset,
+                 SourceId sourceId = InvalidSourceId)
+{
+	Token token;
+	ASSERT_TRUE(reader.read(token, tokens, mode));
+	EXPECT_EQ(id, token.id);
+	EXPECT_EQ(text, token.content);
+	if (start != InvalidSourceOffset) {
+		EXPECT_EQ(start, token.getLocation().getStart());
+	}
+	if (end != InvalidSourceOffset) {
+		EXPECT_EQ(end, token.getLocation().getEnd());
+	}
+	EXPECT_EQ(sourceId, token.getLocation().getSourceId());
+}
+
+void assertText(TokenizedDataReader &reader, const std::string &text,
+                const TokenSet &tokens = TokenSet{},
+                WhitespaceMode mode = WhitespaceMode::TRIM,
+                SourceOffset start = InvalidSourceOffset,
+                SourceOffset end = InvalidSourceOffset,
+                SourceId id = InvalidSourceId)
+{
+	assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id);
+}
+
+void assertEnd(TokenizedDataReader &reader)
+{
+	Token token;
+	ASSERT_TRUE(reader.atEnd());
+	ASSERT_FALSE(reader.read(token));
+}
+
 TEST(TokenizedData, dataWhitespacePreserve)
 {
 	TokenizedData data;
@@ -29,15 +66,10 @@ TEST(TokenizedData, dataWhitespacePreserve)
 	//                          0123456789012345
 	//                          0         1
 
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ(" test1   test2  ", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(16U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, " test1   test2  ", TokenSet{}, WhitespaceMode::PRESERVE,
+	           0, 16);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, dataWhitespaceTrim)
@@ -47,15 +79,10 @@ TEST(TokenizedData, dataWhitespaceTrim)
 	//                          0123456789012345
 	//                          0         1
 
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("test1   test2", token.content);
-	EXPECT_EQ(1U, token.getLocation().getStart());
-	EXPECT_EQ(14U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "test1   test2", TokenSet{}, WhitespaceMode::TRIM, 1,
+	           14);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, dataWhitespaceCollapse)
@@ -65,15 +92,10 @@ TEST(TokenizedData, dataWhitespaceCollapse)
 	//                          0123456789012345
 	//                          0         1
 
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("test1 test2", token.content);
-	EXPECT_EQ(1U, token.getLocation().getStart());
-	EXPECT_EQ(14U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::COLLAPSE, 1,
+	           14);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, singleToken)
@@ -82,17 +104,9 @@ TEST(TokenizedData, singleToken)
 	ASSERT_EQ(2U, data.append("$$"));
 	data.mark(5, 0, 2);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, singleDisabledToken)
@@ -101,15 +115,9 @@ TEST(TokenizedData, singleDisabledToken)
 	ASSERT_EQ(2U, data.append("$$"));
 	data.mark(5, 0, 2);
 
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "$$", TokenSet{}, WhitespaceMode::COLLAPSE, 0, 2);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, dualToken)
@@ -120,18 +128,10 @@ TEST(TokenizedData, dualToken)
 	data.mark(5, 0, 2);
 	data.mark(6, 1, 1);
 
-	data.enableToken(5);
-	data.enableToken(6);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5, 6}, WhitespaceMode::COLLAPSE, 0,
+	            2);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, dualTokenShorterEnabled)
@@ -142,385 +142,281 @@ TEST(TokenizedData, dualTokenShorterEnabled)
 	data.mark(5, 0, 2);
 	data.mark(6, 1, 1);
 
-	data.enableToken(6);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(6U, token.id);
-	EXPECT_EQ("$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(1U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(6U, token.id);
-	EXPECT_EQ("$", token.content);
-	EXPECT_EQ(1U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 0, 1);
+	assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 1, 2);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, dualTokenLongerEnabled)
 {
 	TokenizedData data;
 	ASSERT_EQ(2U, data.append("$$"));
+	data.mark(6, 0, 1);
 	data.mark(5, 0, 2);
+	data.mark(6, 1, 1);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, tokensAndDataPreserveWhitespace)
 {
 	TokenizedData data;
-	ASSERT_EQ(10U, data.append("$$ test $$"));
-	//                          0123456789
+	ASSERT_EQ(18U, data.append("$$ test    text $$"));
+	//                          012345678901234567
 	data.mark(5, 0, 2);
 	data.mark(5, 2);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ(" test ", token.content);
-	EXPECT_EQ(2U, token.getLocation().getStart());
-	EXPECT_EQ(8U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(8U, token.getLocation().getStart());
-	EXPECT_EQ(10U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2);
+	assertText(reader, " test    text ", TokenSet{5}, WhitespaceMode::PRESERVE,
+	           2, 16);
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 16, 18);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, tokensAndDataTrimWhitespace)
 {
 	TokenizedData data;
-	ASSERT_EQ(10U, data.append("$$ test $$"));
-	//                          0123456789
+	ASSERT_EQ(18U, data.append("$$ test    text $$"));
+	//                          012345678901234567
 	data.mark(5, 0, 2);
 	data.mark(5, 2);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("test", token.content);
-	EXPECT_EQ(3U, token.getLocation().getStart());
-	EXPECT_EQ(7U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(8U, token.getLocation().getStart());
-	EXPECT_EQ(10U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2);
+	assertText(reader, "test    text", TokenSet{5}, WhitespaceMode::TRIM, 3,
+	           15);
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 16, 18);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, tokensAndDataCollapseWhitespace)
 {
 	TokenizedData data;
-	ASSERT_EQ(10U, data.append("$$ test $$"));
-	//                          0123456789
+	ASSERT_EQ(18U, data.append("$$ test    text $$"));
+	//                          012345678901234567
 	data.mark(5, 0, 2);
 	data.mark(5, 2);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("test", token.content);
-	EXPECT_EQ(3U, token.getLocation().getStart());
-	EXPECT_EQ(7U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(8U, token.getLocation().getStart());
-	EXPECT_EQ(10U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+	assertText(reader, "test text", TokenSet{5}, WhitespaceMode::COLLAPSE, 3,
+	           15);
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 16, 18);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, tokensAndWhitespacePreserveWhitespace)
 {
 	TokenizedData data;
-	ASSERT_EQ(10U, data.append("$$      $$"));
-	//                          0123456789
+	ASSERT_EQ(8U, data.append("$$    $$"));
+	//                         01234567
 	data.mark(5, 0, 2);
 	data.mark(5, 2);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("      ", token.content);
-	EXPECT_EQ(2U, token.getLocation().getStart());
-	EXPECT_EQ(8U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(8U, token.getLocation().getStart());
-	EXPECT_EQ(10U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2);
+	assertText(reader, "    ", TokenSet{5}, WhitespaceMode::PRESERVE, 2, 6);
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 6, 8);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, tokensAndWhitespaceTrimWhitespace)
 {
 	TokenizedData data;
-	ASSERT_EQ(10U, data.append("$$      $$"));
-	//                          0123456789
+	ASSERT_EQ(8U, data.append("$$    $$"));
+	//                         01234567
 	data.mark(5, 0, 2);
 	data.mark(5, 2);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(8U, token.getLocation().getStart());
-	EXPECT_EQ(10U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2);
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 6, 8);
+	assertEnd(reader);
 }
 
 TEST(TokenizedData, tokensAndWhitespaceCollapseWhitespace)
 {
 	TokenizedData data;
-	ASSERT_EQ(10U, data.append("$$      $$"));
-	//                          0123456789
+	ASSERT_EQ(8U, data.append("$$    $$"));
+	//                         01234567
 	data.mark(5, 0, 2);
 	data.mark(5, 2);
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(8U, token.getLocation().getStart());
-	EXPECT_EQ(10U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+	assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 6, 8);
+	assertEnd(reader);
 }
 
-TEST(TokenizedData, textPreserveWhitespace)
+TEST(TokenizedData, appendChars)
 {
 	TokenizedData data;
-	ASSERT_EQ(6U, data.append("  $$  "));
-	//                         012345
-	data.mark(5, 2, 2);
-
-	data.enableToken(5);
+	ASSERT_EQ(1U, data.append('t', 5, 7));
+	ASSERT_EQ(2U, data.append('e', 7, 8));
+	ASSERT_EQ(3U, data.append('s', 8, 10));
+	ASSERT_EQ(4U, data.append('t', 10, 12));
 
-	Token token;
-	ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("  ", token.content);
-	EXPECT_EQ(0U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(2U, token.getLocation().getStart());
-	EXPECT_EQ(4U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("  ", token.content);
-	EXPECT_EQ(4U, token.getLocation().getStart());
-	EXPECT_EQ(6U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.text(token, WhitespaceMode::PRESERVE));
-	ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "test", TokenSet{5}, WhitespaceMode::COLLAPSE, 5, 12);
+	assertEnd(reader);
 }
 
-TEST(TokenizedData, textTrimWhitespace)
+TEST(TokenizedData, protectedWhitespace)
 {
 	TokenizedData data;
-	ASSERT_EQ(6U, data.append("  $$  "));
-	//                         012345
-	data.mark(5, 2, 2);
+	ASSERT_EQ(4U, data.append("test", 10));
+	ASSERT_EQ(11U, data.append("   test", 14, true));
 
-	data.enableToken(5);
-
-	Token token;
-	ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM));
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(2U, token.getLocation().getStart());
-	EXPECT_EQ(4U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "test   test", TokenSet{5}, WhitespaceMode::COLLAPSE, 10,
+	           21);
+	assertEnd(reader);
+}
 
-	ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM));
-	ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+TEST(TokenizedData, specialNewlineToken)
+{
+	TokenizedData data;
+	data.append("a\nb\n   \nc\n");
+	//           0 12 3456 78 9
+
+	const TokenSet tokens{Tokens::Newline};
+
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1);
+	assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+	            1, 2);
+	assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3);
+	assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+	            3, 4);
+	assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+	            7, 8);
+	assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9);
+	assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+	            9, 10);
+	assertEnd(reader);
 }
 
-TEST(TokenizedData, textCollapseWhitespace)
+TEST(TokenizedData, specialParagraphToken)
 {
 	TokenizedData data;
-	ASSERT_EQ(6U, data.append("  $$  "));
-	//                         012345
-	data.mark(5, 2, 2);
+	data.append("a\nb\n   \nc\n");
+	//           0 12 3456 78 9
 
-	data.enableToken(5);
+	const TokenSet tokens{Tokens::Paragraph};
 
-	Token token;
-	ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3);
+	assertToken(reader, Tokens::Paragraph, "\n   \n", tokens,
+	            WhitespaceMode::COLLAPSE, 3, 8);
+	assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9);
+	assertEnd(reader);
+}
 
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(5U, token.id);
-	EXPECT_EQ("$$", token.content);
-	EXPECT_EQ(2U, token.getLocation().getStart());
-	EXPECT_EQ(4U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+TEST(TokenizedData, specialSectionToken)
+{
+	TokenizedData data;
+	data.append("a\nb\n   \n  \t \n");
+	//           0 12 3456 789 01 2
+	//           0             1
+
+	const TokenSet tokens{Tokens::Section};
 
-	ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
-	ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3);
+	assertToken(reader, Tokens::Section, "\n   \n  \t \n", tokens,
+	            WhitespaceMode::COLLAPSE, 3, 13);
+	assertEnd(reader);
 }
 
-TEST(TokenizedData, appendChars)
+TEST(TokenizedData, specialTokenPrecedence)
 {
 	TokenizedData data;
-	ASSERT_EQ(1U, data.append('t', 5, 7));
-	ASSERT_EQ(2U, data.append('e', 7, 8));
-	ASSERT_EQ(3U, data.append('s', 8, 10));
-	ASSERT_EQ(4U, data.append('t', 10, 12));
+	data.append("a\nb\n\nc\n\n\nd");
+	//           0 12 3 45 6 7 89
+
+	const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section};
+
+	TokenizedDataReader reader = data.reader();
+	assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1);
+	assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+	            1, 2);
+	assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3);
+	assertToken(reader, Tokens::Paragraph, "\n\n", tokens,
+	            WhitespaceMode::COLLAPSE, 3, 5);
+	assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 5, 6);
+	assertToken(reader, Tokens::Section, "\n\n\n", tokens,
+	            WhitespaceMode::COLLAPSE, 6, 9);
+	assertText(reader, "d", tokens, WhitespaceMode::COLLAPSE, 9, 10);
+	assertEnd(reader);
+}
 
-	Token token;
-	ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("test", token.content);
-	EXPECT_EQ(5U, token.getLocation().getStart());
-	EXPECT_EQ(12U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
-	ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+TEST(TokenizedData, specialTokenPrecedence2)
+{
+	TokenizedData data;
+	data.append("\nb\n\nc\n\n\n");
+	//            0 12 3 45 6 7
+
+	const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section};
+
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+	            0, 1);
+	assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 1, 2);
+	assertToken(reader, Tokens::Paragraph, "\n\n", tokens,
+	            WhitespaceMode::COLLAPSE, 2, 4);
+	assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 4, 5);
+	assertToken(reader, Tokens::Section, "\n\n\n", tokens,
+	            WhitespaceMode::COLLAPSE, 5, 8);
+	assertEnd(reader);
 }
 
-TEST(TokenizedData, copy)
+TEST(TokenizedData, specialTokenIndent)
 {
 	TokenizedData data;
-	ASSERT_EQ(7U, data.append(" a $ b "));
-	//                         0123456
-	data.mark(6, 3, 1);
-	data.enableToken(6);
+	data.append("    test\n\ttest2\n        test3  \ttest4\ntest5");
+	//           01234567 8 901234 5678901234567890 123456 789012
+	//           0           1          2         3           4
+	const TokenSet tokens{Tokens::Indent, Tokens::Dedent};
+
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+	            4, 4);
+	assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
+	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+	            10, 10);
+	assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+	assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
+	            38, 38);
+	assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43);
+	assertEnd(reader);
+}
 
-	Token token;
-	ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("a", token.content);
-	EXPECT_EQ(1U, token.getLocation().getStart());
-	EXPECT_EQ(2U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
-
-	TokenizedData dataCopy = data;
-
-	ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(6U, token.id);
-	EXPECT_EQ("$", token.content);
-	EXPECT_EQ(3U, token.getLocation().getStart());
-	EXPECT_EQ(4U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(dataCopy.next(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(6U, token.id);
-	EXPECT_EQ("$", token.content);
-	EXPECT_EQ(3U, token.getLocation().getStart());
-	EXPECT_EQ(4U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
-	ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ(" b ", token.content);
-	EXPECT_EQ(4U, token.getLocation().getStart());
-	EXPECT_EQ(7U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-	ASSERT_FALSE(data.next(token));
-
-	ASSERT_TRUE(dataCopy.text(token, WhitespaceMode::COLLAPSE));
-	EXPECT_EQ(Tokens::Data, token.id);
-	EXPECT_EQ("b", token.content);
-	EXPECT_EQ(5U, token.getLocation().getStart());
-	EXPECT_EQ(6U, token.getLocation().getEnd());
-	EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-	ASSERT_FALSE(dataCopy.next(token));
+TEST(TokenizedData, specialTokenIndentOverlap)
+{
+	TokenizedData data;
+	data.append("    test\n\ttest2\n        test3  \ttest4\ntest5");
+	//           01234567 8 901234 5678901234567890 123456 789012
+	//           0           1          2         3           4
+	const TokenSet tokens{Tokens::Indent, Tokens::Dedent, 5};
+
+	data.mark(5, 4, 4);
+
+	TokenizedDataReader reader = data.reader();
+	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+	            4, 4);
+	assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
+	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+	            10, 10);
+	assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+	assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
+	            38, 38);
+	assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43);
+	assertEnd(reader);
 }
+
 }
 
diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp
index 3809a12..0f2bfb7 100644
--- a/test/core/parser/utils/TokenizerTest.cpp
+++ b/test/core/parser/utils/TokenizerTest.cpp
@@ -20,6 +20,7 @@
 
 #include <core/common/CharReader.hpp>
 #include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
 
 namespace ousia {
 
@@ -31,23 +32,40 @@ TEST(Tokenizer, tokenRegistration)
 
 	ASSERT_EQ(0U, tokenizer.registerToken("a"));
 	ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("a"));
-	ASSERT_EQ("a", tokenizer.getTokenString(0U));
+	ASSERT_EQ("a", tokenizer.lookupToken(0U).string);
 
 	ASSERT_EQ(1U, tokenizer.registerToken("b"));
 	ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("b"));
-	ASSERT_EQ("b", tokenizer.getTokenString(1U));
+	ASSERT_EQ("b", tokenizer.lookupToken(1U).string);
 
 	ASSERT_EQ(2U, tokenizer.registerToken("c"));
 	ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("c"));
-	ASSERT_EQ("c", tokenizer.getTokenString(2U));
+	ASSERT_EQ("c", tokenizer.lookupToken(2U).string);
 
 	ASSERT_TRUE(tokenizer.unregisterToken(1U));
 	ASSERT_FALSE(tokenizer.unregisterToken(1U));
-	ASSERT_EQ("", tokenizer.getTokenString(1U));
+	ASSERT_EQ("", tokenizer.lookupToken(1U).string);
 
 	ASSERT_EQ(1U, tokenizer.registerToken("d"));
 	ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("d"));
-	ASSERT_EQ("d", tokenizer.getTokenString(1U));
+	ASSERT_EQ("d", tokenizer.lookupToken(1U).string);
+}
+
+void expectData(const std::string &expected, SourceOffset tokenStart,
+                SourceOffset tokenEnd, SourceOffset textStart,
+                SourceOffset textEnd, const Token &token, TokenizedData &data,
+                WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+	ASSERT_EQ(Tokens::Data, token.id);
+
+	Variant text = data.text(mode);
+	ASSERT_TRUE(text.isString());
+
+	EXPECT_EQ(expected, text.asString());
+	EXPECT_EQ(tokenStart, token.location.getStart());
+	EXPECT_EQ(tokenEnd, token.location.getEnd());
+	EXPECT_EQ(textStart, text.getLocation().getStart());
+	EXPECT_EQ(textEnd, text.getLocation().getEnd());
 }
 
 TEST(Tokenizer, textTokenPreserveWhitespace)
@@ -56,36 +74,34 @@ TEST(Tokenizer, textTokenPreserveWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		Tokenizer tokenizer{WhitespaceMode::PRESERVE};
+		Tokenizer tokenizer;
 
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ(" this \t is only a  \n\n test   text   ", token.content);
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(0U, loc.getStart());
-		ASSERT_EQ(36U, loc.getEnd());
+		expectData(" this \t is only a  \n\n test   text   ", 0, 36, 0, 36,
+		           token, data, WhitespaceMode::PRESERVE);
 
-		ASSERT_FALSE(tokenizer.read(reader, token));
+		data.clear();
+		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		Tokenizer tokenizer{WhitespaceMode::PRESERVE};
+		Tokenizer tokenizer;
 
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(0U, loc.getStart());
-		ASSERT_EQ(32U, loc.getEnd());
+		expectData("this \t is only a  \n\n test   text", 0, 32, 0, 32,
+		           token, data, WhitespaceMode::PRESERVE);
 
-		ASSERT_FALSE(tokenizer.read(reader, token));
+		data.clear();
+		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 }
 
@@ -95,36 +111,34 @@ TEST(Tokenizer, textTokenTrimWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		Tokenizer tokenizer{WhitespaceMode::TRIM};
+		Tokenizer tokenizer;
 
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(1U, loc.getStart());
-		ASSERT_EQ(33U, loc.getEnd());
+		expectData("this \t is only a  \n\n test   text", 0, 36, 1, 33, token,
+		           data, WhitespaceMode::TRIM);
 
-		ASSERT_FALSE(tokenizer.read(reader, token));
+		data.clear();
+		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		Tokenizer tokenizer{WhitespaceMode::TRIM};
+		Tokenizer tokenizer;
 
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(0U, loc.getStart());
-		ASSERT_EQ(32U, loc.getEnd());
+		expectData("this \t is only a  \n\n test   text", 0, 32, 0, 32,
+		           token, data, WhitespaceMode::TRIM);
 
-		ASSERT_FALSE(tokenizer.read(reader, token));
+		data.clear();
+		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 }
 
@@ -134,36 +148,34 @@ TEST(Tokenizer, textTokenCollapseWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		Tokenizer tokenizer{WhitespaceMode::COLLAPSE};
+		Tokenizer tokenizer;
 
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("this is only a test text", token.content);
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(1U, loc.getStart());
-		ASSERT_EQ(33U, loc.getEnd());
+		expectData("this is only a test text", 0, 36, 1, 33, token, data,
+		           WhitespaceMode::COLLAPSE);
 
-		ASSERT_FALSE(tokenizer.read(reader, token));
+		data.clear();
+		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		Tokenizer tokenizer{WhitespaceMode::COLLAPSE};
+		Tokenizer tokenizer;
 
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("this is only a test text", token.content);
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(0U, loc.getStart());
-		ASSERT_EQ(32U, loc.getEnd());
+		expectData("this is only a test text", 0, 32, 0, 32, token, data,
+		           WhitespaceMode::COLLAPSE);
 
-		ASSERT_FALSE(tokenizer.read(reader, token));
+		data.clear();
+		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 }
 
@@ -177,14 +189,12 @@ TEST(Tokenizer, simpleReadToken)
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
 		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("test1", token.content);
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(0U, loc.getStart());
-		ASSERT_EQ(5U, loc.getEnd());
+		expectData("test1", 0, 5, 0, 5, token, data);
 
 		char c;
 		ASSERT_TRUE(reader.peek(c));
@@ -193,7 +203,8 @@ TEST(Tokenizer, simpleReadToken)
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
 		ASSERT_EQ(tid, token.id);
 		ASSERT_EQ(":", token.content);
@@ -209,14 +220,10 @@ TEST(Tokenizer, simpleReadToken)
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("test2", token.content);
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-		SourceLocation loc = token.location;
-		ASSERT_EQ(6U, loc.getStart());
-		ASSERT_EQ(11U, loc.getEnd());
+		expectData("test2", 6, 11, 6, 11, token, data);
 
 		char c;
 		ASSERT_FALSE(reader.peek(c));
@@ -233,21 +240,17 @@ TEST(Tokenizer, simplePeekToken)
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.peek(reader, token));
-
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("test1", token.content);
-
-		SourceLocation loc = token.location;
-		ASSERT_EQ(0U, loc.getStart());
-		ASSERT_EQ(5U, loc.getEnd());
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.peek(reader, token, data));
+		expectData("test1", 0, 5, 0, 5, token, data);
 		ASSERT_EQ(0U, reader.getOffset());
 		ASSERT_EQ(5U, reader.getPeekOffset());
 	}
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.peek(reader, token));
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.peek(reader, token, data));
 
 		ASSERT_EQ(tid, token.id);
 		ASSERT_EQ(":", token.content);
@@ -261,35 +264,26 @@ TEST(Tokenizer, simplePeekToken)
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.peek(reader, token));
-
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("test2", token.content);
-
-		SourceLocation loc = token.location;
-		ASSERT_EQ(6U, loc.getStart());
-		ASSERT_EQ(11U, loc.getEnd());
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.peek(reader, token, data));
+		expectData("test2", 6, 11, 6, 11, token, data);
 		ASSERT_EQ(0U, reader.getOffset());
 		ASSERT_EQ(11U, reader.getPeekOffset());
 	}
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("test1", token.content);
-
-		SourceLocation loc = token.location;
-		ASSERT_EQ(0U, loc.getStart());
-		ASSERT_EQ(5U, loc.getEnd());
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		expectData("test1", 0, 5, 0, 5, token, data);
 		ASSERT_EQ(5U, reader.getOffset());
 		ASSERT_EQ(5U, reader.getPeekOffset());
 	}
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
 
 		ASSERT_EQ(tid, token.id);
 		ASSERT_EQ(":", token.content);
@@ -303,14 +297,9 @@ TEST(Tokenizer, simplePeekToken)
 
 	{
 		Token token;
-		ASSERT_TRUE(tokenizer.read(reader, token));
-
-		ASSERT_EQ(Tokens::Data, token.id);
-		ASSERT_EQ("test2", token.content);
-
-		SourceLocation loc = token.location;
-		ASSERT_EQ(6U, loc.getStart());
-		ASSERT_EQ(11U, loc.getEnd());
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		expectData("test2", 6, 11, 6, 11, token, data);
 		ASSERT_EQ(11U, reader.getOffset());
 		ASSERT_EQ(11U, reader.getPeekOffset());
 	}
@@ -320,6 +309,7 @@ TEST(Tokenizer, ambiguousTokens)
 {
 	CharReader reader{"abc"};
 	Tokenizer tokenizer;
+	TokenizedData data;
 
 	TokenId t1 = tokenizer.registerToken("abd");
 	TokenId t2 = tokenizer.registerToken("bc");
@@ -328,16 +318,17 @@ TEST(Tokenizer, ambiguousTokens)
 	ASSERT_EQ(1U, t2);
 
 	Token token;
-	ASSERT_TRUE(tokenizer.read(reader, token));
+	data.clear();
+	ASSERT_TRUE(tokenizer.read(reader, token, data));
 
-	ASSERT_EQ(Tokens::Data, token.id);
-	ASSERT_EQ("a", token.content);
+	expectData("a", 0, 1, 0, 1, token, data);
 
 	SourceLocation loc = token.location;
 	ASSERT_EQ(0U, loc.getStart());
 	ASSERT_EQ(1U, loc.getEnd());
 
-	ASSERT_TRUE(tokenizer.read(reader, token));
+	data.clear();
+	ASSERT_TRUE(tokenizer.read(reader, token, data));
 
 	ASSERT_EQ(t2, token.id);
 	ASSERT_EQ("bc", token.content);
@@ -346,7 +337,8 @@ TEST(Tokenizer, ambiguousTokens)
 	ASSERT_EQ(1U, loc.getStart());
 	ASSERT_EQ(3U, loc.getEnd());
 
-	ASSERT_FALSE(tokenizer.read(reader, token));
+	data.clear();
+	ASSERT_FALSE(tokenizer.read(reader, token, data));
 }
 
 TEST(Tokenizer, commentTestWhitespacePreserve)
@@ -354,7 +346,7 @@ TEST(Tokenizer, commentTestWhitespacePreserve)
 	CharReader reader{"Test/Test /* Block Comment */", 0};
 	//                 012345678901234567890123456789
 	//                 0        1         2
-	Tokenizer tokenizer(WhitespaceMode::PRESERVE);
+	Tokenizer tokenizer;
 
 	const TokenId t1 = tokenizer.registerToken("/");
 	const TokenId t2 = tokenizer.registerToken("/*");
@@ -370,45 +362,23 @@ TEST(Tokenizer, commentTestWhitespacePreserve)
 
 	Token t;
 	for (auto &te : expected) {
-		EXPECT_TRUE(tokenizer.read(reader, t));
+		TokenizedData data(0);
+		EXPECT_TRUE(tokenizer.read(reader, t, data));
 		EXPECT_EQ(te.id, t.id);
-		EXPECT_EQ(te.content, t.content);
+		if (te.id != Tokens::Data) {
+			EXPECT_EQ(te.content, t.content);
+		} else {
+			Variant text = data.text(WhitespaceMode::PRESERVE);
+			ASSERT_TRUE(text.isString());
+			EXPECT_EQ(te.content, text.asString());
+		}
 		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
 		EXPECT_EQ(te.location.getStart(), t.location.getStart());
 		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
 	}
-	ASSERT_FALSE(tokenizer.read(reader, t));
-}
-
-TEST(Tokenizer, commentTestWhitespaceCollapse)
-{
-	CharReader reader{"Test/Test /* Block Comment */", 0};
-	//                 012345678901234567890123456789
-	//                 0        1         2
-	Tokenizer tokenizer(WhitespaceMode::COLLAPSE);
 
-	const TokenId t1 = tokenizer.registerToken("/");
-	const TokenId t2 = tokenizer.registerToken("/*");
-	const TokenId t3 = tokenizer.registerToken("*/");
-
-	std::vector<Token> expected = {
-	    {Tokens::Data, "Test", SourceLocation{0, 0, 4}},
-	    {t1, "/", SourceLocation{0, 4, 5}},
-	    {Tokens::Data, "Test", SourceLocation{0, 5, 9}},
-	    {t2, "/*", SourceLocation{0, 10, 12}},
-	    {Tokens::Data, "Block Comment", SourceLocation{0, 13, 26}},
-	    {t3, "*/", SourceLocation{0, 27, 29}}};
-
-	Token t;
-	for (auto &te : expected) {
-		EXPECT_TRUE(tokenizer.read(reader, t));
-		EXPECT_EQ(te.id, t.id);
-		EXPECT_EQ(te.content, t.content);
-		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
-		EXPECT_EQ(te.location.getStart(), t.location.getStart());
-		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
-	}
-	ASSERT_FALSE(tokenizer.read(reader, t));
+	TokenizedData data;
+	ASSERT_FALSE(tokenizer.read(reader, t, data));
 }
 }
 
diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp
index d52fa5b..3d01007 100644
--- a/test/formats/osml/OsmlStreamParserTest.cpp
+++ b/test/formats/osml/OsmlStreamParserTest.cpp
@@ -30,11 +30,21 @@ namespace ousia {
 static TerminalLogger logger(std::cerr, true);
 // static ConcreteLogger logger;
 
+static OsmlStreamParser::State skipEmptyData(OsmlStreamParser &reader)
+{
+	OsmlStreamParser::State res = reader.parse();
+	if (res == OsmlStreamParser::State::DATA) {
+		EXPECT_FALSE(reader.getData().hasNonWhitespaceText());
+		res = reader.parse();
+	}
+	return res;
+}
+
 static void assertCommand(OsmlStreamParser &reader, const std::string &name,
                           SourceOffset start = InvalidSourceOffset,
                           SourceOffset end = InvalidSourceOffset)
 {
-	ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse());
+	ASSERT_EQ(OsmlStreamParser::State::COMMAND, skipEmptyData(reader));
 	EXPECT_EQ(name, reader.getCommandName().asString());
 	if (start != InvalidSourceOffset) {
 		EXPECT_EQ(start, reader.getCommandName().getLocation().getStart());
@@ -57,16 +67,19 @@ static void assertCommand(OsmlStreamParser &reader, const std::string &name,
 
 static void assertData(OsmlStreamParser &reader, const std::string &data,
                        SourceOffset start = InvalidSourceOffset,
-                       SourceOffset end = InvalidSourceOffset)
+                       SourceOffset end = InvalidSourceOffset,
+                       WhitespaceMode mode = WhitespaceMode::COLLAPSE)
 {
 	ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-	EXPECT_EQ(data, reader.getData().asString());
+	Variant text = reader.getText(mode);
+	ASSERT_TRUE(text.isString());
+	EXPECT_EQ(data, text.asString());
 	if (start != InvalidSourceOffset) {
-		EXPECT_EQ(start, reader.getData().getLocation().getStart());
+		EXPECT_EQ(start, text.getLocation().getStart());
 		EXPECT_EQ(start, reader.getLocation().getStart());
 	}
 	if (end != InvalidSourceOffset) {
-		EXPECT_EQ(end, reader.getData().getLocation().getEnd());
+		EXPECT_EQ(end, text.getLocation().getEnd());
 		EXPECT_EQ(end, reader.getLocation().getEnd());
 	}
 }
@@ -75,7 +88,7 @@ static void assertFieldStart(OsmlStreamParser &reader, bool defaultField,
                              SourceOffset start = InvalidSourceOffset,
                              SourceOffset end = InvalidSourceOffset)
 {
-	ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse());
+	ASSERT_EQ(OsmlStreamParser::State::FIELD_START, skipEmptyData(reader));
 	EXPECT_EQ(defaultField, reader.inDefaultField());
 	if (start != InvalidSourceOffset) {
 		EXPECT_EQ(start, reader.getLocation().getStart());
@@ -89,7 +102,7 @@ static void assertFieldEnd(OsmlStreamParser &reader,
                            SourceOffset start = InvalidSourceOffset,
                            SourceOffset end = InvalidSourceOffset)
 {
-	ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse());
+	ASSERT_EQ(OsmlStreamParser::State::FIELD_END, skipEmptyData(reader));
 	if (start != InvalidSourceOffset) {
 		EXPECT_EQ(start, reader.getLocation().getStart());
 	}
@@ -103,7 +116,7 @@ static void assertAnnotationStart(OsmlStreamParser &reader,
                                   SourceOffset start = InvalidSourceOffset,
                                   SourceOffset end = InvalidSourceOffset)
 {
-	ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, reader.parse());
+	ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, skipEmptyData(reader));
 	EXPECT_EQ(name, reader.getCommandName().asString());
 	if (start != InvalidSourceOffset) {
 		EXPECT_EQ(start, reader.getCommandName().getLocation().getStart());
@@ -131,7 +144,7 @@ static void assertAnnotationEnd(OsmlStreamParser &reader,
                                 SourceOffset start = InvalidSourceOffset,
                                 SourceOffset end = InvalidSourceOffset)
 {
-	ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, reader.parse());
+	ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, skipEmptyData(reader));
 	ASSERT_EQ(name, reader.getCommandName().asString());
 	if (!elementName.empty()) {
 		ASSERT_EQ(1U, reader.getCommandArguments().asMap().size());
@@ -152,7 +165,7 @@ static void assertEnd(OsmlStreamParser &reader,
                       SourceOffset start = InvalidSourceOffset,
                       SourceOffset end = InvalidSourceOffset)
 {
-	ASSERT_EQ(OsmlStreamParser::State::END, reader.parse());
+	ASSERT_EQ(OsmlStreamParser::State::END, skipEmptyData(reader));
 	if (start != InvalidSourceOffset) {
 		EXPECT_EQ(start, reader.getLocation().getStart());
 	}
@@ -205,26 +218,14 @@ TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak)
 	assertData(reader, "hello world", 1, 14);
 }
 
-TEST(OsmlStreamParser, escapeWhitespace)
-{
-	const char *testString = " hello\\ \\ world ";
-	//                        012345 67 89012345
-	//                        0           1
-	CharReader charReader(testString);
-
-	OsmlStreamParser reader(charReader, logger);
-
-	assertData(reader, "hello  world", 1, 15);
-}
-
 static void testEscapeSpecialCharacter(const std::string &c)
 {
 	CharReader charReader(std::string("\\") + c);
 	OsmlStreamParser reader(charReader, logger);
 	EXPECT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-	EXPECT_EQ(c, reader.getData().asString());
+	EXPECT_EQ(c, reader.getText().asString());
 
-	SourceLocation loc = reader.getData().getLocation();
+	SourceLocation loc = reader.getText().getLocation();
 	EXPECT_EQ(0U, loc.getStart());
 	EXPECT_EQ(1U + c.size(), loc.getEnd());
 }
@@ -253,16 +254,16 @@ TEST(OsmlStreamParser, singleLineComment)
 	OsmlStreamParser reader(charReader, logger);
 	{
 		ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-		ASSERT_EQ("a", reader.getData().asString());
-		SourceLocation loc = reader.getData().getLocation();
+		ASSERT_EQ("a", reader.getText().asString());
+		SourceLocation loc = reader.getText().getLocation();
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(1U, loc.getEnd());
 	}
 
 	{
 		ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-		ASSERT_EQ("b", reader.getData().asString());
-		SourceLocation loc = reader.getData().getLocation();
+		ASSERT_EQ("b", reader.getText().asString());
+		SourceLocation loc = reader.getText().getLocation();
 		ASSERT_EQ(33U, loc.getStart());
 		ASSERT_EQ(34U, loc.getEnd());
 	}
@@ -279,16 +280,16 @@ TEST(OsmlStreamParser, multilineComment)
 	OsmlStreamParser reader(charReader, logger);
 	{
 		ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-		ASSERT_EQ("a", reader.getData().asString());
-		SourceLocation loc = reader.getData().getLocation();
+		ASSERT_EQ("a", reader.getText().asString());
+		SourceLocation loc = reader.getText().getLocation();
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(1U, loc.getEnd());
 	}
 
 	{
 		ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-		ASSERT_EQ("b", reader.getData().asString());
-		SourceLocation loc = reader.getData().getLocation();
+		ASSERT_EQ("b", reader.getText().asString());
+		SourceLocation loc = reader.getText().getLocation();
 		ASSERT_EQ(40U, loc.getStart());
 		ASSERT_EQ(41U, loc.getEnd());
 	}
@@ -305,16 +306,16 @@ TEST(OsmlStreamParser, nestedMultilineComment)
 	OsmlStreamParser reader(charReader, logger);
 	{
 		ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-		ASSERT_EQ("a", reader.getData().asString());
-		SourceLocation loc = reader.getData().getLocation();
+		ASSERT_EQ("a", reader.getText().asString());
+		SourceLocation loc = reader.getText().getLocation();
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(1U, loc.getEnd());
 	}
 
 	{
 		ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
-		ASSERT_EQ("b", reader.getData().asString());
-		SourceLocation loc = reader.getData().getLocation();
+		ASSERT_EQ("b", reader.getText().asString());
+		SourceLocation loc = reader.getText().getLocation();
 		ASSERT_EQ(40U, loc.getStart());
 		ASSERT_EQ(41U, loc.getEnd());
 	}
@@ -569,8 +570,11 @@ TEST(OsmlStreamParser, multipleCommands)
 	OsmlStreamParser reader(charReader, logger);
 
 	assertCommand(reader, "a", 0, 2);
+	assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE);
 	assertCommand(reader, "b", 3, 5);
+	assertData(reader, " ", 5, 6, WhitespaceMode::PRESERVE);
 	assertCommand(reader, "c", 6, 8);
+	assertData(reader, " ", 8, 9, WhitespaceMode::PRESERVE);
 	assertCommand(reader, "d", 9, 11);
 	assertEnd(reader, 11, 11);
 }
@@ -584,10 +588,13 @@ TEST(OsmlStreamParser, fieldsWithSpaces)
 	OsmlStreamParser reader(charReader, logger);
 
 	assertCommand(reader, "a", 0, 2);
+	assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE);
 	assertFieldStart(reader, false, 3, 4);
 	assertCommand(reader, "b", 4, 6);
+	assertData(reader, " ", 6, 7, WhitespaceMode::PRESERVE);
 	assertCommand(reader, "c", 7, 9);
 	assertFieldEnd(reader, 9, 10);
+	assertData(reader, "   \n\n {", 10, 12, WhitespaceMode::PRESERVE);
 	assertFieldStart(reader, false, 16, 17);
 	assertCommand(reader, "d", 17, 19);
 	assertFieldEnd(reader, 19, 20);
diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp
index 3293370..6942166 100644
--- a/test/formats/osxml/OsxmlEventParserTest.cpp
+++ b/test/formats/osxml/OsxmlEventParserTest.cpp
@@ -21,6 +21,7 @@
 #include <core/frontend/TerminalLogger.hpp>
 #include <core/common/CharReader.hpp>
 #include <core/common/Variant.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
 
 #include <formats/osxml/OsxmlEventParser.hpp>
 
@@ -74,13 +75,11 @@ public:
 };
 
 static std::vector<std::pair<OsxmlEvent, Variant>> parseXml(
-    const char *testString,
-    WhitespaceMode whitespaceMode = WhitespaceMode::TRIM)
+    const char *testString)
 {
 	TestOsxmlEventListener listener;
 	CharReader reader(testString);
 	OsxmlEventParser parser(reader, listener, logger);
-	parser.setWhitespaceMode(whitespaceMode);
 	parser.parse();
 	return listener.events;
 }
@@ -157,7 +156,7 @@ TEST(OsxmlEventParser, magicTopLevelTagInside)
 	ASSERT_EQ(expectedEvents, events);
 }
 
-TEST(OsxmlEventParser, commandWithDataPreserveWhitespace)
+TEST(OsxmlEventParser, commandWithData)
 {
 	const char *testString = "<a>  hello  \n world </a>";
 	//                        012345678901 234567890123
@@ -168,50 +167,12 @@ TEST(OsxmlEventParser, commandWithDataPreserveWhitespace)
 	    {OsxmlEvent::DATA, Variant::arrayType{"  hello  \n world "}},
 	    {OsxmlEvent::FIELD_END, Variant::arrayType{}}};
 
-	auto events = parseXml(testString, WhitespaceMode::PRESERVE);
+	auto events = parseXml(testString);
 	ASSERT_EQ(expectedEvents, events);
 
 	// Check the location of the text
 	ASSERT_EQ(3U, events[1].second.asArray()[0].getLocation().getStart());
 	ASSERT_EQ(20U, events[1].second.asArray()[0].getLocation().getEnd());
 }
-
-TEST(OsxmlEventParser, commandWithDataTrimWhitespace)
-{
-	const char *testString = "<a>  hello  \n world </a>";
-	//                        012345678901 234567890123
-	//                        0         1          2
-
-	std::vector<std::pair<OsxmlEvent, Variant>> expectedEvents{
-	    {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}},
-	    {OsxmlEvent::DATA, Variant::arrayType{"hello  \n world"}},
-	    {OsxmlEvent::FIELD_END, Variant::arrayType{}}};
-
-	auto events = parseXml(testString, WhitespaceMode::TRIM);
-	ASSERT_EQ(expectedEvents, events);
-
-	// Check the location of the text
-	ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart());
-	ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd());
-}
-
-TEST(OsxmlEventParser, commandWithDataCollapseWhitespace)
-{
-	const char *testString = "<a>  hello  \n world </a>";
-	//                        012345678901 234567890123
-	//                        0         1          2
-
-	std::vector<std::pair<OsxmlEvent, Variant>> expectedEvents{
-	    {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}},
-	    {OsxmlEvent::DATA, Variant::arrayType{"hello world"}},
-	    {OsxmlEvent::FIELD_END, Variant::arrayType{}}};
-
-	auto events = parseXml(testString, WhitespaceMode::COLLAPSE);
-	ASSERT_EQ(expectedEvents, events);
-
-	// Check the location of the text
-	ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart());
-	ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd());
-}
 }