From 4854509f8add1e2ff167623fb0e8d4216d9d6023 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 17:54:27 +0100
Subject: Implemented DynamicTokenizer and unit tests

---
 src/plugins/plain/DynamicTokenizer.cpp | 514 +++++++++++++++++++++++++++++++--
 1 file changed, 493 insertions(+), 21 deletions(-)

(limited to 'src/plugins/plain/DynamicTokenizer.cpp')
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
index 7690395..a8f2317 100644
--- a/src/plugins/plain/DynamicTokenizer.cpp
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -17,57 +17,529 @@
 */
 
 #include <memory>
-#include <string>
-#include <unordered_map>
+#include <vector>
 
 #include <core/common/CharReader.hpp>
+#include <core/common/Exceptions.hpp>
+#include <core/common/Utils.hpp>
 
 #include "DynamicTokenizer.hpp"
 
 namespace ousia {
 
+namespace {
+
+/* Internal class TokenMatch */
+
+/**
+ * Contains information about a matching token.
+ */
+struct TokenMatch {
+	/**
+	 * Token that was matched.
+	 */
+	DynamicToken token;
+
+	/**
+	 * Current length of the data within the text handler. The text buffer needs
+	 * to be trimmed to this length if this token matches.
+	 */
+	size_t textLength;
+
+	/**
+	 * End location of the current text handler. This location needs to be used
+	 * for the text token that is emitted before the actual token.
+	 */
+	size_t textEnd;
+
+	/**
+	 * Constructor of the TokenMatch class.
+	 */
+	TokenMatch() : textLength(0), textEnd(0) {}
+
+	/**
+	 * Returns true if this TokenMatch instance actually represents a match.
+	 */
+	bool hasMatch() { return token.type != EmptyToken; }
+};
+
+/* Internal class TokenLookup */
+
+/**
+ * The TokenLookup class is used to represent a thread in a running token
+ * lookup.
+ */
+class TokenLookup {
+private:
+	/**
+	 * Current node within the token trie.
+	 */
+	TokenTrie::Node const *node;
+
+	/**
+	 * Start offset within the source file.
+	 */
+	size_t start;
+
+	/**
+	 * Current length of the data within the text handler. The text buffer needs
+	 * to be trimmed to this length if this token matches.
+	 */
+	size_t textLength;
+
+	/**
+	 * End location of the current text handler. This location needs to be used
+	 * for the text token that is emitted before the actual token.
+	 */
+	size_t textEnd;
+
+public:
+	/**
+	 * Constructor of the TokenLookup class.
+	 *
+	 * @param node is the current node.
+	 * @param start is the start position.
+	 * @param textLength is the text buffer length of the previous text token.
+	 * @param textEnd is the current end location of the previous text token.
+	 */
+	TokenLookup(const TokenTrie::Node *node, size_t start,
+	            size_t textLength, size_t textEnd)
+	    : node(node), start(start), textLength(textLength), textEnd(textEnd)
+	{
+	}
+
+	/**
+	 * Tries to extend the current path in the token trie with the given
+	 * character. If a complete token is matched, stores this match in the
+	 * tokens list (in case it is longer than any previous token).
+	 *
+	 * @param c is the character that should be appended to the current prefix.
+	 * @param lookups is a list to which new TokeLookup instances are added --
+	 * which could potentially be expanded in the next iteration.
+	 * @param match is the DynamicToken instance to which the matching token
+	 * should be written.
+	 * @param tokens is a reference at the internal token list of the
+	 * DynamicTokenizer.
+	 * @param end is the end byte offset of the current character.
+	 * @param sourceId is the source if of this file.
+	 */
+	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+	             const std::vector<std::string> &tokens, SourceOffset end,
+	             SourceId sourceId)
+	{
+		// Check whether we can continue the current token path with the given
+		// character without visiting an already visited node
+		auto it = node->children.find(c);
+		if (it == node->children.end()) {
+			return;
+		}
+
+		// Check whether the new node represents a complete token a whether it
+		// is longer than the current token. If yes, replace the current token.
+		node = it->second.get();
+		if (node->type != EmptyToken) {
+			const std::string &str = tokens[node->type];
+			size_t len = str.size();
+			if (len > match.token.content.size()) {
+				match.token =
+				    DynamicToken{node->type, str, {sourceId, start, end}};
+				match.textLength = textLength;
+				match.textEnd = textEnd;
+			}
+		}
+
+		// If this state can possibly be advanced, store it in the states list.
+		if (!node->children.empty()) {
+			lookups.emplace_back(*this);
+		}
+	}
+};
+
+/* Internal class TextHandlerBase */
+
+/**
+ * Base class used for those classes that may be used as TextHandler in the
+ * DynamicTokenizer::next function.
+ */
+class TextHandlerBase {
+public:
+	/**
+	 * Start position of the extracted text.
+	 */
+	size_t textStart;
+
+	/**
+	 * End position of the extracted text.
+	 */
+	size_t textEnd;
+
+	/**
+	 * Buffer containing the extracted text.
+	 */
+	std::vector<char> textBuf;
+
+	/**
+	 * Constructor of the TextHandlerBase base class. Initializes the start and
+	 * end position with zeros.
+	 */
+	TextHandlerBase() : textStart(0), textEnd(0) {}
+
+	/**
+	 * Transforms the given token into a text token containing the extracted
+	 * text.
+	 *
+	 * @param token is the output token to which the text should be written.
+	 * @param sourceId is the source id of the underlying file.
+	 */
+	void buildTextToken(TokenMatch &match, SourceId sourceId)
+	{
+		if (match.hasMatch()) {
+			match.token.content =
+			    std::string{textBuf.data(), match.textLength};
+			match.token.location =
+			    SourceLocation{sourceId, textStart, match.textEnd};
+		} else {
+			match.token.content = std::string{textBuf.data(), textBuf.size()};
+			match.token.location = SourceLocation{sourceId, textStart, textEnd};
+		}
+		match.token.type = TextToken;
+	}
+
+	/**
+	 * Returns true if this whitespace handler has found any text and a text
+	 * token could be emitted.
+	 *
+	 * @return true if the internal data buffer is non-empty.
+	 */
+	bool hasText() { return !textBuf.empty(); }
+};
+
+/* Internal class PreservingTextHandler */
+
+/**
+ * The PreservingTextHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingTextHandler : public TextHandlerBase {
+public:
+	using TextHandlerBase::TextHandlerBase;
+
+	/**
+	 * Appends the given character to the internal text buffer, does not
+	 * eliminate whitespace.
+	 *
+	 * @param c is the character that should be appended to the internal buffer.
+	 * @param start is the start byte offset of the given character.
+	 * @param end is the end byte offset of the given character.
+	 */
+	void append(char c, size_t start, size_t end)
+	{
+		if (textBuf.empty()) {
+			textStart = start;
+		}
+		textEnd = end;
+		textBuf.push_back(c);
+	}
+};
+
+/* Internal class TrimmingTextHandler */
+
 /**
- * The TokenDescriptor class is a simple wrapper around a standard string
- * containing the character sequence of the token.
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
  */
-class TokenDescriptor {
+class TrimmingTextHandler : public TextHandlerBase {
+public:
+	using TextHandlerBase::TextHandlerBase;
+
 	/**
-	 * The character sequence of the token.
+	 * Buffer used internally to temporarily store all whitespace characters.
+	 * They are only added to the output buffer if another non-whitespace
+	 * character is reached.
 	 */
-	std::string str;
+	std::vector<char> whitespaceBuf;
 
 	/**
-	 * Default constructor of the TokenDescriptor class. Used to describe
-	 * special tokens.
+	 * Appends the given character to the internal text buffer, eliminates
+	 * whitespace characters at the begin and end of the text.
+	 *
+	 * @param c is the character that should be appended to the internal buffer.
+	 * @param start is the start byte offset of the given character.
+	 * @param end is the end byte offset of the given character.
 	 */
-	TokenDescriptor();
+	void append(char c, size_t start, size_t end)
+	{
+		// Handle whitespace characters
+		if (Utils::isWhitespace(c)) {
+			if (!textBuf.empty()) {
+				whitespaceBuf.push_back(c);
+			}
+			return;
+		}
+
+		// Set the start and end offset correctly
+		if (textBuf.empty()) {
+			textStart = start;
+		}
+		textEnd = end;
+
+		// Store the character
+		if (!whitespaceBuf.empty()) {
+			textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+			               whitespaceBuf.end());
+			whitespaceBuf.clear();
+		}
+		textBuf.push_back(c);
+	}
+};
+
+/* Internal class CollapsingTextHandler */
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingTextHandler : public TextHandlerBase {
+public:
+	using TextHandlerBase::TextHandlerBase;
 
 	/**
-	 * Constructor initializing the character sequence of the token.
+	 * Flag set to true if a whitespace character was reached.
 	 */
-	TokenDescriptor(const std::string &str) : str(str) {}
+	bool hasWhitespace = false;
+
+	/**
+	 * Appends the given character to the internal text buffer, eliminates
+	 * redundant whitespace characters.
+	 *
+	 * @param c is the character that should be appended to the internal buffer.
+	 * @param start is the start byte offset of the given character.
+	 * @param end is the end byte offset of the given character.
+	 */
+	void append(char c, size_t start, size_t end)
+	{
+		// Handle whitespace characters
+		if (Utils::isWhitespace(c)) {
+			if (!textBuf.empty()) {
+				hasWhitespace = true;
+			}
+			return;
+		}
+
+		// Set the start and end offset correctly
+		if (textBuf.empty()) {
+			textStart = start;
+		}
+		textEnd = end;
+
+		// Store the character
+		if (hasWhitespace) {
+			textBuf.push_back(' ');
+			hasWhitespace = false;
+		}
+		textBuf.push_back(c);
+	}
 };
+}
 
 /* Class DynamicTokenizer */
 
-void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode)
+DynamicTokenizer::DynamicTokenizer(CharReader &reader,
+                                   WhitespaceMode whitespaceMode)
+    : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0)
 {
-	whitespaceMode = mode;
 }
 
-WhitespaceMode DynamicTokenizer::getWhitespaceMode()
+template <typename TextHandler, bool read>
+bool DynamicTokenizer::next(DynamicToken &token)
 {
-	return whitespaceMode;
+	// If we're in the read mode, reset the char reader peek position to the
+	// current read position
+	if (read) {
+		reader.resetPeek();
+	}
+
+	// Prepare the lookups in the token trie
+	const TokenTrie::Node *root = trie.getRoot();
+	TokenMatch match;
+	std::vector<TokenLookup> lookups;
+	std::vector<TokenLookup> nextLookups;
+
+	// Instantiate the text handler
+	TextHandler textHandler;
+
+	// Peek characters from the reader and try to advance the current token tree
+	// cursor
+	char c;
+	size_t charStart = reader.getPeekOffset();
+	const SourceId sourceId = reader.getSourceId();
+	while (reader.peek(c)) {
+		const size_t charEnd = reader.getPeekOffset();
+		const size_t textLength = textHandler.textBuf.size();
+		const size_t textEnd = textHandler.textEnd;
+
+		// If we do not have a match yet, start a new lookup from the root
+		if (!match.hasMatch()) {
+			TokenLookup{root, charStart, textLength, textEnd}.advance(
+			    c, nextLookups, match, tokens, charEnd, sourceId);
+		}
+
+		// Try to advance all other lookups with the new character
+		for (TokenLookup &lookup : lookups) {
+			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+		}
+
+		// We have found a token and there are no more states to advance or the
+		// text handler has found something -- abort to return the new token
+		if (match.hasMatch()) {
+			if ((nextLookups.empty() || textHandler.hasText())) {
+				break;
+			}
+		} else {
+			// Record all incomming characters
+			textHandler.append(c, charStart, charEnd);
+		}
+
+		// Swap the lookups and the nextLookups list
+		lookups = std::move(nextLookups);
+		nextLookups.clear();
+
+		// Advance the offset
+		charStart = charEnd;
+	}
+
+	// If we found text, emit that text
+	if (textHandler.hasText() &&
+	    (!match.hasMatch() || match.textLength > 0)) {
+		textHandler.buildTextToken(match, sourceId);
+	}
+
+	// Move the read/peek cursor to the end of the token, abort if an error
+	// happens while doing so
+	if (match.hasMatch()) {
+		// Make sure we have a valid location
+		if (match.token.location.getEnd() == InvalidSourceOffset) {
+			throw OusiaException{"Token end position offset out of range"};
+		}
+
+		// Seek to the end of the current token
+		const size_t end = match.token.location.getEnd();
+		if (read) {
+			reader.seek(end);
+		} else {
+			reader.seekPeekCursor(end);
+		}
+		token = match.token;
+	} else {
+		token = DynamicToken{};
+	}
+	return match.hasMatch();
+}
+
+bool DynamicTokenizer::read(DynamicToken &token)
+{
+	switch (whitespaceMode) {
+		case WhitespaceMode::PRESERVE:
+			return next<PreservingTextHandler, true>(token);
+		case WhitespaceMode::TRIM:
+			return next<TrimmingTextHandler, true>(token);
+		case WhitespaceMode::COLLAPSE:
+			return next<CollapsingTextHandler, true>(token);
+	}
+	return false;
+}
+
+bool DynamicTokenizer::peek(DynamicToken &token)
+{
+	switch (whitespaceMode) {
+		case WhitespaceMode::PRESERVE:
+			return next<PreservingTextHandler, false>(token);
+		case WhitespaceMode::TRIM:
+			return next<TrimmingTextHandler, false>(token);
+		case WhitespaceMode::COLLAPSE:
+			return next<CollapsingTextHandler, false>(token);
+	}
+	return false;
 }
 
+TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
+{
+	// Abort if an empty token should be registered
+	if (token.empty()) {
+		return EmptyToken;
+	}
+
+	// Search for a new slot in the tokens list
+	TokenTypeId type = EmptyToken;
+	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
+		if (tokens[i].empty()) {
+			tokens[i] = token;
+			type = i;
+			break;
+		}
+	}
 
-/* Constant initializations */
+	// No existing slot was found, add a new one -- make sure we do not
+	// override the special token type handles
+	if (type == EmptyToken) {
+		type = tokens.size();
+		if (type == TextToken || type == EmptyToken) {
+			throw OusiaException{"Token type ids depleted!"};
+		}
+		tokens.emplace_back(token);
+	}
+	nextTokenTypeId = type + 1;
 
-static const TokenDescriptor Empty;
-static const TokenDescriptor Text;
-static const TokenDescriptor* DynamicTokenizer::Empty = &Empty;
-static const TokenDescriptor* DynamicTokenizer::Token = &Text;
+	// Try to register the token in the trie -- if this fails, remove it
+	// from the tokens list
+	if (!trie.registerToken(token, type)) {
+		tokens[type] = std::string();
+		nextTokenTypeId = type;
+		return EmptyToken;
+	}
+	return type;
+}
+
+bool DynamicTokenizer::unregisterToken(TokenTypeId type)
+{
+	// Unregister the token from the trie, abort if an invalid type is given
+	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
+		tokens[type] = std::string{};
+		nextTokenTypeId = type;
+		return true;
+	}
+	return false;
+}
+
+std::string DynamicTokenizer::getTokenString(TokenTypeId type)
+{
+	if (type < tokens.size()) {
+		return tokens[type];
+	}
+	return std::string{};
+}
+
+void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
+{
+	whitespaceMode = mode;
+}
 
+WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
 
+/* Explicitly instantiate all possible instantiations of the "next" member
+   function */
+template bool DynamicTokenizer::next<PreservingTextHandler, false>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<PreservingTextHandler, true>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
+    DynamicToken &token);
 }
 
-- 
cgit v1.2.3


From f713b1d393230e7083727d457623fdac878eb248 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 18:48:07 +0100
Subject: DynamicTokenizer now gets the reader as a parameter to read and peek
 -- the beauty of this tokenizer is that it has no internal state depending on
 the reader, so it doesn't need to hold a reference to it

---
 src/plugins/plain/DynamicTokenizer.cpp      | 35 ++++++-------
 src/plugins/plain/DynamicTokenizer.hpp      | 22 ++++----
 test/plugins/plain/DynamicTokenizerTest.cpp | 81 ++++++++++++++---------------
 3 files changed, 67 insertions(+), 71 deletions(-)

(limited to 'src/plugins/plain/DynamicTokenizer.cpp')

diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
index a8f2317..f2cfcd1 100644
--- a/src/plugins/plain/DynamicTokenizer.cpp
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -345,14 +345,13 @@ public:
 
 /* Class DynamicTokenizer */
 
-DynamicTokenizer::DynamicTokenizer(CharReader &reader,
-                                   WhitespaceMode whitespaceMode)
-    : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0)
+DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
+    : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
 {
 }
 
 template <typename TextHandler, bool read>
-bool DynamicTokenizer::next(DynamicToken &token)
+bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
 {
 	// If we're in the read mode, reset the char reader peek position to the
 	// current read position
@@ -437,28 +436,28 @@ bool DynamicTokenizer::next(DynamicToken &token)
 	return match.hasMatch();
 }
 
-bool DynamicTokenizer::read(DynamicToken &token)
+bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
 {
 	switch (whitespaceMode) {
 		case WhitespaceMode::PRESERVE:
-			return next<PreservingTextHandler, true>(token);
+			return next<PreservingTextHandler, true>(reader, token);
 		case WhitespaceMode::TRIM:
-			return next<TrimmingTextHandler, true>(token);
+			return next<TrimmingTextHandler, true>(reader, token);
 		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingTextHandler, true>(token);
+			return next<CollapsingTextHandler, true>(reader, token);
 	}
 	return false;
 }
 
-bool DynamicTokenizer::peek(DynamicToken &token)
+bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
 {
 	switch (whitespaceMode) {
 		case WhitespaceMode::PRESERVE:
-			return next<PreservingTextHandler, false>(token);
+			return next<PreservingTextHandler, false>(reader, token);
 		case WhitespaceMode::TRIM:
-			return next<TrimmingTextHandler, false>(token);
+			return next<TrimmingTextHandler, false>(reader, token);
 		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingTextHandler, false>(token);
+			return next<CollapsingTextHandler, false>(reader, token);
 	}
 	return false;
 }
@@ -530,16 +529,16 @@ WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
 /* Explicitly instantiate all possible instantiations of the "next" member
    function */
 template bool DynamicTokenizer::next<PreservingTextHandler, false>(
-    DynamicToken &token);
+    CharReader &reader, DynamicToken &token);
 template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
-    DynamicToken &token);
+    CharReader &reader, DynamicToken &token);
 template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 template bool DynamicTokenizer::next<PreservingTextHandler, true>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 }
 
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
index 760bebf..0b4dd39 100644
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -118,11 +118,6 @@ enum class WhitespaceMode {
  */
 class DynamicTokenizer {
 private:
-	/**
-	 * CharReader instance from which the tokens should be read.
-	 */
-	CharReader &reader;
-
 	/**
 	 * Internally used token trie. This object holds all registered tokens.
 	 */
@@ -151,23 +146,22 @@ private:
 	 * @tparam TextHandler is the type to be used for the textHandler instance.
 	 * @tparam read specifies whether the function should start from and advance
 	 * the read pointer of the char reader.
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
 	 * @param token is the token structure into which the token information
 	 * should be written.
 	 * @return false if the end of the stream has been reached, true otherwise.
 	 */
 	template <typename TextHandler, bool read>
-	bool next(DynamicToken &token);
+	bool next(CharReader &reader, DynamicToken &token);
 
 public:
 	/**
 	 * Constructor of the DynamicTokenizer class.
 	 *
-	 * @param reader is the CharReader that should be used for reading the
-	 * tokens.
 	 * @param whitespaceMode specifies how whitespace should be handled.
 	 */
-	DynamicTokenizer(CharReader &reader,
-	                 WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
 
 	/**
 	 * Registers the given string as a token. Returns a const pointer at a
@@ -222,23 +216,27 @@ public:
 	 * Reads a new token from the CharReader and stores it in the given
 	 * DynamicToken instance.
 	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool read(DynamicToken &token);
+	bool read(CharReader &reader, DynamicToken &token);
 
 	/**
 	 * The peek method does not advance the read position of the char reader,
 	 * but reads the next token from the current char reader peek position.
 	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool peek(DynamicToken &token);
+	bool peek(CharReader &reader, DynamicToken &token);
 };
 }
 
diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp
index 63fa466..5183fdd 100644
--- a/test/plugins/plain/DynamicTokenizerTest.cpp
+++ b/test/plugins/plain/DynamicTokenizerTest.cpp
@@ -25,8 +25,7 @@ namespace ousia {
 
 TEST(DynamicTokenizer, tokenRegistration)
 {
-	CharReader reader{"test"};
-	DynamicTokenizer tokenizer{reader};
+	DynamicTokenizer tokenizer;
 
 	ASSERT_EQ(EmptyToken, tokenizer.registerToken(""));
 
@@ -57,10 +56,10 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+		DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ(" this \t is only a  \n\n test   text   ", token.content);
 
@@ -68,17 +67,17 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(36U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+		DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
 
@@ -86,7 +85,7 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(32U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 }
 
@@ -96,10 +95,10 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+		DynamicTokenizer tokenizer{WhitespaceMode::TRIM};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
 
@@ -107,17 +106,17 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace)
 		ASSERT_EQ(1U, loc.getStart());
 		ASSERT_EQ(33U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+		DynamicTokenizer tokenizer{WhitespaceMode::TRIM};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
 
@@ -125,7 +124,7 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(32U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 }
 
@@ -135,10 +134,10 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+		DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this is only a test text", token.content);
 
@@ -146,17 +145,17 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace)
 		ASSERT_EQ(1U, loc.getStart());
 		ASSERT_EQ(33U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+		DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this is only a test text", token.content);
 
@@ -164,21 +163,21 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(32U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 }
 
 TEST(DynamicTokenizer, simpleReadToken)
 {
 	CharReader reader{"test1:test2"};
-	DynamicTokenizer tokenizer{reader};
+	DynamicTokenizer tokenizer;
 
 	const TokenTypeId tid = tokenizer.registerToken(":");
 	ASSERT_EQ(0U, tid);
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test1", token.content);
@@ -194,7 +193,7 @@ TEST(DynamicTokenizer, simpleReadToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(tid, token.type);
 		ASSERT_EQ(":", token.content);
@@ -210,7 +209,7 @@ TEST(DynamicTokenizer, simpleReadToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test2", token.content);
@@ -227,14 +226,14 @@ TEST(DynamicTokenizer, simpleReadToken)
 TEST(DynamicTokenizer, simplePeekToken)
 {
 	CharReader reader{"test1:test2"};
-	DynamicTokenizer tokenizer{reader};
+	DynamicTokenizer tokenizer;
 
 	const TokenTypeId tid = tokenizer.registerToken(":");
 	ASSERT_EQ(0U, tid);
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.peek(token));
+		ASSERT_TRUE(tokenizer.peek(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test1", token.content);
@@ -248,7 +247,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.peek(token));
+		ASSERT_TRUE(tokenizer.peek(reader, token));
 
 		ASSERT_EQ(tid, token.type);
 		ASSERT_EQ(":", token.content);
@@ -262,7 +261,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.peek(token));
+		ASSERT_TRUE(tokenizer.peek(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test2", token.content);
@@ -276,7 +275,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test1", token.content);
@@ -290,7 +289,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(tid, token.type);
 		ASSERT_EQ(":", token.content);
@@ -304,7 +303,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test2", token.content);
@@ -320,7 +319,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 TEST(DynamicTokenizer, ambiguousTokens)
 {
 	CharReader reader{"abc"};
-	DynamicTokenizer tokenizer(reader);
+	DynamicTokenizer tokenizer;
 
 	TokenTypeId t1 = tokenizer.registerToken("abd");
 	TokenTypeId t2 = tokenizer.registerToken("bc");
@@ -329,7 +328,7 @@ TEST(DynamicTokenizer, ambiguousTokens)
 	ASSERT_EQ(1U, t2);
 
 	DynamicToken token;
-	ASSERT_TRUE(tokenizer.read(token));
+	ASSERT_TRUE(tokenizer.read(reader, token));
 
 	ASSERT_EQ(TextToken, token.type);
 	ASSERT_EQ("a", token.content);
@@ -338,7 +337,7 @@ TEST(DynamicTokenizer, ambiguousTokens)
 	ASSERT_EQ(0U, loc.getStart());
 	ASSERT_EQ(1U, loc.getEnd());
 
-	ASSERT_TRUE(tokenizer.read(token));
+	ASSERT_TRUE(tokenizer.read(reader, token));
 
 	ASSERT_EQ(t2, token.type);
 	ASSERT_EQ("bc", token.content);
@@ -347,7 +346,7 @@ TEST(DynamicTokenizer, ambiguousTokens)
 	ASSERT_EQ(1U, loc.getStart());
 	ASSERT_EQ(3U, loc.getEnd());
 
-	ASSERT_FALSE(tokenizer.read(token));
+	ASSERT_FALSE(tokenizer.read(reader, token));
 }
 
 TEST(DynamicTokenizer, commentTestWhitespacePreserve)
@@ -355,7 +354,7 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve)
 	CharReader reader{"Test/Test /* Block Comment */", 0};
 	//                 012345678901234567890123456789
 	//                 0        1         2
-	DynamicTokenizer tokenizer(reader, WhitespaceMode::PRESERVE);
+	DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE);
 
 	const TokenTypeId t1 = tokenizer.registerToken("/");
 	const TokenTypeId t2 = tokenizer.registerToken("/*");
@@ -371,14 +370,14 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve)
 
 	DynamicToken t;
 	for (auto &te : expected) {
-		EXPECT_TRUE(tokenizer.read(t));
+		EXPECT_TRUE(tokenizer.read(reader, t));
 		EXPECT_EQ(te.type, t.type);
 		EXPECT_EQ(te.content, t.content);
 		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
 		EXPECT_EQ(te.location.getStart(), t.location.getStart());
 		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
 	}
-	ASSERT_FALSE(tokenizer.read(t));
+	ASSERT_FALSE(tokenizer.read(reader, t));
 }
 
 TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
@@ -386,7 +385,7 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
 	CharReader reader{"Test/Test /* Block Comment */", 0};
 	//                 012345678901234567890123456789
 	//                 0        1         2
-	DynamicTokenizer tokenizer(reader, WhitespaceMode::COLLAPSE);
+	DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE);
 
 	const TokenTypeId t1 = tokenizer.registerToken("/");
 	const TokenTypeId t2 = tokenizer.registerToken("/*");
@@ -402,14 +401,14 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
 
 	DynamicToken t;
 	for (auto &te : expected) {
-		EXPECT_TRUE(tokenizer.read(t));
+		EXPECT_TRUE(tokenizer.read(reader, t));
 		EXPECT_EQ(te.type, t.type);
 		EXPECT_EQ(te.content, t.content);
 		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
 		EXPECT_EQ(te.location.getStart(), t.location.getStart());
 		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
 	}
-	ASSERT_FALSE(tokenizer.read(t));
+	ASSERT_FALSE(tokenizer.read(reader, t));
 }
 
 }
-- 
cgit v1.2.3