From fb0922e57f1a5e1fb8bfbe153dc381d5778e3137 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 17:53:15 +0100
Subject: Added seekPeekCursor function to CharReader, improved how seeking is
 handled by adding seekCursor method to Buffer

---
 src/core/common/CharReader.cpp | 32 ++++++++++++++++++++++++--------
 src/core/common/CharReader.hpp | 20 ++++++++++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/core/common/CharReader.cpp b/src/core/common/CharReader.cpp
index edcaf76..5b9b1d4 100644
--- a/src/core/common/CharReader.cpp
+++ b/src/core/common/CharReader.cpp
@@ -329,6 +329,19 @@ ssize_t Buffer::moveCursor(CursorId cursor, ssize_t relativeOffs)
 	}
 }
 
+size_t Buffer::seekCursor(CursorId cursor, size_t offs)
+{
+	// Fetch the current offset
+	const ssize_t currentOffs = offset(cursor);
+	const ssize_t relativeOffs = offs - currentOffs;
+
+	// Perform the actual seeking, move the peek cursor to the read cursor
+	const ssize_t reachedOffs =  currentOffs + moveCursor(cursor, relativeOffs);
+
+	// Clamp to values larger or equal to zero
+	return reachedOffs < 0 ? 0 : reachedOffs;
+}
+
 bool Buffer::atEnd(Buffer::CursorId cursor) const
 {
 	const Cursor &c = cursors[cursor];
@@ -504,17 +517,17 @@ size_t CharReader::readRaw(char *buf, size_t size)
 
 size_t CharReader::seek(size_t requestedOffset)
 {
-	// Fetch the current offset
-	const ssize_t currentOffs = getOffset();
-	const ssize_t relativeOffs = requestedOffset - currentOffs;
-
-	// Perform the actual seeking, move the peek cursor to the read cursor
-	const ssize_t reachedOffs =  currentOffs + buffer->moveCursor(readCursor, relativeOffs);
+	const size_t res =  buffer->seekCursor(readCursor, requestedOffset);
 	buffer->copyCursor(readCursor, peekCursor);
 	coherent = true;
+	return res;
+}
 
-	// Clamp to values larger or equal to zero
-	return reachedOffs < 0 ? 0 : reachedOffs;
+size_t CharReader::seekPeekCursor(size_t requestedOffset)
+{
+	const size_t res =  buffer->seekCursor(peekCursor, requestedOffset);
+	coherent = (res == getOffset());
+	return res;
 }
 
 bool CharReader::atEnd() const { return buffer->atEnd(readCursor); }
@@ -526,6 +539,9 @@ size_t CharReader::getOffset() const
 
 size_t CharReader::getPeekOffset() const
 {
+	if (coherent) {
+		return getOffset();
+	}
 	return buffer->offset(peekCursor) + offs;
 }
 
diff --git a/src/core/common/CharReader.hpp b/src/core/common/CharReader.hpp
index cbd7b74..cbfeaf2 100644
--- a/src/core/common/CharReader.hpp
+++ b/src/core/common/CharReader.hpp
@@ -301,6 +301,15 @@ public:
 	 */
 	ssize_t moveCursor(CursorId cursor, ssize_t relativeOffs);
 
+	/**
+	 * Moves the cursor to the given position.
+	 *
+	 * @param cursor is the cursor that should be moved.
+	 * @param offs is the offset to which the cursor should be moved.
+	 * @return the actual location that was reached.
+	 */
+	size_t seekCursor(CursorId cursor, size_t offs);
+
 	/**
 	 * Returns the current byte offset of the given cursor relative to the
 	 * beginning of the stream.
@@ -532,6 +541,17 @@ public:
 	 */
 	size_t seek(size_t requestedOffset);
 
+	/**
+	 * Moves the peek cursor to the requested offset. Returns the offse that wa
+	 * actually reached.
+	 *
+	 * @param requestedOffset is the requested offset. This offset may no longer
+	 * be reachable by the CharReader.
+	 * @return the actually reached offset. The operation was successful, if
+	 * the requested and reached offset are equal.
+	 */
+	size_t seekPeekCursor(size_t requestedOffset);
+
 	/**
 	 * Returns true if there are no more characters as the stream was closed.
 	 *
-- 
cgit v1.2.3


From b9681594380333a0a3f0011b40ac6542e7022d98 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 17:54:09 +0100
Subject: Deleted DynamicTokenTree class, replaced by TokenTrie

---
 CMakeLists.txt                              |   4 +-
 src/plugins/plain/DynamicTokenTree.cpp      | 121 ----------------------
 src/plugins/plain/DynamicTokenTree.hpp      | 136 -------------------------
 src/plugins/plain/TokenTrie.cpp             | 119 ++++++++++++++++++++++
 src/plugins/plain/TokenTrie.hpp             | 150 ++++++++++++++++++++++++++++
 test/plugins/plain/DynamicTokenTreeTest.cpp |  94 -----------------
 test/plugins/plain/TokenTrieTest.cpp        |  92 +++++++++++++++++
 7 files changed, 363 insertions(+), 353 deletions(-)
 delete mode 100644 src/plugins/plain/DynamicTokenTree.cpp
 delete mode 100644 src/plugins/plain/DynamicTokenTree.hpp
 create mode 100644 src/plugins/plain/TokenTrie.cpp
 create mode 100644 src/plugins/plain/TokenTrie.hpp
 delete mode 100644 test/plugins/plain/DynamicTokenTreeTest.cpp
 create mode 100644 test/plugins/plain/TokenTrieTest.cpp

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d73248..f9b224d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,7 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml
 )
 
 ADD_LIBRARY(ousia_plain
-	src/plugins/plain/DynamicTokenTree
+	src/plugins/plain/TokenTrie
 	src/plugins/plain/PlainFormatStreamReader
 )
 
@@ -324,7 +324,7 @@ IF(TEST)
 	)
 
 	ADD_EXECUTABLE(ousia_test_plain
-		test/plugins/plain/DynamicTokenTreeTest
+		test/plugins/plain/TokenTrieTest
 		test/plugins/plain/PlainFormatStreamReaderTest
 	)
 
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/DynamicTokenTree.cpp
deleted file mode 100644
index 8b7bfc2..0000000
--- a/src/plugins/plain/DynamicTokenTree.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "DynamicTokenTree.hpp"
-
-namespace ousia {
-
-/* Class DynamicTokenTree::Node */
-
-DynamicTokenTree::Node::Node() : descriptor(nullptr) {}
-
-/* Class DynamicTokenTree */
-
-bool DynamicTokenTree::registerToken(const std::string &token,
-                                     const TokenDescriptor *descriptor) noexcept
-{
-	// Abort if the token is empty -- this would taint the root node
-	if (token.empty()) {
-		return false;
-	}
-
-	// Iterate over each character in the given string and insert them as
-	// (new) nodes
-	Node *node = &root;
-	for (size_t i = 0; i < token.size(); i++) {
-		// Insert a new node if this one does not exist
-		const char c = token[i];
-		auto it = node->children.find(c);
-		if (it == node->children.end()) {
-			it = node->children.emplace(c, std::unique_ptr<Node>(new Node{}))
-			         .first;
-		}
-		node = it->second.get();
-	}
-
-	// If the resulting node already has a descriptor set, we're screwed.
-	if (node->descriptor != nullptr) {
-		return false;
-	}
-
-	// Otherwise just set the descriptor to the given descriptor.
-	node->descriptor = descriptor;
-	return true;
-}
-
-bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
-{
-	// We cannot remove empty tokens as we need to access the fist character
-	// upfront
-	if (token.empty()) {
-		return false;
-	}
-
-	// First pass -- search the node in the path that can be deleted
-	Node *subtreeRoot = &root;
-	char subtreeKey = token[0];
-	Node *node = &root;
-	for (size_t i = 0; i < token.size(); i++) {
-		// Go to the next node, abort if the tree ends unexpectedly
-		auto it = node->children.find(token[i]);
-		if (it == node->children.end()) {
-			return false;
-		}
-
-		// Reset the subtree handler if this node has another descriptor
-		node = it->second.get();
-		if ((node->descriptor != nullptr || node->children.size() > 1) &&
-		    (i + 1 != token.size())) {
-			subtreeRoot = node;
-			subtreeKey = token[i + 1];
-		}
-	}
-
-	// If the node descriptor is already nullptr, we cannot do anything here
-	if (node->descriptor == nullptr) {
-		return false;
-	}
-
-	// If the target node has children, we cannot delete the subtree. Set the
-	// descriptor to nullptr instead
-	if (!node->children.empty()) {
-		node->descriptor = nullptr;
-		return true;
-	}
-
-	// If we end up here, we can safely delete the complete subtree
-	subtreeRoot->children.erase(subtreeKey);
-	return true;
-}
-
-const TokenDescriptor *DynamicTokenTree::hasToken(
-    const std::string &token) const noexcept
-{
-	Node const *node = &root;
-	for (size_t i = 0; i < token.size(); i++) {
-		const char c = token[i];
-		auto it = node->children.find(c);
-		if (it == node->children.end()) {
-			return nullptr;
-		}
-		node = it->second.get();
-	}
-	return node->descriptor;
-}
-}
-
diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/DynamicTokenTree.hpp
deleted file mode 100644
index c5dc4de..0000000
--- a/src/plugins/plain/DynamicTokenTree.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file DynamicTokenTree.hpp
- *
- * Class representing a token tree that can be updated dynamically.
- *
- * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
-#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
-
-#include <memory>
-#include <unordered_map>
-
-namespace ousia {
-
-class TokenDescriptor;
-
-/**
- * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to
- * identify the longest consecutive token in the text. This is equivalent to a
- * prefix trie.
- *
- * A token tree is a construct that structures all special tokens a
- * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then
- * the token tree would look like this:
- *
- * \code{*.txt}
- * a
- * | \
- * a $
- * | \
- * b c
- * | |
- * $ $
- * \endcode
- *
- * Every node in the token tree is a valid end state that has a $ attached to
- * it. During the search algorithm the Tokenizer goes through the tree and
- * stores the last valid position. If a character follows that does not lead to
- * a new node in the TokenTree the search ends (and starts again at this
- * character). The token corresponding to the last valid position is returned.
- *
- * This allows us to uniquely identify the matching token given a certain
- * input text. Note that this is a greedy matching approach that does not
- * work if you're using truly ambiguous tokens (that have the same text).
- */
-class DynamicTokenTree {
-public:
-	/**
-	 * Structure used to build the node tree.
-	 */
-	struct Node {
-		/**
-		 * Type used for the child map.
-		 */
-		using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>;
-
-		/**
-		 * Map from single characters at the corresponding child nodes.
-		 */
-		ChildMap children;
-
-		/**
-		 * Reference at the corresponding token descriptor. Set to nullptr if
-		 * no token is attached to this node.
-		 */
-		TokenDescriptor const *descriptor;
-
-		/**
-		 * Default constructor, initializes the descriptor with nullptr.
-		 */
-		Node();
-	};
-
-private:
-	/**
-	 * Root node of the internal token tree.
-	 */
-	Node root;
-
-public:
-	/**
-	 * Registers a token containing the given string. Returns false if the
-	 * token already exists, true otherwise.
-	 *
-	 * @param token is the character sequence that should be registered as
-	 * token.
-	 * @param descriptor is the descriptor that should be set for this token.
-	 * @return true if the operation is successful, false otherwise.
-	 */
-	bool registerToken(const std::string &token,
-	                   const TokenDescriptor *descriptor) noexcept;
-
-	/**
-	 * Unregisters the token from the token tree. Returns true if the token was
-	 * unregistered successfully, false otherwise.
-	 *
-	 * @param token is the character sequence that should be unregistered.
-	 * @return true if the operation was successful, false otherwise.
-	 */
-	bool unregisterToken(const std::string &token) noexcept;
-
-	/**
-	 * Returns true, if the given token exists within the TokenTree. This
-	 * function is mostly thought for debugging and unit testing.
-	 *
-	 * @param token is the character sequence that should be searched.
-	 * @return the attached token descriptor or nullptr if the given token is
-	 * not found.
-	 */
-	const TokenDescriptor* hasToken(const std::string &token) const noexcept;
-};
-}
-
-#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */
-
diff --git a/src/plugins/plain/TokenTrie.cpp b/src/plugins/plain/TokenTrie.cpp
new file mode 100644
index 0000000..4a0430b
--- /dev/null
+++ b/src/plugins/plain/TokenTrie.cpp
@@ -0,0 +1,119 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "TokenTrie.hpp"
+
+namespace ousia {
+
+/* Class DynamicTokenTree::Node */
+
+TokenTrie::Node::Node() : type(EmptyToken) {}
+
+/* Class DynamicTokenTree */
+
+bool TokenTrie::registerToken(const std::string &token,
+                              TokenTypeId type) noexcept
+{
+	// Abort if the token is empty -- this would taint the root node
+	if (token.empty()) {
+		return false;
+	}
+
+	// Iterate over each character in the given string and insert them as
+	// (new) nodes
+	Node *node = &root;
+	for (size_t i = 0; i < token.size(); i++) {
+		// Insert a new node if this one does not exist
+		const char c = token[i];
+		auto it = node->children.find(c);
+		if (it == node->children.end()) {
+			it = node->children.emplace(c, std::make_shared<Node>()).first;
+		}
+		node = it->second.get();
+	}
+
+	// If the resulting node already has a type set, we're screwed.
+	if (node->type != EmptyToken) {
+		return false;
+	}
+
+	// Otherwise just set the type to the given type.
+	node->type = type;
+	return true;
+}
+
+bool TokenTrie::unregisterToken(const std::string &token) noexcept
+{
+	// We cannot remove empty tokens as we need to access the fist character
+	// upfront
+	if (token.empty()) {
+		return false;
+	}
+
+	// First pass -- search the node in the path that can be deleted
+	Node *subtreeRoot = &root;
+	char subtreeKey = token[0];
+	Node *node = &root;
+	for (size_t i = 0; i < token.size(); i++) {
+		// Go to the next node, abort if the tree ends unexpectedly
+		auto it = node->children.find(token[i]);
+		if (it == node->children.end()) {
+			return false;
+		}
+
+		// Reset the subtree handler if this node has another type
+		node = it->second.get();
+		if ((node->type != EmptyToken || node->children.size() > 1) &&
+		    (i + 1 != token.size())) {
+			subtreeRoot = node;
+			subtreeKey = token[i + 1];
+		}
+	}
+
+	// If the node type is already EmptyToken, we cannot do anything here
+	if (node->type == EmptyToken) {
+		return false;
+	}
+
+	// If the target node has children, we cannot delete the subtree. Set the
+	// type to EmptyToken instead
+	if (!node->children.empty()) {
+		node->type = EmptyToken;
+		return true;
+	}
+
+	// If we end up here, we can safely delete the complete subtree
+	subtreeRoot->children.erase(subtreeKey);
+	return true;
+}
+
+TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
+{
+	Node const *node = &root;
+	for (size_t i = 0; i < token.size(); i++) {
+		const char c = token[i];
+		auto it = node->children.find(c);
+		if (it == node->children.end()) {
+			return EmptyToken;
+		}
+		node = it->second.get();
+	}
+	return node->type;
+}
+}
+
diff --git a/src/plugins/plain/TokenTrie.hpp b/src/plugins/plain/TokenTrie.hpp
new file mode 100644
index 0000000..36c2ffa
--- /dev/null
+++ b/src/plugins/plain/TokenTrie.hpp
@@ -0,0 +1,150 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file TokenTrie.hpp
+ *
+ * Class representing a token trie that can be updated dynamically.
+ *
+ * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_TOKEN_TRIE_HPP_
+#define _OUSIA_TOKEN_TRIE_HPP_
+
+#include <cstdint>
+#include <memory>
+#include <limits>
+#include <unordered_map>
+
+namespace ousia {
+
+/**
+ * The TokenTypeId is used to give each token type a unique id.
+ */
+using TokenTypeId = uint32_t;
+
+/**
+ * Token which is not a token.
+ */
+constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+
+/**
+ * Token which represents a text token.
+ */
+constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+
+/**
+ * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
+ * the longest consecutive token in the text. This is equivalent to a prefix
+ * trie.
+ *
+ * A token trie is a construct that structures all special tokens a Tokenizer
+ * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
+ * three. Then the token tree would look like this:
+ *
+ * \code{*.txt}
+ *        ~ (0)
+ *       /     \
+ *      a (2)  b (0)
+ *      |      |
+ *      a (0)  a (0)
+ *      |      |
+ *      b (1)  c (0)
+ * \endcode
+ *
+ * Where the number indicates the corresponding token descriptor identifier.
+ */
+class TokenTrie {
+public:
+	/**
+	 * Structure used to build the node tree.
+	 */
+	struct Node {
+		/**
+		 * Type used for the child map.
+		 */
+		using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
+
+		/**
+		 * Map from single characters at the corresponding child nodes.
+		 */
+		ChildMap children;
+
+		/**
+		 * Reference at the corresponding token descriptor. Set to nullptr if
+		 * no token is attached to this node.
+		 */
+		TokenTypeId type;
+
+		/**
+		 * Default constructor, initializes the descriptor with nullptr.
+		 */
+		Node();
+	};
+
+private:
+	/**
+	 * Root node of the internal token tree.
+	 */
+	Node root;
+
+public:
+	/**
+	 * Registers a token containing the given string. Returns false if the
+	 * token already exists, true otherwise.
+	 *
+	 * @param token is the character sequence that should be registered as
+	 * token.
+	 * @param type is the descriptor that should be set for this token.
+	 * @return true if the operation is successful, false otherwise.
+	 */
+	bool registerToken(const std::string &token, TokenTypeId type) noexcept;
+
+	/**
+	 * Unregisters the token from the token tree. Returns true if the token was
+	 * unregistered successfully, false otherwise.
+	 *
+	 * @param token is the character sequence that should be unregistered.
+	 * @return true if the operation was successful, false otherwise.
+	 */
+	bool unregisterToken(const std::string &token) noexcept;
+
+	/**
+	 * Returns true, if the given token exists within the TokenTree. This
+	 * function is mostly thought for debugging and unit testing.
+	 *
+	 * @param token is the character sequence that should be searched.
+	 * @return the attached token descriptor or nullptr if the given token is
+	 * not found.
+	 */
+	TokenTypeId hasToken(const std::string &token) const noexcept;
+
+	/**
+	 * Returns a reference at the root node to be used for traversing the token
+	 * tree.
+	 *
+	 * @return a reference at the root node.
+	 */
+	const Node *getRoot() const noexcept { return &root; }
+};
+}
+
+#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
+
diff --git a/test/plugins/plain/DynamicTokenTreeTest.cpp b/test/plugins/plain/DynamicTokenTreeTest.cpp
deleted file mode 100644
index 5ae414c..0000000
--- a/test/plugins/plain/DynamicTokenTreeTest.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <gtest/gtest.h>
-
-#include <plugins/plain/DynamicTokenTree.hpp>
-
-namespace ousia {
-
-static const TokenDescriptor *d1 = reinterpret_cast<const TokenDescriptor*>(1);
-static const TokenDescriptor *d2 = reinterpret_cast<const TokenDescriptor*>(2);
-static const TokenDescriptor *d3 = reinterpret_cast<const TokenDescriptor*>(3);
-static const TokenDescriptor *d4 = reinterpret_cast<const TokenDescriptor*>(4);
-
-TEST(DynamicTokenTree, registerToken)
-{
-	DynamicTokenTree tree;
-
-	ASSERT_TRUE(tree.registerToken("a", d1));
-	ASSERT_TRUE(tree.registerToken("ab", d2));
-	ASSERT_TRUE(tree.registerToken("b", d3));
-	ASSERT_TRUE(tree.registerToken("hello", d4));
-
-	ASSERT_FALSE(tree.registerToken("", d1));
-	ASSERT_FALSE(tree.registerToken("a", d4));
-	ASSERT_FALSE(tree.registerToken("ab", d4));
-	ASSERT_FALSE(tree.registerToken("b", d4));
-	ASSERT_FALSE(tree.registerToken("hello", d4));
-
-	ASSERT_EQ(d1, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(d3, tree.hasToken("b"));
-	ASSERT_EQ(d4, tree.hasToken("hello"));
-	ASSERT_EQ(nullptr, tree.hasToken(""));
-	ASSERT_EQ(nullptr, tree.hasToken("abc"));
-}
-
-TEST(DynamicTokenTree, unregisterToken)
-{
-	DynamicTokenTree tree;
-
-	ASSERT_TRUE(tree.registerToken("a", d1));
-	ASSERT_FALSE(tree.registerToken("a", d4));
-
-	ASSERT_TRUE(tree.registerToken("ab", d2));
-	ASSERT_FALSE(tree.registerToken("ab", d4));
-
-	ASSERT_TRUE(tree.registerToken("b", d3));
-	ASSERT_FALSE(tree.registerToken("b", d4));
-
-	ASSERT_EQ(d1, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(d3, tree.hasToken("b"));
-
-	ASSERT_TRUE(tree.unregisterToken("a"));
-	ASSERT_FALSE(tree.unregisterToken("a"));
-
-	ASSERT_EQ(nullptr, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(d3, tree.hasToken("b"));
-
-	ASSERT_TRUE(tree.unregisterToken("b"));
-	ASSERT_FALSE(tree.unregisterToken("b"));
-
-	ASSERT_EQ(nullptr, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(nullptr, tree.hasToken("b"));
-
-	ASSERT_TRUE(tree.unregisterToken("ab"));
-	ASSERT_FALSE(tree.unregisterToken("ab"));
-
-	ASSERT_EQ(nullptr, tree.hasToken("a"));
-	ASSERT_EQ(nullptr, tree.hasToken("ab"));
-	ASSERT_EQ(nullptr, tree.hasToken("b"));
-}
-
-
-}
-
diff --git a/test/plugins/plain/TokenTrieTest.cpp b/test/plugins/plain/TokenTrieTest.cpp
new file mode 100644
index 0000000..d378fdf
--- /dev/null
+++ b/test/plugins/plain/TokenTrieTest.cpp
@@ -0,0 +1,92 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <gtest/gtest.h>
+
+#include <plugins/plain/TokenTrie.hpp>
+
+namespace ousia {
+
+static const TokenTypeId t1 = 0;
+static const TokenTypeId t2 = 1;
+static const TokenTypeId t3 = 2;
+static const TokenTypeId t4 = 3;
+
+TEST(TokenTrie, registerToken)
+{
+	TokenTrie tree;
+
+	ASSERT_TRUE(tree.registerToken("a", t1));
+	ASSERT_TRUE(tree.registerToken("ab", t2));
+	ASSERT_TRUE(tree.registerToken("b", t3));
+	ASSERT_TRUE(tree.registerToken("hello", t4));
+
+	ASSERT_FALSE(tree.registerToken("", t1));
+	ASSERT_FALSE(tree.registerToken("a", t4));
+	ASSERT_FALSE(tree.registerToken("ab", t4));
+	ASSERT_FALSE(tree.registerToken("b", t4));
+	ASSERT_FALSE(tree.registerToken("hello", t4));
+
+	ASSERT_EQ(t1, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(t3, tree.hasToken("b"));
+	ASSERT_EQ(t4, tree.hasToken("hello"));
+	ASSERT_EQ(EmptyToken, tree.hasToken(""));
+	ASSERT_EQ(EmptyToken, tree.hasToken("abc"));
+}
+
+TEST(TokenTrie, unregisterToken)
+{
+	TokenTrie tree;
+
+	ASSERT_TRUE(tree.registerToken("a", t1));
+	ASSERT_FALSE(tree.registerToken("a", t4));
+
+	ASSERT_TRUE(tree.registerToken("ab", t2));
+	ASSERT_FALSE(tree.registerToken("ab", t4));
+
+	ASSERT_TRUE(tree.registerToken("b", t3));
+	ASSERT_FALSE(tree.registerToken("b", t4));
+
+	ASSERT_EQ(t1, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(t3, tree.hasToken("b"));
+
+	ASSERT_TRUE(tree.unregisterToken("a"));
+	ASSERT_FALSE(tree.unregisterToken("a"));
+
+	ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(t3, tree.hasToken("b"));
+
+	ASSERT_TRUE(tree.unregisterToken("b"));
+	ASSERT_FALSE(tree.unregisterToken("b"));
+
+	ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(EmptyToken, tree.hasToken("b"));
+
+	ASSERT_TRUE(tree.unregisterToken("ab"));
+	ASSERT_FALSE(tree.unregisterToken("ab"));
+
+	ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+	ASSERT_EQ(EmptyToken, tree.hasToken("ab"));
+	ASSERT_EQ(EmptyToken, tree.hasToken("b"));
+}
+}
+
-- 
cgit v1.2.3


From 4854509f8add1e2ff167623fb0e8d4216d9d6023 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 17:54:27 +0100
Subject: Implemented DynamicTokenizer and unit tests

---
 CMakeLists.txt                              |   2 +
 src/plugins/plain/DynamicTokenizer.cpp      | 514 ++++++++++++++++++++++++++--
 src/plugins/plain/DynamicTokenizer.hpp      | 154 ++++++---
 test/plugins/plain/DynamicTokenizerTest.cpp | 416 ++++++++++++++++++++++
 4 files changed, 1016 insertions(+), 70 deletions(-)

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9b224d..867ca6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,6 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml
 )
 
 ADD_LIBRARY(ousia_plain
+	src/plugins/plain/DynamicTokenizer
 	src/plugins/plain/TokenTrie
 	src/plugins/plain/PlainFormatStreamReader
 )
@@ -325,6 +326,7 @@ IF(TEST)
 
 	ADD_EXECUTABLE(ousia_test_plain
 		test/plugins/plain/TokenTrieTest
+		test/plugins/plain/DynamicTokenizerTest
 		test/plugins/plain/PlainFormatStreamReaderTest
 	)
 
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
index 7690395..a8f2317 100644
--- a/src/plugins/plain/DynamicTokenizer.cpp
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -17,57 +17,529 @@
 */
 
 #include <memory>
-#include <string>
-#include <unordered_map>
+#include <vector>
 
 #include <core/common/CharReader.hpp>
+#include <core/common/Exceptions.hpp>
+#include <core/common/Utils.hpp>
 
 #include "DynamicTokenizer.hpp"
 
 namespace ousia {
 
+namespace {
+
+/* Internal class TokenMatch */
+
+/**
+ * Contains information about a matching token.
+ */
+struct TokenMatch {
+	/**
+	 * Token that was matched.
+	 */
+	DynamicToken token;
+
+	/**
+	 * Current length of the data within the text handler. The text buffer needs
+	 * to be trimmed to this length if this token matches.
+	 */
+	size_t textLength;
+
+	/**
+	 * End location of the current text handler. This location needs to be used
+	 * for the text token that is emitted before the actual token.
+	 */
+	size_t textEnd;
+
+	/**
+	 * Constructor of the TokenMatch class.
+	 */
+	TokenMatch() : textLength(0), textEnd(0) {}
+
+	/**
+	 * Returns true if this TokenMatch instance actually represents a match.
+	 */
+	bool hasMatch() { return token.type != EmptyToken; }
+};
+
+/* Internal class TokenLookup */
+
+/**
+ * The TokenLookup class is used to represent a thread in a running token
+ * lookup.
+ */
+class TokenLookup {
+private:
+	/**
+	 * Current node within the token trie.
+	 */
+	TokenTrie::Node const *node;
+
+	/**
+	 * Start offset within the source file.
+	 */
+	size_t start;
+
+	/**
+	 * Current length of the data within the text handler. The text buffer needs
+	 * to be trimmed to this length if this token matches.
+	 */
+	size_t textLength;
+
+	/**
+	 * End location of the current text handler. This location needs to be used
+	 * for the text token that is emitted before the actual token.
+	 */
+	size_t textEnd;
+
+public:
+	/**
+	 * Constructor of the TokenLookup class.
+	 *
+	 * @param node is the current node.
+	 * @param start is the start position.
+	 * @param textLength is the text buffer length of the previous text token.
+	 * @param textEnd is the current end location of the previous text token.
+	 */
+	TokenLookup(const TokenTrie::Node *node, size_t start,
+	            size_t textLength, size_t textEnd)
+	    : node(node), start(start), textLength(textLength), textEnd(textEnd)
+	{
+	}
+
+	/**
+	 * Tries to extend the current path in the token trie with the given
+	 * character. If a complete token is matched, stores this match in the
+	 * tokens list (in case it is longer than any previous token).
+	 *
+	 * @param c is the character that should be appended to the current prefix.
+	 * @param lookups is a list to which new TokeLookup instances are added --
+	 * which could potentially be expanded in the next iteration.
+	 * @param match is the DynamicToken instance to which the matching token
+	 * should be written.
+	 * @param tokens is a reference at the internal token list of the
+	 * DynamicTokenizer.
+	 * @param end is the end byte offset of the current character.
+	 * @param sourceId is the source if of this file.
+	 */
+	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+	             const std::vector<std::string> &tokens, SourceOffset end,
+	             SourceId sourceId)
+	{
+		// Check whether we can continue the current token path with the given
+		// character without visiting an already visited node
+		auto it = node->children.find(c);
+		if (it == node->children.end()) {
+			return;
+		}
+
+		// Check whether the new node represents a complete token a whether it
+		// is longer than the current token. If yes, replace the current token.
+		node = it->second.get();
+		if (node->type != EmptyToken) {
+			const std::string &str = tokens[node->type];
+			size_t len = str.size();
+			if (len > match.token.content.size()) {
+				match.token =
+				    DynamicToken{node->type, str, {sourceId, start, end}};
+				match.textLength = textLength;
+				match.textEnd = textEnd;
+			}
+		}
+
+		// If this state can possibly be advanced, store it in the states list.
+		if (!node->children.empty()) {
+			lookups.emplace_back(*this);
+		}
+	}
+};
+
+/* Internal class TextHandlerBase */
+
+/**
+ * Base class used for those classes that may be used as TextHandler in the
+ * DynamicTokenizer::next function.
+ */
+class TextHandlerBase {
+public:
+	/**
+	 * Start position of the extracted text.
+	 */
+	size_t textStart;
+
+	/**
+	 * End position of the extracted text.
+	 */
+	size_t textEnd;
+
+	/**
+	 * Buffer containing the extracted text.
+	 */
+	std::vector<char> textBuf;
+
+	/**
+	 * Constructor of the TextHandlerBase base class. Initializes the start and
+	 * end position with zeros.
+	 */
+	TextHandlerBase() : textStart(0), textEnd(0) {}
+
+	/**
+	 * Transforms the given token into a text token containing the extracted
+	 * text.
+	 *
+	 * @param token is the output token to which the text should be written.
+	 * @param sourceId is the source id of the underlying file.
+	 */
+	void buildTextToken(TokenMatch &match, SourceId sourceId)
+	{
+		if (match.hasMatch()) {
+			match.token.content =
+			    std::string{textBuf.data(), match.textLength};
+			match.token.location =
+			    SourceLocation{sourceId, textStart, match.textEnd};
+		} else {
+			match.token.content = std::string{textBuf.data(), textBuf.size()};
+			match.token.location = SourceLocation{sourceId, textStart, textEnd};
+		}
+		match.token.type = TextToken;
+	}
+
+	/**
+	 * Returns true if this whitespace handler has found any text and a text
+	 * token could be emitted.
+	 *
+	 * @return true if the internal data buffer is non-empty.
+	 */
+	bool hasText() { return !textBuf.empty(); }
+};
+
+/* Internal class PreservingTextHandler */
+
+/**
+ * The PreservingTextHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingTextHandler : public TextHandlerBase {
+public:
+	using TextHandlerBase::TextHandlerBase;
+
+	/**
+	 * Appends the given character to the internal text buffer, does not
+	 * eliminate whitespace.
+	 *
+	 * @param c is the character that should be appended to the internal buffer.
+	 * @param start is the start byte offset of the given character.
+	 * @param end is the end byte offset of the given character.
+	 */
+	void append(char c, size_t start, size_t end)
+	{
+		if (textBuf.empty()) {
+			textStart = start;
+		}
+		textEnd = end;
+		textBuf.push_back(c);
+	}
+};
+
+/* Internal class TrimmingTextHandler */
+
 /**
- * The TokenDescriptor class is a simple wrapper around a standard string
- * containing the character sequence of the token.
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
  */
-class TokenDescriptor {
+class TrimmingTextHandler : public TextHandlerBase {
+public:
+	using TextHandlerBase::TextHandlerBase;
+
 	/**
-	 * The character sequence of the token.
+	 * Buffer used internally to temporarily store all whitespace characters.
+	 * They are only added to the output buffer if another non-whitespace
+	 * character is reached.
 	 */
-	std::string str;
+	std::vector<char> whitespaceBuf;
 
 	/**
-	 * Default constructor of the TokenDescriptor class. Used to describe
-	 * special tokens.
+	 * Appends the given character to the internal text buffer, eliminates
+	 * whitespace characters at the begin and end of the text.
+	 *
+	 * @param c is the character that should be appended to the internal buffer.
+	 * @param start is the start byte offset of the given character.
+	 * @param end is the end byte offset of the given character.
 	 */
-	TokenDescriptor();
+	void append(char c, size_t start, size_t end)
+	{
+		// Handle whitespace characters
+		if (Utils::isWhitespace(c)) {
+			if (!textBuf.empty()) {
+				whitespaceBuf.push_back(c);
+			}
+			return;
+		}
+
+		// Set the start and end offset correctly
+		if (textBuf.empty()) {
+			textStart = start;
+		}
+		textEnd = end;
+
+		// Store the character
+		if (!whitespaceBuf.empty()) {
+			textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+			               whitespaceBuf.end());
+			whitespaceBuf.clear();
+		}
+		textBuf.push_back(c);
+	}
+};
+
+/* Internal class CollapsingTextHandler */
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingTextHandler : public TextHandlerBase {
+public:
+	using TextHandlerBase::TextHandlerBase;
 
 	/**
-	 * Constructor initializing the character sequence of the token.
+	 * Flag set to true if a whitespace character was reached.
 	 */
-	TokenDescriptor(const std::string &str) : str(str) {}
+	bool hasWhitespace = false;
+
+	/**
+	 * Appends the given character to the internal text buffer, eliminates
+	 * redundant whitespace characters.
+	 *
+	 * @param c is the character that should be appended to the internal buffer.
+	 * @param start is the start byte offset of the given character.
+	 * @param end is the end byte offset of the given character.
+	 */
+	void append(char c, size_t start, size_t end)
+	{
+		// Handle whitespace characters
+		if (Utils::isWhitespace(c)) {
+			if (!textBuf.empty()) {
+				hasWhitespace = true;
+			}
+			return;
+		}
+
+		// Set the start and end offset correctly
+		if (textBuf.empty()) {
+			textStart = start;
+		}
+		textEnd = end;
+
+		// Store the character
+		if (hasWhitespace) {
+			textBuf.push_back(' ');
+			hasWhitespace = false;
+		}
+		textBuf.push_back(c);
+	}
 };
+}
 
 /* Class DynamicTokenizer */
 
-void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode)
+DynamicTokenizer::DynamicTokenizer(CharReader &reader,
+                                   WhitespaceMode whitespaceMode)
+    : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0)
 {
-	whitespaceMode = mode;
 }
 
-WhitespaceMode DynamicTokenizer::getWhitespaceMode()
+template <typename TextHandler, bool read>
+bool DynamicTokenizer::next(DynamicToken &token)
 {
-	return whitespaceMode;
+	// If we're in the read mode, reset the char reader peek position to the
+	// current read position
+	if (read) {
+		reader.resetPeek();
+	}
+
+	// Prepare the lookups in the token trie
+	const TokenTrie::Node *root = trie.getRoot();
+	TokenMatch match;
+	std::vector<TokenLookup> lookups;
+	std::vector<TokenLookup> nextLookups;
+
+	// Instantiate the text handler
+	TextHandler textHandler;
+
+	// Peek characters from the reader and try to advance the current token tree
+	// cursor
+	char c;
+	size_t charStart = reader.getPeekOffset();
+	const SourceId sourceId = reader.getSourceId();
+	while (reader.peek(c)) {
+		const size_t charEnd = reader.getPeekOffset();
+		const size_t textLength = textHandler.textBuf.size();
+		const size_t textEnd = textHandler.textEnd;
+
+		// If we do not have a match yet, start a new lookup from the root
+		if (!match.hasMatch()) {
+			TokenLookup{root, charStart, textLength, textEnd}.advance(
+			    c, nextLookups, match, tokens, charEnd, sourceId);
+		}
+
+		// Try to advance all other lookups with the new character
+		for (TokenLookup &lookup : lookups) {
+			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+		}
+
+		// We have found a token and there are no more states to advance or the
+		// text handler has found something -- abort to return the new token
+		if (match.hasMatch()) {
+			if ((nextLookups.empty() || textHandler.hasText())) {
+				break;
+			}
+		} else {
+			// Record all incomming characters
+			textHandler.append(c, charStart, charEnd);
+		}
+
+		// Swap the lookups and the nextLookups list
+		lookups = std::move(nextLookups);
+		nextLookups.clear();
+
+		// Advance the offset
+		charStart = charEnd;
+	}
+
+	// If we found text, emit that text
+	if (textHandler.hasText() &&
+	    (!match.hasMatch() || match.textLength > 0)) {
+		textHandler.buildTextToken(match, sourceId);
+	}
+
+	// Move the read/peek cursor to the end of the token, abort if an error
+	// happens while doing so
+	if (match.hasMatch()) {
+		// Make sure we have a valid location
+		if (match.token.location.getEnd() == InvalidSourceOffset) {
+			throw OusiaException{"Token end position offset out of range"};
+		}
+
+		// Seek to the end of the current token
+		const size_t end = match.token.location.getEnd();
+		if (read) {
+			reader.seek(end);
+		} else {
+			reader.seekPeekCursor(end);
+		}
+		token = match.token;
+	} else {
+		token = DynamicToken{};
+	}
+	return match.hasMatch();
+}
+
+bool DynamicTokenizer::read(DynamicToken &token)
+{
+	switch (whitespaceMode) {
+		case WhitespaceMode::PRESERVE:
+			return next<PreservingTextHandler, true>(token);
+		case WhitespaceMode::TRIM:
+			return next<TrimmingTextHandler, true>(token);
+		case WhitespaceMode::COLLAPSE:
+			return next<CollapsingTextHandler, true>(token);
+	}
+	return false;
+}
+
+bool DynamicTokenizer::peek(DynamicToken &token)
+{
+	switch (whitespaceMode) {
+		case WhitespaceMode::PRESERVE:
+			return next<PreservingTextHandler, false>(token);
+		case WhitespaceMode::TRIM:
+			return next<TrimmingTextHandler, false>(token);
+		case WhitespaceMode::COLLAPSE:
+			return next<CollapsingTextHandler, false>(token);
+	}
+	return false;
 }
 
+TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
+{
+	// Abort if an empty token should be registered
+	if (token.empty()) {
+		return EmptyToken;
+	}
+
+	// Search for a new slot in the tokens list
+	TokenTypeId type = EmptyToken;
+	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
+		if (tokens[i].empty()) {
+			tokens[i] = token;
+			type = i;
+			break;
+		}
+	}
 
-/* Constant initializations */
+	// No existing slot was found, add a new one -- make sure we do not
+	// override the special token type handles
+	if (type == EmptyToken) {
+		type = tokens.size();
+		if (type == TextToken || type == EmptyToken) {
+			throw OusiaException{"Token type ids depleted!"};
+		}
+		tokens.emplace_back(token);
+	}
+	nextTokenTypeId = type + 1;
 
-static const TokenDescriptor Empty;
-static const TokenDescriptor Text;
-static const TokenDescriptor* DynamicTokenizer::Empty = &Empty;
-static const TokenDescriptor* DynamicTokenizer::Token = &Text;
+	// Try to register the token in the trie -- if this fails, remove it
+	// from the tokens list
+	if (!trie.registerToken(token, type)) {
+		tokens[type] = std::string();
+		nextTokenTypeId = type;
+		return EmptyToken;
+	}
+	return type;
+}
+
+bool DynamicTokenizer::unregisterToken(TokenTypeId type)
+{
+	// Unregister the token from the trie, abort if an invalid type is given
+	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
+		tokens[type] = std::string{};
+		nextTokenTypeId = type;
+		return true;
+	}
+	return false;
+}
+
+std::string DynamicTokenizer::getTokenString(TokenTypeId type)
+{
+	if (type < tokens.size()) {
+		return tokens[type];
+	}
+	return std::string{};
+}
+
+void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
+{
+	whitespaceMode = mode;
+}
 
+WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
 
+/* Explicitly instantiate all possible instantiations of the "next" member
+   function */
+template bool DynamicTokenizer::next<PreservingTextHandler, false>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<PreservingTextHandler, true>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
+    DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
+    DynamicToken &token);
 }
 
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
index f7fef13..760bebf 100644
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -28,34 +28,63 @@
 #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
 #define _OUSIA_DYNAMIC_TOKENIZER_HPP_
 
+#include <set>
+#include <string>
+#include <vector>
+
 #include <core/common/Location.hpp>
 
+#include "TokenTrie.hpp"
+
 namespace ousia {
 
 // Forward declarations
 class CharReader;
-class TokenDescriptor;
 
 /**
  * The DynamicToken structure describes a token discovered by the Tokenizer.
  */
 struct DynamicToken {
 	/**
-	 * Pointer pointing at the TokenDescriptor instance this token corresponds
-	 * to. May be one of the special TokenDescriptors defined as static members
-	 * of the DynamicTokenizer class.
+	 * Id of the type of this token.
 	 */
-	TokenDescriptor const *descriptor;
+	TokenTypeId type;
 
 	/**
 	 * String that was matched.
 	 */
-	std::string str;
+	std::string content;
 
 	/**
 	 * Location from which the string was extracted.
 	 */
 	SourceLocation location;
+
+	/**
+	 * Default constructor.
+	 */
+	DynamicToken() : type(EmptyToken) {}
+
+	/**
+	 * Constructor of the DynamicToken struct.
+	 *
+	 * @param id represents the token type.
+	 * @param content is the string content that has been extracted.
+	 * @param location is the location of the extracted string content in the
+	 * source file.
+	 */
+	DynamicToken(TokenTypeId type, const std::string &content,
+	             SourceLocation location)
+	    : type(type), content(content), location(location)
+	{
+	}
+
+	/**
+	 * Constructor of the DynamicToken struct, only initializes the token type
+	 *
+	 * @param type is the id corresponding to the type of the token.
+	 */
+	DynamicToken(TokenTypeId type) : type(type) {}
 };
 
 /**
@@ -64,43 +93,70 @@ struct DynamicToken {
  */
 enum class WhitespaceMode {
 	/**
-	 * Preserves all whitespaces as they are found in the source file.
-	 */
+     * Preserves all whitespaces as they are found in the source file.
+     */
 	PRESERVE,
 
 	/**
-	 * Trims whitespace at the beginning and the end of the found text.
-	 */
+     * Trims whitespace at the beginning and the end of the found text.
+     */
 	TRIM,
 
 	/**
-	 * Whitespaces are trimmed and collapsed, multiple whitespace characters
-	 * are replaced by a single space character.
-	 */
+     * Whitespaces are trimmed and collapsed, multiple whitespace characters
+     * are replaced by a single space character.
+     */
 	COLLAPSE
 };
 
 /**
  * The DynamicTokenizer is used to extract tokens and chunks of text from a
  * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters.
+ * to modify the handling of whitespace characters. Note that the
+ * DynamicTokenizer always tries to extract the longest possible token from the
+ * tokenizer.
  */
 class DynamicTokenizer {
 private:
 	/**
-	 * Reference at the char reader.
+	 * CharReader instance from which the tokens should be read.
 	 */
 	CharReader &reader;
 
+	/**
+	 * Internally used token trie. This object holds all registered tokens.
+	 */
+	TokenTrie trie;
+
 	/**
 	 * Flag defining whether whitespaces should be preserved or not.
 	 */
 	WhitespaceMode whitespaceMode;
 
 	/**
-	 * Vector containing all registered token descriptors.
+	 * Vector containing all registered token types.
+	 */
+	std::vector<std::string> tokens;
+
+	/**
+	 * Next index in the tokens list where to search for a new token id.
 	 */
-	std::vector<std::unique_ptr<TokenDescriptor>> descriptors;
+	size_t nextTokenTypeId;
+
+	/**
+	 * Templated function used internally to read the current token. The
+	 * function is templated in order to force code generation for all six
+	 * combiations of whitespace modes and reading/peeking.
+	 *
+	 * @tparam TextHandler is the type to be used for the textHandler instance.
+	 * @tparam read specifies whether the function should start from and advance
+	 * the read pointer of the char reader.
+	 * @param token is the token structure into which the token information
+	 * should be written.
+	 * @return false if the end of the stream has been reached, true otherwise.
+	 */
+	template <typename TextHandler, bool read>
+	bool next(DynamicToken &token);
 
 public:
 	/**
@@ -108,43 +164,44 @@ public:
 	 *
 	 * @param reader is the CharReader that should be used for reading the
 	 * tokens.
-	 * @param preserveWhitespaces should be set to true if all whitespaces
-	 * should be preserved (for preformated environments).
-	 */
-	DynamicTokenizer(CharReader &reader)
-	    : reader(reader),
-	      preserveWhitespaces(preserveWhitespaces),
-	      location(reader.getSourceId()),
-	      empty(true),
-	      hasWhitespace(false)
-	{
-	}
-
-	/**
-	 * Destructor of the DynamicTokenizer class.
+	 * @param whitespaceMode specifies how whitespace should be handled.
 	 */
-	~DynamicTokenizer();
+	DynamicTokenizer(CharReader &reader,
+	                 WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
 
 	/**
 	 * Registers the given string as a token. Returns a const pointer at a
 	 * TokenDescriptor that will be used to reference the newly created token.
 	 *
 	 * @param token is the token string that should be registered.
-	 * @return a pointer at a TokenDescriptor which is representative for the
-	 * newly registered token. Returns nullptr if a token with this string
-	 * was already registered.
+	 * @return a unique identifier for the registered token or EmptyToken if
+	 * an error occured.
 	 */
-	const TokenDescriptor* registerToken(const std::string &token);
+	TokenTypeId registerToken(const std::string &token);
 
 	/**
-	 * Unregisters the token belonging to the given TokenDescriptor.
+	 * Unregisters the token belonging to the given TokenTypeId.
 	 *
-	 * @param descr is a TokenDescriptor that was previously returned by
-	 * registerToken.
+	 * @param type is the token type that should be unregistered. The
+	 *TokenTypeId
+	 * must have been returned by registerToken.
 	 * @return true if the operation was successful, false otherwise (e.g.
 	 * because the given TokenDescriptor was already unregistered).
 	 */
-	bool unregisterToken(const TokenDescriptor *descr);
+	bool unregisterToken(TokenTypeId type);
+
+	/**
+	 * Returns the token that was registered under the given TokenTypeId id or
+	 *an
+	 * empty string if an invalid TokenTypeId id is given.
+	 *
+	 * @param type is the TokenTypeId id for which the corresponding token
+	 *string
+	 * should be returned.
+	 * @return the registered token string or an empty string if the given type
+	 * was invalid.
+	 */
+	std::string getTokenString(TokenTypeId type);
 
 	/**
 	 * Sets the whitespace mode.
@@ -173,17 +230,16 @@ public:
 	bool read(DynamicToken &token);
 
 	/**
-	 * TokenDescriptor representing an empty token.
-	 */
-	static const *TokenDescriptor Empty;
-
-	/**
-	 * TokenDescriptor representing generic text.
+	 * The peek method does not advance the read position of the char reader,
+	 * but reads the next token from the current char reader peek position.
+	 *
+	 * @param token is a reference at the token instance into which the Token
+	 * information should be written.
+	 * @return true if a token could be read, false if the end of the stream
+	 * has been reached.
 	 */
-	static const *TokenDescriptor Text;
-
+	bool peek(DynamicToken &token);
 };
-
 }
 
 #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp
index e69de29..63fa466 100644
--- a/test/plugins/plain/DynamicTokenizerTest.cpp
+++ b/test/plugins/plain/DynamicTokenizerTest.cpp
@@ -0,0 +1,416 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <gtest/gtest.h>
+
+#include <core/common/CharReader.hpp>
+#include <plugins/plain/DynamicTokenizer.hpp>
+
+namespace ousia {
+
+TEST(DynamicTokenizer, tokenRegistration)
+{
+	CharReader reader{"test"};
+	DynamicTokenizer tokenizer{reader};
+
+	ASSERT_EQ(EmptyToken, tokenizer.registerToken(""));
+
+	ASSERT_EQ(0U, tokenizer.registerToken("a"));
+	ASSERT_EQ(EmptyToken, tokenizer.registerToken("a"));
+	ASSERT_EQ("a", tokenizer.getTokenString(0U));
+
+	ASSERT_EQ(1U, tokenizer.registerToken("b"));
+	ASSERT_EQ(EmptyToken, tokenizer.registerToken("b"));
+	ASSERT_EQ("b", tokenizer.getTokenString(1U));
+
+	ASSERT_EQ(2U, tokenizer.registerToken("c"));
+	ASSERT_EQ(EmptyToken, tokenizer.registerToken("c"));
+	ASSERT_EQ("c", tokenizer.getTokenString(2U));
+
+	ASSERT_TRUE(tokenizer.unregisterToken(1U));
+	ASSERT_FALSE(tokenizer.unregisterToken(1U));
+	ASSERT_EQ("", tokenizer.getTokenString(1U));
+
+	ASSERT_EQ(1U, tokenizer.registerToken("d"));
+	ASSERT_EQ(EmptyToken, tokenizer.registerToken("d"));
+	ASSERT_EQ("d", tokenizer.getTokenString(1U));
+}
+
+TEST(DynamicTokenizer, textTokenPreserveWhitespace)
+{
+	{
+		CharReader reader{" this \t is only a  \n\n test   text   "};
+		//                 012345 6789012345678 9 0123456789012345
+		//                 0          1           2         3
+		DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ(" this \t is only a  \n\n test   text   ", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(0U, loc.getStart());
+		ASSERT_EQ(36U, loc.getEnd());
+
+		ASSERT_FALSE(tokenizer.read(token));
+	}
+
+	{
+		CharReader reader{"this \t is only a  \n\n test   text"};
+		//                 01234 5678901234567 8 9012345678901
+		//                 0          1           2         3
+		DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(0U, loc.getStart());
+		ASSERT_EQ(32U, loc.getEnd());
+
+		ASSERT_FALSE(tokenizer.read(token));
+	}
+}
+
+TEST(DynamicTokenizer, textTokenTrimWhitespace)
+{
+	{
+		CharReader reader{" this \t is only a  \n\n test   text   "};
+		//                 012345 6789012345678 9 0123456789012345
+		//                 0          1           2         3
+		DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(1U, loc.getStart());
+		ASSERT_EQ(33U, loc.getEnd());
+
+		ASSERT_FALSE(tokenizer.read(token));
+	}
+
+	{
+		CharReader reader{"this \t is only a  \n\n test   text"};
+		//                 01234 5678901234567 8 9012345678901
+		//                 0          1           2         3
+		DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(0U, loc.getStart());
+		ASSERT_EQ(32U, loc.getEnd());
+
+		ASSERT_FALSE(tokenizer.read(token));
+	}
+}
+
+TEST(DynamicTokenizer, textTokenCollapseWhitespace)
+{
+	{
+		CharReader reader{" this \t is only a  \n\n test   text   "};
+		//                 012345 6789012345678 9 0123456789012345
+		//                 0          1           2         3
+		DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("this is only a test text", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(1U, loc.getStart());
+		ASSERT_EQ(33U, loc.getEnd());
+
+		ASSERT_FALSE(tokenizer.read(token));
+	}
+
+	{
+		CharReader reader{"this \t is only a  \n\n test   text"};
+		//                 01234 5678901234567 8 9012345678901
+		//                 0          1           2         3
+		DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("this is only a test text", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(0U, loc.getStart());
+		ASSERT_EQ(32U, loc.getEnd());
+
+		ASSERT_FALSE(tokenizer.read(token));
+	}
+}
+
+TEST(DynamicTokenizer, simpleReadToken)
+{
+	CharReader reader{"test1:test2"};
+	DynamicTokenizer tokenizer{reader};
+
+	const TokenTypeId tid = tokenizer.registerToken(":");
+	ASSERT_EQ(0U, tid);
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("test1", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(0U, loc.getStart());
+		ASSERT_EQ(5U, loc.getEnd());
+
+		char c;
+		ASSERT_TRUE(reader.peek(c));
+		ASSERT_EQ(':', c);
+	}
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+
+		ASSERT_EQ(tid, token.type);
+		ASSERT_EQ(":", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(5U, loc.getStart());
+		ASSERT_EQ(6U, loc.getEnd());
+
+		char c;
+		ASSERT_TRUE(reader.peek(c));
+		ASSERT_EQ('t', c);
+	}
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("test2", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(6U, loc.getStart());
+		ASSERT_EQ(11U, loc.getEnd());
+
+		char c;
+		ASSERT_FALSE(reader.peek(c));
+	}
+}
+
+TEST(DynamicTokenizer, simplePeekToken)
+{
+	CharReader reader{"test1:test2"};
+	DynamicTokenizer tokenizer{reader};
+
+	const TokenTypeId tid = tokenizer.registerToken(":");
+	ASSERT_EQ(0U, tid);
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.peek(token));
+
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("test1", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(0U, loc.getStart());
+		ASSERT_EQ(5U, loc.getEnd());
+		ASSERT_EQ(0U, reader.getOffset());
+		ASSERT_EQ(5U, reader.getPeekOffset());
+	}
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.peek(token));
+
+		ASSERT_EQ(tid, token.type);
+		ASSERT_EQ(":", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(5U, loc.getStart());
+		ASSERT_EQ(6U, loc.getEnd());
+		ASSERT_EQ(0U, reader.getOffset());
+		ASSERT_EQ(6U, reader.getPeekOffset());
+	}
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.peek(token));
+
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("test2", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(6U, loc.getStart());
+		ASSERT_EQ(11U, loc.getEnd());
+		ASSERT_EQ(0U, reader.getOffset());
+		ASSERT_EQ(11U, reader.getPeekOffset());
+	}
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("test1", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(0U, loc.getStart());
+		ASSERT_EQ(5U, loc.getEnd());
+		ASSERT_EQ(5U, reader.getOffset());
+		ASSERT_EQ(5U, reader.getPeekOffset());
+	}
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+
+		ASSERT_EQ(tid, token.type);
+		ASSERT_EQ(":", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(5U, loc.getStart());
+		ASSERT_EQ(6U, loc.getEnd());
+		ASSERT_EQ(6U, reader.getOffset());
+		ASSERT_EQ(6U, reader.getPeekOffset());
+	}
+
+	{
+		DynamicToken token;
+		ASSERT_TRUE(tokenizer.read(token));
+
+		ASSERT_EQ(TextToken, token.type);
+		ASSERT_EQ("test2", token.content);
+
+		SourceLocation loc = token.location;
+		ASSERT_EQ(6U, loc.getStart());
+		ASSERT_EQ(11U, loc.getEnd());
+		ASSERT_EQ(11U, reader.getOffset());
+		ASSERT_EQ(11U, reader.getPeekOffset());
+	}
+}
+
+TEST(DynamicTokenizer, ambiguousTokens)
+{
+	CharReader reader{"abc"};
+	DynamicTokenizer tokenizer(reader);
+
+	TokenTypeId t1 = tokenizer.registerToken("abd");
+	TokenTypeId t2 = tokenizer.registerToken("bc");
+
+	ASSERT_EQ(0U, t1);
+	ASSERT_EQ(1U, t2);
+
+	DynamicToken token;
+	ASSERT_TRUE(tokenizer.read(token));
+
+	ASSERT_EQ(TextToken, token.type);
+	ASSERT_EQ("a", token.content);
+
+	SourceLocation loc = token.location;
+	ASSERT_EQ(0U, loc.getStart());
+	ASSERT_EQ(1U, loc.getEnd());
+
+	ASSERT_TRUE(tokenizer.read(token));
+
+	ASSERT_EQ(t2, token.type);
+	ASSERT_EQ("bc", token.content);
+
+	loc = token.location;
+	ASSERT_EQ(1U, loc.getStart());
+	ASSERT_EQ(3U, loc.getEnd());
+
+	ASSERT_FALSE(tokenizer.read(token));
+}
+
+TEST(DynamicTokenizer, commentTestWhitespacePreserve)
+{
+	CharReader reader{"Test/Test /* Block Comment */", 0};
+	//                 012345678901234567890123456789
+	//                 0        1         2
+	DynamicTokenizer tokenizer(reader, WhitespaceMode::PRESERVE);
+
+	const TokenTypeId t1 = tokenizer.registerToken("/");
+	const TokenTypeId t2 = tokenizer.registerToken("/*");
+	const TokenTypeId t3 = tokenizer.registerToken("*/");
+
+	std::vector<DynamicToken> expected = {
+	    {TextToken, "Test", SourceLocation{0, 0, 4}},
+	    {t1, "/", SourceLocation{0, 4, 5}},
+	    {TextToken, "Test ", SourceLocation{0, 5, 10}},
+	    {t2, "/*", SourceLocation{0, 10, 12}},
+	    {TextToken, " Block Comment ", SourceLocation{0, 12, 27}},
+	    {t3, "*/", SourceLocation{0, 27, 29}}};
+
+	DynamicToken t;
+	for (auto &te : expected) {
+		EXPECT_TRUE(tokenizer.read(t));
+		EXPECT_EQ(te.type, t.type);
+		EXPECT_EQ(te.content, t.content);
+		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
+		EXPECT_EQ(te.location.getStart(), t.location.getStart());
+		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
+	}
+	ASSERT_FALSE(tokenizer.read(t));
+}
+
+TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
+{
+	CharReader reader{"Test/Test /* Block Comment */", 0};
+	//                 012345678901234567890123456789
+	//                 0        1         2
+	DynamicTokenizer tokenizer(reader, WhitespaceMode::COLLAPSE);
+
+	const TokenTypeId t1 = tokenizer.registerToken("/");
+	const TokenTypeId t2 = tokenizer.registerToken("/*");
+	const TokenTypeId t3 = tokenizer.registerToken("*/");
+
+	std::vector<DynamicToken> expected = {
+	    {TextToken, "Test", SourceLocation{0, 0, 4}},
+	    {t1, "/", SourceLocation{0, 4, 5}},
+	    {TextToken, "Test", SourceLocation{0, 5, 9}},
+	    {t2, "/*", SourceLocation{0, 10, 12}},
+	    {TextToken, "Block Comment", SourceLocation{0, 13, 26}},
+	    {t3, "*/", SourceLocation{0, 27, 29}}};
+
+	DynamicToken t;
+	for (auto &te : expected) {
+		EXPECT_TRUE(tokenizer.read(t));
+		EXPECT_EQ(te.type, t.type);
+		EXPECT_EQ(te.content, t.content);
+		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
+		EXPECT_EQ(te.location.getStart(), t.location.getStart());
+		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
+	}
+	ASSERT_FALSE(tokenizer.read(t));
+}
+
+}
+
-- 
cgit v1.2.3


From f713b1d393230e7083727d457623fdac878eb248 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 18:48:07 +0100
Subject: DynamicTokenizer now gets the reader as a parameter to read and peek
 -- the beauty of this tokenizer is that it has no internal state depending on
 the reader, so it doesn't need to hold a reference to it

---
 src/plugins/plain/DynamicTokenizer.cpp      | 35 ++++++-------
 src/plugins/plain/DynamicTokenizer.hpp      | 22 ++++----
 test/plugins/plain/DynamicTokenizerTest.cpp | 81 ++++++++++++++---------------
 3 files changed, 67 insertions(+), 71 deletions(-)

(limited to 'src')

diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
index a8f2317..f2cfcd1 100644
--- a/src/plugins/plain/DynamicTokenizer.cpp
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -345,14 +345,13 @@ public:
 
 /* Class DynamicTokenizer */
 
-DynamicTokenizer::DynamicTokenizer(CharReader &reader,
-                                   WhitespaceMode whitespaceMode)
-    : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0)
+DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
+    : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
 {
 }
 
 template <typename TextHandler, bool read>
-bool DynamicTokenizer::next(DynamicToken &token)
+bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
 {
 	// If we're in the read mode, reset the char reader peek position to the
 	// current read position
@@ -437,28 +436,28 @@ bool DynamicTokenizer::next(DynamicToken &token)
 	return match.hasMatch();
 }
 
-bool DynamicTokenizer::read(DynamicToken &token)
+bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
 {
 	switch (whitespaceMode) {
 		case WhitespaceMode::PRESERVE:
-			return next<PreservingTextHandler, true>(token);
+			return next<PreservingTextHandler, true>(reader, token);
 		case WhitespaceMode::TRIM:
-			return next<TrimmingTextHandler, true>(token);
+			return next<TrimmingTextHandler, true>(reader, token);
 		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingTextHandler, true>(token);
+			return next<CollapsingTextHandler, true>(reader, token);
 	}
 	return false;
 }
 
-bool DynamicTokenizer::peek(DynamicToken &token)
+bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
 {
 	switch (whitespaceMode) {
 		case WhitespaceMode::PRESERVE:
-			return next<PreservingTextHandler, false>(token);
+			return next<PreservingTextHandler, false>(reader, token);
 		case WhitespaceMode::TRIM:
-			return next<TrimmingTextHandler, false>(token);
+			return next<TrimmingTextHandler, false>(reader, token);
 		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingTextHandler, false>(token);
+			return next<CollapsingTextHandler, false>(reader, token);
 	}
 	return false;
 }
@@ -530,16 +529,16 @@ WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
 /* Explicitly instantiate all possible instantiations of the "next" member
    function */
 template bool DynamicTokenizer::next<PreservingTextHandler, false>(
-    DynamicToken &token);
+    CharReader &reader, DynamicToken &token);
 template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
-    DynamicToken &token);
+    CharReader &reader, DynamicToken &token);
 template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 template bool DynamicTokenizer::next<PreservingTextHandler, true>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
-    DynamicToken &token);
+    CharReader &reader,DynamicToken &token);
 }
 
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
index 760bebf..0b4dd39 100644
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -118,11 +118,6 @@ enum class WhitespaceMode {
  */
 class DynamicTokenizer {
 private:
-	/**
-	 * CharReader instance from which the tokens should be read.
-	 */
-	CharReader &reader;
-
 	/**
 	 * Internally used token trie. This object holds all registered tokens.
 	 */
@@ -151,23 +146,22 @@ private:
 	 * @tparam TextHandler is the type to be used for the textHandler instance.
 	 * @tparam read specifies whether the function should start from and advance
 	 * the read pointer of the char reader.
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
 	 * @param token is the token structure into which the token information
 	 * should be written.
 	 * @return false if the end of the stream has been reached, true otherwise.
 	 */
 	template <typename TextHandler, bool read>
-	bool next(DynamicToken &token);
+	bool next(CharReader &reader, DynamicToken &token);
 
 public:
 	/**
 	 * Constructor of the DynamicTokenizer class.
 	 *
-	 * @param reader is the CharReader that should be used for reading the
-	 * tokens.
 	 * @param whitespaceMode specifies how whitespace should be handled.
 	 */
-	DynamicTokenizer(CharReader &reader,
-	                 WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
 
 	/**
 	 * Registers the given string as a token. Returns a const pointer at a
@@ -222,23 +216,27 @@ public:
 	 * Reads a new token from the CharReader and stores it in the given
 	 * DynamicToken instance.
 	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool read(DynamicToken &token);
+	bool read(CharReader &reader, DynamicToken &token);
 
 	/**
 	 * The peek method does not advance the read position of the char reader,
 	 * but reads the next token from the current char reader peek position.
 	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool peek(DynamicToken &token);
+	bool peek(CharReader &reader, DynamicToken &token);
 };
 }
 
diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp
index 63fa466..5183fdd 100644
--- a/test/plugins/plain/DynamicTokenizerTest.cpp
+++ b/test/plugins/plain/DynamicTokenizerTest.cpp
@@ -25,8 +25,7 @@ namespace ousia {
 
 TEST(DynamicTokenizer, tokenRegistration)
 {
-	CharReader reader{"test"};
-	DynamicTokenizer tokenizer{reader};
+	DynamicTokenizer tokenizer;
 
 	ASSERT_EQ(EmptyToken, tokenizer.registerToken(""));
 
@@ -57,10 +56,10 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+		DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ(" this \t is only a  \n\n test   text   ", token.content);
 
@@ -68,17 +67,17 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(36U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+		DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
 
@@ -86,7 +85,7 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(32U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 }
 
@@ -96,10 +95,10 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+		DynamicTokenizer tokenizer{WhitespaceMode::TRIM};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
 
@@ -107,17 +106,17 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace)
 		ASSERT_EQ(1U, loc.getStart());
 		ASSERT_EQ(33U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+		DynamicTokenizer tokenizer{WhitespaceMode::TRIM};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this \t is only a  \n\n test   text", token.content);
 
@@ -125,7 +124,7 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(32U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 }
 
@@ -135,10 +134,10 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace)
 		CharReader reader{" this \t is only a  \n\n test   text   "};
 		//                 012345 6789012345678 9 0123456789012345
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+		DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this is only a test text", token.content);
 
@@ -146,17 +145,17 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace)
 		ASSERT_EQ(1U, loc.getStart());
 		ASSERT_EQ(33U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 
 	{
 		CharReader reader{"this \t is only a  \n\n test   text"};
 		//                 01234 5678901234567 8 9012345678901
 		//                 0          1           2         3
-		DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+		DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE};
 
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("this is only a test text", token.content);
 
@@ -164,21 +163,21 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace)
 		ASSERT_EQ(0U, loc.getStart());
 		ASSERT_EQ(32U, loc.getEnd());
 
-		ASSERT_FALSE(tokenizer.read(token));
+		ASSERT_FALSE(tokenizer.read(reader, token));
 	}
 }
 
 TEST(DynamicTokenizer, simpleReadToken)
 {
 	CharReader reader{"test1:test2"};
-	DynamicTokenizer tokenizer{reader};
+	DynamicTokenizer tokenizer;
 
 	const TokenTypeId tid = tokenizer.registerToken(":");
 	ASSERT_EQ(0U, tid);
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test1", token.content);
@@ -194,7 +193,7 @@ TEST(DynamicTokenizer, simpleReadToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(tid, token.type);
 		ASSERT_EQ(":", token.content);
@@ -210,7 +209,7 @@ TEST(DynamicTokenizer, simpleReadToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test2", token.content);
@@ -227,14 +226,14 @@ TEST(DynamicTokenizer, simpleReadToken)
 TEST(DynamicTokenizer, simplePeekToken)
 {
 	CharReader reader{"test1:test2"};
-	DynamicTokenizer tokenizer{reader};
+	DynamicTokenizer tokenizer;
 
 	const TokenTypeId tid = tokenizer.registerToken(":");
 	ASSERT_EQ(0U, tid);
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.peek(token));
+		ASSERT_TRUE(tokenizer.peek(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test1", token.content);
@@ -248,7 +247,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.peek(token));
+		ASSERT_TRUE(tokenizer.peek(reader, token));
 
 		ASSERT_EQ(tid, token.type);
 		ASSERT_EQ(":", token.content);
@@ -262,7 +261,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.peek(token));
+		ASSERT_TRUE(tokenizer.peek(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test2", token.content);
@@ -276,7 +275,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test1", token.content);
@@ -290,7 +289,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(tid, token.type);
 		ASSERT_EQ(":", token.content);
@@ -304,7 +303,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 
 	{
 		DynamicToken token;
-		ASSERT_TRUE(tokenizer.read(token));
+		ASSERT_TRUE(tokenizer.read(reader, token));
 
 		ASSERT_EQ(TextToken, token.type);
 		ASSERT_EQ("test2", token.content);
@@ -320,7 +319,7 @@ TEST(DynamicTokenizer, simplePeekToken)
 TEST(DynamicTokenizer, ambiguousTokens)
 {
 	CharReader reader{"abc"};
-	DynamicTokenizer tokenizer(reader);
+	DynamicTokenizer tokenizer;
 
 	TokenTypeId t1 = tokenizer.registerToken("abd");
 	TokenTypeId t2 = tokenizer.registerToken("bc");
@@ -329,7 +328,7 @@ TEST(DynamicTokenizer, ambiguousTokens)
 	ASSERT_EQ(1U, t2);
 
 	DynamicToken token;
-	ASSERT_TRUE(tokenizer.read(token));
+	ASSERT_TRUE(tokenizer.read(reader, token));
 
 	ASSERT_EQ(TextToken, token.type);
 	ASSERT_EQ("a", token.content);
@@ -338,7 +337,7 @@ TEST(DynamicTokenizer, ambiguousTokens)
 	ASSERT_EQ(0U, loc.getStart());
 	ASSERT_EQ(1U, loc.getEnd());
 
-	ASSERT_TRUE(tokenizer.read(token));
+	ASSERT_TRUE(tokenizer.read(reader, token));
 
 	ASSERT_EQ(t2, token.type);
 	ASSERT_EQ("bc", token.content);
@@ -347,7 +346,7 @@ TEST(DynamicTokenizer, ambiguousTokens)
 	ASSERT_EQ(1U, loc.getStart());
 	ASSERT_EQ(3U, loc.getEnd());
 
-	ASSERT_FALSE(tokenizer.read(token));
+	ASSERT_FALSE(tokenizer.read(reader, token));
 }
 
 TEST(DynamicTokenizer, commentTestWhitespacePreserve)
@@ -355,7 +354,7 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve)
 	CharReader reader{"Test/Test /* Block Comment */", 0};
 	//                 012345678901234567890123456789
 	//                 0        1         2
-	DynamicTokenizer tokenizer(reader, WhitespaceMode::PRESERVE);
+	DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE);
 
 	const TokenTypeId t1 = tokenizer.registerToken("/");
 	const TokenTypeId t2 = tokenizer.registerToken("/*");
@@ -371,14 +370,14 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve)
 
 	DynamicToken t;
 	for (auto &te : expected) {
-		EXPECT_TRUE(tokenizer.read(t));
+		EXPECT_TRUE(tokenizer.read(reader, t));
 		EXPECT_EQ(te.type, t.type);
 		EXPECT_EQ(te.content, t.content);
 		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
 		EXPECT_EQ(te.location.getStart(), t.location.getStart());
 		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
 	}
-	ASSERT_FALSE(tokenizer.read(t));
+	ASSERT_FALSE(tokenizer.read(reader, t));
 }
 
 TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
@@ -386,7 +385,7 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
 	CharReader reader{"Test/Test /* Block Comment */", 0};
 	//                 012345678901234567890123456789
 	//                 0        1         2
-	DynamicTokenizer tokenizer(reader, WhitespaceMode::COLLAPSE);
+	DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE);
 
 	const TokenTypeId t1 = tokenizer.registerToken("/");
 	const TokenTypeId t2 = tokenizer.registerToken("/*");
@@ -402,14 +401,14 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
 
 	DynamicToken t;
 	for (auto &te : expected) {
-		EXPECT_TRUE(tokenizer.read(t));
+		EXPECT_TRUE(tokenizer.read(reader, t));
 		EXPECT_EQ(te.type, t.type);
 		EXPECT_EQ(te.content, t.content);
 		EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
 		EXPECT_EQ(te.location.getStart(), t.location.getStart());
 		EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
 	}
-	ASSERT_FALSE(tokenizer.read(t));
+	ASSERT_FALSE(tokenizer.read(reader, t));
 }
 
 }
-- 
cgit v1.2.3


From f066b4887f6f2896fe602f14ede9c02a9f5a7e1a Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 18:48:27 +0100
Subject: Added isIdentifierStart function

---
 src/core/common/Utils.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'src')

diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index fa3788a..457d446 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -57,6 +57,14 @@ public:
 		return isAlphabetic(c) || isNumeric(c);
 	}
 
+	/**
+	 * Returns true if the given character is in [A-Za-z_]
+	 */
+	static bool isIdentifierStart(const char c)
+	{
+		return isAlphabetic(c) || (c == '_');
+	}
+
 	/**
 	 * Returns true if the given character is in [A-Za-z_][A-Za-z0-9_-]*
 	 */
-- 
cgit v1.2.3


From 51f09f4faa7cd4b6a0576758881d322e31e896ba Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 18:49:02 +0100
Subject: Ported PlainFormatStreamReader to DynamicTokenizer

---
 src/plugins/plain/PlainFormatStreamReader.cpp | 279 ++++++++------------------
 src/plugins/plain/PlainFormatStreamReader.hpp |  34 +++-
 2 files changed, 116 insertions(+), 197 deletions(-)

(limited to 'src')

diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp
index 15ca403..f0721a0 100644
--- a/src/plugins/plain/PlainFormatStreamReader.cpp
+++ b/src/plugins/plain/PlainFormatStreamReader.cpp
@@ -16,9 +16,6 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <sstream>
-#include <unordered_set>
-
 #include <core/common/CharReader.hpp>
 #include <core/common/Logger.hpp>
 #include <core/common/Utils.hpp>
@@ -27,123 +24,40 @@
 
 namespace ousia {
 
-/* Internally used types, protected from spilling the exports by a namespace */
-
 namespace {
-/**
- * Enum used to specify the state of the parseBlockComment state machine.
- */
-enum class BlockCommentState { DEFAULT, HAS_CURLY_CLOSE, HAS_PERCENT };
-
-/**
- * Class taking care of recording plain text data found withing the file.
- */
-class DataHandler {
-private:
-	/**
-	 * Const reference at the reader, used for reading the current location.
-	 */
-	const CharReader &reader;
-
-	/**
-	 * Flag defining whether whitespaces should be preserved or not.
-	 */
-	const bool preserveWhitespaces;
+struct DataHandler {
+	std::vector<char> buf;
 
-	/**
-	 * Current source range of the data in the buffer.
-	 */
-	SourceLocation location;
+	SourceOffset start;
+	SourceOffset end;
 
-	/**
-	 * Current buffer containing all read characters.
-	 */
-	std::stringstream buffer;
+	DataHandler() : start(0), end(0) {}
 
-	/**
-	 * Set to false, once a non-whitespace character was reached.
-	 */
-	bool empty;
+	bool isEmpty() { return buf.empty(); }
 
-	/**
-	 * Set to true if a whitespace was found -- these are normalized to a single
-	 * space.
-	 */
-	bool hasWhitespace;
-
-public:
-	/**
-	 * Constructor of the DataHandler class.
-	 *
-	 * @param reader is the CharReader that should be used for reading the data
-	 * location.
-	 * @param preserveWhitespaces should be set to true if all whitespaces
-	 * should be preserved (for preformated environments).
-	 */
-	DataHandler(const CharReader &reader, bool preserveWhitespaces = false)
-	    : reader(reader),
-	      preserveWhitespaces(preserveWhitespaces),
-	      location(reader.getSourceId()),
-	      empty(true),
-	      hasWhitespace(false)
+	void append(char c, SourceOffset charStart, SourceOffset charEnd)
 	{
+		if (isEmpty()) {
+			start = charStart;
+		}
+		buf.push_back(c);
+		end = charEnd;
 	}
 
-	/**
-	 * Appends the given character to the internal buffer.
-	 *
-	 * @param c is the character that should be appended.
-	 * @param wasEscaped is set to true if the character was escaped (prepended
-	 * with a backslash), this allows whitespace characters to be explicitly
-	 * included.
-	 */
-	void append(char c, bool wasEscaped = false)
+	void append(const std::string &s, SourceOffset stringStart,
+	            SourceOffset stringEnd)
 	{
-		// Check whether the character is a whitespace
-		const bool isWhitespace =
-		    !wasEscaped && !preserveWhitespaces && Utils::isWhitespace(c);
-
-		// Trim leading and trailing whitespaces
-		if (isWhitespace) {
-			if (!empty) {
-				hasWhitespace = true;
-			}
-		} else {
-			// Compress whitespaces to a single space
-			if (hasWhitespace) {
-				buffer << ' ';
-				hasWhitespace = false;
-			}
-
-			// Append the character
-			buffer << c;
-
-			// Update the "empty" flag and set the start and end offset
-			if (empty) {
-				location.setStart(reader.getOffset());
-				empty = false;
-			}
-			location.setEnd(reader.getPeekOffset());
+		if (isEmpty()) {
+			start = stringStart;
 		}
+		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
+		end = stringEnd;
 	}
 
-	/**
-	 * Returns true if no non-whitespace character has been found until now.
-	 *
-	 * @return true if the internal buffer is still empty.
-	 */
-	bool isEmpty() { return empty; }
-
-	/**
-	 * Returns a variant containg the read data and its location.
-	 *
-	 * @return a variant with a string value containing the read data and the
-	 * location being set to
-	 */
-	Variant getData()
+	Variant toVariant(SourceId sourceId)
 	{
-		Variant res = Variant::fromString(buffer.str());
-		res.setLocation(location);
+		Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
+		res.setLocation({sourceId, start, end});
 		return res;
 	}
 };
@@ -153,35 +67,26 @@ PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,
                                                  Logger &logger)
     : reader(reader), logger(logger), fieldIdx(0)
 {
+	tokenBackslash = tokenizer.registerToken("\\");
+	tokenLinebreak = tokenizer.registerToken("\n");
+	tokenLineComment = tokenizer.registerToken("%");
+	tokenBlockCommentStart = tokenizer.registerToken("%{");
+	tokenBlockCommentEnd = tokenizer.registerToken("}%");
 }
 
-/* Comment handling */
-
 void PlainFormatStreamReader::parseBlockComment()
 {
-	char c;
-	BlockCommentState state = BlockCommentState::DEFAULT;
-	while (reader.read(c)) {
-		switch (state) {
-			case BlockCommentState::DEFAULT:
-				if (c == '%') {
-					state = BlockCommentState::HAS_PERCENT;
-				} else if (c == '}') {
-					state = BlockCommentState::HAS_CURLY_CLOSE;
-				}
-				break;
-			case BlockCommentState::HAS_PERCENT:
-				if (c == '{') {
-					parseBlockComment();
-				}
-				state = BlockCommentState::DEFAULT;
-				break;
-			case BlockCommentState::HAS_CURLY_CLOSE:
-				if (c == '%') {
-					return;
-				}
-				state = BlockCommentState::DEFAULT;
-				break;
+	DynamicToken token;
+	size_t depth = 1;
+	while (tokenizer.read(reader, token)) {
+		if (token.type == tokenBlockCommentEnd) {
+			depth--;
+			if (depth == 0) {
+				return;
+			}
+		}
+		if (token.type == tokenBlockCommentStart) {
+			depth++;
 		}
 	}
 
@@ -189,102 +94,84 @@ void PlainFormatStreamReader::parseBlockComment()
 	logger.error("File ended while being in a block comment", reader);
 }
 
-void PlainFormatStreamReader::parseComment()
+void PlainFormatStreamReader::parseLineComment()
 {
 	char c;
-	bool first = true;
 	reader.consumePeek();
 	while (reader.read(c)) {
-		// Continue parsing a block comment if a '{' is found
-		if (c == '{' && first) {
-			parseBlockComment();
-			return;
-		}
 		if (c == '\n') {
 			return;
 		}
-		first = false;
 	}
 }
 
-/* Top level parse function */
-
-static const std::unordered_set<char> EscapeableCharacters{'\\', '<', '>',
-                                                    '{',  '}', '%'};
-
 PlainFormatStreamReader::State PlainFormatStreamReader::parse()
 {
 // Macro (sorry for that) used for checking whether there is data to issue, and
 // if yes, aborting the loop, allowing for a reentry on a later parse call by
 // resetting the peek cursor
-#define CHECK_ISSUE_DATA()      \
-	{                           \
-		if (!dataHandler.isEmpty()) {   \
-			reader.resetPeek(); \
-			abort = true;       \
-			break;              \
-		}                       \
+#define CHECK_ISSUE_DATA()            \
+	{                                 \
+		if (!dataHandler.isEmpty()) { \
+			reader.resetPeek();       \
+			abort = true;             \
+			break;                    \
+		}                             \
 	}
 
-	// Data handler
-	DataHandler dataHandler(reader);
+	// Handler for incomming data
+	DataHandler dataHandler;
 
 	// Variable set to true if the parser loop should be left
 	bool abort = false;
 
-	// Happily add characters to the dataHandler and handle escaping until a
-	// special character is reached. Then go to a specialiced parsing routine
-	char c;
-	while (!abort && reader.peek(c)) {
-		switch (c) {
-			case '\\':
-				reader.peek(c);
-				// Check whether this backslash just escaped some special or
-				// whitespace character or was the beginning of a command
-				if (EscapeableCharacters.count(c) == 0 &&
-				    !Utils::isWhitespace(c)) {
-					CHECK_ISSUE_DATA();
-					// TODO: Parse command (starting from the backslash)
-					return State::COMMAND;
-				}
-				// A character was escaped, add it to the buffer, with the
-				// wasEscaped flag set to true
-				dataHandler.append(c, true);
-				break;
-			case '<':
-				// TODO: Annotations
-				break;
-			case '>':
-				// TODO: Annotations
-				break;
-			case '{':
-				// TODO: Issue start of field
-				break;
-			case '}':
-			// TODO: Issue end of field
-			case '%':
-				CHECK_ISSUE_DATA();
-				parseComment();
-				break;
-			case '\n':
+	// Read tokens until the outer loop should be left
+	DynamicToken token;
+	while (!abort && tokenizer.peek(reader, token)) {
+		// Check whether this backslash just escaped some special or
+		// whitespace character or was the beginning of a command
+		if (token.type == tokenBackslash) {
+			// Check whether this character could be the start of a command
+			char c;
+			reader.consumePeek();
+			reader.peek(c);
+			if (Utils::isIdentifierStart(c)) {
 				CHECK_ISSUE_DATA();
-				reader.consumePeek();
-				return State::LINEBREAK;
-			default:
-				dataHandler.append(c, false);
+				// TODO: Parse a command
+				return State::COMMAND;
+			}
+
+			// This was not a special character, just append the given character
+			// to the data buffer, use the escape character start as start
+			// location and the peek offset as end location
+			dataHandler.append(c, token.location.getStart(),
+			                   reader.getPeekOffset());
+		} else if (token.type == tokenLineComment) {
+			CHECK_ISSUE_DATA();
+			reader.consumePeek();
+			parseLineComment();
+		} else if (token.type == tokenBlockCommentStart) {
+			CHECK_ISSUE_DATA();
+			reader.consumePeek();
+			parseBlockComment();
+		} else if (token.type == tokenLinebreak) {
+			CHECK_ISSUE_DATA();
+			reader.consumePeek();
+			return State::LINEBREAK;
+		} else if (token.type == TextToken) {
+			dataHandler.append(token.content, token.location.getStart(),
+			                   token.location.getEnd());
 		}
 
 		// Consume the peeked character if we did not abort, otherwise abort
 		if (!abort) {
 			reader.consumePeek();
-		} else {
-			break;
 		}
 	}
 
 	// Send out pending output data, otherwise we are at the end of the stream
 	if (!dataHandler.isEmpty()) {
-		data = dataHandler.getData();
+		data = dataHandler.toVariant(reader.getSourceId());
 		return State::DATA;
 	}
 	return State::END;
diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp
index 1a136cd..b2ea378 100644
--- a/src/plugins/plain/PlainFormatStreamReader.hpp
+++ b/src/plugins/plain/PlainFormatStreamReader.hpp
@@ -31,6 +31,8 @@
 
 #include <core/common/Variant.hpp>
 
+#include "DynamicTokenizer.hpp"
+
 namespace ousia {
 
 // Forward declarations
@@ -122,6 +124,11 @@ private:
 	 */
 	Logger &logger;
 
+	/**
+	 * Tokenizer instance used to read individual tokens from the text.
+	 */
+	DynamicTokenizer tokenizer;
+
 	/**
 	 * Variant containing the current command name (always is a string variant,
 	 * but additionally contains the correct locatino of the name).
@@ -140,6 +147,31 @@ private:
 	 */
 	Variant data;
 
+	/**
+	 * Id of the backslash token.
+	 */
+	TokenTypeId tokenBackslash;
+
+	/**
+	 * Id of the linebreak token.
+	 */
+	TokenTypeId tokenLinebreak;
+
+	/**
+	 * Id of the line comment token.
+	 */
+	TokenTypeId tokenLineComment;
+
+	/**
+	 * Id of the block comment start token.
+	 */
+	TokenTypeId tokenBlockCommentStart;
+
+	/**
+	 * If of the block comment end token.
+	 */
+	TokenTypeId tokenBlockCommentEnd;
+
 	/**
 	 * Contains the field index of the current command.
 	 */
@@ -153,7 +185,7 @@ private:
 	/**
 	 * Function used internally to parse a generic comment.
 	 */
-	void parseComment();
+	void parseLineComment();
 
 public:
 	/**
-- 
cgit v1.2.3


From f6e7859a835375c25226719a46df99ec11037599 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 19:01:34 +0100
Subject: added some comments

---
 src/plugins/plain/PlainFormatStreamReader.cpp | 51 ++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp
index f0721a0..498cd43 100644
--- a/src/plugins/plain/PlainFormatStreamReader.cpp
+++ b/src/plugins/plain/PlainFormatStreamReader.cpp
@@ -25,16 +25,49 @@
 namespace ousia {
 
 namespace {
-struct DataHandler {
+
+/**
+ * Class used internally to collect data issued via "DATA" event.
+ */
+class DataHandler {
+private:
+	/**
+	 * Internal character buffer.
+	 */
 	std::vector<char> buf;
 
+	/**
+	 * Start location of the character data.
+	 */
 	SourceOffset start;
+
+	/**
+	 * End location of the character data.
+	 */
 	SourceOffset end;
 
+public:
+
+	/**
+	 * Default constructor, initializes start and end with zeros.
+	 */
 	DataHandler() : start(0), end(0) {}
 
+	/**
+	 * Returns true if the internal buffer is empty.
+	 *
+	 * @return true if no characters were added to the internal buffer, false
+	 * otherwise.
+	 */
 	bool isEmpty() { return buf.empty(); }
 
+	/**
+	 * Appends a single character to the internal buffer.
+	 *
+	 * @param c is the character that should be added to the internal buffer.
+	 * @param charStart is the start position of the character.
+	 * @param charEnd is the end position of the character.
+	 */
 	void append(char c, SourceOffset charStart, SourceOffset charEnd)
 	{
 		if (isEmpty()) {
@@ -44,6 +77,13 @@ struct DataHandler {
 		end = charEnd;
 	}
 
+	/**
+	 * Appends a string to the internal buffer.
+	 *
+	 * @param s is the string that should be added to the internal buffer.
+	 * @param stringStart is the start position of the string.
+	 * @param stringEnd is the end position of the string.
+	 */
 	void append(const std::string &s, SourceOffset stringStart,
 	            SourceOffset stringEnd)
 	{
@@ -54,6 +94,15 @@ struct DataHandler {
 		end = stringEnd;
 	}
 
+	/**
+	 * Converts the internal buffer to a variant with attached location
+	 * information.
+	 *
+	 * @param sourceId is the source id which is needed for building the
+	 * location information.
+	 * @return a Variant with the internal buffer content as string and
+	 * the correct start and end location.
+	 */
 	Variant toVariant(SourceId sourceId)
 	{
 		Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
-- 
cgit v1.2.3