5 files changed, 162 insertions, 152 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d73248..f9b224d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,7 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml
 )
 
 ADD_LIBRARY(ousia_plain
-	src/plugins/plain/DynamicTokenTree
+	src/plugins/plain/TokenTrie
 	src/plugins/plain/PlainFormatStreamReader
 )
 
@@ -324,7 +324,7 @@ IF(TEST)
 	)
 
 	ADD_EXECUTABLE(ousia_test_plain
-		test/plugins/plain/DynamicTokenTreeTest
+		test/plugins/plain/TokenTrieTest
 		test/plugins/plain/PlainFormatStreamReaderTest
 	)
 
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/TokenTrie.cpp
index 8b7bfc2..4a0430b 100644
--- a/src/plugins/plain/DynamicTokenTree.cpp
+++ b/src/plugins/plain/TokenTrie.cpp
@@ -16,18 +16,18 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "DynamicTokenTree.hpp"
+#include "TokenTrie.hpp"
 
 namespace ousia {
 
 /* Class DynamicTokenTree::Node */
 
-DynamicTokenTree::Node::Node() : descriptor(nullptr) {}
+TokenTrie::Node::Node() : type(EmptyToken) {}
 
 /* Class DynamicTokenTree */
 
-bool DynamicTokenTree::registerToken(const std::string &token,
-                                     const TokenDescriptor *descriptor) noexcept
+bool TokenTrie::registerToken(const std::string &token,
+                              TokenTypeId type) noexcept
 {
 	// Abort if the token is empty -- this would taint the root node
 	if (token.empty()) {
@@ -42,23 +42,22 @@ bool DynamicTokenTree::registerToken(const std::string &token,
 		const char c = token[i];
 		auto it = node->children.find(c);
 		if (it == node->children.end()) {
-			it = node->children.emplace(c, std::unique_ptr<Node>(new Node{}))
-			         .first;
+			it = node->children.emplace(c, std::make_shared<Node>()).first;
 		}
 		node = it->second.get();
 	}
 
-	// If the resulting node already has a descriptor set, we're screwed.
-	if (node->descriptor != nullptr) {
+	// If the resulting node already has a type set, we're screwed.
+	if (node->type != EmptyToken) {
 		return false;
 	}
 
-	// Otherwise just set the descriptor to the given descriptor.
-	node->descriptor = descriptor;
+	// Otherwise just set the type to the given type.
+	node->type = type;
 	return true;
 }
 
-bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
+bool TokenTrie::unregisterToken(const std::string &token) noexcept
 {
 	// We cannot remove empty tokens as we need to access the fist character
 	// upfront
@@ -77,24 +76,24 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
 			return false;
 		}
 
-		// Reset the subtree handler if this node has another descriptor
+		// Reset the subtree handler if this node has another type
 		node = it->second.get();
-		if ((node->descriptor != nullptr || node->children.size() > 1) &&
+		if ((node->type != EmptyToken || node->children.size() > 1) &&
 		    (i + 1 != token.size())) {
 			subtreeRoot = node;
 			subtreeKey = token[i + 1];
 		}
 	}
 
-	// If the node descriptor is already nullptr, we cannot do anything here
-	if (node->descriptor == nullptr) {
+	// If the node type is already EmptyToken, we cannot do anything here
+	if (node->type == EmptyToken) {
 		return false;
 	}
 
 	// If the target node has children, we cannot delete the subtree. Set the
-	// descriptor to nullptr instead
+	// type to EmptyToken instead
 	if (!node->children.empty()) {
-		node->descriptor = nullptr;
+		node->type = EmptyToken;
 		return true;
 	}
 
@@ -103,19 +102,18 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
 	return true;
 }
 
-const TokenDescriptor *DynamicTokenTree::hasToken(
-    const std::string &token) const noexcept
+TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
 {
 	Node const *node = &root;
 	for (size_t i = 0; i < token.size(); i++) {
 		const char c = token[i];
 		auto it = node->children.find(c);
 		if (it == node->children.end()) {
-			return nullptr;
+			return EmptyToken;
 		}
 		node = it->second.get();
 	}
-	return node->descriptor;
+	return node->type;
 }
 }
 
diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/TokenTrie.hpp
index c5dc4de..36c2ffa 100644
--- a/src/plugins/plain/DynamicTokenTree.hpp
+++ b/src/plugins/plain/TokenTrie.hpp
@@ -17,54 +17,61 @@
 */
 
 /**
- * @file DynamicTokenTree.hpp
+ * @file TokenTrie.hpp
  *
- * Class representing a token tree that can be updated dynamically.
+ * Class representing a token trie that can be updated dynamically.
  *
  * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
  * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
  */
 
-#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
-#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+#ifndef _OUSIA_TOKEN_TRIE_HPP_
+#define _OUSIA_TOKEN_TRIE_HPP_
 
+#include <cstdint>
 #include <memory>
+#include <limits>
 #include <unordered_map>
 
 namespace ousia {
 
-class TokenDescriptor;
+/**
+ * The TokenTypeId is used to give each token type a unique id.
+ */
+using TokenTypeId = uint32_t;
 
 /**
- * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to
- * identify the longest consecutive token in the text. This is equivalent to a
- * prefix trie.
+ * Token which is not a token.
+ */
+constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+
+/**
+ * Token which represents a text token.
+ */
+constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+
+/**
+ * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
+ * the longest consecutive token in the text. This is equivalent to a prefix
+ * trie.
  *
- * A token tree is a construct that structures all special tokens a
- * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then
- * the token tree would look like this:
+ * A token trie is a construct that structures all special tokens a Tokenizer
+ * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
+ * three. Then the token tree would look like this:
  *
  * \code{*.txt}
- * a
- * | \
- * a $
- * | \
- * b c
- * | |
- * $ $
+ *        ~ (0)
+ *       /     \
+ *      a (2)  b (0)
+ *      |      |
+ *      a (0)  a (0)
+ *      |      |
+ *      b (1)  c (0)
  * \endcode
  *
- * Every node in the token tree is a valid end state that has a $ attached to
- * it. During the search algorithm the Tokenizer goes through the tree and
- * stores the last valid position. If a character follows that does not lead to
- * a new node in the TokenTree the search ends (and starts again at this
- * character). The token corresponding to the last valid position is returned.
- *
- * This allows us to uniquely identify the matching token given a certain
- * input text. Note that this is a greedy matching approach that does not
- * work if you're using truly ambiguous tokens (that have the same text).
+ * Where the number indicates the corresponding token descriptor identifier.
  */
-class DynamicTokenTree {
+class TokenTrie {
 public:
 	/**
 	 * Structure used to build the node tree.
@@ -73,7 +80,7 @@ public:
 		/**
 		 * Type used for the child map.
 		 */
-		using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>;
+		using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
 
 		/**
 		 * Map from single characters at the corresponding child nodes.
@@ -84,7 +91,7 @@ public:
 		 * Reference at the corresponding token descriptor. Set to nullptr if
 		 * no token is attached to this node.
 		 */
-		TokenDescriptor const *descriptor;
+		TokenTypeId type;
 
 		/**
 		 * Default constructor, initializes the descriptor with nullptr.
@@ -105,11 +112,10 @@ public:
 	 *
 	 * @param token is the character sequence that should be registered as
 	 * token.
-	 * @param descriptor is the descriptor that should be set for this token.
+	 * @param type is the descriptor that should be set for this token.
 	 * @return true if the operation is successful, false otherwise.
 	 */
-	bool registerToken(const std::string &token,
-	                   const TokenDescriptor *descriptor) noexcept;
+	bool registerToken(const std::string &token, TokenTypeId type) noexcept;
 
 	/**
 	 * Unregisters the token from the token tree. Returns true if the token was
@@ -128,9 +134,17 @@ public:
 	 * @return the attached token descriptor or nullptr if the given token is
 	 * not found.
 	 */
-	const TokenDescriptor* hasToken(const std::string &token) const noexcept;
+	TokenTypeId hasToken(const std::string &token) const noexcept;
+
+	/**
+	 * Returns a reference at the root node to be used for traversing the token
+	 * tree.
+	 *
+	 * @return a reference at the root node.
+	 */
+	const Node *getRoot() const noexcept { return &root; }
 };
 }
 
-#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */
+#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
 
diff --git a/test/plugins/plain/DynamicTokenTreeTest.cpp b/test/plugins/plain/DynamicTokenTreeTest.cpp
deleted file mode 100644
index 5ae414c..0000000
--- a/test/plugins/plain/DynamicTokenTreeTest.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <gtest/gtest.h>
-
-#include <plugins/plain/DynamicTokenTree.hpp>
-
-namespace ousia {
-
-static const TokenDescriptor *d1 = reinterpret_cast<const TokenDescriptor*>(1);
-static const TokenDescriptor *d2 = reinterpret_cast<const TokenDescriptor*>(2);
-static const TokenDescriptor *d3 = reinterpret_cast<const TokenDescriptor*>(3);
-static const TokenDescriptor *d4 = reinterpret_cast<const TokenDescriptor*>(4);
-
-TEST(DynamicTokenTree, registerToken)
-{
-	DynamicTokenTree tree;
-
-	ASSERT_TRUE(tree.registerToken("a", d1));
-	ASSERT_TRUE(tree.registerToken("ab", d2));
-	ASSERT_TRUE(tree.registerToken("b", d3));
-	ASSERT_TRUE(tree.registerToken("hello", d4));
-
-	ASSERT_FALSE(tree.registerToken("", d1));
-	ASSERT_FALSE(tree.registerToken("a", d4));
-	ASSERT_FALSE(tree.registerToken("ab", d4));
-	ASSERT_FALSE(tree.registerToken("b", d4));
-	ASSERT_FALSE(tree.registerToken("hello", d4));
-
-	ASSERT_EQ(d1, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(d3, tree.hasToken("b"));
-	ASSERT_EQ(d4, tree.hasToken("hello"));
-	ASSERT_EQ(nullptr, tree.hasToken(""));
-	ASSERT_EQ(nullptr, tree.hasToken("abc"));
-}
-
-TEST(DynamicTokenTree, unregisterToken)
-{
-	DynamicTokenTree tree;
-
-	ASSERT_TRUE(tree.registerToken("a", d1));
-	ASSERT_FALSE(tree.registerToken("a", d4));
-
-	ASSERT_TRUE(tree.registerToken("ab", d2));
-	ASSERT_FALSE(tree.registerToken("ab", d4));
-
-	ASSERT_TRUE(tree.registerToken("b", d3));
-	ASSERT_FALSE(tree.registerToken("b", d4));
-
-	ASSERT_EQ(d1, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(d3, tree.hasToken("b"));
-
-	ASSERT_TRUE(tree.unregisterToken("a"));
-	ASSERT_FALSE(tree.unregisterToken("a"));
-
-	ASSERT_EQ(nullptr, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(d3, tree.hasToken("b"));
-
-	ASSERT_TRUE(tree.unregisterToken("b"));
-	ASSERT_FALSE(tree.unregisterToken("b"));
-
-	ASSERT_EQ(nullptr, tree.hasToken("a"));
-	ASSERT_EQ(d2, tree.hasToken("ab"));
-	ASSERT_EQ(nullptr, tree.hasToken("b"));
-
-	ASSERT_TRUE(tree.unregisterToken("ab"));
-	ASSERT_FALSE(tree.unregisterToken("ab"));
-
-	ASSERT_EQ(nullptr, tree.hasToken("a"));
-	ASSERT_EQ(nullptr, tree.hasToken("ab"));
-	ASSERT_EQ(nullptr, tree.hasToken("b"));
-}
-
-
-}
-
diff --git a/test/plugins/plain/TokenTrieTest.cpp b/test/plugins/plain/TokenTrieTest.cpp
new file mode 100644
index 0000000..d378fdf
--- /dev/null
+++ b/test/plugins/plain/TokenTrieTest.cpp
@@ -0,0 +1,92 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <gtest/gtest.h>
+
+#include <plugins/plain/TokenTrie.hpp>
+
+namespace ousia {
+
+static const TokenTypeId t1 = 0;
+static const TokenTypeId t2 = 1;
+static const TokenTypeId t3 = 2;
+static const TokenTypeId t4 = 3;
+
+TEST(TokenTrie, registerToken)
+{
+	TokenTrie tree;
+
+	ASSERT_TRUE(tree.registerToken("a", t1));
+	ASSERT_TRUE(tree.registerToken("ab", t2));
+	ASSERT_TRUE(tree.registerToken("b", t3));
+	ASSERT_TRUE(tree.registerToken("hello", t4));
+
+	ASSERT_FALSE(tree.registerToken("", t1));
+	ASSERT_FALSE(tree.registerToken("a", t4));
+	ASSERT_FALSE(tree.registerToken("ab", t4));
+	ASSERT_FALSE(tree.registerToken("b", t4));
+	ASSERT_FALSE(tree.registerToken("hello", t4));
+
+	ASSERT_EQ(t1, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(t3, tree.hasToken("b"));
+	ASSERT_EQ(t4, tree.hasToken("hello"));
+	ASSERT_EQ(EmptyToken, tree.hasToken(""));
+	ASSERT_EQ(EmptyToken, tree.hasToken("abc"));
+}
+
+TEST(TokenTrie, unregisterToken)
+{
+	TokenTrie tree;
+
+	ASSERT_TRUE(tree.registerToken("a", t1));
+	ASSERT_FALSE(tree.registerToken("a", t4));
+
+	ASSERT_TRUE(tree.registerToken("ab", t2));
+	ASSERT_FALSE(tree.registerToken("ab", t4));
+
+	ASSERT_TRUE(tree.registerToken("b", t3));
+	ASSERT_FALSE(tree.registerToken("b", t4));
+
+	ASSERT_EQ(t1, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(t3, tree.hasToken("b"));
+
+	ASSERT_TRUE(tree.unregisterToken("a"));
+	ASSERT_FALSE(tree.unregisterToken("a"));
+
+	ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(t3, tree.hasToken("b"));
+
+	ASSERT_TRUE(tree.unregisterToken("b"));
+	ASSERT_FALSE(tree.unregisterToken("b"));
+
+	ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+	ASSERT_EQ(t2, tree.hasToken("ab"));
+	ASSERT_EQ(EmptyToken, tree.hasToken("b"));
+
+	ASSERT_TRUE(tree.unregisterToken("ab"));
+	ASSERT_FALSE(tree.unregisterToken("ab"));
+
+	ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+	ASSERT_EQ(EmptyToken, tree.hasToken("ab"));
+	ASSERT_EQ(EmptyToken, tree.hasToken("b"));
+}
+}
+