summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt4
-rw-r--r--src/plugins/plain/TokenTrie.cpp (renamed from src/plugins/plain/DynamicTokenTree.cpp)40
-rw-r--r--src/plugins/plain/TokenTrie.hpp (renamed from src/plugins/plain/DynamicTokenTree.hpp)84
-rw-r--r--test/plugins/plain/DynamicTokenTreeTest.cpp94
-rw-r--r--test/plugins/plain/TokenTrieTest.cpp92
5 files changed, 162 insertions, 152 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d73248..f9b224d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,7 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml
)
ADD_LIBRARY(ousia_plain
- src/plugins/plain/DynamicTokenTree
+ src/plugins/plain/TokenTrie
src/plugins/plain/PlainFormatStreamReader
)
@@ -324,7 +324,7 @@ IF(TEST)
)
ADD_EXECUTABLE(ousia_test_plain
- test/plugins/plain/DynamicTokenTreeTest
+ test/plugins/plain/TokenTrieTest
test/plugins/plain/PlainFormatStreamReaderTest
)
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/TokenTrie.cpp
index 8b7bfc2..4a0430b 100644
--- a/src/plugins/plain/DynamicTokenTree.cpp
+++ b/src/plugins/plain/TokenTrie.cpp
@@ -16,18 +16,18 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include "DynamicTokenTree.hpp"
+#include "TokenTrie.hpp"
namespace ousia {
/* Class DynamicTokenTree::Node */
-DynamicTokenTree::Node::Node() : descriptor(nullptr) {}
+TokenTrie::Node::Node() : type(EmptyToken) {}
/* Class DynamicTokenTree */
-bool DynamicTokenTree::registerToken(const std::string &token,
- const TokenDescriptor *descriptor) noexcept
+bool TokenTrie::registerToken(const std::string &token,
+ TokenTypeId type) noexcept
{
// Abort if the token is empty -- this would taint the root node
if (token.empty()) {
@@ -42,23 +42,22 @@ bool DynamicTokenTree::registerToken(const std::string &token,
const char c = token[i];
auto it = node->children.find(c);
if (it == node->children.end()) {
- it = node->children.emplace(c, std::unique_ptr<Node>(new Node{}))
- .first;
+ it = node->children.emplace(c, std::make_shared<Node>()).first;
}
node = it->second.get();
}
- // If the resulting node already has a descriptor set, we're screwed.
- if (node->descriptor != nullptr) {
+ // If the resulting node already has a type set, we're screwed.
+ if (node->type != EmptyToken) {
return false;
}
- // Otherwise just set the descriptor to the given descriptor.
- node->descriptor = descriptor;
+ // Otherwise just set the type to the given type.
+ node->type = type;
return true;
}
-bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
+bool TokenTrie::unregisterToken(const std::string &token) noexcept
{
// We cannot remove empty tokens as we need to access the fist character
// upfront
@@ -77,24 +76,24 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
return false;
}
- // Reset the subtree handler if this node has another descriptor
+ // Reset the subtree handler if this node has another type
node = it->second.get();
- if ((node->descriptor != nullptr || node->children.size() > 1) &&
+ if ((node->type != EmptyToken || node->children.size() > 1) &&
(i + 1 != token.size())) {
subtreeRoot = node;
subtreeKey = token[i + 1];
}
}
- // If the node descriptor is already nullptr, we cannot do anything here
- if (node->descriptor == nullptr) {
+ // If the node type is already EmptyToken, we cannot do anything here
+ if (node->type == EmptyToken) {
return false;
}
// If the target node has children, we cannot delete the subtree. Set the
- // descriptor to nullptr instead
+ // type to EmptyToken instead
if (!node->children.empty()) {
- node->descriptor = nullptr;
+ node->type = EmptyToken;
return true;
}
@@ -103,19 +102,18 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
return true;
}
-const TokenDescriptor *DynamicTokenTree::hasToken(
- const std::string &token) const noexcept
+TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
{
Node const *node = &root;
for (size_t i = 0; i < token.size(); i++) {
const char c = token[i];
auto it = node->children.find(c);
if (it == node->children.end()) {
- return nullptr;
+ return EmptyToken;
}
node = it->second.get();
}
- return node->descriptor;
+ return node->type;
}
}
diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/TokenTrie.hpp
index c5dc4de..36c2ffa 100644
--- a/src/plugins/plain/DynamicTokenTree.hpp
+++ b/src/plugins/plain/TokenTrie.hpp
@@ -17,54 +17,61 @@
*/
/**
- * @file DynamicTokenTree.hpp
+ * @file TokenTrie.hpp
*
- * Class representing a token tree that can be updated dynamically.
+ * Class representing a token trie that can be updated dynamically.
*
* @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
-#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
-#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+#ifndef _OUSIA_TOKEN_TRIE_HPP_
+#define _OUSIA_TOKEN_TRIE_HPP_
+#include <cstdint>
#include <memory>
+#include <limits>
#include <unordered_map>
namespace ousia {
-class TokenDescriptor;
+/**
+ * The TokenTypeId is used to give each token type a unique id.
+ */
+using TokenTypeId = uint32_t;
/**
- * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to
- * identify the longest consecutive token in the text. This is equivalent to a
- * prefix trie.
+ * Token which is not a token.
+ */
+constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+
+/**
+ * Token which represents a text token.
+ */
+constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+
+/**
+ * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
+ * the longest consecutive token in the text. This is equivalent to a prefix
+ * trie.
*
- * A token tree is a construct that structures all special tokens a
- * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then
- * the token tree would look like this:
+ * A token trie is a construct that structures all special tokens a Tokenizer
+ * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
+ * three. Then the token tree would look like this:
*
* \code{*.txt}
- * a
- * | \
- * a $
- * | \
- * b c
- * | |
- * $ $
+ * ~ (0)
+ * / \
+ * a (2) b (0)
+ * | |
+ * a (0) a (0)
+ * | |
+ * b (1) c (0)
* \endcode
*
- * Every node in the token tree is a valid end state that has a $ attached to
- * it. During the search algorithm the Tokenizer goes through the tree and
- * stores the last valid position. If a character follows that does not lead to
- * a new node in the TokenTree the search ends (and starts again at this
- * character). The token corresponding to the last valid position is returned.
- *
- * This allows us to uniquely identify the matching token given a certain
- * input text. Note that this is a greedy matching approach that does not
- * work if you're using truly ambiguous tokens (that have the same text).
+ * Where the number indicates the corresponding token descriptor identifier.
*/
-class DynamicTokenTree {
+class TokenTrie {
public:
/**
* Structure used to build the node tree.
@@ -73,7 +80,7 @@ public:
/**
* Type used for the child map.
*/
- using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>;
+ using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
/**
* Map from single characters at the corresponding child nodes.
@@ -84,7 +91,7 @@ public:
* Reference at the corresponding token descriptor. Set to nullptr if
* no token is attached to this node.
*/
- TokenDescriptor const *descriptor;
+ TokenTypeId type;
/**
* Default constructor, initializes the descriptor with nullptr.
@@ -105,11 +112,10 @@ public:
*
* @param token is the character sequence that should be registered as
* token.
- * @param descriptor is the descriptor that should be set for this token.
+ * @param type is the descriptor that should be set for this token.
* @return true if the operation is successful, false otherwise.
*/
- bool registerToken(const std::string &token,
- const TokenDescriptor *descriptor) noexcept;
+ bool registerToken(const std::string &token, TokenTypeId type) noexcept;
/**
* Unregisters the token from the token tree. Returns true if the token was
@@ -128,9 +134,17 @@ public:
* @return the attached token descriptor or nullptr if the given token is
* not found.
*/
- const TokenDescriptor* hasToken(const std::string &token) const noexcept;
+ TokenTypeId hasToken(const std::string &token) const noexcept;
+
+ /**
+ * Returns a reference at the root node to be used for traversing the token
+ * tree.
+ *
+ * @return a reference at the root node.
+ */
+ const Node *getRoot() const noexcept { return &root; }
};
}
-#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */
+#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
diff --git a/test/plugins/plain/DynamicTokenTreeTest.cpp b/test/plugins/plain/DynamicTokenTreeTest.cpp
deleted file mode 100644
index 5ae414c..0000000
--- a/test/plugins/plain/DynamicTokenTreeTest.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <gtest/gtest.h>
-
-#include <plugins/plain/DynamicTokenTree.hpp>
-
-namespace ousia {
-
-static const TokenDescriptor *d1 = reinterpret_cast<const TokenDescriptor*>(1);
-static const TokenDescriptor *d2 = reinterpret_cast<const TokenDescriptor*>(2);
-static const TokenDescriptor *d3 = reinterpret_cast<const TokenDescriptor*>(3);
-static const TokenDescriptor *d4 = reinterpret_cast<const TokenDescriptor*>(4);
-
-TEST(DynamicTokenTree, registerToken)
-{
- DynamicTokenTree tree;
-
- ASSERT_TRUE(tree.registerToken("a", d1));
- ASSERT_TRUE(tree.registerToken("ab", d2));
- ASSERT_TRUE(tree.registerToken("b", d3));
- ASSERT_TRUE(tree.registerToken("hello", d4));
-
- ASSERT_FALSE(tree.registerToken("", d1));
- ASSERT_FALSE(tree.registerToken("a", d4));
- ASSERT_FALSE(tree.registerToken("ab", d4));
- ASSERT_FALSE(tree.registerToken("b", d4));
- ASSERT_FALSE(tree.registerToken("hello", d4));
-
- ASSERT_EQ(d1, tree.hasToken("a"));
- ASSERT_EQ(d2, tree.hasToken("ab"));
- ASSERT_EQ(d3, tree.hasToken("b"));
- ASSERT_EQ(d4, tree.hasToken("hello"));
- ASSERT_EQ(nullptr, tree.hasToken(""));
- ASSERT_EQ(nullptr, tree.hasToken("abc"));
-}
-
-TEST(DynamicTokenTree, unregisterToken)
-{
- DynamicTokenTree tree;
-
- ASSERT_TRUE(tree.registerToken("a", d1));
- ASSERT_FALSE(tree.registerToken("a", d4));
-
- ASSERT_TRUE(tree.registerToken("ab", d2));
- ASSERT_FALSE(tree.registerToken("ab", d4));
-
- ASSERT_TRUE(tree.registerToken("b", d3));
- ASSERT_FALSE(tree.registerToken("b", d4));
-
- ASSERT_EQ(d1, tree.hasToken("a"));
- ASSERT_EQ(d2, tree.hasToken("ab"));
- ASSERT_EQ(d3, tree.hasToken("b"));
-
- ASSERT_TRUE(tree.unregisterToken("a"));
- ASSERT_FALSE(tree.unregisterToken("a"));
-
- ASSERT_EQ(nullptr, tree.hasToken("a"));
- ASSERT_EQ(d2, tree.hasToken("ab"));
- ASSERT_EQ(d3, tree.hasToken("b"));
-
- ASSERT_TRUE(tree.unregisterToken("b"));
- ASSERT_FALSE(tree.unregisterToken("b"));
-
- ASSERT_EQ(nullptr, tree.hasToken("a"));
- ASSERT_EQ(d2, tree.hasToken("ab"));
- ASSERT_EQ(nullptr, tree.hasToken("b"));
-
- ASSERT_TRUE(tree.unregisterToken("ab"));
- ASSERT_FALSE(tree.unregisterToken("ab"));
-
- ASSERT_EQ(nullptr, tree.hasToken("a"));
- ASSERT_EQ(nullptr, tree.hasToken("ab"));
- ASSERT_EQ(nullptr, tree.hasToken("b"));
-}
-
-
-}
-
diff --git a/test/plugins/plain/TokenTrieTest.cpp b/test/plugins/plain/TokenTrieTest.cpp
new file mode 100644
index 0000000..d378fdf
--- /dev/null
+++ b/test/plugins/plain/TokenTrieTest.cpp
@@ -0,0 +1,92 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <gtest/gtest.h>
+
+#include <plugins/plain/TokenTrie.hpp>
+
+namespace ousia {
+
+static const TokenTypeId t1 = 0;
+static const TokenTypeId t2 = 1;
+static const TokenTypeId t3 = 2;
+static const TokenTypeId t4 = 3;
+
+TEST(TokenTrie, registerToken)
+{
+ TokenTrie tree;
+
+ ASSERT_TRUE(tree.registerToken("a", t1));
+ ASSERT_TRUE(tree.registerToken("ab", t2));
+ ASSERT_TRUE(tree.registerToken("b", t3));
+ ASSERT_TRUE(tree.registerToken("hello", t4));
+
+ ASSERT_FALSE(tree.registerToken("", t1));
+ ASSERT_FALSE(tree.registerToken("a", t4));
+ ASSERT_FALSE(tree.registerToken("ab", t4));
+ ASSERT_FALSE(tree.registerToken("b", t4));
+ ASSERT_FALSE(tree.registerToken("hello", t4));
+
+ ASSERT_EQ(t1, tree.hasToken("a"));
+ ASSERT_EQ(t2, tree.hasToken("ab"));
+ ASSERT_EQ(t3, tree.hasToken("b"));
+ ASSERT_EQ(t4, tree.hasToken("hello"));
+ ASSERT_EQ(EmptyToken, tree.hasToken(""));
+ ASSERT_EQ(EmptyToken, tree.hasToken("abc"));
+}
+
+TEST(TokenTrie, unregisterToken)
+{
+ TokenTrie tree;
+
+ ASSERT_TRUE(tree.registerToken("a", t1));
+ ASSERT_FALSE(tree.registerToken("a", t4));
+
+ ASSERT_TRUE(tree.registerToken("ab", t2));
+ ASSERT_FALSE(tree.registerToken("ab", t4));
+
+ ASSERT_TRUE(tree.registerToken("b", t3));
+ ASSERT_FALSE(tree.registerToken("b", t4));
+
+ ASSERT_EQ(t1, tree.hasToken("a"));
+ ASSERT_EQ(t2, tree.hasToken("ab"));
+ ASSERT_EQ(t3, tree.hasToken("b"));
+
+ ASSERT_TRUE(tree.unregisterToken("a"));
+ ASSERT_FALSE(tree.unregisterToken("a"));
+
+ ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+ ASSERT_EQ(t2, tree.hasToken("ab"));
+ ASSERT_EQ(t3, tree.hasToken("b"));
+
+ ASSERT_TRUE(tree.unregisterToken("b"));
+ ASSERT_FALSE(tree.unregisterToken("b"));
+
+ ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+ ASSERT_EQ(t2, tree.hasToken("ab"));
+ ASSERT_EQ(EmptyToken, tree.hasToken("b"));
+
+ ASSERT_TRUE(tree.unregisterToken("ab"));
+ ASSERT_FALSE(tree.unregisterToken("ab"));
+
+ ASSERT_EQ(EmptyToken, tree.hasToken("a"));
+ ASSERT_EQ(EmptyToken, tree.hasToken("ab"));
+ ASSERT_EQ(EmptyToken, tree.hasToken("b"));
+}
+}
+