diff options
-rw-r--r-- | CMakeLists.txt | 4 | ||||
-rw-r--r-- | src/plugins/plain/TokenTrie.cpp (renamed from src/plugins/plain/DynamicTokenTree.cpp) | 40 | ||||
-rw-r--r-- | src/plugins/plain/TokenTrie.hpp (renamed from src/plugins/plain/DynamicTokenTree.hpp) | 84 | ||||
-rw-r--r-- | test/plugins/plain/DynamicTokenTreeTest.cpp | 94 | ||||
-rw-r--r-- | test/plugins/plain/TokenTrieTest.cpp | 92 |
5 files changed, 162 insertions, 152 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d73248..f9b224d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,7 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml ) ADD_LIBRARY(ousia_plain - src/plugins/plain/DynamicTokenTree + src/plugins/plain/TokenTrie src/plugins/plain/PlainFormatStreamReader ) @@ -324,7 +324,7 @@ IF(TEST) ) ADD_EXECUTABLE(ousia_test_plain - test/plugins/plain/DynamicTokenTreeTest + test/plugins/plain/TokenTrieTest test/plugins/plain/PlainFormatStreamReaderTest ) diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/TokenTrie.cpp index 8b7bfc2..4a0430b 100644 --- a/src/plugins/plain/DynamicTokenTree.cpp +++ b/src/plugins/plain/TokenTrie.cpp @@ -16,18 +16,18 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "DynamicTokenTree.hpp" +#include "TokenTrie.hpp" namespace ousia { /* Class DynamicTokenTree::Node */ -DynamicTokenTree::Node::Node() : descriptor(nullptr) {} +TokenTrie::Node::Node() : type(EmptyToken) {} /* Class DynamicTokenTree */ -bool DynamicTokenTree::registerToken(const std::string &token, - const TokenDescriptor *descriptor) noexcept +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -42,23 +42,22 @@ bool DynamicTokenTree::registerToken(const std::string &token, const char c = token[i]; auto it = node->children.find(c); if (it == node->children.end()) { - it = node->children.emplace(c, std::unique_ptr<Node>(new Node{})) - .first; + it = node->children.emplace(c, std::make_shared<Node>()).first; } node = it->second.get(); } - // If the resulting node already has a descriptor set, we're screwed. - if (node->descriptor != nullptr) { + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { return false; } - // Otherwise just set the descriptor to the given descriptor. - node->descriptor = descriptor; + // Otherwise just set the type to the given type. + node->type = type; return true; } -bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept +bool TokenTrie::unregisterToken(const std::string &token) noexcept { // We cannot remove empty tokens as we need to access the fist character // upfront @@ -77,24 +76,24 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept return false; } - // Reset the subtree handler if this node has another descriptor + // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->descriptor != nullptr || node->children.size() > 1) && + if ((node->type != EmptyToken || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; } } - // If the node descriptor is already nullptr, we cannot do anything here - if (node->descriptor == nullptr) { + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { return false; } // If the target node has children, we cannot delete the subtree. Set the - // descriptor to nullptr instead + // type to EmptyToken instead if (!node->children.empty()) { - node->descriptor = nullptr; + node->type = EmptyToken; return true; } @@ -103,19 +102,18 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept return true; } -const TokenDescriptor *DynamicTokenTree::hasToken( - const std::string &token) const noexcept +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept { Node const *node = &root; for (size_t i = 0; i < token.size(); i++) { const char c = token[i]; auto it = node->children.find(c); if (it == node->children.end()) { - return nullptr; + return EmptyToken; } node = it->second.get(); } - return node->descriptor; + return node->type; } } diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/TokenTrie.hpp index c5dc4de..36c2ffa 100644 --- a/src/plugins/plain/DynamicTokenTree.hpp +++ b/src/plugins/plain/TokenTrie.hpp @@ -17,54 +17,61 @@ */ /** - * @file DynamicTokenTree.hpp + * @file TokenTrie.hpp * - * Class representing a token tree that can be updated dynamically. + * Class representing a token trie that can be updated dynamically. * * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ -#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ +#include <cstdint> #include <memory> +#include <limits> #include <unordered_map> namespace ousia { -class TokenDescriptor; +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; /** - * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to - * identify the longest consecutive token in the text. This is equivalent to a - * prefix trie. + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. * - * A token tree is a construct that structures all special tokens a - * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then - * the token tree would look like this: + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: * * \code{*.txt} - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) * \endcode * - * Every node in the token tree is a valid end state that has a $ attached to - * it. During the search algorithm the Tokenizer goes through the tree and - * stores the last valid position. If a character follows that does not lead to - * a new node in the TokenTree the search ends (and starts again at this - * character). The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). + * Where the number indicates the corresponding token descriptor identifier. */ -class DynamicTokenTree { +class TokenTrie { public: /** * Structure used to build the node tree. @@ -73,7 +80,7 @@ public: /** * Type used for the child map. */ - using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>; + using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; /** * Map from single characters at the corresponding child nodes. @@ -84,7 +91,7 @@ public: * Reference at the corresponding token descriptor. Set to nullptr if * no token is attached to this node. */ - TokenDescriptor const *descriptor; + TokenTypeId type; /** * Default constructor, initializes the descriptor with nullptr. @@ -105,11 +112,10 @@ public: * * @param token is the character sequence that should be registered as * token. - * @param descriptor is the descriptor that should be set for this token. + * @param type is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, - const TokenDescriptor *descriptor) noexcept; + bool registerToken(const std::string &token, TokenTypeId type) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was @@ -128,9 +134,17 @@ public: * @return the attached token descriptor or nullptr if the given token is * not found. */ - const TokenDescriptor* hasToken(const std::string &token) const noexcept; + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } }; } -#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */ +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ diff --git a/test/plugins/plain/DynamicTokenTreeTest.cpp b/test/plugins/plain/DynamicTokenTreeTest.cpp deleted file mode 100644 index 5ae414c..0000000 --- a/test/plugins/plain/DynamicTokenTreeTest.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <gtest/gtest.h> - -#include <plugins/plain/DynamicTokenTree.hpp> - -namespace ousia { - -static const TokenDescriptor *d1 = reinterpret_cast<const TokenDescriptor*>(1); -static const TokenDescriptor *d2 = reinterpret_cast<const TokenDescriptor*>(2); -static const TokenDescriptor *d3 = reinterpret_cast<const TokenDescriptor*>(3); -static const TokenDescriptor *d4 = reinterpret_cast<const TokenDescriptor*>(4); - -TEST(DynamicTokenTree, registerToken) -{ - DynamicTokenTree tree; - - ASSERT_TRUE(tree.registerToken("a", d1)); - ASSERT_TRUE(tree.registerToken("ab", d2)); - ASSERT_TRUE(tree.registerToken("b", d3)); - ASSERT_TRUE(tree.registerToken("hello", d4)); - - ASSERT_FALSE(tree.registerToken("", d1)); - ASSERT_FALSE(tree.registerToken("a", d4)); - ASSERT_FALSE(tree.registerToken("ab", d4)); - ASSERT_FALSE(tree.registerToken("b", d4)); - ASSERT_FALSE(tree.registerToken("hello", d4)); - - ASSERT_EQ(d1, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(d3, tree.hasToken("b")); - ASSERT_EQ(d4, tree.hasToken("hello")); - ASSERT_EQ(nullptr, tree.hasToken("")); - ASSERT_EQ(nullptr, tree.hasToken("abc")); -} - -TEST(DynamicTokenTree, unregisterToken) -{ - DynamicTokenTree tree; - - ASSERT_TRUE(tree.registerToken("a", d1)); - ASSERT_FALSE(tree.registerToken("a", d4)); - - ASSERT_TRUE(tree.registerToken("ab", d2)); - ASSERT_FALSE(tree.registerToken("ab", d4)); - - ASSERT_TRUE(tree.registerToken("b", d3)); - ASSERT_FALSE(tree.registerToken("b", d4)); - - ASSERT_EQ(d1, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(d3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("a")); - ASSERT_FALSE(tree.unregisterToken("a")); - - ASSERT_EQ(nullptr, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(d3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("b")); - ASSERT_FALSE(tree.unregisterToken("b")); - - ASSERT_EQ(nullptr, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(nullptr, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("ab")); - ASSERT_FALSE(tree.unregisterToken("ab")); - - ASSERT_EQ(nullptr, tree.hasToken("a")); - ASSERT_EQ(nullptr, tree.hasToken("ab")); - ASSERT_EQ(nullptr, tree.hasToken("b")); -} - - -} - diff --git a/test/plugins/plain/TokenTrieTest.cpp b/test/plugins/plain/TokenTrieTest.cpp new file mode 100644 index 0000000..d378fdf --- /dev/null +++ b/test/plugins/plain/TokenTrieTest.cpp @@ -0,0 +1,92 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <gtest/gtest.h> + +#include <plugins/plain/TokenTrie.hpp> + +namespace ousia { + +static const TokenTypeId t1 = 0; +static const TokenTypeId t2 = 1; +static const TokenTypeId t3 = 2; +static const TokenTypeId t4 = 3; + +TEST(TokenTrie, registerToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_TRUE(tree.registerToken("hello", t4)); + + ASSERT_FALSE(tree.registerToken("", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + ASSERT_FALSE(tree.registerToken("b", t4)); + ASSERT_FALSE(tree.registerToken("hello", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + ASSERT_EQ(t4, tree.hasToken("hello")); + ASSERT_EQ(EmptyToken, tree.hasToken("")); + ASSERT_EQ(EmptyToken, tree.hasToken("abc")); +} + +TEST(TokenTrie, unregisterToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_FALSE(tree.registerToken("b", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("a")); + ASSERT_FALSE(tree.unregisterToken("a")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("b")); + ASSERT_FALSE(tree.unregisterToken("b")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("ab")); + ASSERT_FALSE(tree.unregisterToken("ab")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(EmptyToken, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); +} +} + |