diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-08 17:54:09 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-08 17:54:09 +0100 |
commit | b9681594380333a0a3f0011b40ac6542e7022d98 (patch) | |
tree | b46f0ea06f5025aeec2dd23abeec378afef25228 /src/plugins | |
parent | fb0922e57f1a5e1fb8bfbe153dc381d5778e3137 (diff) |
Deleted DynamicTokenTree class, replaced by TokenTrie
Diffstat (limited to 'src/plugins')
-rw-r--r-- | src/plugins/plain/TokenTrie.cpp (renamed from src/plugins/plain/DynamicTokenTree.cpp) | 40 | ||||
-rw-r--r-- | src/plugins/plain/TokenTrie.hpp (renamed from src/plugins/plain/DynamicTokenTree.hpp) | 84 |
2 files changed, 68 insertions, 56 deletions
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/TokenTrie.cpp index 8b7bfc2..4a0430b 100644 --- a/src/plugins/plain/DynamicTokenTree.cpp +++ b/src/plugins/plain/TokenTrie.cpp @@ -16,18 +16,18 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "DynamicTokenTree.hpp" +#include "TokenTrie.hpp" namespace ousia { /* Class DynamicTokenTree::Node */ -DynamicTokenTree::Node::Node() : descriptor(nullptr) {} +TokenTrie::Node::Node() : type(EmptyToken) {} /* Class DynamicTokenTree */ -bool DynamicTokenTree::registerToken(const std::string &token, - const TokenDescriptor *descriptor) noexcept +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -42,23 +42,22 @@ bool DynamicTokenTree::registerToken(const std::string &token, const char c = token[i]; auto it = node->children.find(c); if (it == node->children.end()) { - it = node->children.emplace(c, std::unique_ptr<Node>(new Node{})) - .first; + it = node->children.emplace(c, std::make_shared<Node>()).first; } node = it->second.get(); } - // If the resulting node already has a descriptor set, we're screwed. - if (node->descriptor != nullptr) { + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { return false; } - // Otherwise just set the descriptor to the given descriptor. - node->descriptor = descriptor; + // Otherwise just set the type to the given type. + node->type = type; return true; } -bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept +bool TokenTrie::unregisterToken(const std::string &token) noexcept { // We cannot remove empty tokens as we need to access the fist character // upfront @@ -77,24 +76,24 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept return false; } - // Reset the subtree handler if this node has another descriptor + // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->descriptor != nullptr || node->children.size() > 1) && + if ((node->type != EmptyToken || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; } } - // If the node descriptor is already nullptr, we cannot do anything here - if (node->descriptor == nullptr) { + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { return false; } // If the target node has children, we cannot delete the subtree. Set the - // descriptor to nullptr instead + // type to EmptyToken instead if (!node->children.empty()) { - node->descriptor = nullptr; + node->type = EmptyToken; return true; } @@ -103,19 +102,18 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept return true; } -const TokenDescriptor *DynamicTokenTree::hasToken( - const std::string &token) const noexcept +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept { Node const *node = &root; for (size_t i = 0; i < token.size(); i++) { const char c = token[i]; auto it = node->children.find(c); if (it == node->children.end()) { - return nullptr; + return EmptyToken; } node = it->second.get(); } - return node->descriptor; + return node->type; } } diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/TokenTrie.hpp index c5dc4de..36c2ffa 100644 --- a/src/plugins/plain/DynamicTokenTree.hpp +++ b/src/plugins/plain/TokenTrie.hpp @@ -17,54 +17,61 @@ */ /** - * @file DynamicTokenTree.hpp + * @file TokenTrie.hpp * - * Class representing a token tree that can be updated dynamically. + * Class representing a token trie that can be updated dynamically. * * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ -#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ +#include <cstdint> #include <memory> +#include <limits> #include <unordered_map> namespace ousia { -class TokenDescriptor; +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; /** - * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to - * identify the longest consecutive token in the text. This is equivalent to a - * prefix trie. + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. * - * A token tree is a construct that structures all special tokens a - * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then - * the token tree would look like this: + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: * * \code{*.txt} - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) * \endcode * - * Every node in the token tree is a valid end state that has a $ attached to - * it. During the search algorithm the Tokenizer goes through the tree and - * stores the last valid position. If a character follows that does not lead to - * a new node in the TokenTree the search ends (and starts again at this - * character). The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). + * Where the number indicates the corresponding token descriptor identifier. */ -class DynamicTokenTree { +class TokenTrie { public: /** * Structure used to build the node tree. @@ -73,7 +80,7 @@ public: /** * Type used for the child map. */ - using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>; + using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; /** * Map from single characters at the corresponding child nodes. @@ -84,7 +91,7 @@ public: * Reference at the corresponding token descriptor. Set to nullptr if * no token is attached to this node. */ - TokenDescriptor const *descriptor; + TokenTypeId type; /** * Default constructor, initializes the descriptor with nullptr. @@ -105,11 +112,10 @@ public: * * @param token is the character sequence that should be registered as * token. - * @param descriptor is the descriptor that should be set for this token. + * @param type is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, - const TokenDescriptor *descriptor) noexcept; + bool registerToken(const std::string &token, TokenTypeId type) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was @@ -128,9 +134,17 @@ public: * @return the attached token descriptor or nullptr if the given token is * not found. */ - const TokenDescriptor* hasToken(const std::string &token) const noexcept; + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } }; } -#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */ +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ |