summaryrefslogtreecommitdiff
path: root/src/plugins
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-08 17:54:09 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-08 17:54:09 +0100
commitb9681594380333a0a3f0011b40ac6542e7022d98 (patch)
treeb46f0ea06f5025aeec2dd23abeec378afef25228 /src/plugins
parentfb0922e57f1a5e1fb8bfbe153dc381d5778e3137 (diff)
Deleted DynamicTokenTree class, replaced by TokenTrie
Diffstat (limited to 'src/plugins')
-rw-r--r--src/plugins/plain/TokenTrie.cpp (renamed from src/plugins/plain/DynamicTokenTree.cpp)40
-rw-r--r--src/plugins/plain/TokenTrie.hpp (renamed from src/plugins/plain/DynamicTokenTree.hpp)84
2 files changed, 68 insertions, 56 deletions
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/TokenTrie.cpp
index 8b7bfc2..4a0430b 100644
--- a/src/plugins/plain/DynamicTokenTree.cpp
+++ b/src/plugins/plain/TokenTrie.cpp
@@ -16,18 +16,18 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include "DynamicTokenTree.hpp"
+#include "TokenTrie.hpp"
namespace ousia {
/* Class DynamicTokenTree::Node */
-DynamicTokenTree::Node::Node() : descriptor(nullptr) {}
+TokenTrie::Node::Node() : type(EmptyToken) {}
/* Class DynamicTokenTree */
-bool DynamicTokenTree::registerToken(const std::string &token,
- const TokenDescriptor *descriptor) noexcept
+bool TokenTrie::registerToken(const std::string &token,
+ TokenTypeId type) noexcept
{
// Abort if the token is empty -- this would taint the root node
if (token.empty()) {
@@ -42,23 +42,22 @@ bool DynamicTokenTree::registerToken(const std::string &token,
const char c = token[i];
auto it = node->children.find(c);
if (it == node->children.end()) {
- it = node->children.emplace(c, std::unique_ptr<Node>(new Node{}))
- .first;
+ it = node->children.emplace(c, std::make_shared<Node>()).first;
}
node = it->second.get();
}
- // If the resulting node already has a descriptor set, we're screwed.
- if (node->descriptor != nullptr) {
+ // If the resulting node already has a type set, we're screwed.
+ if (node->type != EmptyToken) {
return false;
}
- // Otherwise just set the descriptor to the given descriptor.
- node->descriptor = descriptor;
+ // Otherwise just set the type to the given type.
+ node->type = type;
return true;
}
-bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
+bool TokenTrie::unregisterToken(const std::string &token) noexcept
{
// We cannot remove empty tokens as we need to access the fist character
// upfront
@@ -77,24 +76,24 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
return false;
}
- // Reset the subtree handler if this node has another descriptor
+ // Reset the subtree handler if this node has another type
node = it->second.get();
- if ((node->descriptor != nullptr || node->children.size() > 1) &&
+ if ((node->type != EmptyToken || node->children.size() > 1) &&
(i + 1 != token.size())) {
subtreeRoot = node;
subtreeKey = token[i + 1];
}
}
- // If the node descriptor is already nullptr, we cannot do anything here
- if (node->descriptor == nullptr) {
+ // If the node type is already EmptyToken, we cannot do anything here
+ if (node->type == EmptyToken) {
return false;
}
// If the target node has children, we cannot delete the subtree. Set the
- // descriptor to nullptr instead
+ // type to EmptyToken instead
if (!node->children.empty()) {
- node->descriptor = nullptr;
+ node->type = EmptyToken;
return true;
}
@@ -103,19 +102,18 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
return true;
}
-const TokenDescriptor *DynamicTokenTree::hasToken(
- const std::string &token) const noexcept
+TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
{
Node const *node = &root;
for (size_t i = 0; i < token.size(); i++) {
const char c = token[i];
auto it = node->children.find(c);
if (it == node->children.end()) {
- return nullptr;
+ return EmptyToken;
}
node = it->second.get();
}
- return node->descriptor;
+ return node->type;
}
}
diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/TokenTrie.hpp
index c5dc4de..36c2ffa 100644
--- a/src/plugins/plain/DynamicTokenTree.hpp
+++ b/src/plugins/plain/TokenTrie.hpp
@@ -17,54 +17,61 @@
*/
/**
- * @file DynamicTokenTree.hpp
+ * @file TokenTrie.hpp
*
- * Class representing a token tree that can be updated dynamically.
+ * Class representing a token trie that can be updated dynamically.
*
* @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
-#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
-#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+#ifndef _OUSIA_TOKEN_TRIE_HPP_
+#define _OUSIA_TOKEN_TRIE_HPP_
+#include <cstdint>
#include <memory>
+#include <limits>
#include <unordered_map>
namespace ousia {
-class TokenDescriptor;
+/**
+ * The TokenTypeId is used to give each token type a unique id.
+ */
+using TokenTypeId = uint32_t;
/**
- * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to
- * identify the longest consecutive token in the text. This is equivalent to a
- * prefix trie.
+ * Token which is not a token.
+ */
+constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+
+/**
+ * Token which represents a text token.
+ */
+constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+
+/**
+ * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
+ * the longest consecutive token in the text. This is equivalent to a prefix
+ * trie.
*
- * A token tree is a construct that structures all special tokens a
- * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then
- * the token tree would look like this:
+ * A token trie is a construct that structures all special tokens a Tokenizer
+ * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
+ * three. Then the token tree would look like this:
*
* \code{*.txt}
- * a
- * | \
- * a $
- * | \
- * b c
- * | |
- * $ $
+ * ~ (0)
+ * / \
+ * a (2) b (0)
+ * | |
+ * a (0) a (0)
+ * | |
+ * b (1) c (0)
* \endcode
*
- * Every node in the token tree is a valid end state that has a $ attached to
- * it. During the search algorithm the Tokenizer goes through the tree and
- * stores the last valid position. If a character follows that does not lead to
- * a new node in the TokenTree the search ends (and starts again at this
- * character). The token corresponding to the last valid position is returned.
- *
- * This allows us to uniquely identify the matching token given a certain
- * input text. Note that this is a greedy matching approach that does not
- * work if you're using truly ambiguous tokens (that have the same text).
+ * Where the number indicates the corresponding token descriptor identifier.
*/
-class DynamicTokenTree {
+class TokenTrie {
public:
/**
* Structure used to build the node tree.
@@ -73,7 +80,7 @@ public:
/**
* Type used for the child map.
*/
- using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>;
+ using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
/**
* Map from single characters at the corresponding child nodes.
@@ -84,7 +91,7 @@ public:
* Reference at the corresponding token descriptor. Set to nullptr if
* no token is attached to this node.
*/
- TokenDescriptor const *descriptor;
+ TokenTypeId type;
/**
* Default constructor, initializes the descriptor with nullptr.
@@ -105,11 +112,10 @@ public:
*
* @param token is the character sequence that should be registered as
* token.
- * @param descriptor is the descriptor that should be set for this token.
+ * @param type is the descriptor that should be set for this token.
* @return true if the operation is successful, false otherwise.
*/
- bool registerToken(const std::string &token,
- const TokenDescriptor *descriptor) noexcept;
+ bool registerToken(const std::string &token, TokenTypeId type) noexcept;
/**
* Unregisters the token from the token tree. Returns true if the token was
@@ -128,9 +134,17 @@ public:
* @return the attached token descriptor or nullptr if the given token is
* not found.
*/
- const TokenDescriptor* hasToken(const std::string &token) const noexcept;
+ TokenTypeId hasToken(const std::string &token) const noexcept;
+
+ /**
+ * Returns a reference at the root node to be used for traversing the token
+ * tree.
+ *
+ * @return a reference at the root node.
+ */
+ const Node *getRoot() const noexcept { return &root; }
};
}
-#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */
+#endif /* _OUSIA_TOKEN_TRIE_HPP_ */