From 0376fdb483464ded73a5c1a8bba97b196af23b6d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Fri, 6 Feb 2015 02:24:07 +0100 Subject: Continue writing parser for plain document format --- src/plugins/plain/DynamicTokenTree.hpp | 136 +++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 src/plugins/plain/DynamicTokenTree.hpp (limited to 'src/plugins/plain/DynamicTokenTree.hpp') diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/DynamicTokenTree.hpp new file mode 100644 index 0000000..c5dc4de --- /dev/null +++ b/src/plugins/plain/DynamicTokenTree.hpp @@ -0,0 +1,136 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file DynamicTokenTree.hpp + * + * Class representing a token tree that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ +#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ + +#include +#include + +namespace ousia { + +class TokenDescriptor; + +/** + * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to + * identify the longest consecutive token in the text. This is equivalent to a + * prefix trie. + * + * A token tree is a construct that structures all special tokens a + * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then + * the token tree would look like this: + * + * \code{*.txt} + * a + * | \ + * a $ + * | \ + * b c + * | | + * $ $ + * \endcode + * + * Every node in the token tree is a valid end state that has a $ attached to + * it. During the search algorithm the Tokenizer goes through the tree and + * stores the last valid position. If a character follows that does not lead to + * a new node in the TokenTree the search ends (and starts again at this + * character). The token corresponding to the last valid position is returned. + * + * This allows us to uniquely identify the matching token given a certain + * input text. Note that this is a greedy matching approach that does not + * work if you're using truly ambiguous tokens (that have the same text). + */ +class DynamicTokenTree { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenDescriptor const *descriptor; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param descriptor is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, + const TokenDescriptor *descriptor) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + const TokenDescriptor* hasToken(const std::string &token) const noexcept; +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */ + -- cgit v1.2.3