diff options
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/plugins/plain/DynamicTokenTree.cpp | 115 | ||||
-rw-r--r-- | src/plugins/plain/DynamicTokenTree.hpp | 136 | ||||
-rw-r--r-- | src/plugins/plain/DynamicTokenizer.cpp | 73 | ||||
-rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 190 | ||||
-rw-r--r-- | test/plugins/plain/DynamicTokenTreeTest.cpp | 94 | ||||
-rw-r--r-- | test/plugins/plain/DynamicTokenizerTest.cpp | 0 |
7 files changed, 610 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 97d2e78..1d73248 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,6 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml ) ADD_LIBRARY(ousia_plain + src/plugins/plain/DynamicTokenTree src/plugins/plain/PlainFormatStreamReader ) @@ -323,6 +324,7 @@ IF(TEST) ) ADD_EXECUTABLE(ousia_test_plain + test/plugins/plain/DynamicTokenTreeTest test/plugins/plain/PlainFormatStreamReaderTest ) diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/DynamicTokenTree.cpp new file mode 100644 index 0000000..25f6a54 --- /dev/null +++ b/src/plugins/plain/DynamicTokenTree.cpp @@ -0,0 +1,115 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "DynamicTokenTree.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +DynamicTokenTree::Node::Node() : descriptor(nullptr) {} + +/* Class DynamicTokenTree */ + +bool DynamicTokenTree::registerToken(const std::string &token, + const TokenDescriptor *descriptor) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::unique_ptr<Node>(new Node{})).first; + } + node = it->second.get(); + } + + // If the resulting node already has a descriptor set, we're screwed. + if (node->descriptor != nullptr) { + return false; + } + + // Otherwise just set the descriptor to the given descriptor. + node->descriptor = descriptor; + return true; +} + +bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size() - 1; i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another descriptor + Node *child = it->second.get(); + if ((child->descriptor != nullptr || child->children.size() > 1)) { + subtreeRoot = child; + subtreeKey = token[i + 1]; + } + node = child; + } + + // If the leaf node, we cannot delete the subtree. Set the + // descriptor to nullptr instead + if (!node->children.empty()) { + node->descriptor = nullptr; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +const TokenDescriptor *DynamicTokenTree::hasToken( + const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return nullptr; + } + node = it->second.get(); + } + return node->descriptor; +} +} + diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/DynamicTokenTree.hpp new file mode 100644 index 0000000..c5dc4de --- /dev/null +++ b/src/plugins/plain/DynamicTokenTree.hpp @@ -0,0 +1,136 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file DynamicTokenTree.hpp + * + * Class representing a token tree that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ +#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ + +#include <memory> +#include <unordered_map> + +namespace ousia { + +class TokenDescriptor; + +/** + * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to + * identify the longest consecutive token in the text. This is equivalent to a + * prefix trie. + * + * A token tree is a construct that structures all special tokens a + * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then + * the token tree would look like this: + * + * \code{*.txt} + * a + * | \ + * a $ + * | \ + * b c + * | | + * $ $ + * \endcode + * + * Every node in the token tree is a valid end state that has a $ attached to + * it. During the search algorithm the Tokenizer goes through the tree and + * stores the last valid position. If a character follows that does not lead to + * a new node in the TokenTree the search ends (and starts again at this + * character). The token corresponding to the last valid position is returned. + * + * This allows us to uniquely identify the matching token given a certain + * input text. Note that this is a greedy matching approach that does not + * work if you're using truly ambiguous tokens (that have the same text). + */ +class DynamicTokenTree { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenDescriptor const *descriptor; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param descriptor is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, + const TokenDescriptor *descriptor) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + const TokenDescriptor* hasToken(const std::string &token) const noexcept; +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */ + diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp new file mode 100644 index 0000000..7690395 --- /dev/null +++ b/src/plugins/plain/DynamicTokenizer.cpp @@ -0,0 +1,73 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <memory> +#include <string> +#include <unordered_map> + +#include <core/common/CharReader.hpp> + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +/** + * The TokenDescriptor class is a simple wrapper around a standard string + * containing the character sequence of the token. + */ +class TokenDescriptor { + /** + * The character sequence of the token. + */ + std::string str; + + /** + * Default constructor of the TokenDescriptor class. Used to describe + * special tokens. + */ + TokenDescriptor(); + + /** + * Constructor initializing the character sequence of the token. + */ + TokenDescriptor(const std::string &str) : str(str) {} +}; + +/* Class DynamicTokenizer */ + +void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode DynamicTokenizer::getWhitespaceMode() +{ + return whitespaceMode; +} + + +/* Constant initializations */ + +static const TokenDescriptor Empty; +static const TokenDescriptor Text; +static const TokenDescriptor* DynamicTokenizer::Empty = &Empty; +static const TokenDescriptor* DynamicTokenizer::Token = &Text; + + +} + diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp new file mode 100644 index 0000000..f7fef13 --- /dev/null +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -0,0 +1,190 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file DynamicTokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include <core/common/Location.hpp> + +namespace ousia { + +// Forward declarations +class CharReader; +class TokenDescriptor; + +/** + * The DynamicToken structure describes a token discovered by the Tokenizer. + */ +struct DynamicToken { + /** + * Pointer pointing at the TokenDescriptor instance this token corresponds + * to. May be one of the special TokenDescriptors defined as static members + * of the DynamicTokenizer class. + */ + TokenDescriptor const *descriptor; + + /** + * String that was matched. + */ + std::string str; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; +}; + +/** + * Enum specifying the whitespace handling of the DynamicTokenizer class when + * reading non-token text. + */ +enum class WhitespaceMode { + /** + * Preserves all whitespaces as they are found in the source file. + */ + PRESERVE, + + /** + * Trims whitespace at the beginning and the end of the found text. + */ + TRIM, + + /** + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ + COLLAPSE +}; + +/** + * The DynamicTokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. + */ +class DynamicTokenizer { +private: + /** + * Reference at the char reader. + */ + CharReader &reader; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token descriptors. + */ + std::vector<std::unique_ptr<TokenDescriptor>> descriptors; + +public: + /** + * Constructor of the DynamicTokenizer class. + * + * @param reader is the CharReader that should be used for reading the + * tokens. + * @param preserveWhitespaces should be set to true if all whitespaces + * should be preserved (for preformated environments). + */ + DynamicTokenizer(CharReader &reader) + : reader(reader), + preserveWhitespaces(preserveWhitespaces), + location(reader.getSourceId()), + empty(true), + hasWhitespace(false) + { + } + + /** + * Destructor of the DynamicTokenizer class. + */ + ~DynamicTokenizer(); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a pointer at a TokenDescriptor which is representative for the + * newly registered token. Returns nullptr if a token with this string + * was already registered. + */ + const TokenDescriptor* registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenDescriptor. + * + * @param descr is a TokenDescriptor that was previously returned by + * registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(const TokenDescriptor *descr); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * DynamicToken instance. + * + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(DynamicToken &token); + + /** + * TokenDescriptor representing an empty token. + */ + static const *TokenDescriptor Empty; + + /** + * TokenDescriptor representing generic text. + */ + static const *TokenDescriptor Text; + +}; + +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + diff --git a/test/plugins/plain/DynamicTokenTreeTest.cpp b/test/plugins/plain/DynamicTokenTreeTest.cpp new file mode 100644 index 0000000..5ae414c --- /dev/null +++ b/test/plugins/plain/DynamicTokenTreeTest.cpp @@ -0,0 +1,94 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <gtest/gtest.h> + +#include <plugins/plain/DynamicTokenTree.hpp> + +namespace ousia { + +static const TokenDescriptor *d1 = reinterpret_cast<const TokenDescriptor*>(1); +static const TokenDescriptor *d2 = reinterpret_cast<const TokenDescriptor*>(2); +static const TokenDescriptor *d3 = reinterpret_cast<const TokenDescriptor*>(3); +static const TokenDescriptor *d4 = reinterpret_cast<const TokenDescriptor*>(4); + +TEST(DynamicTokenTree, registerToken) +{ + DynamicTokenTree tree; + + ASSERT_TRUE(tree.registerToken("a", d1)); + ASSERT_TRUE(tree.registerToken("ab", d2)); + ASSERT_TRUE(tree.registerToken("b", d3)); + ASSERT_TRUE(tree.registerToken("hello", d4)); + + ASSERT_FALSE(tree.registerToken("", d1)); + ASSERT_FALSE(tree.registerToken("a", d4)); + ASSERT_FALSE(tree.registerToken("ab", d4)); + ASSERT_FALSE(tree.registerToken("b", d4)); + ASSERT_FALSE(tree.registerToken("hello", d4)); + + ASSERT_EQ(d1, tree.hasToken("a")); + ASSERT_EQ(d2, tree.hasToken("ab")); + ASSERT_EQ(d3, tree.hasToken("b")); + ASSERT_EQ(d4, tree.hasToken("hello")); + ASSERT_EQ(nullptr, tree.hasToken("")); + ASSERT_EQ(nullptr, tree.hasToken("abc")); +} + +TEST(DynamicTokenTree, unregisterToken) +{ + DynamicTokenTree tree; + + ASSERT_TRUE(tree.registerToken("a", d1)); + ASSERT_FALSE(tree.registerToken("a", d4)); + + ASSERT_TRUE(tree.registerToken("ab", d2)); + ASSERT_FALSE(tree.registerToken("ab", d4)); + + ASSERT_TRUE(tree.registerToken("b", d3)); + ASSERT_FALSE(tree.registerToken("b", d4)); + + ASSERT_EQ(d1, tree.hasToken("a")); + ASSERT_EQ(d2, tree.hasToken("ab")); + ASSERT_EQ(d3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("a")); + ASSERT_FALSE(tree.unregisterToken("a")); + + ASSERT_EQ(nullptr, tree.hasToken("a")); + ASSERT_EQ(d2, tree.hasToken("ab")); + ASSERT_EQ(d3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("b")); + ASSERT_FALSE(tree.unregisterToken("b")); + + ASSERT_EQ(nullptr, tree.hasToken("a")); + ASSERT_EQ(d2, tree.hasToken("ab")); + ASSERT_EQ(nullptr, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("ab")); + ASSERT_FALSE(tree.unregisterToken("ab")); + + ASSERT_EQ(nullptr, tree.hasToken("a")); + ASSERT_EQ(nullptr, tree.hasToken("ab")); + ASSERT_EQ(nullptr, tree.hasToken("b")); +} + + +} + diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/test/plugins/plain/DynamicTokenizerTest.cpp |