From fb0922e57f1a5e1fb8bfbe153dc381d5778e3137 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 17:53:15 +0100 Subject: Added seekPeekCursor function to CharReader, improved how seeking is handled by adding seekCursor method to Buffer --- src/core/common/CharReader.cpp | 32 ++++++++++++++++++++++++-------- src/core/common/CharReader.hpp | 20 ++++++++++++++++++++ 2 files changed, 44 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/core/common/CharReader.cpp b/src/core/common/CharReader.cpp index edcaf76..5b9b1d4 100644 --- a/src/core/common/CharReader.cpp +++ b/src/core/common/CharReader.cpp @@ -329,6 +329,19 @@ ssize_t Buffer::moveCursor(CursorId cursor, ssize_t relativeOffs) } } +size_t Buffer::seekCursor(CursorId cursor, size_t offs) +{ + // Fetch the current offset + const ssize_t currentOffs = offset(cursor); + const ssize_t relativeOffs = offs - currentOffs; + + // Perform the actual seeking, move the peek cursor to the read cursor + const ssize_t reachedOffs = currentOffs + moveCursor(cursor, relativeOffs); + + // Clamp to values larger or equal to zero + return reachedOffs < 0 ? 0 : reachedOffs; +} + bool Buffer::atEnd(Buffer::CursorId cursor) const { const Cursor &c = cursors[cursor]; @@ -504,17 +517,17 @@ size_t CharReader::readRaw(char *buf, size_t size) size_t CharReader::seek(size_t requestedOffset) { - // Fetch the current offset - const ssize_t currentOffs = getOffset(); - const ssize_t relativeOffs = requestedOffset - currentOffs; - - // Perform the actual seeking, move the peek cursor to the read cursor - const ssize_t reachedOffs = currentOffs + buffer->moveCursor(readCursor, relativeOffs); + const size_t res = buffer->seekCursor(readCursor, requestedOffset); buffer->copyCursor(readCursor, peekCursor); coherent = true; + return res; +} - // Clamp to values larger or equal to zero - return reachedOffs < 0 ? 0 : reachedOffs; +size_t CharReader::seekPeekCursor(size_t requestedOffset) +{ + const size_t res = buffer->seekCursor(peekCursor, requestedOffset); + coherent = (res == getOffset()); + return res; } bool CharReader::atEnd() const { return buffer->atEnd(readCursor); } @@ -526,6 +539,9 @@ size_t CharReader::getOffset() const size_t CharReader::getPeekOffset() const { + if (coherent) { + return getOffset(); + } return buffer->offset(peekCursor) + offs; } diff --git a/src/core/common/CharReader.hpp b/src/core/common/CharReader.hpp index cbd7b74..cbfeaf2 100644 --- a/src/core/common/CharReader.hpp +++ b/src/core/common/CharReader.hpp @@ -301,6 +301,15 @@ public: */ ssize_t moveCursor(CursorId cursor, ssize_t relativeOffs); + /** + * Moves the cursor to the given position. + * + * @param cursor is the cursor that should be moved. + * @param offs is the offset to which the cursor should be moved. + * @return the actual location that was reached. + */ + size_t seekCursor(CursorId cursor, size_t offs); + /** * Returns the current byte offset of the given cursor relative to the * beginning of the stream. @@ -532,6 +541,17 @@ public: */ size_t seek(size_t requestedOffset); + /** + * Moves the peek cursor to the requested offset. Returns the offse that wa + * actually reached. + * + * @param requestedOffset is the requested offset. This offset may no longer + * be reachable by the CharReader. + * @return the actually reached offset. The operation was successful, if + * the requested and reached offset are equal. + */ + size_t seekPeekCursor(size_t requestedOffset); + /** * Returns true if there are no more characters as the stream was closed. * -- cgit v1.2.3 From b9681594380333a0a3f0011b40ac6542e7022d98 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 17:54:09 +0100 Subject: Deleted DynamicTokenTree class, replaced by TokenTrie --- CMakeLists.txt | 4 +- src/plugins/plain/DynamicTokenTree.cpp | 121 ---------------------- src/plugins/plain/DynamicTokenTree.hpp | 136 ------------------------- src/plugins/plain/TokenTrie.cpp | 119 ++++++++++++++++++++++ src/plugins/plain/TokenTrie.hpp | 150 ++++++++++++++++++++++++++++ test/plugins/plain/DynamicTokenTreeTest.cpp | 94 ----------------- test/plugins/plain/TokenTrieTest.cpp | 92 +++++++++++++++++ 7 files changed, 363 insertions(+), 353 deletions(-) delete mode 100644 src/plugins/plain/DynamicTokenTree.cpp delete mode 100644 src/plugins/plain/DynamicTokenTree.hpp create mode 100644 src/plugins/plain/TokenTrie.cpp create mode 100644 src/plugins/plain/TokenTrie.hpp delete mode 100644 test/plugins/plain/DynamicTokenTreeTest.cpp create mode 100644 test/plugins/plain/TokenTrieTest.cpp (limited to 'src') diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d73248..f9b224d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,7 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml ) ADD_LIBRARY(ousia_plain - src/plugins/plain/DynamicTokenTree + src/plugins/plain/TokenTrie src/plugins/plain/PlainFormatStreamReader ) @@ -324,7 +324,7 @@ IF(TEST) ) ADD_EXECUTABLE(ousia_test_plain - test/plugins/plain/DynamicTokenTreeTest + test/plugins/plain/TokenTrieTest test/plugins/plain/PlainFormatStreamReaderTest ) diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/DynamicTokenTree.cpp deleted file mode 100644 index 8b7bfc2..0000000 --- a/src/plugins/plain/DynamicTokenTree.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "DynamicTokenTree.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -DynamicTokenTree::Node::Node() : descriptor(nullptr) {} - -/* Class DynamicTokenTree */ - -bool DynamicTokenTree::registerToken(const std::string &token, - const TokenDescriptor *descriptor) noexcept -{ - // Abort if the token is empty -- this would taint the root node - if (token.empty()) { - return false; - } - - // Iterate over each character in the given string and insert them as - // (new) nodes - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Insert a new node if this one does not exist - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - it = node->children.emplace(c, std::unique_ptr(new Node{})) - .first; - } - node = it->second.get(); - } - - // If the resulting node already has a descriptor set, we're screwed. - if (node->descriptor != nullptr) { - return false; - } - - // Otherwise just set the descriptor to the given descriptor. - node->descriptor = descriptor; - return true; -} - -bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept -{ - // We cannot remove empty tokens as we need to access the fist character - // upfront - if (token.empty()) { - return false; - } - - // First pass -- search the node in the path that can be deleted - Node *subtreeRoot = &root; - char subtreeKey = token[0]; - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Go to the next node, abort if the tree ends unexpectedly - auto it = node->children.find(token[i]); - if (it == node->children.end()) { - return false; - } - - // Reset the subtree handler if this node has another descriptor - node = it->second.get(); - if ((node->descriptor != nullptr || node->children.size() > 1) && - (i + 1 != token.size())) { - subtreeRoot = node; - subtreeKey = token[i + 1]; - } - } - - // If the node descriptor is already nullptr, we cannot do anything here - if (node->descriptor == nullptr) { - return false; - } - - // If the target node has children, we cannot delete the subtree. Set the - // descriptor to nullptr instead - if (!node->children.empty()) { - node->descriptor = nullptr; - return true; - } - - // If we end up here, we can safely delete the complete subtree - subtreeRoot->children.erase(subtreeKey); - return true; -} - -const TokenDescriptor *DynamicTokenTree::hasToken( - const std::string &token) const noexcept -{ - Node const *node = &root; - for (size_t i = 0; i < token.size(); i++) { - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - return nullptr; - } - node = it->second.get(); - } - return node->descriptor; -} -} - diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/DynamicTokenTree.hpp deleted file mode 100644 index c5dc4de..0000000 --- a/src/plugins/plain/DynamicTokenTree.hpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file DynamicTokenTree.hpp - * - * Class representing a token tree that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ -#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ - -#include -#include - -namespace ousia { - -class TokenDescriptor; - -/** - * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to - * identify the longest consecutive token in the text. This is equivalent to a - * prefix trie. - * - * A token tree is a construct that structures all special tokens a - * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then - * the token tree would look like this: - * - * \code{*.txt} - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ - * \endcode - * - * Every node in the token tree is a valid end state that has a $ attached to - * it. During the search algorithm the Tokenizer goes through the tree and - * stores the last valid position. If a character follows that does not lead to - * a new node in the TokenTree the search ends (and starts again at this - * character). The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). - */ -class DynamicTokenTree { -public: - /** - * Structure used to build the node tree. - */ - struct Node { - /** - * Type used for the child map. - */ - using ChildMap = std::unordered_map>; - - /** - * Map from single characters at the corresponding child nodes. - */ - ChildMap children; - - /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. - */ - TokenDescriptor const *descriptor; - - /** - * Default constructor, initializes the descriptor with nullptr. - */ - Node(); - }; - -private: - /** - * Root node of the internal token tree. - */ - Node root; - -public: - /** - * Registers a token containing the given string. Returns false if the - * token already exists, true otherwise. - * - * @param token is the character sequence that should be registered as - * token. - * @param descriptor is the descriptor that should be set for this token. - * @return true if the operation is successful, false otherwise. - */ - bool registerToken(const std::string &token, - const TokenDescriptor *descriptor) noexcept; - - /** - * Unregisters the token from the token tree. Returns true if the token was - * unregistered successfully, false otherwise. - * - * @param token is the character sequence that should be unregistered. - * @return true if the operation was successful, false otherwise. - */ - bool unregisterToken(const std::string &token) noexcept; - - /** - * Returns true, if the given token exists within the TokenTree. This - * function is mostly thought for debugging and unit testing. - * - * @param token is the character sequence that should be searched. - * @return the attached token descriptor or nullptr if the given token is - * not found. - */ - const TokenDescriptor* hasToken(const std::string &token) const noexcept; -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */ - diff --git a/src/plugins/plain/TokenTrie.cpp b/src/plugins/plain/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/plugins/plain/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::make_shared()).first; + } + node = it->second.get(); + } + + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { + return false; + } + + // Otherwise just set the type to the given type. + node->type = type; + return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another type + node = it->second.get(); + if ((node->type != EmptyToken || node->children.size() > 1) && + (i + 1 != token.size())) { + subtreeRoot = node; + subtreeKey = token[i + 1]; + } + } + + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { + return false; + } + + // If the target node has children, we cannot delete the subtree. Set the + // type to EmptyToken instead + if (!node->children.empty()) { + node->type = EmptyToken; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return EmptyToken; + } + node = it->second.get(); + } + return node->type; +} +} + diff --git a/src/plugins/plain/TokenTrie.hpp b/src/plugins/plain/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/plugins/plain/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include +#include +#include +#include + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenTypeId type; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param type is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, TokenTypeId type) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + diff --git a/test/plugins/plain/DynamicTokenTreeTest.cpp b/test/plugins/plain/DynamicTokenTreeTest.cpp deleted file mode 100644 index 5ae414c..0000000 --- a/test/plugins/plain/DynamicTokenTreeTest.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -static const TokenDescriptor *d1 = reinterpret_cast(1); -static const TokenDescriptor *d2 = reinterpret_cast(2); -static const TokenDescriptor *d3 = reinterpret_cast(3); -static const TokenDescriptor *d4 = reinterpret_cast(4); - -TEST(DynamicTokenTree, registerToken) -{ - DynamicTokenTree tree; - - ASSERT_TRUE(tree.registerToken("a", d1)); - ASSERT_TRUE(tree.registerToken("ab", d2)); - ASSERT_TRUE(tree.registerToken("b", d3)); - ASSERT_TRUE(tree.registerToken("hello", d4)); - - ASSERT_FALSE(tree.registerToken("", d1)); - ASSERT_FALSE(tree.registerToken("a", d4)); - ASSERT_FALSE(tree.registerToken("ab", d4)); - ASSERT_FALSE(tree.registerToken("b", d4)); - ASSERT_FALSE(tree.registerToken("hello", d4)); - - ASSERT_EQ(d1, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(d3, tree.hasToken("b")); - ASSERT_EQ(d4, tree.hasToken("hello")); - ASSERT_EQ(nullptr, tree.hasToken("")); - ASSERT_EQ(nullptr, tree.hasToken("abc")); -} - -TEST(DynamicTokenTree, unregisterToken) -{ - DynamicTokenTree tree; - - ASSERT_TRUE(tree.registerToken("a", d1)); - ASSERT_FALSE(tree.registerToken("a", d4)); - - ASSERT_TRUE(tree.registerToken("ab", d2)); - ASSERT_FALSE(tree.registerToken("ab", d4)); - - ASSERT_TRUE(tree.registerToken("b", d3)); - ASSERT_FALSE(tree.registerToken("b", d4)); - - ASSERT_EQ(d1, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(d3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("a")); - ASSERT_FALSE(tree.unregisterToken("a")); - - ASSERT_EQ(nullptr, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(d3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("b")); - ASSERT_FALSE(tree.unregisterToken("b")); - - ASSERT_EQ(nullptr, tree.hasToken("a")); - ASSERT_EQ(d2, tree.hasToken("ab")); - ASSERT_EQ(nullptr, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("ab")); - ASSERT_FALSE(tree.unregisterToken("ab")); - - ASSERT_EQ(nullptr, tree.hasToken("a")); - ASSERT_EQ(nullptr, tree.hasToken("ab")); - ASSERT_EQ(nullptr, tree.hasToken("b")); -} - - -} - diff --git a/test/plugins/plain/TokenTrieTest.cpp b/test/plugins/plain/TokenTrieTest.cpp new file mode 100644 index 0000000..d378fdf --- /dev/null +++ b/test/plugins/plain/TokenTrieTest.cpp @@ -0,0 +1,92 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +static const TokenTypeId t1 = 0; +static const TokenTypeId t2 = 1; +static const TokenTypeId t3 = 2; +static const TokenTypeId t4 = 3; + +TEST(TokenTrie, registerToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_TRUE(tree.registerToken("hello", t4)); + + ASSERT_FALSE(tree.registerToken("", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + ASSERT_FALSE(tree.registerToken("b", t4)); + ASSERT_FALSE(tree.registerToken("hello", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + ASSERT_EQ(t4, tree.hasToken("hello")); + ASSERT_EQ(EmptyToken, tree.hasToken("")); + ASSERT_EQ(EmptyToken, tree.hasToken("abc")); +} + +TEST(TokenTrie, unregisterToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_FALSE(tree.registerToken("b", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("a")); + ASSERT_FALSE(tree.unregisterToken("a")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("b")); + ASSERT_FALSE(tree.unregisterToken("b")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("ab")); + ASSERT_FALSE(tree.unregisterToken("ab")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(EmptyToken, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); +} +} + -- cgit v1.2.3 From 4854509f8add1e2ff167623fb0e8d4216d9d6023 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 17:54:27 +0100 Subject: Implemented DynamicTokenizer and unit tests --- CMakeLists.txt | 2 + src/plugins/plain/DynamicTokenizer.cpp | 514 ++++++++++++++++++++++++++-- src/plugins/plain/DynamicTokenizer.hpp | 154 ++++++--- test/plugins/plain/DynamicTokenizerTest.cpp | 416 ++++++++++++++++++++++ 4 files changed, 1016 insertions(+), 70 deletions(-) (limited to 'src') diff --git a/CMakeLists.txt b/CMakeLists.txt index f9b224d..867ca6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,6 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml ) ADD_LIBRARY(ousia_plain + src/plugins/plain/DynamicTokenizer src/plugins/plain/TokenTrie src/plugins/plain/PlainFormatStreamReader ) @@ -325,6 +326,7 @@ IF(TEST) ADD_EXECUTABLE(ousia_test_plain test/plugins/plain/TokenTrieTest + test/plugins/plain/DynamicTokenizerTest test/plugins/plain/PlainFormatStreamReaderTest ) diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp index 7690395..a8f2317 100644 --- a/src/plugins/plain/DynamicTokenizer.cpp +++ b/src/plugins/plain/DynamicTokenizer.cpp @@ -17,57 +17,529 @@ */ #include -#include -#include +#include #include +#include +#include #include "DynamicTokenizer.hpp" namespace ousia { +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + DynamicToken token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t textLength, size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the DynamicToken instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * DynamicTokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + DynamicToken{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/* Internal class TextHandlerBase */ + +/** + * Base class used for those classes that may be used as TextHandler in the + * DynamicTokenizer::next function. + */ +class TextHandlerBase { +public: + /** + * Start position of the extracted text. + */ + size_t textStart; + + /** + * End position of the extracted text. + */ + size_t textEnd; + + /** + * Buffer containing the extracted text. + */ + std::vector textBuf; + + /** + * Constructor of the TextHandlerBase base class. Initializes the start and + * end position with zeros. + */ + TextHandlerBase() : textStart(0), textEnd(0) {} + + /** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ + void buildTextToken(TokenMatch &match, SourceId sourceId) + { + if (match.hasMatch()) { + match.token.content = + std::string{textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, textStart, match.textEnd}; + } else { + match.token.content = std::string{textBuf.data(), textBuf.size()}; + match.token.location = SourceLocation{sourceId, textStart, textEnd}; + } + match.token.type = TextToken; + } + + /** + * Returns true if this whitespace handler has found any text and a text + * token could be emitted. + * + * @return true if the internal data buffer is non-empty. + */ + bool hasText() { return !textBuf.empty(); } +}; + +/* Internal class PreservingTextHandler */ + +/** + * The PreservingTextHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; + + /** + * Appends the given character to the internal text buffer, does not + * eliminate whitespace. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + textBuf.push_back(c); + } +}; + +/* Internal class TrimmingTextHandler */ + /** - * The TokenDescriptor class is a simple wrapper around a standard string - * containing the character sequence of the token. + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. */ -class TokenDescriptor { +class TrimmingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; + /** - * The character sequence of the token. + * Buffer used internally to temporarily store all whitespace characters. + * They are only added to the output buffer if another non-whitespace + * character is reached. */ - std::string str; + std::vector whitespaceBuf; /** - * Default constructor of the TokenDescriptor class. Used to describe - * special tokens. + * Appends the given character to the internal text buffer, eliminates + * whitespace characters at the begin and end of the text. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. */ - TokenDescriptor(); + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + whitespaceBuf.push_back(c); + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (!whitespaceBuf.empty()) { + textBuf.insert(textBuf.end(), whitespaceBuf.begin(), + whitespaceBuf.end()); + whitespaceBuf.clear(); + } + textBuf.push_back(c); + } +}; + +/* Internal class CollapsingTextHandler */ + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; /** - * Constructor initializing the character sequence of the token. + * Flag set to true if a whitespace character was reached. */ - TokenDescriptor(const std::string &str) : str(str) {} + bool hasWhitespace = false; + + /** + * Appends the given character to the internal text buffer, eliminates + * redundant whitespace characters. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + hasWhitespace = true; + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (hasWhitespace) { + textBuf.push_back(' '); + hasWhitespace = false; + } + textBuf.push_back(c); + } }; +} /* Class DynamicTokenizer */ -void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode) +DynamicTokenizer::DynamicTokenizer(CharReader &reader, + WhitespaceMode whitespaceMode) + : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0) { - whitespaceMode = mode; } -WhitespaceMode DynamicTokenizer::getWhitespaceMode() +template +bool DynamicTokenizer::next(DynamicToken &token) { - return whitespaceMode; + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector lookups; + std::vector nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && + (!match.hasMatch() || match.textLength > 0)) { + textHandler.buildTextToken(match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = DynamicToken{}; + } + return match.hasMatch(); +} + +bool DynamicTokenizer::read(DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(token); + case WhitespaceMode::TRIM: + return next(token); + case WhitespaceMode::COLLAPSE: + return next(token); + } + return false; +} + +bool DynamicTokenizer::peek(DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(token); + case WhitespaceMode::TRIM: + return next(token); + case WhitespaceMode::COLLAPSE: + return next(token); + } + return false; } +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } -/* Constant initializations */ + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; -static const TokenDescriptor Empty; -static const TokenDescriptor Text; -static const TokenDescriptor* DynamicTokenizer::Empty = &Empty; -static const TokenDescriptor* DynamicTokenizer::Token = &Text; + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string(); + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); } diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp index f7fef13..760bebf 100644 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -28,34 +28,63 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#include +#include +#include + #include +#include "TokenTrie.hpp" + namespace ousia { // Forward declarations class CharReader; -class TokenDescriptor; /** * The DynamicToken structure describes a token discovered by the Tokenizer. */ struct DynamicToken { /** - * Pointer pointing at the TokenDescriptor instance this token corresponds - * to. May be one of the special TokenDescriptors defined as static members - * of the DynamicTokenizer class. + * Id of the type of this token. */ - TokenDescriptor const *descriptor; + TokenTypeId type; /** * String that was matched. */ - std::string str; + std::string content; /** * Location from which the string was extracted. */ SourceLocation location; + + /** + * Default constructor. + */ + DynamicToken() : type(EmptyToken) {} + + /** + * Constructor of the DynamicToken struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + DynamicToken(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the DynamicToken struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + DynamicToken(TokenTypeId type) : type(type) {} }; /** @@ -64,43 +93,70 @@ struct DynamicToken { */ enum class WhitespaceMode { /** - * Preserves all whitespaces as they are found in the source file. - */ + * Preserves all whitespaces as they are found in the source file. + */ PRESERVE, /** - * Trims whitespace at the beginning and the end of the found text. - */ + * Trims whitespace at the beginning and the end of the found text. + */ TRIM, /** - * Whitespaces are trimmed and collapsed, multiple whitespace characters - * are replaced by a single space character. - */ + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ COLLAPSE }; /** * The DynamicTokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. + * to modify the handling of whitespace characters. Note that the + * DynamicTokenizer always tries to extract the longest possible token from the + * tokenizer. */ class DynamicTokenizer { private: /** - * Reference at the char reader. + * CharReader instance from which the tokens should be read. */ CharReader &reader; + /** + * Internally used token trie. This object holds all registered tokens. + */ + TokenTrie trie; + /** * Flag defining whether whitespaces should be preserved or not. */ WhitespaceMode whitespaceMode; /** - * Vector containing all registered token descriptors. + * Vector containing all registered token types. + */ + std::vector tokens; + + /** + * Next index in the tokens list where to search for a new token id. */ - std::vector> descriptors; + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. + * + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template + bool next(DynamicToken &token); public: /** @@ -108,43 +164,44 @@ public: * * @param reader is the CharReader that should be used for reading the * tokens. - * @param preserveWhitespaces should be set to true if all whitespaces - * should be preserved (for preformated environments). - */ - DynamicTokenizer(CharReader &reader) - : reader(reader), - preserveWhitespaces(preserveWhitespaces), - location(reader.getSourceId()), - empty(true), - hasWhitespace(false) - { - } - - /** - * Destructor of the DynamicTokenizer class. + * @param whitespaceMode specifies how whitespace should be handled. */ - ~DynamicTokenizer(); + DynamicTokenizer(CharReader &reader, + WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Registers the given string as a token. Returns a const pointer at a * TokenDescriptor that will be used to reference the newly created token. * * @param token is the token string that should be registered. - * @return a pointer at a TokenDescriptor which is representative for the - * newly registered token. Returns nullptr if a token with this string - * was already registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. */ - const TokenDescriptor* registerToken(const std::string &token); + TokenTypeId registerToken(const std::string &token); /** - * Unregisters the token belonging to the given TokenDescriptor. + * Unregisters the token belonging to the given TokenTypeId. * - * @param descr is a TokenDescriptor that was previously returned by - * registerToken. + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. * because the given TokenDescriptor was already unregistered). */ - bool unregisterToken(const TokenDescriptor *descr); + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); /** * Sets the whitespace mode. @@ -173,17 +230,16 @@ public: bool read(DynamicToken &token); /** - * TokenDescriptor representing an empty token. - */ - static const *TokenDescriptor Empty; - - /** - * TokenDescriptor representing generic text. + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. */ - static const *TokenDescriptor Text; - + bool peek(DynamicToken &token); }; - } #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp index e69de29..63fa466 100644 --- a/test/plugins/plain/DynamicTokenizerTest.cpp +++ b/test/plugins/plain/DynamicTokenizerTest.cpp @@ -0,0 +1,416 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +namespace ousia { + +TEST(DynamicTokenizer, tokenRegistration) +{ + CharReader reader{"test"}; + DynamicTokenizer tokenizer{reader}; + + ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); + + ASSERT_EQ(0U, tokenizer.registerToken("a")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); + ASSERT_EQ("a", tokenizer.getTokenString(0U)); + + ASSERT_EQ(1U, tokenizer.registerToken("b")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); + ASSERT_EQ("b", tokenizer.getTokenString(1U)); + + ASSERT_EQ(2U, tokenizer.registerToken("c")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); + ASSERT_EQ("c", tokenizer.getTokenString(2U)); + + ASSERT_TRUE(tokenizer.unregisterToken(1U)); + ASSERT_FALSE(tokenizer.unregisterToken(1U)); + ASSERT_EQ("", tokenizer.getTokenString(1U)); + + ASSERT_EQ(1U, tokenizer.registerToken("d")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); + ASSERT_EQ("d", tokenizer.getTokenString(1U)); +} + +TEST(DynamicTokenizer, textTokenPreserveWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(36U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(token)); + } +} + +TEST(DynamicTokenizer, textTokenTrimWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(token)); + } +} + +TEST(DynamicTokenizer, textTokenCollapseWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(token)); + } +} + +TEST(DynamicTokenizer, simpleReadToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer{reader}; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ(':', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ('t', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + + char c; + ASSERT_FALSE(reader.peek(c)); + } +} + +TEST(DynamicTokenizer, simplePeekToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer{reader}; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(5U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(6U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(11U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } +} + +TEST(DynamicTokenizer, ambiguousTokens) +{ + CharReader reader{"abc"}; + DynamicTokenizer tokenizer(reader); + + TokenTypeId t1 = tokenizer.registerToken("abd"); + TokenTypeId t2 = tokenizer.registerToken("bc"); + + ASSERT_EQ(0U, t1); + ASSERT_EQ(1U, t2); + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("a", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + + ASSERT_TRUE(tokenizer.read(token)); + + ASSERT_EQ(t2, token.type); + ASSERT_EQ("bc", token.content); + + loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(3U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(token)); +} + +TEST(DynamicTokenizer, commentTestWhitespacePreserve) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(reader, WhitespaceMode::PRESERVE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test ", SourceLocation{0, 5, 10}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(t)); +} + +TEST(DynamicTokenizer, commentTestWhitespaceCollapse) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(reader, WhitespaceMode::COLLAPSE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test", SourceLocation{0, 5, 9}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(t)); +} + +} + -- cgit v1.2.3 From f713b1d393230e7083727d457623fdac878eb248 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 18:48:07 +0100 Subject: DynamicTokenizer now gets the reader as a parameter to read and peek -- the beauty of this tokenizer is that it has no internal state depending on the reader, so it doesn't need to hold a reference to it --- src/plugins/plain/DynamicTokenizer.cpp | 35 ++++++------- src/plugins/plain/DynamicTokenizer.hpp | 22 ++++---- test/plugins/plain/DynamicTokenizerTest.cpp | 81 ++++++++++++++--------------- 3 files changed, 67 insertions(+), 71 deletions(-) (limited to 'src') diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp index a8f2317..f2cfcd1 100644 --- a/src/plugins/plain/DynamicTokenizer.cpp +++ b/src/plugins/plain/DynamicTokenizer.cpp @@ -345,14 +345,13 @@ public: /* Class DynamicTokenizer */ -DynamicTokenizer::DynamicTokenizer(CharReader &reader, - WhitespaceMode whitespaceMode) - : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0) +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) { } template -bool DynamicTokenizer::next(DynamicToken &token) +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -437,28 +436,28 @@ bool DynamicTokenizer::next(DynamicToken &token) return match.hasMatch(); } -bool DynamicTokenizer::read(DynamicToken &token) +bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next(token); + return next(reader, token); case WhitespaceMode::TRIM: - return next(token); + return next(reader, token); case WhitespaceMode::COLLAPSE: - return next(token); + return next(reader, token); } return false; } -bool DynamicTokenizer::peek(DynamicToken &token) +bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next(token); + return next(reader, token); case WhitespaceMode::TRIM: - return next(token); + return next(reader, token); case WhitespaceMode::COLLAPSE: - return next(token); + return next(reader, token); } return false; } @@ -530,16 +529,16 @@ WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } /* Explicitly instantiate all possible instantiations of the "next" member function */ template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader, DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader, DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); } diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp index 760bebf..0b4dd39 100644 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -118,11 +118,6 @@ enum class WhitespaceMode { */ class DynamicTokenizer { private: - /** - * CharReader instance from which the tokens should be read. - */ - CharReader &reader; - /** * Internally used token trie. This object holds all registered tokens. */ @@ -151,23 +146,22 @@ private: * @tparam TextHandler is the type to be used for the textHandler instance. * @tparam read specifies whether the function should start from and advance * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. * @param token is the token structure into which the token information * should be written. * @return false if the end of the stream has been reached, true otherwise. */ template - bool next(DynamicToken &token); + bool next(CharReader &reader, DynamicToken &token); public: /** * Constructor of the DynamicTokenizer class. * - * @param reader is the CharReader that should be used for reading the - * tokens. * @param whitespaceMode specifies how whitespace should be handled. */ - DynamicTokenizer(CharReader &reader, - WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Registers the given string as a token. Returns a const pointer at a @@ -222,23 +216,27 @@ public: * Reads a new token from the CharReader and stores it in the given * DynamicToken instance. * + * @param reader is the CharReader instance from which the data should be + * read. * @param token is a reference at the token instance into which the Token * information should be written. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(DynamicToken &token); + bool read(CharReader &reader, DynamicToken &token); /** * The peek method does not advance the read position of the char reader, * but reads the next token from the current char reader peek position. * + * @param reader is the CharReader instance from which the data should be + * read. * @param token is a reference at the token instance into which the Token * information should be written. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(DynamicToken &token); + bool peek(CharReader &reader, DynamicToken &token); }; } diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp index 63fa466..5183fdd 100644 --- a/test/plugins/plain/DynamicTokenizerTest.cpp +++ b/test/plugins/plain/DynamicTokenizerTest.cpp @@ -25,8 +25,7 @@ namespace ousia { TEST(DynamicTokenizer, tokenRegistration) { - CharReader reader{"test"}; - DynamicTokenizer tokenizer{reader}; + DynamicTokenizer tokenizer; ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); @@ -57,10 +56,10 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE}; + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ(" this \t is only a \n\n test text ", token.content); @@ -68,17 +67,17 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(36U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE}; + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -86,7 +85,7 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(32U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } } @@ -96,10 +95,10 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM}; + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -107,17 +106,17 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(33U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM}; + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -125,7 +124,7 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(32U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } } @@ -135,10 +134,10 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE}; + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -146,17 +145,17 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(33U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE}; + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -164,21 +163,21 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(32U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } } TEST(DynamicTokenizer, simpleReadToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer{reader}; + DynamicTokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test1", token.content); @@ -194,7 +193,7 @@ TEST(DynamicTokenizer, simpleReadToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); ASSERT_EQ(":", token.content); @@ -210,7 +209,7 @@ TEST(DynamicTokenizer, simpleReadToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test2", token.content); @@ -227,14 +226,14 @@ TEST(DynamicTokenizer, simpleReadToken) TEST(DynamicTokenizer, simplePeekToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer{reader}; + DynamicTokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { DynamicToken token; - ASSERT_TRUE(tokenizer.peek(token)); + ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test1", token.content); @@ -248,7 +247,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.peek(token)); + ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(tid, token.type); ASSERT_EQ(":", token.content); @@ -262,7 +261,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.peek(token)); + ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test2", token.content); @@ -276,7 +275,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test1", token.content); @@ -290,7 +289,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); ASSERT_EQ(":", token.content); @@ -304,7 +303,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test2", token.content); @@ -320,7 +319,7 @@ TEST(DynamicTokenizer, simplePeekToken) TEST(DynamicTokenizer, ambiguousTokens) { CharReader reader{"abc"}; - DynamicTokenizer tokenizer(reader); + DynamicTokenizer tokenizer; TokenTypeId t1 = tokenizer.registerToken("abd"); TokenTypeId t2 = tokenizer.registerToken("bc"); @@ -329,7 +328,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(1U, t2); DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("a", token.content); @@ -338,7 +337,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(t2, token.type); ASSERT_EQ("bc", token.content); @@ -347,7 +346,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(3U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } TEST(DynamicTokenizer, commentTestWhitespacePreserve) @@ -355,7 +354,7 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(reader, WhitespaceMode::PRESERVE); + DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); @@ -371,14 +370,14 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) DynamicToken t; for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(t)); + EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); EXPECT_EQ(te.content, t.content); EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); } - ASSERT_FALSE(tokenizer.read(t)); + ASSERT_FALSE(tokenizer.read(reader, t)); } TEST(DynamicTokenizer, commentTestWhitespaceCollapse) @@ -386,7 +385,7 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(reader, WhitespaceMode::COLLAPSE); + DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); @@ -402,14 +401,14 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) DynamicToken t; for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(t)); + EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); EXPECT_EQ(te.content, t.content); EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); } - ASSERT_FALSE(tokenizer.read(t)); + ASSERT_FALSE(tokenizer.read(reader, t)); } } -- cgit v1.2.3 From f066b4887f6f2896fe602f14ede9c02a9f5a7e1a Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 18:48:27 +0100 Subject: Added isIdentifierStart function --- src/core/common/Utils.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src') diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index fa3788a..457d446 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -57,6 +57,14 @@ public: return isAlphabetic(c) || isNumeric(c); } + /** + * Returns true if the given character is in [A-Za-z_] + */ + static bool isIdentifierStart(const char c) + { + return isAlphabetic(c) || (c == '_'); + } + /** * Returns true if the given character is in [A-Za-z_][A-Za-z0-9_-]* */ -- cgit v1.2.3 From 51f09f4faa7cd4b6a0576758881d322e31e896ba Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 18:49:02 +0100 Subject: Ported PlainFormatStreamReader to DynamicTokenizer --- src/plugins/plain/PlainFormatStreamReader.cpp | 279 ++++++++------------------ src/plugins/plain/PlainFormatStreamReader.hpp | 34 +++- 2 files changed, 116 insertions(+), 197 deletions(-) (limited to 'src') diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp index 15ca403..f0721a0 100644 --- a/src/plugins/plain/PlainFormatStreamReader.cpp +++ b/src/plugins/plain/PlainFormatStreamReader.cpp @@ -16,9 +16,6 @@ along with this program. If not, see . */ -#include -#include - #include #include #include @@ -27,123 +24,40 @@ namespace ousia { -/* Internally used types, protected from spilling the exports by a namespace */ - namespace { -/** - * Enum used to specify the state of the parseBlockComment state machine. - */ -enum class BlockCommentState { DEFAULT, HAS_CURLY_CLOSE, HAS_PERCENT }; - -/** - * Class taking care of recording plain text data found withing the file. - */ -class DataHandler { -private: - /** - * Const reference at the reader, used for reading the current location. - */ - const CharReader &reader; - - /** - * Flag defining whether whitespaces should be preserved or not. - */ - const bool preserveWhitespaces; +struct DataHandler { + std::vector buf; - /** - * Current source range of the data in the buffer. - */ - SourceLocation location; + SourceOffset start; + SourceOffset end; - /** - * Current buffer containing all read characters. - */ - std::stringstream buffer; + DataHandler() : start(0), end(0) {} - /** - * Set to false, once a non-whitespace character was reached. - */ - bool empty; + bool isEmpty() { return buf.empty(); } - /** - * Set to true if a whitespace was found -- these are normalized to a single - * space. - */ - bool hasWhitespace; - -public: - /** - * Constructor of the DataHandler class. - * - * @param reader is the CharReader that should be used for reading the data - * location. - * @param preserveWhitespaces should be set to true if all whitespaces - * should be preserved (for preformated environments). - */ - DataHandler(const CharReader &reader, bool preserveWhitespaces = false) - : reader(reader), - preserveWhitespaces(preserveWhitespaces), - location(reader.getSourceId()), - empty(true), - hasWhitespace(false) + void append(char c, SourceOffset charStart, SourceOffset charEnd) { + if (isEmpty()) { + start = charStart; + } + buf.push_back(c); + end = charEnd; } - /** - * Appends the given character to the internal buffer. - * - * @param c is the character that should be appended. - * @param wasEscaped is set to true if the character was escaped (prepended - * with a backslash), this allows whitespace characters to be explicitly - * included. - */ - void append(char c, bool wasEscaped = false) + void append(const std::string &s, SourceOffset stringStart, + SourceOffset stringEnd) { - // Check whether the character is a whitespace - const bool isWhitespace = - !wasEscaped && !preserveWhitespaces && Utils::isWhitespace(c); - - // Trim leading and trailing whitespaces - if (isWhitespace) { - if (!empty) { - hasWhitespace = true; - } - } else { - // Compress whitespaces to a single space - if (hasWhitespace) { - buffer << ' '; - hasWhitespace = false; - } - - // Append the character - buffer << c; - - // Update the "empty" flag and set the start and end offset - if (empty) { - location.setStart(reader.getOffset()); - empty = false; - } - location.setEnd(reader.getPeekOffset()); + if (isEmpty()) { + start = stringStart; } + std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); + end = stringEnd; } - /** - * Returns true if no non-whitespace character has been found until now. - * - * @return true if the internal buffer is still empty. - */ - bool isEmpty() { return empty; } - - /** - * Returns a variant containg the read data and its location. - * - * @return a variant with a string value containing the read data and the - * location being set to - */ - Variant getData() + Variant toVariant(SourceId sourceId) { - Variant res = Variant::fromString(buffer.str()); - res.setLocation(location); + Variant res = Variant::fromString(std::string(buf.data(), buf.size())); + res.setLocation({sourceId, start, end}); return res; } }; @@ -153,35 +67,26 @@ PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader, Logger &logger) : reader(reader), logger(logger), fieldIdx(0) { + tokenBackslash = tokenizer.registerToken("\\"); + tokenLinebreak = tokenizer.registerToken("\n"); + tokenLineComment = tokenizer.registerToken("%"); + tokenBlockCommentStart = tokenizer.registerToken("%{"); + tokenBlockCommentEnd = tokenizer.registerToken("}%"); } -/* Comment handling */ - void PlainFormatStreamReader::parseBlockComment() { - char c; - BlockCommentState state = BlockCommentState::DEFAULT; - while (reader.read(c)) { - switch (state) { - case BlockCommentState::DEFAULT: - if (c == '%') { - state = BlockCommentState::HAS_PERCENT; - } else if (c == '}') { - state = BlockCommentState::HAS_CURLY_CLOSE; - } - break; - case BlockCommentState::HAS_PERCENT: - if (c == '{') { - parseBlockComment(); - } - state = BlockCommentState::DEFAULT; - break; - case BlockCommentState::HAS_CURLY_CLOSE: - if (c == '%') { - return; - } - state = BlockCommentState::DEFAULT; - break; + DynamicToken token; + size_t depth = 1; + while (tokenizer.read(reader, token)) { + if (token.type == tokenBlockCommentEnd) { + depth--; + if (depth == 0) { + return; + } + } + if (token.type == tokenBlockCommentStart) { + depth++; } } @@ -189,102 +94,84 @@ void PlainFormatStreamReader::parseBlockComment() logger.error("File ended while being in a block comment", reader); } -void PlainFormatStreamReader::parseComment() +void PlainFormatStreamReader::parseLineComment() { char c; - bool first = true; reader.consumePeek(); while (reader.read(c)) { - // Continue parsing a block comment if a '{' is found - if (c == '{' && first) { - parseBlockComment(); - return; - } if (c == '\n') { return; } - first = false; } } -/* Top level parse function */ - -static const std::unordered_set EscapeableCharacters{'\\', '<', '>', - '{', '}', '%'}; - PlainFormatStreamReader::State PlainFormatStreamReader::parse() { // Macro (sorry for that) used for checking whether there is data to issue, and // if yes, aborting the loop, allowing for a reentry on a later parse call by // resetting the peek cursor -#define CHECK_ISSUE_DATA() \ - { \ - if (!dataHandler.isEmpty()) { \ - reader.resetPeek(); \ - abort = true; \ - break; \ - } \ +#define CHECK_ISSUE_DATA() \ + { \ + if (!dataHandler.isEmpty()) { \ + reader.resetPeek(); \ + abort = true; \ + break; \ + } \ } - // Data handler - DataHandler dataHandler(reader); + // Handler for incomming data + DataHandler dataHandler; // Variable set to true if the parser loop should be left bool abort = false; - // Happily add characters to the dataHandler and handle escaping until a - // special character is reached. Then go to a specialiced parsing routine - char c; - while (!abort && reader.peek(c)) { - switch (c) { - case '\\': - reader.peek(c); - // Check whether this backslash just escaped some special or - // whitespace character or was the beginning of a command - if (EscapeableCharacters.count(c) == 0 && - !Utils::isWhitespace(c)) { - CHECK_ISSUE_DATA(); - // TODO: Parse command (starting from the backslash) - return State::COMMAND; - } - // A character was escaped, add it to the buffer, with the - // wasEscaped flag set to true - dataHandler.append(c, true); - break; - case '<': - // TODO: Annotations - break; - case '>': - // TODO: Annotations - break; - case '{': - // TODO: Issue start of field - break; - case '}': - // TODO: Issue end of field - case '%': - CHECK_ISSUE_DATA(); - parseComment(); - break; - case '\n': + // Read tokens until the outer loop should be left + DynamicToken token; + while (!abort && tokenizer.peek(reader, token)) { + // Check whether this backslash just escaped some special or + // whitespace character or was the beginning of a command + if (token.type == tokenBackslash) { + // Check whether this character could be the start of a command + char c; + reader.consumePeek(); + reader.peek(c); + if (Utils::isIdentifierStart(c)) { CHECK_ISSUE_DATA(); - reader.consumePeek(); - return State::LINEBREAK; - default: - dataHandler.append(c, false); + // TODO: Parse a command + return State::COMMAND; + } + + // This was not a special character, just append the given character + // to the data buffer, use the escape character start as start + // location and the peek offset as end location + dataHandler.append(c, token.location.getStart(), + reader.getPeekOffset()); + } else if (token.type == tokenLineComment) { + CHECK_ISSUE_DATA(); + reader.consumePeek(); + parseLineComment(); + } else if (token.type == tokenBlockCommentStart) { + CHECK_ISSUE_DATA(); + reader.consumePeek(); + parseBlockComment(); + } else if (token.type == tokenLinebreak) { + CHECK_ISSUE_DATA(); + reader.consumePeek(); + return State::LINEBREAK; + } else if (token.type == TextToken) { + dataHandler.append(token.content, token.location.getStart(), + token.location.getEnd()); } // Consume the peeked character if we did not abort, otherwise abort if (!abort) { reader.consumePeek(); - } else { - break; } } // Send out pending output data, otherwise we are at the end of the stream if (!dataHandler.isEmpty()) { - data = dataHandler.getData(); + data = dataHandler.toVariant(reader.getSourceId()); return State::DATA; } return State::END; diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp index 1a136cd..b2ea378 100644 --- a/src/plugins/plain/PlainFormatStreamReader.hpp +++ b/src/plugins/plain/PlainFormatStreamReader.hpp @@ -31,6 +31,8 @@ #include +#include "DynamicTokenizer.hpp" + namespace ousia { // Forward declarations @@ -122,6 +124,11 @@ private: */ Logger &logger; + /** + * Tokenizer instance used to read individual tokens from the text. + */ + DynamicTokenizer tokenizer; + /** * Variant containing the current command name (always is a string variant, * but additionally contains the correct locatino of the name). @@ -140,6 +147,31 @@ private: */ Variant data; + /** + * Id of the backslash token. + */ + TokenTypeId tokenBackslash; + + /** + * Id of the linebreak token. + */ + TokenTypeId tokenLinebreak; + + /** + * Id of the line comment token. + */ + TokenTypeId tokenLineComment; + + /** + * Id of the block comment start token. + */ + TokenTypeId tokenBlockCommentStart; + + /** + * If of the block comment end token. + */ + TokenTypeId tokenBlockCommentEnd; + /** * Contains the field index of the current command. */ @@ -153,7 +185,7 @@ private: /** * Function used internally to parse a generic comment. */ - void parseComment(); + void parseLineComment(); public: /** -- cgit v1.2.3 From f6e7859a835375c25226719a46df99ec11037599 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 19:01:34 +0100 Subject: added some comments --- src/plugins/plain/PlainFormatStreamReader.cpp | 51 ++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp index f0721a0..498cd43 100644 --- a/src/plugins/plain/PlainFormatStreamReader.cpp +++ b/src/plugins/plain/PlainFormatStreamReader.cpp @@ -25,16 +25,49 @@ namespace ousia { namespace { -struct DataHandler { + +/** + * Class used internally to collect data issued via "DATA" event. + */ +class DataHandler { +private: + /** + * Internal character buffer. + */ std::vector buf; + /** + * Start location of the character data. + */ SourceOffset start; + + /** + * End location of the character data. + */ SourceOffset end; +public: + + /** + * Default constructor, initializes start and end with zeros. + */ DataHandler() : start(0), end(0) {} + /** + * Returns true if the internal buffer is empty. + * + * @return true if no characters were added to the internal buffer, false + * otherwise. + */ bool isEmpty() { return buf.empty(); } + /** + * Appends a single character to the internal buffer. + * + * @param c is the character that should be added to the internal buffer. + * @param charStart is the start position of the character. + * @param charEnd is the end position of the character. + */ void append(char c, SourceOffset charStart, SourceOffset charEnd) { if (isEmpty()) { @@ -44,6 +77,13 @@ struct DataHandler { end = charEnd; } + /** + * Appends a string to the internal buffer. + * + * @param s is the string that should be added to the internal buffer. + * @param stringStart is the start position of the string. + * @param stringEnd is the end position of the string. + */ void append(const std::string &s, SourceOffset stringStart, SourceOffset stringEnd) { @@ -54,6 +94,15 @@ struct DataHandler { end = stringEnd; } + /** + * Converts the internal buffer to a variant with attached location + * information. + * + * @param sourceId is the source id which is needed for building the + * location information. + * @return a Variant with the internal buffer content as string and + * the correct start and end location. + */ Variant toVariant(SourceId sourceId) { Variant res = Variant::fromString(std::string(buf.data(), buf.size())); -- cgit v1.2.3