summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-06 02:24:07 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-06 02:24:07 +0100
commit0376fdb483464ded73a5c1a8bba97b196af23b6d (patch)
tree43d2d5a8092ab4bcf6a991c8eee95414ccfc9f10 /src
parent9a153303908e9511526f916cc4771a91df6635ae (diff)
Continue writing parser for plain document format
Diffstat (limited to 'src')
-rw-r--r--src/plugins/plain/DynamicTokenTree.cpp115
-rw-r--r--src/plugins/plain/DynamicTokenTree.hpp136
-rw-r--r--src/plugins/plain/DynamicTokenizer.cpp73
-rw-r--r--src/plugins/plain/DynamicTokenizer.hpp190
4 files changed, 514 insertions, 0 deletions
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/DynamicTokenTree.cpp
new file mode 100644
index 0000000..25f6a54
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenTree.cpp
@@ -0,0 +1,115 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "DynamicTokenTree.hpp"
+
+namespace ousia {
+
+/* Class DynamicTokenTree::Node */
+
+DynamicTokenTree::Node::Node() : descriptor(nullptr) {}
+
+/* Class DynamicTokenTree */
+
+bool DynamicTokenTree::registerToken(const std::string &token,
+ const TokenDescriptor *descriptor) noexcept
+{
+ // Abort if the token is empty -- this would taint the root node
+ if (token.empty()) {
+ return false;
+ }
+
+ // Iterate over each character in the given string and insert them as
+ // (new) nodes
+ Node *node = &root;
+ for (size_t i = 0; i < token.size(); i++) {
+ // Insert a new node if this one does not exist
+ const char c = token[i];
+ auto it = node->children.find(c);
+ if (it == node->children.end()) {
+ it = node->children.emplace(c, std::unique_ptr<Node>(new Node{})).first;
+ }
+ node = it->second.get();
+ }
+
+ // If the resulting node already has a descriptor set, we're screwed.
+ if (node->descriptor != nullptr) {
+ return false;
+ }
+
+ // Otherwise just set the descriptor to the given descriptor.
+ node->descriptor = descriptor;
+ return true;
+}
+
+bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
+{
+ // We cannot remove empty tokens as we need to access the fist character
+ // upfront
+ if (token.empty()) {
+ return false;
+ }
+
+ // First pass -- search the node in the path that can be deleted
+ Node *subtreeRoot = &root;
+ char subtreeKey = token[0];
+ Node *node = &root;
+ for (size_t i = 0; i < token.size() - 1; i++) {
+ // Go to the next node, abort if the tree ends unexpectedly
+ auto it = node->children.find(token[i]);
+ if (it == node->children.end()) {
+ return false;
+ }
+
+ // Reset the subtree handler if this node has another descriptor
+ Node *child = it->second.get();
+ if ((child->descriptor != nullptr || child->children.size() > 1)) {
+ subtreeRoot = child;
+ subtreeKey = token[i + 1];
+ }
+ node = child;
+ }
+
+ // If the leaf node, we cannot delete the subtree. Set the
+ // descriptor to nullptr instead
+ if (!node->children.empty()) {
+ node->descriptor = nullptr;
+ return true;
+ }
+
+ // If we end up here, we can safely delete the complete subtree
+ subtreeRoot->children.erase(subtreeKey);
+ return true;
+}
+
+const TokenDescriptor *DynamicTokenTree::hasToken(
+ const std::string &token) const noexcept
+{
+ Node const *node = &root;
+ for (size_t i = 0; i < token.size(); i++) {
+ const char c = token[i];
+ auto it = node->children.find(c);
+ if (it == node->children.end()) {
+ return nullptr;
+ }
+ node = it->second.get();
+ }
+ return node->descriptor;
+}
+}
+
diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/DynamicTokenTree.hpp
new file mode 100644
index 0000000..c5dc4de
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenTree.hpp
@@ -0,0 +1,136 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file DynamicTokenTree.hpp
+ *
+ * Class representing a token tree that can be updated dynamically.
+ *
+ * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+
+#include <memory>
+#include <unordered_map>
+
+namespace ousia {
+
+class TokenDescriptor;
+
+/**
+ * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to
+ * identify the longest consecutive token in the text. This is equivalent to a
+ * prefix trie.
+ *
+ * A token tree is a construct that structures all special tokens a
+ * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then
+ * the token tree would look like this:
+ *
+ * \code{*.txt}
+ * a
+ * | \
+ * a $
+ * | \
+ * b c
+ * | |
+ * $ $
+ * \endcode
+ *
+ * Every node in the token tree is a valid end state that has a $ attached to
+ * it. During the search algorithm the Tokenizer goes through the tree and
+ * stores the last valid position. If a character follows that does not lead to
+ * a new node in the TokenTree the search ends (and starts again at this
+ * character). The token corresponding to the last valid position is returned.
+ *
+ * This allows us to uniquely identify the matching token given a certain
+ * input text. Note that this is a greedy matching approach that does not
+ * work if you're using truly ambiguous tokens (that have the same text).
+ */
+class DynamicTokenTree {
+public:
+ /**
+ * Structure used to build the node tree.
+ */
+ struct Node {
+ /**
+ * Type used for the child map.
+ */
+ using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>;
+
+ /**
+ * Map from single characters at the corresponding child nodes.
+ */
+ ChildMap children;
+
+ /**
+ * Reference at the corresponding token descriptor. Set to nullptr if
+ * no token is attached to this node.
+ */
+ TokenDescriptor const *descriptor;
+
+ /**
+ * Default constructor, initializes the descriptor with nullptr.
+ */
+ Node();
+ };
+
+private:
+ /**
+ * Root node of the internal token tree.
+ */
+ Node root;
+
+public:
+ /**
+ * Registers a token containing the given string. Returns false if the
+ * token already exists, true otherwise.
+ *
+ * @param token is the character sequence that should be registered as
+ * token.
+ * @param descriptor is the descriptor that should be set for this token.
+ * @return true if the operation is successful, false otherwise.
+ */
+ bool registerToken(const std::string &token,
+ const TokenDescriptor *descriptor) noexcept;
+
+ /**
+ * Unregisters the token from the token tree. Returns true if the token was
+ * unregistered successfully, false otherwise.
+ *
+ * @param token is the character sequence that should be unregistered.
+ * @return true if the operation was successful, false otherwise.
+ */
+ bool unregisterToken(const std::string &token) noexcept;
+
+ /**
+ * Returns true, if the given token exists within the TokenTree. This
+ * function is mostly thought for debugging and unit testing.
+ *
+ * @param token is the character sequence that should be searched.
+ * @return the attached token descriptor or nullptr if the given token is
+ * not found.
+ */
+ const TokenDescriptor* hasToken(const std::string &token) const noexcept;
+};
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */
+
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
new file mode 100644
index 0000000..7690395
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -0,0 +1,73 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <core/common/CharReader.hpp>
+
+#include "DynamicTokenizer.hpp"
+
+namespace ousia {
+
+/**
+ * The TokenDescriptor class is a simple wrapper around a standard string
+ * containing the character sequence of the token.
+ */
+class TokenDescriptor {
+ /**
+ * The character sequence of the token.
+ */
+ std::string str;
+
+ /**
+ * Default constructor of the TokenDescriptor class. Used to describe
+ * special tokens.
+ */
+ TokenDescriptor();
+
+ /**
+ * Constructor initializing the character sequence of the token.
+ */
+ TokenDescriptor(const std::string &str) : str(str) {}
+};
+
+/* Class DynamicTokenizer */
+
+void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode)
+{
+ whitespaceMode = mode;
+}
+
+WhitespaceMode DynamicTokenizer::getWhitespaceMode()
+{
+ return whitespaceMode;
+}
+
+
+/* Constant initializations */
+
+static const TokenDescriptor Empty;
+static const TokenDescriptor Text;
+static const TokenDescriptor* DynamicTokenizer::Empty = &Empty;
+static const TokenDescriptor* DynamicTokenizer::Token = &Text;
+
+
+}
+
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
new file mode 100644
index 0000000..f7fef13
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -0,0 +1,190 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file DynamicTokenizer.hpp
+ *
+ * Tokenizer that can be reconfigured at runtime used for parsing the plain
+ * text format.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+
+#include <core/common/Location.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class TokenDescriptor;
+
+/**
+ * The DynamicToken structure describes a token discovered by the Tokenizer.
+ */
+struct DynamicToken {
+ /**
+ * Pointer pointing at the TokenDescriptor instance this token corresponds
+ * to. May be one of the special TokenDescriptors defined as static members
+ * of the DynamicTokenizer class.
+ */
+ TokenDescriptor const *descriptor;
+
+ /**
+ * String that was matched.
+ */
+ std::string str;
+
+ /**
+ * Location from which the string was extracted.
+ */
+ SourceLocation location;
+};
+
+/**
+ * Enum specifying the whitespace handling of the DynamicTokenizer class when
+ * reading non-token text.
+ */
+enum class WhitespaceMode {
+ /**
+ * Preserves all whitespaces as they are found in the source file.
+ */
+ PRESERVE,
+
+ /**
+ * Trims whitespace at the beginning and the end of the found text.
+ */
+ TRIM,
+
+ /**
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
+ COLLAPSE
+};
+
+/**
+ * The DynamicTokenizer is used to extract tokens and chunks of text from a
+ * CharReader. It allows to register and unregister tokens while parsing and
+ * to modify the handling of whitespace characters.
+ */
+class DynamicTokenizer {
+private:
+ /**
+ * Reference at the char reader.
+ */
+ CharReader &reader;
+
+ /**
+ * Flag defining whether whitespaces should be preserved or not.
+ */
+ WhitespaceMode whitespaceMode;
+
+ /**
+ * Vector containing all registered token descriptors.
+ */
+ std::vector<std::unique_ptr<TokenDescriptor>> descriptors;
+
+public:
+ /**
+ * Constructor of the DynamicTokenizer class.
+ *
+ * @param reader is the CharReader that should be used for reading the
+ * tokens.
+ * @param preserveWhitespaces should be set to true if all whitespaces
+ * should be preserved (for preformated environments).
+ */
+ DynamicTokenizer(CharReader &reader)
+ : reader(reader),
+ preserveWhitespaces(preserveWhitespaces),
+ location(reader.getSourceId()),
+ empty(true),
+ hasWhitespace(false)
+ {
+ }
+
+ /**
+ * Destructor of the DynamicTokenizer class.
+ */
+ ~DynamicTokenizer();
+
+ /**
+ * Registers the given string as a token. Returns a const pointer at a
+ * TokenDescriptor that will be used to reference the newly created token.
+ *
+ * @param token is the token string that should be registered.
+ * @return a pointer at a TokenDescriptor which is representative for the
+ * newly registered token. Returns nullptr if a token with this string
+ * was already registered.
+ */
+ const TokenDescriptor* registerToken(const std::string &token);
+
+ /**
+ * Unregisters the token belonging to the given TokenDescriptor.
+ *
+ * @param descr is a TokenDescriptor that was previously returned by
+ * registerToken.
+ * @return true if the operation was successful, false otherwise (e.g.
+ * because the given TokenDescriptor was already unregistered).
+ */
+ bool unregisterToken(const TokenDescriptor *descr);
+
+ /**
+ * Sets the whitespace mode.
+ *
+ * @param whitespaceMode defines how whitespace should be treated in text
+ * tokens.
+ */
+ void setWhitespaceMode(WhitespaceMode mode);
+
+ /**
+ * Returns the current value of the whitespace mode.
+ *
+ * @return the whitespace mode.
+ */
+ WhitespaceMode getWhitespaceMode();
+
+ /**
+ * Reads a new token from the CharReader and stores it in the given
+ * DynamicToken instance.
+ *
+ * @param token is a reference at the token instance into which the Token
+ * information should be written.
+ * @return true if a token could be read, false if the end of the stream
+ * has been reached.
+ */
+ bool read(DynamicToken &token);
+
+ /**
+ * TokenDescriptor representing an empty token.
+ */
+ static const *TokenDescriptor Empty;
+
+ /**
+ * TokenDescriptor representing generic text.
+ */
+ static const *TokenDescriptor Text;
+
+};
+
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+