Continue writing parser for plain document format

author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-02-06 02:24:07 +0100
committer: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-02-06 02:24:07 +0100
commit: 0376fdb483464ded73a5c1a8bba97b196af23b6d (patch)
tree: 43d2d5a8092ab4bcf6a991c8eee95414ccfc9f10 /src
parent: 9a153303908e9511526f916cc4771a91df6635ae (diff)
4 files changed, 514 insertions, 0 deletions
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/DynamicTokenTree.cpp
new file mode 100644
index 0000000..25f6a54
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenTree.cpp
@@ -0,0 +1,115 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "DynamicTokenTree.hpp"
+
+namespace ousia {
+
+/* Class DynamicTokenTree::Node */
+
+DynamicTokenTree::Node::Node() : descriptor(nullptr) {}
+
+/* Class DynamicTokenTree */
+
+bool DynamicTokenTree::registerToken(const std::string &token,
+                                     const TokenDescriptor *descriptor) noexcept
+{
+	// Abort if the token is empty -- this would taint the root node
+	if (token.empty()) {
+		return false;
+	}
+
+	// Iterate over each character in the given string and insert them as
+	// (new) nodes
+	Node *node = &root;
+	for (size_t i = 0; i < token.size(); i++) {
+		// Insert a new node if this one does not exist
+		const char c = token[i];
+		auto it = node->children.find(c);
+		if (it == node->children.end()) {
+			it = node->children.emplace(c, std::unique_ptr<Node>(new Node{})).first;
+		}
+		node = it->second.get();
+	}
+
+	// If the resulting node already has a descriptor set, we're screwed.
+	if (node->descriptor != nullptr) {
+		return false;
+	}
+
+	// Otherwise just set the descriptor to the given descriptor.
+	node->descriptor = descriptor;
+	return true;
+}
+
+bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
+{
+	// We cannot remove empty tokens as we need to access the fist character
+	// upfront
+	if (token.empty()) {
+		return false;
+	}
+
+	// First pass -- search the node in the path that can be deleted
+	Node *subtreeRoot = &root;
+	char subtreeKey = token[0];
+	Node *node = &root;
+	for (size_t i = 0; i < token.size() - 1; i++) {
+		// Go to the next node, abort if the tree ends unexpectedly
+		auto it = node->children.find(token[i]);
+		if (it == node->children.end()) {
+			return false;
+		}
+
+		// Reset the subtree handler if this node has another descriptor
+		Node *child = it->second.get();
+		if ((child->descriptor != nullptr || child->children.size() > 1)) {
+			subtreeRoot = child;
+			subtreeKey = token[i + 1];
+		}
+		node = child;
+	}
+
+	// If the leaf node, we cannot delete the subtree. Set the
+	// descriptor to nullptr instead
+	if (!node->children.empty()) {
+		node->descriptor = nullptr;
+		return true;
+	}
+
+	// If we end up here, we can safely delete the complete subtree
+	subtreeRoot->children.erase(subtreeKey);
+	return true;
+}
+
+const TokenDescriptor *DynamicTokenTree::hasToken(
+    const std::string &token) const noexcept
+{
+	Node const *node = &root;
+	for (size_t i = 0; i < token.size(); i++) {
+		const char c = token[i];
+		auto it = node->children.find(c);
+		if (it == node->children.end()) {
+			return nullptr;
+		}
+		node = it->second.get();
+	}
+	return node->descriptor;
+}
+}
+
diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/DynamicTokenTree.hpp
new file mode 100644
index 0000000..c5dc4de
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenTree.hpp
@@ -0,0 +1,136 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file DynamicTokenTree.hpp
+ *
+ * Class representing a token tree that can be updated dynamically.
+ *
+ * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+
+#include <memory>
+#include <unordered_map>
+
+namespace ousia {
+
+class TokenDescriptor;
+
+/**
+ * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to
+ * identify the longest consecutive token in the text. This is equivalent to a
+ * prefix trie.
+ *
+ * A token tree is a construct that structures all special tokens a
+ * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then
+ * the token tree would look like this:
+ *
+ * \code{*.txt}
+ * a
+ * | \
+ * a $
+ * | \
+ * b c
+ * | |
+ * $ $
+ * \endcode
+ *
+ * Every node in the token tree is a valid end state that has a $ attached to
+ * it. During the search algorithm the Tokenizer goes through the tree and
+ * stores the last valid position. If a character follows that does not lead to
+ * a new node in the TokenTree the search ends (and starts again at this
+ * character). The token corresponding to the last valid position is returned.
+ *
+ * This allows us to uniquely identify the matching token given a certain
+ * input text. Note that this is a greedy matching approach that does not
+ * work if you're using truly ambiguous tokens (that have the same text).
+ */
+class DynamicTokenTree {
+public:
+	/**
+	 * Structure used to build the node tree.
+	 */
+	struct Node {
+		/**
+		 * Type used for the child map.
+		 */
+		using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>;
+
+		/**
+		 * Map from single characters at the corresponding child nodes.
+		 */
+		ChildMap children;
+
+		/**
+		 * Reference at the corresponding token descriptor. Set to nullptr if
+		 * no token is attached to this node.
+		 */
+		TokenDescriptor const *descriptor;
+
+		/**
+		 * Default constructor, initializes the descriptor with nullptr.
+		 */
+		Node();
+	};
+
+private:
+	/**
+	 * Root node of the internal token tree.
+	 */
+	Node root;
+
+public:
+	/**
+	 * Registers a token containing the given string. Returns false if the
+	 * token already exists, true otherwise.
+	 *
+	 * @param token is the character sequence that should be registered as
+	 * token.
+	 * @param descriptor is the descriptor that should be set for this token.
+	 * @return true if the operation is successful, false otherwise.
+	 */
+	bool registerToken(const std::string &token,
+	                   const TokenDescriptor *descriptor) noexcept;
+
+	/**
+	 * Unregisters the token from the token tree. Returns true if the token was
+	 * unregistered successfully, false otherwise.
+	 *
+	 * @param token is the character sequence that should be unregistered.
+	 * @return true if the operation was successful, false otherwise.
+	 */
+	bool unregisterToken(const std::string &token) noexcept;
+
+	/**
+	 * Returns true, if the given token exists within the TokenTree. This
+	 * function is mostly thought for debugging and unit testing.
+	 *
+	 * @param token is the character sequence that should be searched.
+	 * @return the attached token descriptor or nullptr if the given token is
+	 * not found.
+	 */
+	const TokenDescriptor* hasToken(const std::string &token) const noexcept;
+};
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */
+
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
new file mode 100644
index 0000000..7690395
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -0,0 +1,73 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <core/common/CharReader.hpp>
+
+#include "DynamicTokenizer.hpp"
+
+namespace ousia {
+
+/**
+ * The TokenDescriptor class is a simple wrapper around a standard string
+ * containing the character sequence of the token.
+ */
+class TokenDescriptor {
+	/**
+	 * The character sequence of the token.
+	 */
+	std::string str;
+
+	/**
+	 * Default constructor of the TokenDescriptor class. Used to describe
+	 * special tokens.
+	 */
+	TokenDescriptor();
+
+	/**
+	 * Constructor initializing the character sequence of the token.
+	 */
+	TokenDescriptor(const std::string &str) : str(str) {}
+};
+
+/* Class DynamicTokenizer */
+
+void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode)
+{
+	whitespaceMode = mode;
+}
+
+WhitespaceMode DynamicTokenizer::getWhitespaceMode()
+{
+	return whitespaceMode;
+}
+
+
+/* Constant initializations */
+
+static const TokenDescriptor Empty;
+static const TokenDescriptor Text;
+static const TokenDescriptor* DynamicTokenizer::Empty = &Empty;
+static const TokenDescriptor* DynamicTokenizer::Token = &Text;
+
+
+}
+
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
new file mode 100644
index 0000000..f7fef13
--- /dev/null
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -0,0 +1,190 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file DynamicTokenizer.hpp
+ *
+ * Tokenizer that can be reconfigured at runtime used for parsing the plain
+ * text format.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+
+#include <core/common/Location.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class TokenDescriptor;
+
+/**
+ * The DynamicToken structure describes a token discovered by the Tokenizer.
+ */
+struct DynamicToken {
+	/**
+	 * Pointer pointing at the TokenDescriptor instance this token corresponds
+	 * to. May be one of the special TokenDescriptors defined as static members
+	 * of the DynamicTokenizer class.
+	 */
+	TokenDescriptor const *descriptor;
+
+	/**
+	 * String that was matched.
+	 */
+	std::string str;
+
+	/**
+	 * Location from which the string was extracted.
+	 */
+	SourceLocation location;
+};
+
+/**
+ * Enum specifying the whitespace handling of the DynamicTokenizer class when
+ * reading non-token text.
+ */
+enum class WhitespaceMode {
+	/**
+	 * Preserves all whitespaces as they are found in the source file.
+	 */
+	PRESERVE,
+
+	/**
+	 * Trims whitespace at the beginning and the end of the found text.
+	 */
+	TRIM,
+
+	/**
+	 * Whitespaces are trimmed and collapsed, multiple whitespace characters
+	 * are replaced by a single space character.
+	 */
+	COLLAPSE
+};
+
+/**
+ * The DynamicTokenizer is used to extract tokens and chunks of text from a
+ * CharReader. It allows to register and unregister tokens while parsing and
+ * to modify the handling of whitespace characters.
+ */
+class DynamicTokenizer {
+private:
+	/**
+	 * Reference at the char reader.
+	 */
+	CharReader &reader;
+
+	/**
+	 * Flag defining whether whitespaces should be preserved or not.
+	 */
+	WhitespaceMode whitespaceMode;
+
+	/**
+	 * Vector containing all registered token descriptors.
+	 */
+	std::vector<std::unique_ptr<TokenDescriptor>> descriptors;
+
+public:
+	/**
+	 * Constructor of the DynamicTokenizer class.
+	 *
+	 * @param reader is the CharReader that should be used for reading the
+	 * tokens.
+	 * @param preserveWhitespaces should be set to true if all whitespaces
+	 * should be preserved (for preformated environments).
+	 */
+	DynamicTokenizer(CharReader &reader)
+	    : reader(reader),
+	      preserveWhitespaces(preserveWhitespaces),
+	      location(reader.getSourceId()),
+	      empty(true),
+	      hasWhitespace(false)
+	{
+	}
+
+	/**
+	 * Destructor of the DynamicTokenizer class.
+	 */
+	~DynamicTokenizer();
+
+	/**
+	 * Registers the given string as a token. Returns a const pointer at a
+	 * TokenDescriptor that will be used to reference the newly created token.
+	 *
+	 * @param token is the token string that should be registered.
+	 * @return a pointer at a TokenDescriptor which is representative for the
+	 * newly registered token. Returns nullptr if a token with this string
+	 * was already registered.
+	 */
+	const TokenDescriptor* registerToken(const std::string &token);
+
+	/**
+	 * Unregisters the token belonging to the given TokenDescriptor.
+	 *
+	 * @param descr is a TokenDescriptor that was previously returned by
+	 * registerToken.
+	 * @return true if the operation was successful, false otherwise (e.g.
+	 * because the given TokenDescriptor was already unregistered).
+	 */
+	bool unregisterToken(const TokenDescriptor *descr);
+
+	/**
+	 * Sets the whitespace mode.
+	 *
+	 * @param whitespaceMode defines how whitespace should be treated in text
+	 * tokens.
+	 */
+	void setWhitespaceMode(WhitespaceMode mode);
+
+	/**
+	 * Returns the current value of the whitespace mode.
+	 *
+	 * @return the whitespace mode.
+	 */
+	WhitespaceMode getWhitespaceMode();
+
+	/**
+	 * Reads a new token from the CharReader and stores it in the given
+	 * DynamicToken instance.
+	 *
+	 * @param token is a reference at the token instance into which the Token
+	 * information should be written.
+	 * @return true if a token could be read, false if the end of the stream
+	 * has been reached.
+	 */
+	bool read(DynamicToken &token);
+
+	/**
+	 * TokenDescriptor representing an empty token.
+	 */
+	static const *TokenDescriptor Empty;
+
+	/**
+	 * TokenDescriptor representing generic text.
+	 */
+	static const *TokenDescriptor Text;
+
+};
+
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-02-06 02:24:07 +0100
committer	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-02-06 02:24:07 +0100
commit	0376fdb483464ded73a5c1a8bba97b196af23b6d (patch)
tree	43d2d5a8092ab4bcf6a991c8eee95414ccfc9f10 /src
parent	9a153303908e9511526f916cc4771a91df6635ae (diff)