Merge branch 'master' of somweyr.de:ousia

author: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2015-02-15 21:56:04 +0100
committer: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2015-02-15 21:56:04 +0100
commit: d2f99e4b43ed93ef0fa8e138e0c3afc79775b77c (patch)
tree: 8e7cdb894b7036b3ca01499ee9432d2e62930477 /src/core/parser/utils/Tokenizer.hpp
parent: 40f7df390f00f85c17bd0e6527ec4ba19cbce4fc (diff)
parent: 4f2872d9968aec93bebff90d1238347c8a364949 (diff)
1 files changed, 231 insertions, 0 deletions
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
new file mode 100644
index 0000000..6b4e116
--- /dev/null
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -0,0 +1,231 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Tokenizer.hpp
+ *
+ * Tokenizer that can be reconfigured at runtime used for parsing the plain
+ * text format.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include <core/common/Location.hpp>
+#include <core/common/Whitespace.hpp>
+
+#include "TokenTrie.hpp"
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+
+/**
+ * The Token structure describes a token discovered by the Tokenizer.
+ */
+struct Token {
+	/**
+	 * Id of the type of this token.
+	 */
+	TokenTypeId type;
+
+	/**
+	 * String that was matched.
+	 */
+	std::string content;
+
+	/**
+	 * Location from which the string was extracted.
+	 */
+	SourceLocation location;
+
+	/**
+	 * Default constructor.
+	 */
+	Token() : type(EmptyToken) {}
+
+	/**
+	 * Constructor of the Token struct.
+	 *
+	 * @param id represents the token type.
+	 * @param content is the string content that has been extracted.
+	 * @param location is the location of the extracted string content in the
+	 * source file.
+	 */
+	Token(TokenTypeId type, const std::string &content,
+	             SourceLocation location)
+	    : type(type), content(content), location(location)
+	{
+	}
+
+	/**
+	 * Constructor of the Token struct, only initializes the token type
+	 *
+	 * @param type is the id corresponding to the type of the token.
+	 */
+	Token(TokenTypeId type) : type(type) {}
+
+	/**
+	 * The getLocation function allows the tokens to be directly passed as
+	 * parameter to Logger or LoggableException instances.
+	 *
+	 * @return a reference at the location field
+	 */
+	const SourceLocation &getLocation() const { return location; }
+};
+
+/**
+ * The Tokenizer is used to extract tokens and chunks of text from a
+ * CharReader. It allows to register and unregister tokens while parsing and
+ * to modify the handling of whitespace characters. Note that the
+ * Tokenizer always tries to extract the longest possible token from the
+ * tokenizer.
+ */
+class Tokenizer {
+private:
+	/**
+	 * Internally used token trie. This object holds all registered tokens.
+	 */
+	TokenTrie trie;
+
+	/**
+	 * Flag defining whether whitespaces should be preserved or not.
+	 */
+	WhitespaceMode whitespaceMode;
+
+	/**
+	 * Vector containing all registered token types.
+	 */
+	std::vector<std::string> tokens;
+
+	/**
+	 * Next index in the tokens list where to search for a new token id.
+	 */
+	size_t nextTokenTypeId;
+
+	/**
+	 * Templated function used internally to read the current token. The
+	 * function is templated in order to force code generation for all six
+	 * combiations of whitespace modes and reading/peeking.
+	 *
+	 * @tparam TextHandler is the type to be used for the textHandler instance.
+	 * @tparam read specifies whether the function should start from and advance
+	 * the read pointer of the char reader.
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
+	 * @param token is the token structure into which the token information
+	 * should be written.
+	 * @return false if the end of the stream has been reached, true otherwise.
+	 */
+	template <typename TextHandler, bool read>
+	bool next(CharReader &reader, Token &token);
+
+public:
+	/**
+	 * Constructor of the Tokenizer class.
+	 *
+	 * @param whitespaceMode specifies how whitespace should be handled.
+	 */
+	Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+
+	/**
+	 * Registers the given string as a token. Returns a const pointer at a
+	 * TokenDescriptor that will be used to reference the newly created token.
+	 *
+	 * @param token is the token string that should be registered.
+	 * @return a unique identifier for the registered token or EmptyToken if
+	 * an error occured.
+	 */
+	TokenTypeId registerToken(const std::string &token);
+
+	/**
+	 * Unregisters the token belonging to the given TokenTypeId.
+	 *
+	 * @param type is the token type that should be unregistered. The
+	 *TokenTypeId
+	 * must have been returned by registerToken.
+	 * @return true if the operation was successful, false otherwise (e.g.
+	 * because the given TokenDescriptor was already unregistered).
+	 */
+	bool unregisterToken(TokenTypeId type);
+
+	/**
+	 * Returns the token that was registered under the given TokenTypeId id or
+	 *an
+	 * empty string if an invalid TokenTypeId id is given.
+	 *
+	 * @param type is the TokenTypeId id for which the corresponding token
+	 *string
+	 * should be returned.
+	 * @return the registered token string or an empty string if the given type
+	 * was invalid.
+	 */
+	std::string getTokenString(TokenTypeId type);
+
+	/**
+	 * Sets the whitespace mode.
+	 *
+	 * @param whitespaceMode defines how whitespace should be treated in text
+	 * tokens.
+	 */
+	void setWhitespaceMode(WhitespaceMode mode);
+
+	/**
+	 * Returns the current value of the whitespace mode.
+	 *
+	 * @return the whitespace mode.
+	 */
+	WhitespaceMode getWhitespaceMode();
+
+	/**
+	 * Reads a new token from the CharReader and stores it in the given
+	 * Token instance.
+	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
+	 * @param token is a reference at the token instance into which the Token
+	 * information should be written.
+	 * @return true if a token could be read, false if the end of the stream
+	 * has been reached.
+	 */
+	bool read(CharReader &reader, Token &token);
+
+	/**
+	 * The peek method does not advance the read position of the char reader,
+	 * but reads the next token from the current char reader peek position.
+	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
+	 * @param token is a reference at the token instance into which the Token
+	 * information should be written.
+	 * @return true if a token could be read, false if the end of the stream
+	 * has been reached.
+	 */
+	bool peek(CharReader &reader, Token &token);
+};
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+
author	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2015-02-15 21:56:04 +0100
committer	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2015-02-15 21:56:04 +0100
commit	d2f99e4b43ed93ef0fa8e138e0c3afc79775b77c (patch)
tree	8e7cdb894b7036b3ca01499ee9432d2e62930477 /src/core/parser/utils/Tokenizer.hpp
parent	40f7df390f00f85c17bd0e6527ec4ba19cbce4fc (diff)
parent	4f2872d9968aec93bebff90d1238347c8a364949 (diff)