From 65bbbd778f6e0a3668c859b0e22cced7075a726d Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sat, 14 Feb 2015 23:47:11 +0100
Subject: Moved DynamicTokenizer and TokenTrie to parser/utils

---
 src/core/parser/utils/Tokenizer.hpp | 231 ++++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 src/core/parser/utils/Tokenizer.hpp

(limited to 'src/core/parser/utils/Tokenizer.hpp')
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
new file mode 100644
index 0000000..3e5aeb3
--- /dev/null
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -0,0 +1,231 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file DynamicTokenizer.hpp
+ *
+ * Tokenizer that can be reconfigured at runtime used for parsing the plain
+ * text format.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include <core/common/Location.hpp>
+#include <core/common/Whitespace.hpp>
+
+#include "TokenTrie.hpp"
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+
+/**
+ * The DynamicToken structure describes a token discovered by the Tokenizer.
+ */
+struct DynamicToken {
+	/**
+	 * Id of the type of this token.
+	 */
+	TokenTypeId type;
+
+	/**
+	 * String that was matched.
+	 */
+	std::string content;
+
+	/**
+	 * Location from which the string was extracted.
+	 */
+	SourceLocation location;
+
+	/**
+	 * Default constructor.
+	 */
+	DynamicToken() : type(EmptyToken) {}
+
+	/**
+	 * Constructor of the DynamicToken struct.
+	 *
+	 * @param id represents the token type.
+	 * @param content is the string content that has been extracted.
+	 * @param location is the location of the extracted string content in the
+	 * source file.
+	 */
+	DynamicToken(TokenTypeId type, const std::string &content,
+	             SourceLocation location)
+	    : type(type), content(content), location(location)
+	{
+	}
+
+	/**
+	 * Constructor of the DynamicToken struct, only initializes the token type
+	 *
+	 * @param type is the id corresponding to the type of the token.
+	 */
+	DynamicToken(TokenTypeId type) : type(type) {}
+
+	/**
+	 * The getLocation function allows the tokens to be directly passed as
+	 * parameter to Logger or LoggableException instances.
+	 *
+	 * @return a reference at the location field
+	 */
+	const SourceLocation &getLocation() const { return location; }
+};
+
+/**
+ * The DynamicTokenizer is used to extract tokens and chunks of text from a
+ * CharReader. It allows to register and unregister tokens while parsing and
+ * to modify the handling of whitespace characters. Note that the
+ * DynamicTokenizer always tries to extract the longest possible token from the
+ * tokenizer.
+ */
+class DynamicTokenizer {
+private:
+	/**
+	 * Internally used token trie. This object holds all registered tokens.
+	 */
+	TokenTrie trie;
+
+	/**
+	 * Flag defining whether whitespaces should be preserved or not.
+	 */
+	WhitespaceMode whitespaceMode;
+
+	/**
+	 * Vector containing all registered token types.
+	 */
+	std::vector<std::string> tokens;
+
+	/**
+	 * Next index in the tokens list where to search for a new token id.
+	 */
+	size_t nextTokenTypeId;
+
+	/**
+	 * Templated function used internally to read the current token. The
+	 * function is templated in order to force code generation for all six
+	 * combiations of whitespace modes and reading/peeking.
+	 *
+	 * @tparam TextHandler is the type to be used for the textHandler instance.
+	 * @tparam read specifies whether the function should start from and advance
+	 * the read pointer of the char reader.
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
+	 * @param token is the token structure into which the token information
+	 * should be written.
+	 * @return false if the end of the stream has been reached, true otherwise.
+	 */
+	template <typename TextHandler, bool read>
+	bool next(CharReader &reader, DynamicToken &token);
+
+public:
+	/**
+	 * Constructor of the DynamicTokenizer class.
+	 *
+	 * @param whitespaceMode specifies how whitespace should be handled.
+	 */
+	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+
+	/**
+	 * Registers the given string as a token. Returns a const pointer at a
+	 * TokenDescriptor that will be used to reference the newly created token.
+	 *
+	 * @param token is the token string that should be registered.
+	 * @return a unique identifier for the registered token or EmptyToken if
+	 * an error occured.
+	 */
+	TokenTypeId registerToken(const std::string &token);
+
+	/**
+	 * Unregisters the token belonging to the given TokenTypeId.
+	 *
+	 * @param type is the token type that should be unregistered. The
+	 *TokenTypeId
+	 * must have been returned by registerToken.
+	 * @return true if the operation was successful, false otherwise (e.g.
+	 * because the given TokenDescriptor was already unregistered).
+	 */
+	bool unregisterToken(TokenTypeId type);
+
+	/**
+	 * Returns the token that was registered under the given TokenTypeId id or
+	 *an
+	 * empty string if an invalid TokenTypeId id is given.
+	 *
+	 * @param type is the TokenTypeId id for which the corresponding token
+	 *string
+	 * should be returned.
+	 * @return the registered token string or an empty string if the given type
+	 * was invalid.
+	 */
+	std::string getTokenString(TokenTypeId type);
+
+	/**
+	 * Sets the whitespace mode.
+	 *
+	 * @param whitespaceMode defines how whitespace should be treated in text
+	 * tokens.
+	 */
+	void setWhitespaceMode(WhitespaceMode mode);
+
+	/**
+	 * Returns the current value of the whitespace mode.
+	 *
+	 * @return the whitespace mode.
+	 */
+	WhitespaceMode getWhitespaceMode();
+
+	/**
+	 * Reads a new token from the CharReader and stores it in the given
+	 * DynamicToken instance.
+	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
+	 * @param token is a reference at the token instance into which the Token
+	 * information should be written.
+	 * @return true if a token could be read, false if the end of the stream
+	 * has been reached.
+	 */
+	bool read(CharReader &reader, DynamicToken &token);
+
+	/**
+	 * The peek method does not advance the read position of the char reader,
+	 * but reads the next token from the current char reader peek position.
+	 *
+	 * @param reader is the CharReader instance from which the data should be
+	 * read.
+	 * @param token is a reference at the token instance into which the Token
+	 * information should be written.
+	 * @return true if a token could be read, false if the end of the stream
+	 * has been reached.
+	 */
+	bool peek(CharReader &reader, DynamicToken &token);
+};
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+
-- 
cgit v1.2.3


From 9f9e51974e782c4eb6f393ca3d4c3382df093bf1 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sat, 14 Feb 2015 23:58:55 +0100
Subject: Moved Tokenizer to core/parser/utils and adapted name

---
 src/core/parser/utils/Tokenizer.cpp | 56 ++++++++++++++++++-------------------
 src/core/parser/utils/Tokenizer.hpp | 34 +++++++++++-----------
 2 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'src/core/parser/utils/Tokenizer.hpp')

diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 1fac25a..3c8177d 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -24,7 +24,7 @@
 #include <core/common/Utils.hpp>
 #include <core/common/WhitespaceHandler.hpp>
 
-#include "DynamicTokenizer.hpp"
+#include "Tokenizer.hpp"
 
 namespace ousia {
 
@@ -39,7 +39,7 @@ struct TokenMatch {
 	/**
 	 * Token that was matched.
 	 */
-	DynamicToken token;
+	Token token;
 
 	/**
 	 * Current length of the data within the text handler. The text buffer needs
@@ -117,10 +117,10 @@ public:
 	 * @param c is the character that should be appended to the current prefix.
 	 * @param lookups is a list to which new TokeLookup instances are added --
 	 * which could potentially be expanded in the next iteration.
-	 * @param match is the DynamicToken instance to which the matching token
+	 * @param match is the Token instance to which the matching token
 	 * should be written.
 	 * @param tokens is a reference at the internal token list of the
-	 * DynamicTokenizer.
+	 * Tokenizer.
 	 * @param end is the end byte offset of the current character.
 	 * @param sourceId is the source if of this file.
 	 */
@@ -143,7 +143,7 @@ public:
 			size_t len = str.size();
 			if (len > match.token.content.size()) {
 				match.token =
-				    DynamicToken{node->type, str, {sourceId, start, end}};
+				    Token{node->type, str, {sourceId, start, end}};
 				match.textLength = textLength;
 				match.textEnd = textEnd;
 			}
@@ -181,15 +181,15 @@ static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match,
 }
 }
 
-/* Class DynamicTokenizer */
+/* Class Tokenizer */
 
-DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
+Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
     : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
 {
 }
 
 template <typename TextHandler, bool read>
-bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
+bool Tokenizer::next(CharReader &reader, Token &token)
 {
 	// If we're in the read mode, reset the char reader peek position to the
 	// current read position
@@ -268,12 +268,12 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
 		}
 		token = match.token;
 	} else {
-		token = DynamicToken{};
+		token = Token{};
 	}
 	return match.hasMatch();
 }
 
-bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token)
+bool Tokenizer::read(CharReader &reader, Token &token)
 {
 	switch (whitespaceMode) {
 		case WhitespaceMode::PRESERVE:
@@ -286,7 +286,7 @@ bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token)
 	return false;
 }
 
-bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token)
+bool Tokenizer::peek(CharReader &reader, Token &token)
 {
 	switch (whitespaceMode) {
 		case WhitespaceMode::PRESERVE:
@@ -299,7 +299,7 @@ bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token)
 	return false;
 }
 
-TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
+TokenTypeId Tokenizer::registerToken(const std::string &token)
 {
 	// Abort if an empty token should be registered
 	if (token.empty()) {
@@ -337,7 +337,7 @@ TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
 	return type;
 }
 
-bool DynamicTokenizer::unregisterToken(TokenTypeId type)
+bool Tokenizer::unregisterToken(TokenTypeId type)
 {
 	// Unregister the token from the trie, abort if an invalid type is given
 	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
@@ -348,7 +348,7 @@ bool DynamicTokenizer::unregisterToken(TokenTypeId type)
 	return false;
 }
 
-std::string DynamicTokenizer::getTokenString(TokenTypeId type)
+std::string Tokenizer::getTokenString(TokenTypeId type)
 {
 	if (type < tokens.size()) {
 		return tokens[type];
@@ -356,26 +356,26 @@ std::string DynamicTokenizer::getTokenString(TokenTypeId type)
 	return std::string{};
 }
 
-void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
+void Tokenizer::setWhitespaceMode(WhitespaceMode mode)
 {
 	whitespaceMode = mode;
 }
 
-WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
+WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; }
 
 /* Explicitly instantiate all possible instantiations of the "next" member
    function */
-template bool DynamicTokenizer::next<PreservingWhitespaceHandler, false>(
-    CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingWhitespaceHandler, false>(
-    CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingWhitespaceHandler, false>(
-    CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<PreservingWhitespaceHandler, true>(
-    CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingWhitespaceHandler, true>(
-    CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingWhitespaceHandler, true>(
-    CharReader &reader, DynamicToken &token);
+template bool Tokenizer::next<PreservingWhitespaceHandler, false>(
+    CharReader &reader, Token &token);
+template bool Tokenizer::next<TrimmingWhitespaceHandler, false>(
+    CharReader &reader, Token &token);
+template bool Tokenizer::next<CollapsingWhitespaceHandler, false>(
+    CharReader &reader, Token &token);
+template bool Tokenizer::next<PreservingWhitespaceHandler, true>(
+    CharReader &reader, Token &token);
+template bool Tokenizer::next<TrimmingWhitespaceHandler, true>(
+    CharReader &reader, Token &token);
+template bool Tokenizer::next<CollapsingWhitespaceHandler, true>(
+    CharReader &reader, Token &token);
 }
 
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index 3e5aeb3..6b4e116 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -17,7 +17,7 @@
 */
 
 /**
- * @file DynamicTokenizer.hpp
+ * @file Tokenizer.hpp
  *
  * Tokenizer that can be reconfigured at runtime used for parsing the plain
  * text format.
@@ -43,9 +43,9 @@ namespace ousia {
 class CharReader;
 
 /**
- * The DynamicToken structure describes a token discovered by the Tokenizer.
+ * The Token structure describes a token discovered by the Tokenizer.
  */
-struct DynamicToken {
+struct Token {
 	/**
 	 * Id of the type of this token.
 	 */
@@ -64,28 +64,28 @@ struct DynamicToken {
 	/**
 	 * Default constructor.
 	 */
-	DynamicToken() : type(EmptyToken) {}
+	Token() : type(EmptyToken) {}
 
 	/**
-	 * Constructor of the DynamicToken struct.
+	 * Constructor of the Token struct.
 	 *
 	 * @param id represents the token type.
 	 * @param content is the string content that has been extracted.
 	 * @param location is the location of the extracted string content in the
 	 * source file.
 	 */
-	DynamicToken(TokenTypeId type, const std::string &content,
+	Token(TokenTypeId type, const std::string &content,
 	             SourceLocation location)
 	    : type(type), content(content), location(location)
 	{
 	}
 
 	/**
-	 * Constructor of the DynamicToken struct, only initializes the token type
+	 * Constructor of the Token struct, only initializes the token type
 	 *
 	 * @param type is the id corresponding to the type of the token.
 	 */
-	DynamicToken(TokenTypeId type) : type(type) {}
+	Token(TokenTypeId type) : type(type) {}
 
 	/**
 	 * The getLocation function allows the tokens to be directly passed as
@@ -97,13 +97,13 @@ struct DynamicToken {
 };
 
 /**
- * The DynamicTokenizer is used to extract tokens and chunks of text from a
+ * The Tokenizer is used to extract tokens and chunks of text from a
  * CharReader. It allows to register and unregister tokens while parsing and
  * to modify the handling of whitespace characters. Note that the
- * DynamicTokenizer always tries to extract the longest possible token from the
+ * Tokenizer always tries to extract the longest possible token from the
  * tokenizer.
  */
-class DynamicTokenizer {
+class Tokenizer {
 private:
 	/**
 	 * Internally used token trie. This object holds all registered tokens.
@@ -140,15 +140,15 @@ private:
 	 * @return false if the end of the stream has been reached, true otherwise.
 	 */
 	template <typename TextHandler, bool read>
-	bool next(CharReader &reader, DynamicToken &token);
+	bool next(CharReader &reader, Token &token);
 
 public:
 	/**
-	 * Constructor of the DynamicTokenizer class.
+	 * Constructor of the Tokenizer class.
 	 *
 	 * @param whitespaceMode specifies how whitespace should be handled.
 	 */
-	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+	Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
 
 	/**
 	 * Registers the given string as a token. Returns a const pointer at a
@@ -201,7 +201,7 @@ public:
 
 	/**
 	 * Reads a new token from the CharReader and stores it in the given
-	 * DynamicToken instance.
+	 * Token instance.
 	 *
 	 * @param reader is the CharReader instance from which the data should be
 	 * read.
@@ -210,7 +210,7 @@ public:
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool read(CharReader &reader, DynamicToken &token);
+	bool read(CharReader &reader, Token &token);
 
 	/**
 	 * The peek method does not advance the read position of the char reader,
@@ -223,7 +223,7 @@ public:
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool peek(CharReader &reader, DynamicToken &token);
+	bool peek(CharReader &reader, Token &token);
 };
 }
 
-- 
cgit v1.2.3