finished first draft of tokenizer code.

git-svn-id: file:///var/local/svn/basicwriter@90 daaaf23c-2e50-4459-9457-1e69db5a47bf
author: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-10-31 14:56:13 +0000
committer: benjamin <benjamin@daaaf23c-2e50-4459-9457-1e69db5a47bf> 2014-10-31 14:56:13 +0000
commit: 72c1845961e77f7625db47ebd3de129aa90f4f5d (patch)
tree: 790cfaba53fee7b02038bc7513d5bf62b974a4c7
parent: 9e233b3f13daebb9ac4c5cae0da073d0c6f782c0 (diff)
6 files changed, 173 insertions, 41 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 726e1a3..8d06b4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,18 +96,19 @@ ADD_DEFINITIONS(
 
 # ousia_script library (containing the bindings needed for script engines)
 ADD_LIBRARY(ousia_core
-	src/core/script/Function.cpp
-	src/core/script/Object.cpp
-	src/core/script/ScriptEngine.cpp
-	src/core/script/Variant.cpp
-	src/core/utils/Tokenizer.cpp
-	src/core/utils/Utils.cpp
+	src/core/script/Function
+	src/core/script/Object
+	src/core/script/ScriptEngine
+	src/core/script/Variant
+	src/core/utils/BufferedCharReader
+	src/core/utils/Tokenizer
+	src/core/utils/Utils
 )
 
 # ousia_plugin_mozjs library
 
 ADD_LIBRARY(ousia_plugin_mozjs
-	src/plugins/mozjs/MozJsScriptEngine.cpp
+	src/plugins/mozjs/MozJsScriptEngine
 )
 
 TARGET_LINK_LIBRARIES(ousia_plugin_mozjs
@@ -130,12 +131,13 @@ IF(test)
 
 	# Add all unit test files
 	ADD_EXECUTABLE(ousia_test_core
-		test/core/utils/RangeSetTest
-		test/core/utils/TokenizerTest
-		test/core/utils/UtilsTest
 		test/core/script/FunctionTest
 		test/core/script/ObjectTest
 		test/core/script/VariantTest
+		test/core/utils/BufferedCharReaderTest
+		test/core/utils/RangeSetTest
+		test/core/utils/TokenizerTest
+		test/core/utils/UtilsTest
 	)
 
 	TARGET_LINK_LIBRARIES(ousia_test_core
@@ -156,7 +158,7 @@ IF(test)
 
 	# Add all unit test files
 	ADD_EXECUTABLE(ousia_test_plugin_mozjs
-		test/plugins/mozjs/MozJsScriptEngineTest.cpp
+		test/plugins/mozjs/MozJsScriptEngineTest
 	)
 
 	TARGET_LINK_LIBRARIES(ousia_test_plugin_mozjs
diff --git a/src/core/utils/BufferedCharReader.cpp b/src/core/utils/BufferedCharReader.cpp
index 0377015..c13628f 100644
--- a/src/core/utils/BufferedCharReader.cpp
+++ b/src/core/utils/BufferedCharReader.cpp
@@ -1,6 +1,6 @@
 /*
-    SCAENEA IDL Compiler (scidlc)
-    Copyright (C) 2014  Andreas Stöckel
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
diff --git a/src/core/utils/BufferedCharReader.hpp b/src/core/utils/BufferedCharReader.hpp
index 86f43b5..b13cde6 100644
--- a/src/core/utils/BufferedCharReader.hpp
+++ b/src/core/utils/BufferedCharReader.hpp
@@ -1,6 +1,6 @@
 /*
-    SCAENEA IDL Compiler (scidlc)
-    Copyright (C) 2014  Andreas Stöckel
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -236,5 +236,5 @@ public:
 }
 }
 
-#endif /* _OUSISA_UTILS_BUFFERED_CHAR_READER_H_ */
+#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */
 
diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp
index 38f7585..2c36438 100644
--- a/src/core/utils/Tokenizer.cpp
+++ b/src/core/utils/Tokenizer.cpp
@@ -16,6 +16,8 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include <sstream>
+
 #include "Tokenizer.hpp"
 
 namespace ousia {
@@ -52,10 +54,10 @@ static std::map<char, TokenTreeNode> buildChildren(
 
 static int buildId(const std::map<std::string, int> &inputs)
 {
-	int tokenId = -1;
+	int tokenId = TOKEN_NONE;
 	for (auto &e : inputs) {
 		if (e.first.empty()) {
-			if (tokenId != -1) {
+			if (tokenId != TOKEN_NONE) {
 				throw TokenizerException{std::string{"Ambigous token found: "} +
 				                         std::to_string(e.second)};
 			} else {
@@ -68,8 +70,115 @@ static int buildId(const std::map<std::string, int> &inputs)
 
 TokenTreeNode::TokenTreeNode(const std::map<std::string, int> &inputs)
     : children(buildChildren(inputs)), tokenId(buildId(inputs))
+{
+}
+
+Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root)
+    : input(input), root(root)
+{
+}
+
+bool Tokenizer::prepare()
+{
+	std::stringstream buffer;
+	char c;
+	const int startColumn = input.getColumn();
+	const int startLine = input.getLine();
+	bool bufEmpty = true;
+	while (input.peek(&c)) {
+		if (root.children.find(c) != root.children.end()) {
+			// if there might be a special token, keep peeking forward
+			// until we find the token (or we don't).
+			TokenTreeNode const *n = &root;
+			std::stringstream tBuf;
+			int match = TOKEN_NONE;
+			while (true) {
+				tBuf << c;
+				n = &(n->children.at(c));
+				if (n->tokenId != TOKEN_NONE) {
+					// from here on we found a token. If we have something
+					// in our buffer already, we end the search now.
+					if (!bufEmpty) {
+						break;
+					} else {
+						// if we want to return this token ( = we have nothing
+						// in our buffer yet) we look greedily for the longest
+						// possible token we can construct.
+						input.consumePeek();
+					}
+				}
+				if (!input.peek(&c)) {
+					// if we are at the end we break off the search.
+					break;
+				}
+				if (n->children.find(c) == root.children.end()) {
+					// if we do not find a possible continuation anymore,
+					// break off the search.
+					break;
+				}
+			}
+			// check if we did indeed find a special token.
+			if (match != TOKEN_NONE) {
+				input.resetPeek();
+				if (bufEmpty) {
+					// if we did not have text before, construct that token.
+					peeked.push_back(Token{match, tBuf.str(), startColumn,
+					                       startLine, input.getColumn(),
+					                       input.getLine()});
+					return true;
+				} else {
+					// otherwise we return the text before the token.
+					peeked.push_back(Token{TOKEN_TEXT, buffer.str(),
+					                       startColumn, startLine,
+					                       input.getColumn(), input.getLine()});
+					return true;
+				}
+			}
+		}
+		buffer << c;
+		bufEmpty = false;
+		input.consumePeek();
+	}
+	if (!bufEmpty) {
+		peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
+		                       input.getColumn(), input.getLine()});
+		return true;
+	}
+	return false;
+}
 
+bool Tokenizer::next(Token &t)
 {
+	if (peeked.empty()) {
+		if (!prepare()) {
+			return false;
+		}
+	}
+	t = peeked.front();
+	peeked.pop_front();
+	resetPeek();
+	return true;
+}
+
+bool Tokenizer::peek(Token &t)
+{
+	if (peekCursor >= peeked.size()) {
+		if (!prepare()) {
+			return false;
+		}
+	}
+	t = peeked[peekCursor];
+	return true;
+}
+
+void Tokenizer::resetPeek() { peekCursor = 0; }
+
+void Tokenizer::consumePeek()
+{
+	while (peekCursor > 0) {
+		peeked.pop_front();
+		peekCursor--;
+	}
 }
 }
 }
diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
index 24c4f30..924b670 100644
--- a/src/core/utils/Tokenizer.hpp
+++ b/src/core/utils/Tokenizer.hpp
@@ -21,7 +21,9 @@
 
 #include <istream>
 #include <map>
-#include <queue>
+#include <deque>
+
+#include "BufferedCharReader.hpp"
 
 namespace ousia {
 namespace utils {
@@ -44,33 +46,47 @@ public:
 };
 
 struct Token {
-	const int tokenId;
-	const std::string content;
-	const int column;
-	const int line;
-
-	Token(int tokenId, std::string content, int column, int line)
-	    : tokenId(tokenId), content(content), column(column), line(line)
+	int tokenId;
+	std::string content;
+	int startColumn;
+	int startLine;
+	int endColumn;
+	int endLine;
+
+	Token(int tokenId, std::string content, int startColumn, int startLine,
+	      int endColumn, int endLine)
+	    : tokenId(tokenId),
+	      content(content),
+	      startColumn(startColumn),
+	      startLine(startLine),
+	      endColumn(endColumn),
+	      endLine(endLine)
 	{
 	}
 };
 
+static const int TOKEN_NONE = -1;
+static const int TOKEN_TEXT = -2;
+
 class Tokenizer {
 private:
-	const std::istream &input;
-	const TokenTreeNode root;
-	const std::queue<Token> peekQueue;
+	BufferedCharReader &input;
+	const TokenTreeNode &root;
+	std::deque<Token> peeked;
+	unsigned int peekCursor = 0;
+
+	bool prepare();
 
 public:
-	Tokenizer(const TokenTreeNode &root, std::istream &input);
+	Tokenizer(BufferedCharReader &input, const TokenTreeNode &root);
 
-	bool hasNext();
+	bool next(Token &t);
 
-	const Token &next();
+	bool peek(Token &t);
 
-	const Token &peek();
+	void resetPeek();
 
-	void reset();
+	void consumePeek();
 };
 }
 }
diff --git a/test/core/utils/BufferedCharReader.cpp b/test/core/utils/BufferedCharReaderTest.cpp
index f8f668c..69c0974 100644
--- a/test/core/utils/BufferedCharReader.cpp
+++ b/test/core/utils/BufferedCharReaderTest.cpp
@@ -21,7 +21,10 @@
 
 #include "gtest/gtest.h"
 
-#include "BufferedCharReader.hpp"
+#include <core/utils/BufferedCharReader.hpp>
+
+namespace ousia{
+namespace utils{
 
 TEST(BufferedCharReaderTest, SimpleReadTest)
 {
@@ -29,7 +32,7 @@ TEST(BufferedCharReaderTest, SimpleReadTest)
 	char c;
 
 	// Feed a test string into the reader
-	scaenea::compiler::BufferedCharReader reader;
+	BufferedCharReader reader;
 	reader.feed(testStr);
 	reader.close();
 
@@ -58,7 +61,7 @@ TEST(BufferedCharReaderTest, SimplePeekTest)
 	char c;
 
 	// Feed a test string into the reader
-	scaenea::compiler::BufferedCharReader reader;
+	BufferedCharReader reader;
 	reader.feed(testStr);
 	reader.close();
 
@@ -97,7 +100,7 @@ TEST(BufferedCharReaderTest, SplittedPeakTest)
 	char c;
 
 	// Feed a test string into the reader
-	scaenea::compiler::BufferedCharReader reader;
+	BufferedCharReader reader;
 
 	// Try to peek the test string, feed char after char into the reader
 	std::string res;
@@ -132,7 +135,7 @@ TEST(BufferedCharReaderTest, RowColumnCounterTest)
 	char c;
 
 	// Feed a test string into the reader
-	scaenea::compiler::BufferedCharReader reader;
+	BufferedCharReader reader;
 	reader.feed(testStr);
 	reader.close();
 
@@ -162,7 +165,7 @@ TEST(BufferedCharReaderTest, LinebreakSubstitutionTest)
 	const std::string expStr("this\nis\njust\na test\n\ntest\n");
 
 	// Feed a test string into the reader
-	scaenea::compiler::BufferedCharReader reader;
+	BufferedCharReader reader;
 	reader.feed(testStr);
 
 	// Read all characters from the test string
@@ -183,7 +186,7 @@ TEST(BufferedCharReaderTest, RowColumnCounterUTF8Test)
 	char c;
 
 	// Feed a test string into the reader
-	scaenea::compiler::BufferedCharReader reader;
+	BufferedCharReader reader;
 	reader.feed(testStr);
 	reader.close();
 
@@ -196,3 +199,5 @@ TEST(BufferedCharReaderTest, RowColumnCounterUTF8Test)
 	ASSERT_EQ(6, reader.getColumn());
 }
 
+}
+}
author	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-10-31 14:56:13 +0000
committer	benjamin <benjamin@daaaf23c-2e50-4459-9457-1e69db5a47bf>	2014-10-31 14:56:13 +0000
commit	72c1845961e77f7625db47ebd3de129aa90f4f5d (patch)
tree	790cfaba53fee7b02038bc7513d5bf62b974a4c7
parent	9e233b3f13daebb9ac4c5cae0da073d0c6f782c0 (diff)