implemented tokenizer test and started implementing CodeTokenizer under supervision of Maester Stoeckel.

author: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-11-14 17:41:03 +0100
committer: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-11-14 17:41:03 +0100
commit: 165cf9a5c6ab03dab64d5eb5a5577f8c216bb832 (patch)
tree: 8ab8fd9baa491d92d9f21f2e9257450adab4f75f
parent: e3cf0a9d726c9d76f4938590691336fbf2e9f6d5 (diff)
9 files changed, 342 insertions, 20 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c04de7..c375e79 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,6 +104,7 @@ ADD_LIBRARY(ousia_core
 	src/core/script/ScriptEngine
 	src/core/script/Variant
 	src/core/utils/BufferedCharReader
+	src/core/utils/CodeTokenizer
 	src/core/utils/CSSParser
 	src/core/utils/Tokenizer
 	src/core/utils/Utils
@@ -141,6 +142,7 @@ IF(TEST)
 		test/core/script/ObjectTest
 		test/core/script/VariantTest
 		test/core/utils/BufferedCharReaderTest
+		test/core/utils/CodeTokenizerTest
 		test/core/utils/CSSParserTest
 		test/core/utils/RangeSetTest
 		test/core/utils/TokenizerTest
diff --git a/build/.empty b/build/.empty
deleted file mode 100644
index e69de29..0000000
--- a/build/.empty
+++ /dev/null
diff --git a/src/core/utils/CSSParser.cpp b/src/core/utils/CSSParser.cpp
index 1639152..e66eb34 100644
--- a/src/core/utils/CSSParser.cpp
+++ b/src/core/utils/CSSParser.cpp
@@ -44,11 +44,11 @@ static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN},
                                      {"/*", COMMENT_OPEN},
                                      {"*/", COMMENT_CLOSE}}};
 
-//StyleNode CSSParser::parse(BufferedCharReader &input) {
-//	Tokenizer tokenizer {input, CSS_ROOT};
-//	//TODO: implement
-//	
-//}
+StyleNode CSSParser::parse(BufferedCharReader &input) {
+	Tokenizer tokenizer {input, CSS_ROOT};
+	//TODO: implement
+	
+}
 
 
 
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
new file mode 100644
index 0000000..c1376af
--- /dev/null
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -0,0 +1,144 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cassert>
+
+#include "CodeTokenizer.hpp"
+
+namespace ousia {
+namespace utils {
+
+Token CodeTokenizer::constructToken(const Token& t)
+{
+	std::string content = buf.str();
+	buf.str(std::string());
+	return Token{returnTokenId,          content,
+	             startToken.startColumn, startToken.startLine,
+	             t.endColumn,     t.endLine};
+}
+
+void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
+
+bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
+{
+	if (t.startLine != t.endLine) {
+		throw TokenizerException(
+		    "We did not expect a multiline token. Most likely you did not add "
+		    "a linebreak token to your tokenizer!");
+	}
+
+	auto it = descriptors.find(t.tokenId);
+	CodeTokenMode mode = CodeTokenMode::NONE;
+	if (it != descriptors.end()) {
+		mode = it->second.mode;
+	}
+	switch (state) {
+		case CodeTokenizerState::NORMAL:
+			switch (mode) {
+				case CodeTokenMode::STRING_START_END:
+					state = CodeTokenizerState::IN_STRING;
+					break;
+				case CodeTokenMode::BLOCK_COMMENT_START:
+					state = CodeTokenizerState::IN_BLOCK_COMMENT;
+					break;
+				case CodeTokenMode::LINE_COMMENT:
+					state = CodeTokenizerState::IN_LINE_COMMENT;
+					break;
+				default:
+					if (t.tokenId == TOKEN_TEXT) {
+						int begin = -1;
+						for (size_t c = 0; c < t.content.length(); c++) {
+							bool isWhitespace =
+							    t.content[c] == ' ' || t.content[c] == '\t';
+							if (begin >= 0 && isWhitespace) {
+								peeked.push_back(Token{
+								    TOKEN_TEXT,
+								    t.content.substr(begin, (int)c - begin),
+								    t.startColumn + begin, t.startLine,
+								    t.startColumn + (int)c, t.endLine});
+							}
+							if (!isWhitespace && begin < 0) {
+								begin = c;
+							}
+						}
+					}
+					peeked.push_back(t);
+					return true;
+			}
+			startToken = t;
+			returnTokenId = it->second.id;
+			return false;
+		case CodeTokenizerState::IN_LINE_COMMENT:
+			switch (mode) {
+				case CodeTokenMode::LINEBREAK:
+					state = CodeTokenizerState::NORMAL;
+					if (!ignoreComments) {
+						peeked.push_back(constructToken(t));
+					}
+					return !ignoreComments;
+				default:
+					if (!ignoreComments) {
+						buffer(t);
+					}
+					return false;
+			}
+		case CodeTokenizerState::IN_BLOCK_COMMENT:
+			switch (mode) {
+				case CodeTokenMode::BLOCK_COMMENT_END:
+					state = CodeTokenizerState::NORMAL;
+					if (!ignoreComments) {
+						peeked.push_back(constructToken(t));
+					}
+					return !ignoreComments;
+				default:
+					if (!ignoreComments) {
+						buffer(t);
+					}
+					return false;
+			}
+		case CodeTokenizerState::IN_STRING:
+			switch (mode) {
+				case CodeTokenMode::ESCAPE:
+					if (escaped) {
+						buffer(t);
+					}
+					escaped = !escaped;
+					return false;
+				case CodeTokenMode::STRING_START_END:
+					if (escaped) {
+						buffer(t);
+						escaped = false;
+						return false;
+					} else {
+						peeked.push_back(constructToken(t));
+						state = CodeTokenizerState::NORMAL;
+						return true;
+					}
+				default:
+					if (escaped) {
+						// TODO: handle escaped characters?
+						escaped = false;
+					}
+					buffer(t);
+					return false;
+			}
+	}
+	assert(false);
+}
+}
+}
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
new file mode 100644
index 0000000..f26a74c
--- /dev/null
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -0,0 +1,83 @@
+/*
+    Ousía
+    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _OUSIA_UTILS_CODE_TOKENIZER_HPP_
+#define _OUSIA_UTILS_CODE_TOKENIZER_HPP_
+
+#include <map>
+#include <sstream>
+
+#include "BufferedCharReader.hpp"
+#include "Tokenizer.hpp"
+
+namespace ousia {
+namespace utils {
+
+enum class CodeTokenMode {
+	STRING_START_END,
+	LINE_COMMENT,
+	BLOCK_COMMENT_START,
+	BLOCK_COMMENT_END,
+	LINEBREAK,
+	ESCAPE,
+	NONE
+};
+
+struct CodeTokenDescriptor {
+	CodeTokenMode mode;
+	int id;
+
+	CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
+};
+
+
+enum class CodeTokenizerState {
+	NORMAL,
+	IN_BLOCK_COMMENT,
+	IN_LINE_COMMENT,
+	IN_STRING
+};
+
+class CodeTokenizer : public Tokenizer {
+private:
+	std::map<int, CodeTokenDescriptor> descriptors;
+	CodeTokenizerState state;
+	std::stringstream buf;
+	Token startToken;
+	int returnTokenId;
+	bool escaped = false;
+
+	Token constructToken(const Token& t);
+	void buffer(const Token& t);
+
+protected:
+	bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
+
+public:
+	bool ignoreComments = false;
+
+	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
+	          std::map<int, CodeTokenDescriptor> descriptors)
+	    : Tokenizer(input, root), descriptors(descriptors)
+	{
+	}
+};
+}
+}
+
+#endif
diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp
index 2c36438..164a30f 100644
--- a/src/core/utils/Tokenizer.cpp
+++ b/src/core/utils/Tokenizer.cpp
@@ -96,6 +96,7 @@ bool Tokenizer::prepare()
 				tBuf << c;
 				n = &(n->children.at(c));
 				if (n->tokenId != TOKEN_NONE) {
+					match = n->tokenId;
 					// from here on we found a token. If we have something
 					// in our buffer already, we end the search now.
 					if (!bufEmpty) {
@@ -111,7 +112,7 @@ bool Tokenizer::prepare()
 					// if we are at the end we break off the search.
 					break;
 				}
-				if (n->children.find(c) == root.children.end()) {
+				if (n->children.find(c) == n->children.end()) {
 					// if we do not find a possible continuation anymore,
 					// break off the search.
 					break;
@@ -122,16 +123,21 @@ bool Tokenizer::prepare()
 				input.resetPeek();
 				if (bufEmpty) {
 					// if we did not have text before, construct that token.
-					peeked.push_back(Token{match, tBuf.str(), startColumn,
-					                       startLine, input.getColumn(),
-					                       input.getLine()});
-					return true;
+					if (doPrepare(
+					        Token{match, tBuf.str(), startColumn, startLine,
+					              input.getColumn(), input.getLine()},
+					        peeked)) {
+						return true;
+					}
+
 				} else {
 					// otherwise we return the text before the token.
-					peeked.push_back(Token{TOKEN_TEXT, buffer.str(),
-					                       startColumn, startLine,
-					                       input.getColumn(), input.getLine()});
-					return true;
+					if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn,
+					                    startLine, input.getColumn(),
+					                    input.getLine()},
+					              peeked)) {
+						return true;
+					}
 				}
 			}
 		}
@@ -140,13 +146,19 @@ bool Tokenizer::prepare()
 		input.consumePeek();
 	}
 	if (!bufEmpty) {
-		peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
-		                       input.getColumn(), input.getLine()});
-		return true;
+		return doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
+		                       input.getColumn(), input.getLine()},
+		                 peeked);
 	}
 	return false;
 }
 
+bool Tokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
+{
+	peeked.push_back(t);
+	return true;
+}
+
 bool Tokenizer::next(Token &t)
 {
 	if (peeked.empty()) {
diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
index 924b670..eb8eed4 100644
--- a/src/core/utils/Tokenizer.hpp
+++ b/src/core/utils/Tokenizer.hpp
@@ -45,6 +45,9 @@ public:
 	TokenTreeNode(const std::map<std::string, int> &inputs);
 };
 
+static const int TOKEN_NONE = -1;
+static const int TOKEN_TEXT = -2;
+
 struct Token {
 	int tokenId;
 	std::string content;
@@ -63,10 +66,9 @@ struct Token {
 	      endLine(endLine)
 	{
 	}
-};
 
-static const int TOKEN_NONE = -1;
-static const int TOKEN_TEXT = -2;
+	Token() : tokenId(TOKEN_NONE) {}
+};
 
 class Tokenizer {
 private:
@@ -77,6 +79,21 @@ private:
 
 	bool prepare();
 
+protected:
+	/**
+	* This method is an interface to build multiple tokens from a single one in
+	* derived classes. This might be interesting if you want to implement
+	* further logic on text tokens or similar applications.
+	*
+	* @param t a Token the "basic" tokenizer found.
+	* @param peeked a reference to the deque containing all temporary Tokens.
+	* You are supposed to append your tokens there. In the trivial case you just
+	* put the given Token on top of the deque.
+	* @return false if no token was appended to the deque (meaning that you want
+	* to ignore the given token explicitly) and true in all other cases.
+	*/
+	virtual bool doPrepare(const Token &t, std::deque<Token> &peeked);
+
 public:
 	Tokenizer(BufferedCharReader &input, const TokenTreeNode &root);
 
diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp
new file mode 100644
index 0000000..d0f9a17
--- /dev/null
+++ b/test/core/utils/CodeTokenizerTest.cpp
@@ -0,0 +1,30 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <gtest/gtest.h>
+
+#include <core/utils/CodeTokenizer.hpp>
+
+namespace ousia {
+namespace utils {
+TEST(CodeTokenizer, testTokenizer)
+{
+	
+}
+}
+}
diff --git a/test/core/utils/TokenizerTest.cpp b/test/core/utils/TokenizerTest.cpp
index f441fd8..ba06c33 100644
--- a/test/core/utils/TokenizerTest.cpp
+++ b/test/core/utils/TokenizerTest.cpp
@@ -18,6 +18,8 @@
 
 #include <gtest/gtest.h>
 
+#include <core/utils/BufferedCharReader.hpp>
+
 #include <core/utils/Tokenizer.hpp>
 
 namespace ousia {
@@ -59,5 +61,37 @@ TEST(TokenTreeNode, testConstructor)
 	ASSERT_EQ(4, abd.tokenId);
 	ASSERT_EQ(0, abd.children.size());
 }
+
+TEST(Tokenizer, testTokenization)
+{
+	TokenTreeNode root{{{"/", 1}, {"/*", 2}, {"*/", 3}}};
+
+	BufferedCharReader reader;
+	reader.feed("Test/Test /* Block Comment */");
+	//           12345678901234567890123456789
+	//           0        1         2
+
+	std::vector<Token> expected = {
+	    {TOKEN_TEXT, "Test", 1, 1, 5, 1},
+	    {1, "/", 5, 1, 6, 1},
+	    {TOKEN_TEXT, "Test ", 6, 1, 11, 1},
+	    {2, "/*", 11, 1, 13, 1},
+	    {TOKEN_TEXT, " Block Comment ", 13, 1, 28, 1},
+	    {3, "*/", 28, 1, 30, 1}};
+
+	Tokenizer tokenizer{reader, root};
+
+	Token t;
+	for (auto &te : expected) {
+		ASSERT_TRUE(tokenizer.next(t));
+		ASSERT_EQ(te.tokenId, t.tokenId);
+		ASSERT_EQ(te.content, t.content);
+		ASSERT_EQ(te.startColumn, t.startColumn);
+		ASSERT_EQ(te.startLine, t.startLine);
+		ASSERT_EQ(te.endColumn, t.endColumn);
+		ASSERT_EQ(te.endLine, t.endLine);
+	}
+	ASSERT_FALSE(tokenizer.next(t));
+}
 }
 }
author	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-11-14 17:41:03 +0100
committer	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-11-14 17:41:03 +0100
commit	165cf9a5c6ab03dab64d5eb5a5577f8c216bb832 (patch)
tree	8ab8fd9baa491d92d9f21f2e9257450adab4f75f
parent	e3cf0a9d726c9d76f4938590691336fbf2e9f6d5 (diff)