From 7b44735f31b8b5d236d66ea1a681abb99ac83bf9 Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 15:43:35 +0100
Subject: added documentation for CodeTokenizer.

---
 src/core/utils/CodeTokenizer.hpp | 53 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)

(limited to 'src/core/utils/CodeTokenizer.hpp')
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index f26a74c..18cf02a 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -28,6 +28,17 @@
 namespace ousia {
 namespace utils {
 
+/*
+ * This enum contains all special Token the CodeTokenizer supports, namely:
+ *
+ * 1.) An ambigous Tokens - in post programming languages single-quotes ' or
+ * double-quotes " - to delimit string tokens.
+ * 2.) A start token for line comments, which would e.g. be // in Java.
+ * 3.) A start token for a block comment
+ * 4.) An end token for a block comment.
+ * 5.) The linebreak token (this does not have to be specified by the user)
+ * 6.) The escape token, which would e.g. be \ in java.
+ */
 enum class CodeTokenMode {
 	STRING_START_END,
 	LINE_COMMENT,
@@ -38,6 +49,10 @@ enum class CodeTokenMode {
 	NONE
 };
 
+/**
+ * A CodeTokenDescriptor draws the connection between an id returned by the
+ * underlying Tokenizer and the mode this token represents.
+ */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
 	int id;
@@ -45,7 +60,10 @@ struct CodeTokenDescriptor {
 	CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
 };
 
-
+/**
+ * The CodeTokenizer is a finite state machine with the states NORMAL, being
+ * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING.
+ */
 enum class CodeTokenizerState {
 	NORMAL,
 	IN_BLOCK_COMMENT,
@@ -53,6 +71,14 @@ enum class CodeTokenizerState {
 	IN_STRING
 };
 
+/**
+ * The purpose of a CodeTokenizer is to make it easier to parse classical
+ * programming Code. It adds the following features to a regular Tokenizer:
+ * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens
+ * for the opening delimiter, the text and the closing delimiter.
+ * 2.) Escaping in String tokens.
+ * 3.) Comment Tokens (for line comments as well as block comments)
+ */
 class CodeTokenizer : public Tokenizer {
 private:
 	std::map<int, CodeTokenDescriptor> descriptors;
@@ -62,17 +88,36 @@ private:
 	int returnTokenId;
 	bool escaped = false;
 
-	Token constructToken(const Token& t);
-	void buffer(const Token& t);
+	Token constructToken(const Token &t);
+	void buffer(const Token &t);
 
 protected:
 	bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
 
 public:
+	/**
+	 * If you do not want comment tokens to be returned you can set this to
+	 * true.
+	 */
 	bool ignoreComments = false;
 
+	/**
+	 *
+	 * @param input a BufferedCharReader containing the input for this
+	 *tokenizer,
+	 * as with a regular tokenizer.
+	 * @param root a TokenTreeNode representing the root of the TokenTree.
+	 * Please note that you have to specify all tokenIDs here that you use
+	 * in the descriptors map.
+	 * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors.
+	 * In this way you can specify the meaning of certain Tokens. Say you
+	 * specified the Token "//" with the id 1 in the TokenTree. Then you could
+	 * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map
+	 * and this CodeTokenizer would recognize the token "//" as starting a
+	 * line comment.
+	 */
 	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
-	          std::map<int, CodeTokenDescriptor> descriptors)
+	              std::map<int, CodeTokenDescriptor> descriptors)
 	    : Tokenizer(input, root), descriptors(descriptors)
 	{
 	}
-- 
cgit v1.2.3


From 45c6acf226f7867f847a9085d84960d337b1a2ae Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 19:44:29 +0100
Subject: 1.) fixed the word tokenization mechanism in the CodeTokenizer which
 returned rubbish previously. 2.) Allowed multiline tokens if the mode is
 LINEBREAK (obsiously). 3.) returned LINEBREAK tokens in normal mode. 4.)
 added a CodeTokenizer test.

---
 src/core/utils/CodeTokenizer.cpp      | 62 +++++++++++++++++++----------
 src/core/utils/CodeTokenizer.hpp      | 11 ++++--
 test/core/utils/CodeTokenizerTest.cpp | 74 ++++++++++++++++++++++++++++++++++-
 3 files changed, 122 insertions(+), 25 deletions(-)

(limited to 'src/core/utils/CodeTokenizer.hpp')

diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
index c1376af..e5b8610 100644
--- a/src/core/utils/CodeTokenizer.cpp
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -23,30 +23,30 @@
 namespace ousia {
 namespace utils {
 
-Token CodeTokenizer::constructToken(const Token& t)
+Token CodeTokenizer::constructToken(const Token &t)
 {
 	std::string content = buf.str();
 	buf.str(std::string());
-	return Token{returnTokenId,          content,
-	             startToken.startColumn, startToken.startLine,
-	             t.endColumn,     t.endLine};
+	return Token{returnTokenId,        content,     startToken.startColumn,
+	             startToken.startLine, t.endColumn, t.endLine};
 }
 
 void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
 
 bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 {
-	if (t.startLine != t.endLine) {
-		throw TokenizerException(
-		    "We did not expect a multiline token. Most likely you did not add "
-		    "a linebreak token to your tokenizer!");
-	}
-
 	auto it = descriptors.find(t.tokenId);
 	CodeTokenMode mode = CodeTokenMode::NONE;
 	if (it != descriptors.end()) {
 		mode = it->second.mode;
 	}
+
+	if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
+		throw TokenizerException(
+		    "We did not expect a multiline token (except linebreaks). Most "
+		    "likely you did not add a linebreak token to your tokenizer!");
+	}
+
 	switch (state) {
 		case CodeTokenizerState::NORMAL:
 			switch (mode) {
@@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 				case CodeTokenMode::LINE_COMMENT:
 					state = CodeTokenizerState::IN_LINE_COMMENT;
 					break;
+				case CodeTokenMode::LINEBREAK:
+					peeked.push_back({it->second.id, t.content, t.startColumn,
+					                  t.startLine, t.endColumn, t.endLine});
+					return true;
 				default:
 					if (t.tokenId == TOKEN_TEXT) {
 						int begin = -1;
 						for (size_t c = 0; c < t.content.length(); c++) {
 							bool isWhitespace =
 							    t.content[c] == ' ' || t.content[c] == '\t';
-							if (begin >= 0 && isWhitespace) {
-								peeked.push_back(Token{
-								    TOKEN_TEXT,
-								    t.content.substr(begin, (int)c - begin),
-								    t.startColumn + begin, t.startLine,
-								    t.startColumn + (int)c, t.endLine});
-							}
-							if (!isWhitespace && begin < 0) {
-								begin = c;
+							if (begin < 0) {
+								// if we have not yet set our beginning,
+								// we wait for the first
+								// non-whitespace-character to set it.
+								if (!isWhitespace) {
+									begin = c;
+								}
+							} else {
+								// if we have set our beginning, we wait for the
+								// first whitespace character, which marks the
+								// end of the current word.
+								if (isWhitespace) {
+									peeked.push_back(Token{
+									    TOKEN_TEXT,
+									    t.content.substr(begin, (int)c - begin),
+									    t.startColumn + begin, t.startLine,
+									    t.startColumn + (int)c, t.endLine});
+									begin = -1;
+								}
 							}
 						}
+						if(begin >= 0){
+							peeked.push_back(Token{
+									TOKEN_TEXT,
+									t.content.substr(begin),
+									t.startColumn + begin, t.startLine,
+									t.endColumn, t.endLine});
+						}
+					} else {
+						peeked.push_back(t);
 					}
-					peeked.push_back(t);
 					return true;
 			}
 			startToken = t;
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index 18cf02a..fda4493 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -36,7 +36,7 @@ namespace utils {
  * 2.) A start token for line comments, which would e.g. be // in Java.
  * 3.) A start token for a block comment
  * 4.) An end token for a block comment.
- * 5.) The linebreak token (this does not have to be specified by the user)
+ * 5.) A linebreak token
  * 6.) The escape token, which would e.g. be \ in java.
  */
 enum class CodeTokenMode {
@@ -50,8 +50,11 @@ enum class CodeTokenMode {
 };
 
 /**
- * A CodeTokenDescriptor draws the connection between an id returned by the
- * underlying Tokenizer and the mode this token represents.
+ * A CodeTokenDescriptor defines the id the user likes to have returned for
+ * a Token of the mode specified, e.g. if you want to get the id 4 for a
+ * String Token the corresponding CodeTokenDescriptor would be inizialized
+ * with
+ * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
  */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
@@ -118,7 +121,7 @@ public:
 	 */
 	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
 	              std::map<int, CodeTokenDescriptor> descriptors)
-	    : Tokenizer(input, root), descriptors(descriptors)
+	    : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL)
 	{
 	}
 };
diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp
index d0f9a17..0b9d7b3 100644
--- a/test/core/utils/CodeTokenizerTest.cpp
+++ b/test/core/utils/CodeTokenizerTest.cpp
@@ -22,9 +22,81 @@
 
 namespace ousia {
 namespace utils {
+
+static const int BLOCK_COMMENT = 30;
+static const int LINE_COMMENT = 31;
+static const int STRING = 20;
+static const int ESCAPE = 21;
+static const int LINEBREAK = 21;
+static const int CURLY_OPEN = 40;
+static const int CURLY_CLOSE = 41;
+
 TEST(CodeTokenizer, testTokenizer)
 {
-	
+	BufferedCharReader reader;
+	reader.feed("/**\n");                                 // 1
+	reader.feed(" * Some Block Comment\n");               // 2
+	reader.feed(" */\n");                                 // 3
+	reader.feed("var my_string = 'My \\'String\\'';\n");  // 4
+	reader.feed("// and a line comment\n");               // 5
+	reader.feed("var my_obj = { a = 4;}");                // 6
+	//           123456789012345678901234567890123456789012345678901234567890123456789
+	//           0        1         2         3         4         5         6
+	TokenTreeNode root{{{"/*", 1},
+	                    {"*/", 2},
+	                    {"//", 3},
+	                    {"'", 4},
+	                    {"\\", 5},
+	                    {"{", CURLY_OPEN},
+	                    {"}", CURLY_CLOSE},
+	                    {"\n", 6}}};
+	std::map<int, CodeTokenDescriptor> descriptors{
+	    // the block comment start Token has the id 1 and if the Tokenizer
+	    // returns a Block Comment Token that should have the id 10.
+	    {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}},
+	    {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}},
+	    {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}},
+	    {4, {CodeTokenMode::STRING_START_END, STRING}},
+	    {5, {CodeTokenMode::ESCAPE, ESCAPE}},
+	    {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
+
+	std::vector<Token> expected = {
+	    {BLOCK_COMMENT, "*\n * Some Block Comment\n ", 1, 1, 4, 3},
+	    {LINEBREAK, "\n", 4, 3, 1, 4},
+	    {TOKEN_TEXT, "var", 1, 4, 4, 4},
+	    {TOKEN_TEXT, "my_string", 5, 4, 14, 4},
+	    {TOKEN_TEXT, "=", 15, 4, 16, 4},
+	    {STRING, "My 'String'", 17, 4, 32, 4},
+	    {TOKEN_TEXT, ";", 32, 4, 33, 4},
+	    {LINEBREAK, "\n", 33, 4, 1, 5},
+		//this is slightly counter-intuitive but makes sense if you think about
+		//it: As a line comment is ended by a line break the line break is
+		//technically still a part of the line comment and thus the ending
+		//is in the next line.
+	    {LINE_COMMENT, " and a line comment", 1, 5, 1, 6},
+	    {TOKEN_TEXT, "var", 1, 6, 4, 6},
+	    {TOKEN_TEXT, "my_obj", 5, 6, 11, 6},
+	    {TOKEN_TEXT, "=", 12, 6, 13, 6},
+	    {CURLY_OPEN, "{", 14, 6, 15, 6},
+	    {TOKEN_TEXT, "a", 16, 6, 17, 6},
+	    {TOKEN_TEXT, "=", 18, 6, 19, 6},
+	    {TOKEN_TEXT, "4;", 20, 6, 22, 6},
+	    {CURLY_CLOSE, "}", 22, 6, 23, 6},
+	};
+
+	CodeTokenizer tokenizer{reader, root, descriptors};
+
+	Token t;
+	for (auto &te : expected) {
+		ASSERT_TRUE(tokenizer.next(t));
+		ASSERT_EQ(te.tokenId, t.tokenId);
+		ASSERT_EQ(te.content, t.content);
+		ASSERT_EQ(te.startColumn, t.startColumn);
+		ASSERT_EQ(te.startLine, t.startLine);
+		ASSERT_EQ(te.endColumn, t.endColumn);
+		ASSERT_EQ(te.endLine, t.endLine);
+	}
+	ASSERT_FALSE(tokenizer.next(t));
 }
 }
 }
-- 
cgit v1.2.3


From 50e7f4544e806a4196cced68365dd005afa1a982 Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 19:44:45 +0100
Subject: one slight formatting change.

---
 src/core/utils/CodeTokenizer.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'src/core/utils/CodeTokenizer.hpp')

diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index fda4493..0fc0862 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -53,8 +53,7 @@ enum class CodeTokenMode {
  * A CodeTokenDescriptor defines the id the user likes to have returned for
  * a Token of the mode specified, e.g. if you want to get the id 4 for a
  * String Token the corresponding CodeTokenDescriptor would be inizialized
- * with
- * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
+ * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
  */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
-- 
cgit v1.2.3