From 2a270c5e0ec49442fa65f699fbfb30c6bdad69ae Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 11:50:12 +0100
Subject: added documentation to the Tokenizer header.

---
 src/core/utils/Tokenizer.hpp | 123 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

(limited to 'src/core/utils')

diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
index eb8eed4..3b1405a 100644
--- a/src/core/utils/Tokenizer.hpp
+++ b/src/core/utils/Tokenizer.hpp
@@ -28,6 +28,11 @@
 namespace ousia {
 namespace utils {
 
+	/**
+	 * This exception is currently only thrown if errors are made during the
+	 * initialization of the Tokenizer. Have a closer look at the documentation
+	 * of the TokenTreeNode constructor for more information.
+	 */
 class TokenizerException : public std::exception {
 public:
 	const std::string msg;
@@ -37,17 +42,83 @@ public:
 	virtual const char *what() const noexcept override { return msg.c_str(); }
 };
 
+
+/**
+ * The Tokenizer internally uses a TokenTree to be efficiently able to identify
+ * the longest consecutive token in the text. This is equivalent to a prefix
+ * trie.
+ *
+ * The TokenTree is a construct that structures all special tokens this
+ * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then
+ * the TokenTree would look like this:
+ *
+ * a
+ * | \
+ * a $
+ * | \
+ * b c
+ * | |
+ * $ $
+ * 
+ * Every node in the TokenTree is a valid end state that has a $ attached to it.
+ * During the search algorithm the Tokenizer goes through the tree and stores
+ * the last valid position. If a character follows that does not lead to a new
+ * node in the TokenTree the search ends (and starts again at this character).
+ * The token corresponding to the last valid position is returned.
+ *
+ * This allows us to uniquely identify the matching token given a certain
+ * input text. Note that this is a greedy matching approach that does not
+ * work if you're using truly ambiguous tokens (that have the same text).
+ *
+ * It is also not allowed that tokens have common middle parts but varying
+ * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and
+ * the input string "abc". In that case we start looking for "abd" at the
+ * start, won't find it, wenn we hit "c" and start the scanning process
+ * anew. Thus the "bc" token is not found.
+ *
+ * For most (well-behaved) tokenization schemes this is not the case,
+ * though.
+ */
 class TokenTreeNode {
 public:
 	const std::map<char, TokenTreeNode> children;
 	const int tokenId;
 
+	/**
+	 * The TokenTreeNode constructor builds a TokenTree from the given token
+	 * specifications. The node returned by this constructor then is the root of
+	 * said TokenTree.
+	 * @param inputs Specifications of tokens in map form. Each specification
+	 * is a tuple of the text that should be matched and some unique ID (>= 0)
+	 * that is returned to you if that Token is found in the text.
+	 * An example for such a map would be
+	 * {
+	 *	{ "#" , 1},
+	 *  { "##", 2},
+	 *  { "/" , 3}
+	 * }
+	 * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE
+	 * (-1) and TOKEN_TEXT (-2).
+	 */
 	TokenTreeNode(const std::map<std::string, int> &inputs);
 };
 
+/**
+ * This is a reserved constant for the empty token.
+ */
 static const int TOKEN_NONE = -1;
+/**
+ * This is a reserved constant for every part of the input text that is not a
+ * specified token.
+ */
 static const int TOKEN_TEXT = -2;
 
+/**
+ * A token for us is identified by an integer tokenID (either one of the
+ * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants).
+ * Additionally we return the matched text (which should only be really interesting
+ * in case of TOKEN_TEXT tokens) and the position in the input text.
+ */
 struct Token {
 	int tokenId;
 	std::string content;
@@ -70,6 +141,25 @@ struct Token {
 	Token() : tokenId(TOKEN_NONE) {}
 };
 
+/**
+ * A Tokenizer has the purpose of subdividing an input text into tokens. In our
+ * definition here we distinguish between two kinds of tokens:
+ * 1.) User-specified tokens that match a fixed text.
+ * 2.) Any other text between those tokens.
+ * The user might want to specify the tokens '#{' and '#}' for example, because 
+ * they have some meaning in her code. The user sets the IDs to 1 and 2.
+ * Given the input text
+ * "some text #{ special command #} some text"
+ * the tokenizer would return the tokens:
+ * 1.) "some text " with the id TOKEN_TEXT (-2).
+ * 2.) "#{" with the id 1.
+ * 3.) " special command " with the id TOKEN_TEXT (-2).
+ * 4.) "#}" with the id 2.
+ * 5.) " some text" with the id TOKEN_TEXT (-2).
+ * This makes the subsequent parsing of files of a specific type easier.
+ * Note that in case of tokens with that are prefixes of other tokens the
+ * longest possible match is returned.
+ */
 class Tokenizer {
 private:
 	BufferedCharReader &input;
@@ -95,14 +185,45 @@ protected:
 	virtual bool doPrepare(const Token &t, std::deque<Token> &peeked);
 
 public:
+	/**
+	 * @param input The input of a Tokenizer is given in the form of a
+	 * BufferedCharReader. Please refer to the respective documentation.
+	 * @param root This is meant to be the root of a TokenTree giving the
+	 * specification of user-defined tokens this Tokenizer should recognize.
+	 * The Tokenizer promises to not change the TokenTree such that you can
+	 * re-use the same specification for multiple inputs.
+	 * Please refer to the TokenTreeNode documentation for more information.
+	 */
 	Tokenizer(BufferedCharReader &input, const TokenTreeNode &root);
 
+	/**
+	 * The next method consumes one Token from the input stream and gives
+	 * it to the user (stored in the input argument).
+	 * 
+	 * @param t a Token reference that is set to the next found token.
+	 * @return true if a next token was found and false if the input is at its
+	 * end.
+	 */
 	bool next(Token &t);
-
+	/**
+	 * The peek method does not consume the next Token but buffers it and
+	 * shows it to the user (stored in the input argument).
+	 * 
+	 * @param t a Token reference that is set to the next found token.
+	 * @return true if a next token was found and false if the input is at its
+	 * end.
+	 */
 	bool peek(Token &t);
 
+	/**
+	 * Resets the peek pointer to the current position in the stream (to the
+	 * beginning of the buffer).
+	 */
 	void resetPeek();
 
+	/**
+	 * Clears the peek buffer, such that all peeked Tokens are consumed.
+	 */
 	void consumePeek();
 };
 }
-- 
cgit v1.2.3


From f333dd3f7c88ba93e9ac726fd1cfee6b817edd45 Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 11:53:54 +0100
Subject: autoformat on Tokenizer docu.

---
 src/core/utils/Tokenizer.hpp | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'src/core/utils')

diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
index 3b1405a..2debc75 100644
--- a/src/core/utils/Tokenizer.hpp
+++ b/src/core/utils/Tokenizer.hpp
@@ -28,11 +28,11 @@
 namespace ousia {
 namespace utils {
 
-	/**
-	 * This exception is currently only thrown if errors are made during the
-	 * initialization of the Tokenizer. Have a closer look at the documentation
-	 * of the TokenTreeNode constructor for more information.
-	 */
+/**
+ * This exception is currently only thrown if errors are made during the
+ * initialization of the Tokenizer. Have a closer look at the documentation
+ * of the TokenTreeNode constructor for more information.
+ */
 class TokenizerException : public std::exception {
 public:
 	const std::string msg;
@@ -42,7 +42,6 @@ public:
 	virtual const char *what() const noexcept override { return msg.c_str(); }
 };
 
-
 /**
  * The Tokenizer internally uses a TokenTree to be efficiently able to identify
  * the longest consecutive token in the text. This is equivalent to a prefix
@@ -59,7 +58,7 @@ public:
  * b c
  * | |
  * $ $
- * 
+ *
  * Every node in the TokenTree is a valid end state that has a $ attached to it.
  * During the search algorithm the Tokenizer goes through the tree and stores
  * the last valid position. If a character follows that does not lead to a new
@@ -116,8 +115,8 @@ static const int TOKEN_TEXT = -2;
 /**
  * A token for us is identified by an integer tokenID (either one of the
  * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants).
- * Additionally we return the matched text (which should only be really interesting
- * in case of TOKEN_TEXT tokens) and the position in the input text.
+ * Additionally we return the matched text (which should only be really
+ * interesting in case of TOKEN_TEXT tokens) and the position in the input text.
  */
 struct Token {
 	int tokenId;
@@ -146,7 +145,7 @@ struct Token {
  * definition here we distinguish between two kinds of tokens:
  * 1.) User-specified tokens that match a fixed text.
  * 2.) Any other text between those tokens.
- * The user might want to specify the tokens '#{' and '#}' for example, because 
+ * The user might want to specify the tokens '#{' and '#}' for example, because
  * they have some meaning in her code. The user sets the IDs to 1 and 2.
  * Given the input text
  * "some text #{ special command #} some text"
@@ -199,7 +198,7 @@ public:
 	/**
 	 * The next method consumes one Token from the input stream and gives
 	 * it to the user (stored in the input argument).
-	 * 
+	 *
 	 * @param t a Token reference that is set to the next found token.
 	 * @return true if a next token was found and false if the input is at its
 	 * end.
@@ -208,7 +207,7 @@ public:
 	/**
 	 * The peek method does not consume the next Token but buffers it and
 	 * shows it to the user (stored in the input argument).
-	 * 
+	 *
 	 * @param t a Token reference that is set to the next found token.
 	 * @return true if a next token was found and false if the input is at its
 	 * end.
-- 
cgit v1.2.3


From 7b44735f31b8b5d236d66ea1a681abb99ac83bf9 Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 15:43:35 +0100
Subject: added documentation for CodeTokenizer.

---
 src/core/utils/CodeTokenizer.hpp | 53 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)

(limited to 'src/core/utils')

diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index f26a74c..18cf02a 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -28,6 +28,17 @@
 namespace ousia {
 namespace utils {
 
+/*
+ * This enum contains all special Token the CodeTokenizer supports, namely:
+ *
+ * 1.) An ambigous Tokens - in post programming languages single-quotes ' or
+ * double-quotes " - to delimit string tokens.
+ * 2.) A start token for line comments, which would e.g. be // in Java.
+ * 3.) A start token for a block comment
+ * 4.) An end token for a block comment.
+ * 5.) The linebreak token (this does not have to be specified by the user)
+ * 6.) The escape token, which would e.g. be \ in java.
+ */
 enum class CodeTokenMode {
 	STRING_START_END,
 	LINE_COMMENT,
@@ -38,6 +49,10 @@ enum class CodeTokenMode {
 	NONE
 };
 
+/**
+ * A CodeTokenDescriptor draws the connection between an id returned by the
+ * underlying Tokenizer and the mode this token represents.
+ */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
 	int id;
@@ -45,7 +60,10 @@ struct CodeTokenDescriptor {
 	CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
 };
 
-
+/**
+ * The CodeTokenizer is a finite state machine with the states NORMAL, being
+ * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING.
+ */
 enum class CodeTokenizerState {
 	NORMAL,
 	IN_BLOCK_COMMENT,
@@ -53,6 +71,14 @@ enum class CodeTokenizerState {
 	IN_STRING
 };
 
+/**
+ * The purpose of a CodeTokenizer is to make it easier to parse classical
+ * programming Code. It adds the following features to a regular Tokenizer:
+ * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens
+ * for the opening delimiter, the text and the closing delimiter.
+ * 2.) Escaping in String tokens.
+ * 3.) Comment Tokens (for line comments as well as block comments)
+ */
 class CodeTokenizer : public Tokenizer {
 private:
 	std::map<int, CodeTokenDescriptor> descriptors;
@@ -62,17 +88,36 @@ private:
 	int returnTokenId;
 	bool escaped = false;
 
-	Token constructToken(const Token& t);
-	void buffer(const Token& t);
+	Token constructToken(const Token &t);
+	void buffer(const Token &t);
 
 protected:
 	bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
 
 public:
+	/**
+	 * If you do not want comment tokens to be returned you can set this to
+	 * true.
+	 */
 	bool ignoreComments = false;
 
+	/**
+	 *
+	 * @param input a BufferedCharReader containing the input for this
+	 *tokenizer,
+	 * as with a regular tokenizer.
+	 * @param root a TokenTreeNode representing the root of the TokenTree.
+	 * Please note that you have to specify all tokenIDs here that you use
+	 * in the descriptors map.
+	 * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors.
+	 * In this way you can specify the meaning of certain Tokens. Say you
+	 * specified the Token "//" with the id 1 in the TokenTree. Then you could
+	 * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map
+	 * and this CodeTokenizer would recognize the token "//" as starting a
+	 * line comment.
+	 */
 	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
-	          std::map<int, CodeTokenDescriptor> descriptors)
+	              std::map<int, CodeTokenDescriptor> descriptors)
 	    : Tokenizer(input, root), descriptors(descriptors)
 	{
 	}
-- 
cgit v1.2.3


From ec31aae293f88e36190aa32169a97a776873567a Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 19:42:07 +0100
Subject: fixed a bug preventing the Tokenizer from finding the right token if
 a parse was incomplete beforehand. Also cleared the buffers if a subclass
 returns false from doPrepare. Failing to clear the buffers lead to subsequent
 problems.

---
 src/core/utils/Tokenizer.cpp      | 26 +++++++++++++++++++++-----
 test/core/utils/TokenizerTest.cpp | 28 ++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'src/core/utils')

diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp
index 164a30f..a0ca3aa 100644
--- a/src/core/utils/Tokenizer.cpp
+++ b/src/core/utils/Tokenizer.cpp
@@ -82,8 +82,8 @@ bool Tokenizer::prepare()
 {
 	std::stringstream buffer;
 	char c;
-	const int startColumn = input.getColumn();
-	const int startLine = input.getLine();
+	int startColumn = input.getColumn();
+	int startLine = input.getLine();
 	bool bufEmpty = true;
 	while (input.peek(&c)) {
 		if (root.children.find(c) != root.children.end()) {
@@ -118,9 +118,10 @@ bool Tokenizer::prepare()
 					break;
 				}
 			}
+			//reset the peek pointer to the last valid position.
+			input.resetPeek();
 			// check if we did indeed find a special token.
 			if (match != TOKEN_NONE) {
-				input.resetPeek();
 				if (bufEmpty) {
 					// if we did not have text before, construct that token.
 					if (doPrepare(
@@ -128,8 +129,11 @@ bool Tokenizer::prepare()
 					              input.getColumn(), input.getLine()},
 					        peeked)) {
 						return true;
+					} else {
+						startColumn = input.getColumn();
+						startLine = input.getLine();
+						continue;
 					}
-
 				} else {
 					// otherwise we return the text before the token.
 					if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn,
@@ -137,8 +141,20 @@ bool Tokenizer::prepare()
 					                    input.getLine()},
 					              peeked)) {
 						return true;
-					}
+					} else{
+						//we need to clear the buffer here. After all the token
+						//corresponding to this buffer segment is already
+						//constructed.
+						buffer.str(std::string());
+						bufEmpty = true;
+						startColumn = input.getColumn();
+						startLine = input.getLine();
+						continue;
+					} 
 				}
+			} else{
+				//if we found nothing, read at least one character.
+				input.peek(&c);
 			}
 		}
 		buffer << c;
diff --git a/test/core/utils/TokenizerTest.cpp b/test/core/utils/TokenizerTest.cpp
index ba06c33..79cc01d 100644
--- a/test/core/utils/TokenizerTest.cpp
+++ b/test/core/utils/TokenizerTest.cpp
@@ -93,5 +93,33 @@ TEST(Tokenizer, testTokenization)
 	}
 	ASSERT_FALSE(tokenizer.next(t));
 }
+
+TEST(Tokenizer, testIncompleteTokens)
+{
+	TokenTreeNode root{{{"ab", 1}, {"c", 2}}};
+
+	BufferedCharReader reader;
+	reader.feed("ac");
+	//           1234567890
+	//           0        1
+
+	std::vector<Token> expected = {
+	    {TOKEN_TEXT, "a", 1, 1, 2, 1},
+	    {2, "c", 2, 1, 3, 1}};
+
+	Tokenizer tokenizer{reader, root};
+
+	Token t;
+	for (auto &te : expected) {
+		ASSERT_TRUE(tokenizer.next(t));
+		ASSERT_EQ(te.tokenId, t.tokenId);
+		ASSERT_EQ(te.content, t.content);
+		ASSERT_EQ(te.startColumn, t.startColumn);
+		ASSERT_EQ(te.startLine, t.startLine);
+		ASSERT_EQ(te.endColumn, t.endColumn);
+		ASSERT_EQ(te.endLine, t.endLine);
+	}
+	ASSERT_FALSE(tokenizer.next(t));
+}
 }
 }
-- 
cgit v1.2.3


From 45c6acf226f7867f847a9085d84960d337b1a2ae Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 19:44:29 +0100
Subject: 1.) fixed the word tokenization mechanism in the CodeTokenizer which
 returned rubbish previously. 2.) Allowed multiline tokens if the mode is
 LINEBREAK (obsiously). 3.) returned LINEBREAK tokens in normal mode. 4.)
 added a CodeTokenizer test.

---
 src/core/utils/CodeTokenizer.cpp      | 62 +++++++++++++++++++----------
 src/core/utils/CodeTokenizer.hpp      | 11 ++++--
 test/core/utils/CodeTokenizerTest.cpp | 74 ++++++++++++++++++++++++++++++++++-
 3 files changed, 122 insertions(+), 25 deletions(-)

(limited to 'src/core/utils')

diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
index c1376af..e5b8610 100644
--- a/src/core/utils/CodeTokenizer.cpp
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -23,30 +23,30 @@
 namespace ousia {
 namespace utils {
 
-Token CodeTokenizer::constructToken(const Token& t)
+Token CodeTokenizer::constructToken(const Token &t)
 {
 	std::string content = buf.str();
 	buf.str(std::string());
-	return Token{returnTokenId,          content,
-	             startToken.startColumn, startToken.startLine,
-	             t.endColumn,     t.endLine};
+	return Token{returnTokenId,        content,     startToken.startColumn,
+	             startToken.startLine, t.endColumn, t.endLine};
 }
 
 void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
 
 bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 {
-	if (t.startLine != t.endLine) {
-		throw TokenizerException(
-		    "We did not expect a multiline token. Most likely you did not add "
-		    "a linebreak token to your tokenizer!");
-	}
-
 	auto it = descriptors.find(t.tokenId);
 	CodeTokenMode mode = CodeTokenMode::NONE;
 	if (it != descriptors.end()) {
 		mode = it->second.mode;
 	}
+
+	if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
+		throw TokenizerException(
+		    "We did not expect a multiline token (except linebreaks). Most "
+		    "likely you did not add a linebreak token to your tokenizer!");
+	}
+
 	switch (state) {
 		case CodeTokenizerState::NORMAL:
 			switch (mode) {
@@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 				case CodeTokenMode::LINE_COMMENT:
 					state = CodeTokenizerState::IN_LINE_COMMENT;
 					break;
+				case CodeTokenMode::LINEBREAK:
+					peeked.push_back({it->second.id, t.content, t.startColumn,
+					                  t.startLine, t.endColumn, t.endLine});
+					return true;
 				default:
 					if (t.tokenId == TOKEN_TEXT) {
 						int begin = -1;
 						for (size_t c = 0; c < t.content.length(); c++) {
 							bool isWhitespace =
 							    t.content[c] == ' ' || t.content[c] == '\t';
-							if (begin >= 0 && isWhitespace) {
-								peeked.push_back(Token{
-								    TOKEN_TEXT,
-								    t.content.substr(begin, (int)c - begin),
-								    t.startColumn + begin, t.startLine,
-								    t.startColumn + (int)c, t.endLine});
-							}
-							if (!isWhitespace && begin < 0) {
-								begin = c;
+							if (begin < 0) {
+								// if we have not yet set our beginning,
+								// we wait for the first
+								// non-whitespace-character to set it.
+								if (!isWhitespace) {
+									begin = c;
+								}
+							} else {
+								// if we have set our beginning, we wait for the
+								// first whitespace character, which marks the
+								// end of the current word.
+								if (isWhitespace) {
+									peeked.push_back(Token{
+									    TOKEN_TEXT,
+									    t.content.substr(begin, (int)c - begin),
+									    t.startColumn + begin, t.startLine,
+									    t.startColumn + (int)c, t.endLine});
+									begin = -1;
+								}
 							}
 						}
+						if(begin >= 0){
+							peeked.push_back(Token{
+									TOKEN_TEXT,
+									t.content.substr(begin),
+									t.startColumn + begin, t.startLine,
+									t.endColumn, t.endLine});
+						}
+					} else {
+						peeked.push_back(t);
 					}
-					peeked.push_back(t);
 					return true;
 			}
 			startToken = t;
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index 18cf02a..fda4493 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -36,7 +36,7 @@ namespace utils {
  * 2.) A start token for line comments, which would e.g. be // in Java.
  * 3.) A start token for a block comment
  * 4.) An end token for a block comment.
- * 5.) The linebreak token (this does not have to be specified by the user)
+ * 5.) A linebreak token
  * 6.) The escape token, which would e.g. be \ in java.
  */
 enum class CodeTokenMode {
@@ -50,8 +50,11 @@ enum class CodeTokenMode {
 };
 
 /**
- * A CodeTokenDescriptor draws the connection between an id returned by the
- * underlying Tokenizer and the mode this token represents.
+ * A CodeTokenDescriptor defines the id the user likes to have returned for
+ * a Token of the mode specified, e.g. if you want to get the id 4 for a
+ * String Token the corresponding CodeTokenDescriptor would be inizialized
+ * with
+ * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
  */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
@@ -118,7 +121,7 @@ public:
 	 */
 	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
 	              std::map<int, CodeTokenDescriptor> descriptors)
-	    : Tokenizer(input, root), descriptors(descriptors)
+	    : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL)
 	{
 	}
 };
diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp
index d0f9a17..0b9d7b3 100644
--- a/test/core/utils/CodeTokenizerTest.cpp
+++ b/test/core/utils/CodeTokenizerTest.cpp
@@ -22,9 +22,81 @@
 
 namespace ousia {
 namespace utils {
+
+static const int BLOCK_COMMENT = 30;
+static const int LINE_COMMENT = 31;
+static const int STRING = 20;
+static const int ESCAPE = 21;
+static const int LINEBREAK = 21;
+static const int CURLY_OPEN = 40;
+static const int CURLY_CLOSE = 41;
+
 TEST(CodeTokenizer, testTokenizer)
 {
-	
+	BufferedCharReader reader;
+	reader.feed("/**\n");                                 // 1
+	reader.feed(" * Some Block Comment\n");               // 2
+	reader.feed(" */\n");                                 // 3
+	reader.feed("var my_string = 'My \\'String\\'';\n");  // 4
+	reader.feed("// and a line comment\n");               // 5
+	reader.feed("var my_obj = { a = 4;}");                // 6
+	//           123456789012345678901234567890123456789012345678901234567890123456789
+	//           0        1         2         3         4         5         6
+	TokenTreeNode root{{{"/*", 1},
+	                    {"*/", 2},
+	                    {"//", 3},
+	                    {"'", 4},
+	                    {"\\", 5},
+	                    {"{", CURLY_OPEN},
+	                    {"}", CURLY_CLOSE},
+	                    {"\n", 6}}};
+	std::map<int, CodeTokenDescriptor> descriptors{
+	    // the block comment start Token has the id 1 and if the Tokenizer
+	    // returns a Block Comment Token that should have the id 10.
+	    {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}},
+	    {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}},
+	    {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}},
+	    {4, {CodeTokenMode::STRING_START_END, STRING}},
+	    {5, {CodeTokenMode::ESCAPE, ESCAPE}},
+	    {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
+
+	std::vector<Token> expected = {
+	    {BLOCK_COMMENT, "*\n * Some Block Comment\n ", 1, 1, 4, 3},
+	    {LINEBREAK, "\n", 4, 3, 1, 4},
+	    {TOKEN_TEXT, "var", 1, 4, 4, 4},
+	    {TOKEN_TEXT, "my_string", 5, 4, 14, 4},
+	    {TOKEN_TEXT, "=", 15, 4, 16, 4},
+	    {STRING, "My 'String'", 17, 4, 32, 4},
+	    {TOKEN_TEXT, ";", 32, 4, 33, 4},
+	    {LINEBREAK, "\n", 33, 4, 1, 5},
+		//this is slightly counter-intuitive but makes sense if you think about
+		//it: As a line comment is ended by a line break the line break is
+		//technically still a part of the line comment and thus the ending
+		//is in the next line.
+	    {LINE_COMMENT, " and a line comment", 1, 5, 1, 6},
+	    {TOKEN_TEXT, "var", 1, 6, 4, 6},
+	    {TOKEN_TEXT, "my_obj", 5, 6, 11, 6},
+	    {TOKEN_TEXT, "=", 12, 6, 13, 6},
+	    {CURLY_OPEN, "{", 14, 6, 15, 6},
+	    {TOKEN_TEXT, "a", 16, 6, 17, 6},
+	    {TOKEN_TEXT, "=", 18, 6, 19, 6},
+	    {TOKEN_TEXT, "4;", 20, 6, 22, 6},
+	    {CURLY_CLOSE, "}", 22, 6, 23, 6},
+	};
+
+	CodeTokenizer tokenizer{reader, root, descriptors};
+
+	Token t;
+	for (auto &te : expected) {
+		ASSERT_TRUE(tokenizer.next(t));
+		ASSERT_EQ(te.tokenId, t.tokenId);
+		ASSERT_EQ(te.content, t.content);
+		ASSERT_EQ(te.startColumn, t.startColumn);
+		ASSERT_EQ(te.startLine, t.startLine);
+		ASSERT_EQ(te.endColumn, t.endColumn);
+		ASSERT_EQ(te.endLine, t.endLine);
+	}
+	ASSERT_FALSE(tokenizer.next(t));
 }
 }
 }
-- 
cgit v1.2.3


From 50e7f4544e806a4196cced68365dd005afa1a982 Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Wed, 19 Nov 2014 19:44:45 +0100
Subject: one slight formatting change.

---
 src/core/utils/CodeTokenizer.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'src/core/utils')

diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index fda4493..0fc0862 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -53,8 +53,7 @@ enum class CodeTokenMode {
  * A CodeTokenDescriptor defines the id the user likes to have returned for
  * a Token of the mode specified, e.g. if you want to get the id 4 for a
  * String Token the corresponding CodeTokenDescriptor would be inizialized
- * with
- * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
+ * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
  */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
-- 
cgit v1.2.3


From d2f14ec9b2d54c8addc03fef147be15327dd8623 Mon Sep 17 00:00:00 2001
From: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>
Date: Thu, 20 Nov 2014 10:26:02 +0100
Subject: continued specifying the CSS language and setting up a CSS Tokenizer.
 But setting up StyleNodes is still a complicated thing to get my head around.
 So I'll do a draft in Java first.

---
 src/core/utils/CSSParser.cpp | 45 ++++++++++++++++++++++++++++++++++----------
 src/core/utils/CSSParser.hpp |  3 +++
 2 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'src/core/utils')

diff --git a/src/core/utils/CSSParser.cpp b/src/core/utils/CSSParser.cpp
index e66eb34..1763cc2 100644
--- a/src/core/utils/CSSParser.cpp
+++ b/src/core/utils/CSSParser.cpp
@@ -17,6 +17,7 @@
 */
 
 #include "BufferedCharReader.hpp"
+#include "CodeTokenizer.hpp"
 #include "Tokenizer.hpp"
 
 #include "CSSParser.hpp"
@@ -24,6 +25,7 @@
 namespace ousia {
 namespace utils {
 
+// CSS code tokens
 static const int CURLY_OPEN = 1;
 static const int CURLY_CLOSE = 2;
 static const int COLON = 3;
@@ -31,8 +33,19 @@ static const int SEMICOLON = 4;
 static const int HASH = 5;
 static const int BRACKET_OPEN = 6;
 static const int BRACKET_CLOSE = 7;
-static const int COMMENT_OPEN = 8;
-static const int COMMENT_CLOSE = 9;
+static const int PAREN_OPEN = 8;
+static const int PAREN_CLOSE = 9;
+// comments
+static const int COMMENT = 100;
+static const int COMMENT_OPEN = 101;
+static const int COMMENT_CLOSE = 102;
+// strings
+static const int STRING = 200;
+static const int SINGLE_QUOTE = 201;
+static const int DOUBLE_QUOTE = 202;
+static const int ESCAPE = 203;
+// general syntax
+static const int LINEBREAK = 300;
 
 static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN},
                                      {"}", CURLY_CLOSE},
@@ -41,16 +54,28 @@ static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN},
                                      {"#", HASH},
                                      {"[", BRACKET_OPEN},
                                      {"]", BRACKET_CLOSE},
+                                     {"(", PAREN_OPEN},
+                                     {")", PAREN_CLOSE},
                                      {"/*", COMMENT_OPEN},
-                                     {"*/", COMMENT_CLOSE}}};
-
-StyleNode CSSParser::parse(BufferedCharReader &input) {
-	Tokenizer tokenizer {input, CSS_ROOT};
-	//TODO: implement
-	
-}
-
+                                     {"*/", COMMENT_CLOSE},
+                                     {"\\", ESCAPE},
+                                     {"\''", SINGLE_QUOTE},
+                                     {"\"", DOUBLE_QUOTE},
+                                     {"\n", LINEBREAK}}};
 
+static const std::map<int, CodeTokenDescriptor> CSS_DESCRIPTORS = {
+    {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}},
+    {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}},
+    {SINGLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}},
+    {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}},
+    {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}},
+    {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
 
+StyleNode CSSParser::parse(BufferedCharReader &input)
+{
+	CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS};
+	tokenizer.ignoreComments = true;
+	// TODO: implement
+}
 }
 }
diff --git a/src/core/utils/CSSParser.hpp b/src/core/utils/CSSParser.hpp
index 0f9cd8f..c8b772d 100644
--- a/src/core/utils/CSSParser.hpp
+++ b/src/core/utils/CSSParser.hpp
@@ -156,6 +156,9 @@ public:
 };
 
 class CSSParser {
+
+private:
+
 public:
 	StyleNode parse(BufferedCharReader &input);
 };
-- 
cgit v1.2.3