1.) fixed the word tokenization mechanism in the CodeTokenizer which returned rubbish previously. 2.) Allowed multiline tokens if the mode is LINEBREAK (obsiously). 3.) returned LINEBREAK tokens in normal mode. 4.) added a CodeTokenizer test.

author: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-11-19 19:44:29 +0100
committer: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-11-19 19:44:29 +0100
commit: 45c6acf226f7867f847a9085d84960d337b1a2ae (patch)
tree: d248b1155a87b89af0f877284ac69254b9024962 /src/core
parent: ec31aae293f88e36190aa32169a97a776873567a (diff)
2 files changed, 49 insertions, 24 deletions
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
index c1376af..e5b8610 100644
--- a/src/core/utils/CodeTokenizer.cpp
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -23,30 +23,30 @@
 namespace ousia {
 namespace utils {
 
-Token CodeTokenizer::constructToken(const Token& t)
+Token CodeTokenizer::constructToken(const Token &t)
 {
 	std::string content = buf.str();
 	buf.str(std::string());
-	return Token{returnTokenId,          content,
-	             startToken.startColumn, startToken.startLine,
-	             t.endColumn,     t.endLine};
+	return Token{returnTokenId,        content,     startToken.startColumn,
+	             startToken.startLine, t.endColumn, t.endLine};
 }
 
 void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
 
 bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 {
-	if (t.startLine != t.endLine) {
-		throw TokenizerException(
-		    "We did not expect a multiline token. Most likely you did not add "
-		    "a linebreak token to your tokenizer!");
-	}
-
 	auto it = descriptors.find(t.tokenId);
 	CodeTokenMode mode = CodeTokenMode::NONE;
 	if (it != descriptors.end()) {
 		mode = it->second.mode;
 	}
+
+	if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
+		throw TokenizerException(
+		    "We did not expect a multiline token (except linebreaks). Most "
+		    "likely you did not add a linebreak token to your tokenizer!");
+	}
+
 	switch (state) {
 		case CodeTokenizerState::NORMAL:
 			switch (mode) {
@@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 				case CodeTokenMode::LINE_COMMENT:
 					state = CodeTokenizerState::IN_LINE_COMMENT;
 					break;
+				case CodeTokenMode::LINEBREAK:
+					peeked.push_back({it->second.id, t.content, t.startColumn,
+					                  t.startLine, t.endColumn, t.endLine});
+					return true;
 				default:
 					if (t.tokenId == TOKEN_TEXT) {
 						int begin = -1;
 						for (size_t c = 0; c < t.content.length(); c++) {
 							bool isWhitespace =
 							    t.content[c] == ' ' || t.content[c] == '\t';
-							if (begin >= 0 && isWhitespace) {
-								peeked.push_back(Token{
-								    TOKEN_TEXT,
-								    t.content.substr(begin, (int)c - begin),
-								    t.startColumn + begin, t.startLine,
-								    t.startColumn + (int)c, t.endLine});
-							}
-							if (!isWhitespace && begin < 0) {
-								begin = c;
+							if (begin < 0) {
+								// if we have not yet set our beginning,
+								// we wait for the first
+								// non-whitespace-character to set it.
+								if (!isWhitespace) {
+									begin = c;
+								}
+							} else {
+								// if we have set our beginning, we wait for the
+								// first whitespace character, which marks the
+								// end of the current word.
+								if (isWhitespace) {
+									peeked.push_back(Token{
+									    TOKEN_TEXT,
+									    t.content.substr(begin, (int)c - begin),
+									    t.startColumn + begin, t.startLine,
+									    t.startColumn + (int)c, t.endLine});
+									begin = -1;
+								}
 							}
 						}
+						if(begin >= 0){
+							peeked.push_back(Token{
+									TOKEN_TEXT,
+									t.content.substr(begin),
+									t.startColumn + begin, t.startLine,
+									t.endColumn, t.endLine});
+						}
+					} else {
+						peeked.push_back(t);
 					}
-					peeked.push_back(t);
 					return true;
 			}
 			startToken = t;
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index 18cf02a..fda4493 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -36,7 +36,7 @@ namespace utils {
  * 2.) A start token for line comments, which would e.g. be // in Java.
  * 3.) A start token for a block comment
  * 4.) An end token for a block comment.
- * 5.) The linebreak token (this does not have to be specified by the user)
+ * 5.) A linebreak token
  * 6.) The escape token, which would e.g. be \ in java.
  */
 enum class CodeTokenMode {
@@ -50,8 +50,11 @@ enum class CodeTokenMode {
 };
 
 /**
- * A CodeTokenDescriptor draws the connection between an id returned by the
- * underlying Tokenizer and the mode this token represents.
+ * A CodeTokenDescriptor defines the id the user likes to have returned for
+ * a Token of the mode specified, e.g. if you want to get the id 4 for a
+ * String Token the corresponding CodeTokenDescriptor would be inizialized
+ * with
+ * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
  */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
@@ -118,7 +121,7 @@ public:
 	 */
 	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
 	              std::map<int, CodeTokenDescriptor> descriptors)
-	    : Tokenizer(input, root), descriptors(descriptors)
+	    : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL)
 	{
 	}
 };
author	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-11-19 19:44:29 +0100
committer	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-11-19 19:44:29 +0100
commit	45c6acf226f7867f847a9085d84960d337b1a2ae (patch)
tree	d248b1155a87b89af0f877284ac69254b9024962 /src/core
parent	ec31aae293f88e36190aa32169a97a776873567a (diff)