2 files changed, 49 insertions, 24 deletions
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
index c1376af..e5b8610 100644
--- a/src/core/utils/CodeTokenizer.cpp
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -23,30 +23,30 @@
 namespace ousia {
 namespace utils {
 
-Token CodeTokenizer::constructToken(const Token& t)
+Token CodeTokenizer::constructToken(const Token &t)
 {
 	std::string content = buf.str();
 	buf.str(std::string());
-	return Token{returnTokenId,          content,
-	             startToken.startColumn, startToken.startLine,
-	             t.endColumn,     t.endLine};
+	return Token{returnTokenId,        content,     startToken.startColumn,
+	             startToken.startLine, t.endColumn, t.endLine};
 }
 
 void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
 
 bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 {
-	if (t.startLine != t.endLine) {
-		throw TokenizerException(
-		    "We did not expect a multiline token. Most likely you did not add "
-		    "a linebreak token to your tokenizer!");
-	}
-
 	auto it = descriptors.find(t.tokenId);
 	CodeTokenMode mode = CodeTokenMode::NONE;
 	if (it != descriptors.end()) {
 		mode = it->second.mode;
 	}
+
+	if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
+		throw TokenizerException(
+		    "We did not expect a multiline token (except linebreaks). Most "
+		    "likely you did not add a linebreak token to your tokenizer!");
+	}
+
 	switch (state) {
 		case CodeTokenizerState::NORMAL:
 			switch (mode) {
@@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 				case CodeTokenMode::LINE_COMMENT:
 					state = CodeTokenizerState::IN_LINE_COMMENT;
 					break;
+				case CodeTokenMode::LINEBREAK:
+					peeked.push_back({it->second.id, t.content, t.startColumn,
+					                  t.startLine, t.endColumn, t.endLine});
+					return true;
 				default:
 					if (t.tokenId == TOKEN_TEXT) {
 						int begin = -1;
 						for (size_t c = 0; c < t.content.length(); c++) {
 							bool isWhitespace =
 							    t.content[c] == ' ' || t.content[c] == '\t';
-							if (begin >= 0 && isWhitespace) {
-								peeked.push_back(Token{
-								    TOKEN_TEXT,
-								    t.content.substr(begin, (int)c - begin),
-								    t.startColumn + begin, t.startLine,
-								    t.startColumn + (int)c, t.endLine});
-							}
-							if (!isWhitespace && begin < 0) {
-								begin = c;
+							if (begin < 0) {
+								// if we have not yet set our beginning,
+								// we wait for the first
+								// non-whitespace-character to set it.
+								if (!isWhitespace) {
+									begin = c;
+								}
+							} else {
+								// if we have set our beginning, we wait for the
+								// first whitespace character, which marks the
+								// end of the current word.
+								if (isWhitespace) {
+									peeked.push_back(Token{
+									    TOKEN_TEXT,
+									    t.content.substr(begin, (int)c - begin),
+									    t.startColumn + begin, t.startLine,
+									    t.startColumn + (int)c, t.endLine});
+									begin = -1;
+								}
 							}
 						}
+						if(begin >= 0){
+							peeked.push_back(Token{
+									TOKEN_TEXT,
+									t.content.substr(begin),
+									t.startColumn + begin, t.startLine,
+									t.endColumn, t.endLine});
+						}
+					} else {
+						peeked.push_back(t);
 					}
-					peeked.push_back(t);
 					return true;
 			}
 			startToken = t;
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index 18cf02a..fda4493 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -36,7 +36,7 @@ namespace utils {
  * 2.) A start token for line comments, which would e.g. be // in Java.
  * 3.) A start token for a block comment
  * 4.) An end token for a block comment.
- * 5.) The linebreak token (this does not have to be specified by the user)
+ * 5.) A linebreak token
  * 6.) The escape token, which would e.g. be \ in java.
  */
 enum class CodeTokenMode {
@@ -50,8 +50,11 @@ enum class CodeTokenMode {
 };
 
 /**
- * A CodeTokenDescriptor draws the connection between an id returned by the
- * underlying Tokenizer and the mode this token represents.
+ * A CodeTokenDescriptor defines the id the user likes to have returned for
+ * a Token of the mode specified, e.g. if you want to get the id 4 for a
+ * String Token the corresponding CodeTokenDescriptor would be inizialized
+ * with
+ * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
  */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
@@ -118,7 +121,7 @@ public:
 	 */
 	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
 	              std::map<int, CodeTokenDescriptor> descriptors)
-	    : Tokenizer(input, root), descriptors(descriptors)
+	    : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL)
 	{
 	}
 };