Merge branch 'master' of somweyr.de:ousia

author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2014-11-20 17:40:36 +0100
committer: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2014-11-20 17:40:36 +0100
commit: 6c8ee8084a8fa8317be69f5578d9b1052aee3b70 (patch)
tree: fe673a403e75d3478e1995fba2e40ec268f1cb8b /src/core/utils/CodeTokenizer.cpp
parent: adf0b5eaef95484a8d3b8ad1e6e6765018658bdc (diff)
parent: d2f14ec9b2d54c8addc03fef147be15327dd8623 (diff)
1 files changed, 42 insertions, 20 deletions
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
index c1376af..e5b8610 100644
--- a/src/core/utils/CodeTokenizer.cpp
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -23,30 +23,30 @@
 namespace ousia {
 namespace utils {
 
-Token CodeTokenizer::constructToken(const Token& t)
+Token CodeTokenizer::constructToken(const Token &t)
 {
 	std::string content = buf.str();
 	buf.str(std::string());
-	return Token{returnTokenId,          content,
-	             startToken.startColumn, startToken.startLine,
-	             t.endColumn,     t.endLine};
+	return Token{returnTokenId,        content,     startToken.startColumn,
+	             startToken.startLine, t.endColumn, t.endLine};
 }
 
 void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
 
 bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 {
-	if (t.startLine != t.endLine) {
-		throw TokenizerException(
-		    "We did not expect a multiline token. Most likely you did not add "
-		    "a linebreak token to your tokenizer!");
-	}
-
 	auto it = descriptors.find(t.tokenId);
 	CodeTokenMode mode = CodeTokenMode::NONE;
 	if (it != descriptors.end()) {
 		mode = it->second.mode;
 	}
+
+	if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
+		throw TokenizerException(
+		    "We did not expect a multiline token (except linebreaks). Most "
+		    "likely you did not add a linebreak token to your tokenizer!");
+	}
+
 	switch (state) {
 		case CodeTokenizerState::NORMAL:
 			switch (mode) {
@@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
 				case CodeTokenMode::LINE_COMMENT:
 					state = CodeTokenizerState::IN_LINE_COMMENT;
 					break;
+				case CodeTokenMode::LINEBREAK:
+					peeked.push_back({it->second.id, t.content, t.startColumn,
+					                  t.startLine, t.endColumn, t.endLine});
+					return true;
 				default:
 					if (t.tokenId == TOKEN_TEXT) {
 						int begin = -1;
 						for (size_t c = 0; c < t.content.length(); c++) {
 							bool isWhitespace =
 							    t.content[c] == ' ' || t.content[c] == '\t';
-							if (begin >= 0 && isWhitespace) {
-								peeked.push_back(Token{
-								    TOKEN_TEXT,
-								    t.content.substr(begin, (int)c - begin),
-								    t.startColumn + begin, t.startLine,
-								    t.startColumn + (int)c, t.endLine});
-							}
-							if (!isWhitespace && begin < 0) {
-								begin = c;
+							if (begin < 0) {
+								// if we have not yet set our beginning,
+								// we wait for the first
+								// non-whitespace-character to set it.
+								if (!isWhitespace) {
+									begin = c;
+								}
+							} else {
+								// if we have set our beginning, we wait for the
+								// first whitespace character, which marks the
+								// end of the current word.
+								if (isWhitespace) {
+									peeked.push_back(Token{
+									    TOKEN_TEXT,
+									    t.content.substr(begin, (int)c - begin),
+									    t.startColumn + begin, t.startLine,
+									    t.startColumn + (int)c, t.endLine});
+									begin = -1;
+								}
 							}
 						}
+						if(begin >= 0){
+							peeked.push_back(Token{
+									TOKEN_TEXT,
+									t.content.substr(begin),
+									t.startColumn + begin, t.startLine,
+									t.endColumn, t.endLine});
+						}
+					} else {
+						peeked.push_back(t);
 					}
-					peeked.push_back(t);
 					return true;
 			}
 			startToken = t;
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2014-11-20 17:40:36 +0100
committer	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2014-11-20 17:40:36 +0100
commit	6c8ee8084a8fa8317be69f5578d9b1052aee3b70 (patch)
tree	fe673a403e75d3478e1995fba2e40ec268f1cb8b /src/core/utils/CodeTokenizer.cpp
parent	adf0b5eaef95484a8d3b8ad1e6e6765018658bdc (diff)
parent	d2f14ec9b2d54c8addc03fef147be15327dd8623 (diff)