Implement endAtWhitespace flag which tells TokenizedDataReader to stop reading data after the first whitespace character

author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-04-12 18:47:29 +0200
committer: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2016-04-25 22:24:16 +0200
commit: 667d9c4a082552fb64c5ffe7b0bd6212c8a8b1b3 (patch)
tree: 100e8e3fbd86970dec9ef97c773419ac2bba291b /src
parent: 0884afe16263a110597671f60dcb4ff7df66f456 (diff)
2 files changed, 34 insertions, 8 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
index 276cd54..7c7d4a7 100644
--- a/src/core/parser/utils/TokenizedData.cpp
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -367,11 +367,13 @@ public:
 	 * @param cursor is the position in the character buffer from which on the
 	 * next token should be read. The cursor will be updated to the position
 	 * beyond the returned token.
+	 * @param endAtWhitespace if true, only delivers data up to the next
+	 * whitespace.
 	 * @return true if a token was returned, false if no more tokens are
 	 * available.
 	 */
 	bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens,
-	          TokenizedDataCursor &cursor) const
+	          TokenizedDataCursor &cursor, bool endAtWhitespace) const
 	{
 		// Some variables for convenient access
 		size_t &bufPos = cursor.bufPos;
@@ -394,12 +396,28 @@ public:
 
 		// Calculate the buffer start and end character, based on the returned
 		// TokenMark instance
-		const size_t end = (it != marks.end()) ? it->bufStart : buf.size();
+		size_t end = (it != marks.end()) ? it->bufStart : buf.size();
 
 		// Depending on the whitespace mode, fetch all the data between the
 		// cursor position and the calculated end position and return a token
 		// containing that data.
 		if (bufPos < end && bufPos < buf.size()) {
+			// If endAtWhitespace is set to true, limit copying to the the first
+			// whitespace character after non-whitespace
+			if (endAtWhitespace) {
+				bool hasNonWhitespace = false;
+				for (size_t i = bufPos; i < end; i++) {
+					const bool isWhitespace = Utils::isWhitespace(buf[i]);
+					if (isWhitespace) {
+						if (hasNonWhitespace) {
+							end = i;
+							break;
+						}
+					} else {
+						hasNonWhitespace = true;
+					}
+				}
+			}
 			switch (mode) {
 				case WhitespaceMode::PRESERVE: {
 					token = Token(
@@ -685,15 +703,15 @@ bool TokenizedDataReader::atEnd() const
 }
 
 bool TokenizedDataReader::read(Token &token, const TokenSet &tokens,
-                               WhitespaceMode mode)
+                               WhitespaceMode mode, bool endAtWhitespace)
 {
 	peekCursor = readCursor;
-	return impl->next(token, mode, tokens, readCursor);
+	return impl->next(token, mode, tokens, readCursor, endAtWhitespace);
 }
 
 bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens,
-                               WhitespaceMode mode)
+                               WhitespaceMode mode, bool endAtWhitespace)
 {
-	return impl->next(token, mode, tokens, peekCursor);
+	return impl->next(token, mode, tokens, peekCursor, endAtWhitespace);
 }
 }
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 95af95e..83821d7 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -307,11 +307,15 @@ public:
 	 * enabled tokens.
 	 * @param mode is the whitespace mode that should be used when a text token
 	 * is returned.
+	 * @param endAtWhitespace if true, only delivers data until the first
+	 * whitespace character after a sequence of non-whitespace characters. Does
+	 * not affect the delivery of non-data tokens.
 	 * @return true if the operation was successful and there is a next token,
 	 * false if there are no more tokens.
 	 */
 	bool read(Token &token, const TokenSet &tokens = TokenSet{},
-	          WhitespaceMode mode = WhitespaceMode::TRIM);
+	          WhitespaceMode mode = WhitespaceMode::TRIM,
+	          bool endAtWhitespace = false);
 
 	/**
 	 * Stores the next token in the given token reference, returns true if the
@@ -323,11 +327,15 @@ public:
 	 * enabled tokens.
 	 * @param mode is the whitespace mode that should be used when a text token
 	 * is returned.
+	 * @param endAtWhitespace if true, only delivers data until the first
+	 * whitespace character after a sequence of non-whitespace characters. Does
+	 * not affect the delivery of non-data tokens.
 	 * @return true if the operation was successful and there is a next token,
 	 * false if there are no more tokens.
 	 */
 	bool peek(Token &token, const TokenSet &tokens = TokenSet{},
-	          WhitespaceMode mode = WhitespaceMode::TRIM);
+	          WhitespaceMode mode = WhitespaceMode::TRIM,
+	          bool endAtWhitespace = false);
 
 	/**
 	 * Consumes the peeked tokens, the read cursor will now be at the position
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-04-12 18:47:29 +0200
committer	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2016-04-25 22:24:16 +0200
commit	667d9c4a082552fb64c5ffe7b0bd6212c8a8b1b3 (patch)
tree	100e8e3fbd86970dec9ef97c773419ac2bba291b /src
parent	0884afe16263a110597671f60dcb4ff7df66f456 (diff)