diff options
| author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-04-12 18:47:29 +0200 | 
|---|---|---|
| committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2016-04-25 22:24:16 +0200 | 
| commit | 667d9c4a082552fb64c5ffe7b0bd6212c8a8b1b3 (patch) | |
| tree | 100e8e3fbd86970dec9ef97c773419ac2bba291b | |
| parent | 0884afe16263a110597671f60dcb4ff7df66f456 (diff) | |
Implement endAtWhitespace flag which tells TokenizedDataReader to stop reading data after the first whitespace character
| -rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 30 | ||||
| -rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 12 | ||||
| -rw-r--r-- | test/core/parser/utils/TokenizedDataTest.cpp | 43 | ||||
| -rw-r--r-- | test/core/parser/utils/TokenizedDataTestUtils.hpp | 41 | 
4 files changed, 93 insertions, 33 deletions
| diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index 276cd54..7c7d4a7 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -367,11 +367,13 @@ public:  	 * @param cursor is the position in the character buffer from which on the  	 * next token should be read. The cursor will be updated to the position  	 * beyond the returned token. +	 * @param endAtWhitespace if true, only delivers data up to the next +	 * whitespace.  	 * @return true if a token was returned, false if no more tokens are  	 * available.  	 */  	bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, -	          TokenizedDataCursor &cursor) const +	          TokenizedDataCursor &cursor, bool endAtWhitespace) const  	{  		// Some variables for convenient access  		size_t &bufPos = cursor.bufPos; @@ -394,12 +396,28 @@ public:  		// Calculate the buffer start and end character, based on the returned  		// TokenMark instance -		const size_t end = (it != marks.end()) ? it->bufStart : buf.size(); +		size_t end = (it != marks.end()) ? it->bufStart : buf.size();  		// Depending on the whitespace mode, fetch all the data between the  		// cursor position and the calculated end position and return a token  		// containing that data.  		if (bufPos < end && bufPos < buf.size()) { +			// If endAtWhitespace is set to true, limit copying to the the first +			// whitespace character after non-whitespace +			if (endAtWhitespace) { +				bool hasNonWhitespace = false; +				for (size_t i = bufPos; i < end; i++) { +					const bool isWhitespace = Utils::isWhitespace(buf[i]); +					if (isWhitespace) { +						if (hasNonWhitespace) { +							end = i; +							break; +						} +					} else { +						hasNonWhitespace = true; +					} +				} +			}  			switch (mode) {  				case WhitespaceMode::PRESERVE: {  					token = Token( @@ -685,15 +703,15 @@ bool TokenizedDataReader::atEnd() const  }  bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, -                               WhitespaceMode mode) +                               WhitespaceMode mode, bool endAtWhitespace)  {  	peekCursor = readCursor; -	return impl->next(token, mode, tokens, readCursor); +	return impl->next(token, mode, tokens, readCursor, endAtWhitespace);  }  bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, -                               WhitespaceMode mode) +                               WhitespaceMode mode, bool endAtWhitespace)  { -	return impl->next(token, mode, tokens, peekCursor); +	return impl->next(token, mode, tokens, peekCursor, endAtWhitespace);  }  } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 95af95e..83821d7 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -307,11 +307,15 @@ public:  	 * enabled tokens.  	 * @param mode is the whitespace mode that should be used when a text token  	 * is returned. +	 * @param endAtWhitespace if true, only delivers data until the first +	 * whitespace character after a sequence of non-whitespace characters. Does +	 * not affect the delivery of non-data tokens.  	 * @return true if the operation was successful and there is a next token,  	 * false if there are no more tokens.  	 */  	bool read(Token &token, const TokenSet &tokens = TokenSet{}, -	          WhitespaceMode mode = WhitespaceMode::TRIM); +	          WhitespaceMode mode = WhitespaceMode::TRIM, +	          bool endAtWhitespace = false);  	/**  	 * Stores the next token in the given token reference, returns true if the @@ -323,11 +327,15 @@ public:  	 * enabled tokens.  	 * @param mode is the whitespace mode that should be used when a text token  	 * is returned. +	 * @param endAtWhitespace if true, only delivers data until the first +	 * whitespace character after a sequence of non-whitespace characters. Does +	 * not affect the delivery of non-data tokens.  	 * @return true if the operation was successful and there is a next token,  	 * false if there are no more tokens.  	 */  	bool peek(Token &token, const TokenSet &tokens = TokenSet{}, -	          WhitespaceMode mode = WhitespaceMode::TRIM); +	          WhitespaceMode mode = WhitespaceMode::TRIM, +	          bool endAtWhitespace = false);  	/**  	 * Consumes the peeked tokens, the read cursor will now be at the position diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp index 31346bd..e468a50 100644 --- a/test/core/parser/utils/TokenizedDataTest.cpp +++ b/test/core/parser/utils/TokenizedDataTest.cpp @@ -348,12 +348,13 @@ TEST(TokenizedData, specialTokenIndent)  	const TokenSet tokens{Tokens::Indent, Tokens::Dedent};  	TokenizedDataReader reader = data.reader(); -	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, -	            4, 4); +	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 4, +	            4);  	assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);  	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,  	            10, 10); -	assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); +	assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, +	           10, 37);  	assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,  	            37, 37);  	assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, @@ -372,11 +373,11 @@ TEST(TokenizedData, specialTokenIndent2)  	TokenizedDataReader reader = data.reader();  	assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); -	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, -	            3, 3); +	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 3, +	            3);  	assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 3, 4); -	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, -	            7, 7); +	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 7, +	            7);  	assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 7, 8);  	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,  	            12, 12); @@ -403,12 +404,13 @@ TEST(TokenizedData, specialTokenIndentOverlap)  	data.mark(5, 4, 4);  	TokenizedDataReader reader = data.reader(); -	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, -	            4, 4); +	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 4, +	            4);  	assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);  	assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,  	            10, 10); -	assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); +	assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, +	           10, 37);  	assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,  	            37, 37);  	assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, @@ -417,5 +419,26 @@ TEST(TokenizedData, specialTokenIndentOverlap)  	assertEnd(reader);  } +TEST(TokenizedData, endAtWhitespace) +{ +	TokenizedData data; +	data.append("    a    b c   d"); +	//           0123456789012345 +	//           0         1 +	const TokenSet tokens{}; + +	data.mark(5, 4, 4); + +	TokenizedDataReader reader = data.reader(); +	assertTextEndAtWhitespace(reader, "    a", tokens, WhitespaceMode::PRESERVE, +	                          0, 5); +	assertTextEndAtWhitespace(reader, "    b", tokens, WhitespaceMode::PRESERVE, +	                          5, 10); +	assertTextEndAtWhitespace(reader, " c", tokens, WhitespaceMode::PRESERVE, +	                          10, 12); +	assertTextEndAtWhitespace(reader, "   d", tokens, WhitespaceMode::PRESERVE, +	                          12, 16); +	assertEnd(reader); +}  } diff --git a/test/core/parser/utils/TokenizedDataTestUtils.hpp b/test/core/parser/utils/TokenizedDataTestUtils.hpp index c384f9d..30f72ae 100644 --- a/test/core/parser/utils/TokenizedDataTestUtils.hpp +++ b/test/core/parser/utils/TokenizedDataTestUtils.hpp @@ -21,15 +21,17 @@  namespace ousia { -static void assertToken(TokenizedDataReader &reader, TokenId id, -                 const std::string &text, const TokenSet &tokens = TokenSet{}, -                 WhitespaceMode mode = WhitespaceMode::TRIM, -                 SourceOffset start = InvalidSourceOffset, -                 SourceOffset end = InvalidSourceOffset, -                 SourceId sourceId = InvalidSourceId) +inline void assertToken(TokenizedDataReader &reader, TokenId id, +                        const std::string &text, +                        const TokenSet &tokens = TokenSet{}, +                        WhitespaceMode mode = WhitespaceMode::TRIM, +                        SourceOffset start = InvalidSourceOffset, +                        SourceOffset end = InvalidSourceOffset, +                        SourceId sourceId = InvalidSourceId, +                        bool endAtWhitespace = false)  {  	Token token; -	ASSERT_TRUE(reader.read(token, tokens, mode)); +	ASSERT_TRUE(reader.read(token, tokens, mode, endAtWhitespace));  	EXPECT_EQ(id, token.id);  	EXPECT_EQ(text, token.content);  	if (start != InvalidSourceOffset) { @@ -41,23 +43,32 @@ static void assertToken(TokenizedDataReader &reader, TokenId id,  	EXPECT_EQ(sourceId, token.getLocation().getSourceId());  } -static void assertText(TokenizedDataReader &reader, const std::string &text, -                const TokenSet &tokens = TokenSet{}, -                WhitespaceMode mode = WhitespaceMode::TRIM, -                SourceOffset start = InvalidSourceOffset, -                SourceOffset end = InvalidSourceOffset, -                SourceId id = InvalidSourceId) +inline void assertText(TokenizedDataReader &reader, const std::string &text, +                       const TokenSet &tokens = TokenSet{}, +                       WhitespaceMode mode = WhitespaceMode::TRIM, +                       SourceOffset start = InvalidSourceOffset, +                       SourceOffset end = InvalidSourceOffset, +                       SourceId id = InvalidSourceId)  {  	assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id);  } -static void assertEnd(TokenizedDataReader &reader) +inline void assertTextEndAtWhitespace( +    TokenizedDataReader &reader, const std::string &text, +    const TokenSet &tokens = TokenSet{}, +    WhitespaceMode mode = WhitespaceMode::TRIM, +    SourceOffset start = InvalidSourceOffset, +    SourceOffset end = InvalidSourceOffset, SourceId id = InvalidSourceId) +{ +	assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id, true); +} + +inline void assertEnd(TokenizedDataReader &reader)  {  	Token token;  	ASSERT_TRUE(reader.atEnd());  	ASSERT_FALSE(reader.read(token));  } -  }  #endif /* _OUSIA_TOKENIZED_DATA_TEST_UTILS_HPP_ */ | 
