diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-04-12 18:47:29 +0200 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2016-04-25 22:24:16 +0200 |
commit | 667d9c4a082552fb64c5ffe7b0bd6212c8a8b1b3 (patch) | |
tree | 100e8e3fbd86970dec9ef97c773419ac2bba291b | |
parent | 0884afe16263a110597671f60dcb4ff7df66f456 (diff) |
Implement endAtWhitespace flag which tells TokenizedDataReader to stop reading data after the first whitespace character
-rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 30 | ||||
-rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 12 | ||||
-rw-r--r-- | test/core/parser/utils/TokenizedDataTest.cpp | 43 | ||||
-rw-r--r-- | test/core/parser/utils/TokenizedDataTestUtils.hpp | 41 |
4 files changed, 93 insertions, 33 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index 276cd54..7c7d4a7 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -367,11 +367,13 @@ public: * @param cursor is the position in the character buffer from which on the * next token should be read. The cursor will be updated to the position * beyond the returned token. + * @param endAtWhitespace if true, only delivers data up to the next + * whitespace. * @return true if a token was returned, false if no more tokens are * available. */ bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, - TokenizedDataCursor &cursor) const + TokenizedDataCursor &cursor, bool endAtWhitespace) const { // Some variables for convenient access size_t &bufPos = cursor.bufPos; @@ -394,12 +396,28 @@ public: // Calculate the buffer start and end character, based on the returned // TokenMark instance - const size_t end = (it != marks.end()) ? it->bufStart : buf.size(); + size_t end = (it != marks.end()) ? it->bufStart : buf.size(); // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. if (bufPos < end && bufPos < buf.size()) { + // If endAtWhitespace is set to true, limit copying to the the first + // whitespace character after non-whitespace + if (endAtWhitespace) { + bool hasNonWhitespace = false; + for (size_t i = bufPos; i < end; i++) { + const bool isWhitespace = Utils::isWhitespace(buf[i]); + if (isWhitespace) { + if (hasNonWhitespace) { + end = i; + break; + } + } else { + hasNonWhitespace = true; + } + } + } switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( @@ -685,15 +703,15 @@ bool TokenizedDataReader::atEnd() const } bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, - WhitespaceMode mode) + WhitespaceMode mode, bool endAtWhitespace) { peekCursor = readCursor; - return impl->next(token, mode, tokens, readCursor); + return impl->next(token, mode, tokens, readCursor, endAtWhitespace); } bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, - WhitespaceMode mode) + WhitespaceMode mode, bool endAtWhitespace) { - return impl->next(token, mode, tokens, peekCursor); + return impl->next(token, mode, tokens, peekCursor, endAtWhitespace); } } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 95af95e..83821d7 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -307,11 +307,15 @@ public: * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. + * @param endAtWhitespace if true, only delivers data until the first + * whitespace character after a sequence of non-whitespace characters. Does + * not affect the delivery of non-data tokens. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ bool read(Token &token, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::TRIM); + WhitespaceMode mode = WhitespaceMode::TRIM, + bool endAtWhitespace = false); /** * Stores the next token in the given token reference, returns true if the @@ -323,11 +327,15 @@ public: * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. + * @param endAtWhitespace if true, only delivers data until the first + * whitespace character after a sequence of non-whitespace characters. Does + * not affect the delivery of non-data tokens. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ bool peek(Token &token, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::TRIM); + WhitespaceMode mode = WhitespaceMode::TRIM, + bool endAtWhitespace = false); /** * Consumes the peeked tokens, the read cursor will now be at the position diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp index 31346bd..e468a50 100644 --- a/test/core/parser/utils/TokenizedDataTest.cpp +++ b/test/core/parser/utils/TokenizedDataTest.cpp @@ -348,12 +348,13 @@ TEST(TokenizedData, specialTokenIndent) const TokenSet tokens{Tokens::Indent, Tokens::Dedent}; TokenizedDataReader reader = data.reader(); - assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, - 4, 4); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 4, + 4); assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 10, 10); - assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, + 10, 37); assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, 37, 37); assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, @@ -372,11 +373,11 @@ TEST(TokenizedData, specialTokenIndent2) TokenizedDataReader reader = data.reader(); assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); - assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, - 3, 3); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 3, + 3); assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 3, 4); - assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, - 7, 7); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 7, + 7); assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 7, 8); assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 12, 12); @@ -403,12 +404,13 @@ TEST(TokenizedData, specialTokenIndentOverlap) data.mark(5, 4, 4); TokenizedDataReader reader = data.reader(); - assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, - 4, 4); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 4, + 4); assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 10, 10); - assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, + 10, 37); assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, 37, 37); assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, @@ -417,5 +419,26 @@ TEST(TokenizedData, specialTokenIndentOverlap) assertEnd(reader); } +TEST(TokenizedData, endAtWhitespace) +{ + TokenizedData data; + data.append(" a b c d"); + // 0123456789012345 + // 0 1 + const TokenSet tokens{}; + + data.mark(5, 4, 4); + + TokenizedDataReader reader = data.reader(); + assertTextEndAtWhitespace(reader, " a", tokens, WhitespaceMode::PRESERVE, + 0, 5); + assertTextEndAtWhitespace(reader, " b", tokens, WhitespaceMode::PRESERVE, + 5, 10); + assertTextEndAtWhitespace(reader, " c", tokens, WhitespaceMode::PRESERVE, + 10, 12); + assertTextEndAtWhitespace(reader, " d", tokens, WhitespaceMode::PRESERVE, + 12, 16); + assertEnd(reader); +} } diff --git a/test/core/parser/utils/TokenizedDataTestUtils.hpp b/test/core/parser/utils/TokenizedDataTestUtils.hpp index c384f9d..30f72ae 100644 --- a/test/core/parser/utils/TokenizedDataTestUtils.hpp +++ b/test/core/parser/utils/TokenizedDataTestUtils.hpp @@ -21,15 +21,17 @@ namespace ousia { -static void assertToken(TokenizedDataReader &reader, TokenId id, - const std::string &text, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::TRIM, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset, - SourceId sourceId = InvalidSourceId) +inline void assertToken(TokenizedDataReader &reader, TokenId id, + const std::string &text, + const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId sourceId = InvalidSourceId, + bool endAtWhitespace = false) { Token token; - ASSERT_TRUE(reader.read(token, tokens, mode)); + ASSERT_TRUE(reader.read(token, tokens, mode, endAtWhitespace)); EXPECT_EQ(id, token.id); EXPECT_EQ(text, token.content); if (start != InvalidSourceOffset) { @@ -41,23 +43,32 @@ static void assertToken(TokenizedDataReader &reader, TokenId id, EXPECT_EQ(sourceId, token.getLocation().getSourceId()); } -static void assertText(TokenizedDataReader &reader, const std::string &text, - const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::TRIM, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset, - SourceId id = InvalidSourceId) +inline void assertText(TokenizedDataReader &reader, const std::string &text, + const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId id = InvalidSourceId) { assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id); } -static void assertEnd(TokenizedDataReader &reader) +inline void assertTextEndAtWhitespace( + TokenizedDataReader &reader, const std::string &text, + const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, SourceId id = InvalidSourceId) +{ + assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id, true); +} + +inline void assertEnd(TokenizedDataReader &reader) { Token token; ASSERT_TRUE(reader.atEnd()); ASSERT_FALSE(reader.read(token)); } - } #endif /* _OUSIA_TOKENIZED_DATA_TEST_UTILS_HPP_ */ |