summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-04-12 18:47:29 +0200
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2016-04-25 22:24:16 +0200
commit667d9c4a082552fb64c5ffe7b0bd6212c8a8b1b3 (patch)
tree100e8e3fbd86970dec9ef97c773419ac2bba291b
parent0884afe16263a110597671f60dcb4ff7df66f456 (diff)
Implement endAtWhitespace flag which tells TokenizedDataReader to stop reading data after the first whitespace character
-rw-r--r--src/core/parser/utils/TokenizedData.cpp30
-rw-r--r--src/core/parser/utils/TokenizedData.hpp12
-rw-r--r--test/core/parser/utils/TokenizedDataTest.cpp43
-rw-r--r--test/core/parser/utils/TokenizedDataTestUtils.hpp41
4 files changed, 93 insertions, 33 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
index 276cd54..7c7d4a7 100644
--- a/src/core/parser/utils/TokenizedData.cpp
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -367,11 +367,13 @@ public:
* @param cursor is the position in the character buffer from which on the
* next token should be read. The cursor will be updated to the position
* beyond the returned token.
+ * @param endAtWhitespace if true, only delivers data up to the next
+ * whitespace.
* @return true if a token was returned, false if no more tokens are
* available.
*/
bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens,
- TokenizedDataCursor &cursor) const
+ TokenizedDataCursor &cursor, bool endAtWhitespace) const
{
// Some variables for convenient access
size_t &bufPos = cursor.bufPos;
@@ -394,12 +396,28 @@ public:
// Calculate the buffer start and end character, based on the returned
// TokenMark instance
- const size_t end = (it != marks.end()) ? it->bufStart : buf.size();
+ size_t end = (it != marks.end()) ? it->bufStart : buf.size();
// Depending on the whitespace mode, fetch all the data between the
// cursor position and the calculated end position and return a token
// containing that data.
if (bufPos < end && bufPos < buf.size()) {
+ // If endAtWhitespace is set to true, limit copying to the the first
+ // whitespace character after non-whitespace
+ if (endAtWhitespace) {
+ bool hasNonWhitespace = false;
+ for (size_t i = bufPos; i < end; i++) {
+ const bool isWhitespace = Utils::isWhitespace(buf[i]);
+ if (isWhitespace) {
+ if (hasNonWhitespace) {
+ end = i;
+ break;
+ }
+ } else {
+ hasNonWhitespace = true;
+ }
+ }
+ }
switch (mode) {
case WhitespaceMode::PRESERVE: {
token = Token(
@@ -685,15 +703,15 @@ bool TokenizedDataReader::atEnd() const
}
bool TokenizedDataReader::read(Token &token, const TokenSet &tokens,
- WhitespaceMode mode)
+ WhitespaceMode mode, bool endAtWhitespace)
{
peekCursor = readCursor;
- return impl->next(token, mode, tokens, readCursor);
+ return impl->next(token, mode, tokens, readCursor, endAtWhitespace);
}
bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens,
- WhitespaceMode mode)
+ WhitespaceMode mode, bool endAtWhitespace)
{
- return impl->next(token, mode, tokens, peekCursor);
+ return impl->next(token, mode, tokens, peekCursor, endAtWhitespace);
}
}
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 95af95e..83821d7 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -307,11 +307,15 @@ public:
* enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
+ * @param endAtWhitespace if true, only delivers data until the first
+ * whitespace character after a sequence of non-whitespace characters. Does
+ * not affect the delivery of non-data tokens.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
bool read(Token &token, const TokenSet &tokens = TokenSet{},
- WhitespaceMode mode = WhitespaceMode::TRIM);
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ bool endAtWhitespace = false);
/**
* Stores the next token in the given token reference, returns true if the
@@ -323,11 +327,15 @@ public:
* enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
+ * @param endAtWhitespace if true, only delivers data until the first
+ * whitespace character after a sequence of non-whitespace characters. Does
+ * not affect the delivery of non-data tokens.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
bool peek(Token &token, const TokenSet &tokens = TokenSet{},
- WhitespaceMode mode = WhitespaceMode::TRIM);
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ bool endAtWhitespace = false);
/**
* Consumes the peeked tokens, the read cursor will now be at the position
diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp
index 31346bd..e468a50 100644
--- a/test/core/parser/utils/TokenizedDataTest.cpp
+++ b/test/core/parser/utils/TokenizedDataTest.cpp
@@ -348,12 +348,13 @@ TEST(TokenizedData, specialTokenIndent)
const TokenSet tokens{Tokens::Indent, Tokens::Dedent};
TokenizedDataReader reader = data.reader();
- assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
- 4, 4);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 4,
+ 4);
assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
10, 10);
- assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+ assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE,
+ 10, 37);
assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
37, 37);
assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
@@ -372,11 +373,11 @@ TEST(TokenizedData, specialTokenIndent2)
TokenizedDataReader reader = data.reader();
assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1);
- assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
- 3, 3);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 3,
+ 3);
assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 3, 4);
- assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
- 7, 7);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 7,
+ 7);
assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 7, 8);
assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
12, 12);
@@ -403,12 +404,13 @@ TEST(TokenizedData, specialTokenIndentOverlap)
data.mark(5, 4, 4);
TokenizedDataReader reader = data.reader();
- assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
- 4, 4);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, 4,
+ 4);
assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
10, 10);
- assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+ assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE,
+ 10, 37);
assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
37, 37);
assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
@@ -417,5 +419,26 @@ TEST(TokenizedData, specialTokenIndentOverlap)
assertEnd(reader);
}
+TEST(TokenizedData, endAtWhitespace)
+{
+ TokenizedData data;
+ data.append(" a b c d");
+ // 0123456789012345
+ // 0 1
+ const TokenSet tokens{};
+
+ data.mark(5, 4, 4);
+
+ TokenizedDataReader reader = data.reader();
+ assertTextEndAtWhitespace(reader, " a", tokens, WhitespaceMode::PRESERVE,
+ 0, 5);
+ assertTextEndAtWhitespace(reader, " b", tokens, WhitespaceMode::PRESERVE,
+ 5, 10);
+ assertTextEndAtWhitespace(reader, " c", tokens, WhitespaceMode::PRESERVE,
+ 10, 12);
+ assertTextEndAtWhitespace(reader, " d", tokens, WhitespaceMode::PRESERVE,
+ 12, 16);
+ assertEnd(reader);
+}
}
diff --git a/test/core/parser/utils/TokenizedDataTestUtils.hpp b/test/core/parser/utils/TokenizedDataTestUtils.hpp
index c384f9d..30f72ae 100644
--- a/test/core/parser/utils/TokenizedDataTestUtils.hpp
+++ b/test/core/parser/utils/TokenizedDataTestUtils.hpp
@@ -21,15 +21,17 @@
namespace ousia {
-static void assertToken(TokenizedDataReader &reader, TokenId id,
- const std::string &text, const TokenSet &tokens = TokenSet{},
- WhitespaceMode mode = WhitespaceMode::TRIM,
- SourceOffset start = InvalidSourceOffset,
- SourceOffset end = InvalidSourceOffset,
- SourceId sourceId = InvalidSourceId)
+inline void assertToken(TokenizedDataReader &reader, TokenId id,
+ const std::string &text,
+ const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset,
+ SourceId sourceId = InvalidSourceId,
+ bool endAtWhitespace = false)
{
Token token;
- ASSERT_TRUE(reader.read(token, tokens, mode));
+ ASSERT_TRUE(reader.read(token, tokens, mode, endAtWhitespace));
EXPECT_EQ(id, token.id);
EXPECT_EQ(text, token.content);
if (start != InvalidSourceOffset) {
@@ -41,23 +43,32 @@ static void assertToken(TokenizedDataReader &reader, TokenId id,
EXPECT_EQ(sourceId, token.getLocation().getSourceId());
}
-static void assertText(TokenizedDataReader &reader, const std::string &text,
- const TokenSet &tokens = TokenSet{},
- WhitespaceMode mode = WhitespaceMode::TRIM,
- SourceOffset start = InvalidSourceOffset,
- SourceOffset end = InvalidSourceOffset,
- SourceId id = InvalidSourceId)
+inline void assertText(TokenizedDataReader &reader, const std::string &text,
+ const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset,
+ SourceId id = InvalidSourceId)
{
assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id);
}
-static void assertEnd(TokenizedDataReader &reader)
+inline void assertTextEndAtWhitespace(
+ TokenizedDataReader &reader, const std::string &text,
+ const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset, SourceId id = InvalidSourceId)
+{
+ assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id, true);
+}
+
+inline void assertEnd(TokenizedDataReader &reader)
{
Token token;
ASSERT_TRUE(reader.atEnd());
ASSERT_FALSE(reader.read(token));
}
-
}
#endif /* _OUSIA_TOKENIZED_DATA_TEST_UTILS_HPP_ */