summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/parser/utils/Tokenizer.cpp45
-rw-r--r--test/core/parser/utils/TokenizerTest.cpp148
2 files changed, 150 insertions, 43 deletions
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 94d9cb0..8d540a6 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -188,7 +188,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
const size_t dataStartOffset = data.size();
// If we do not have a match yet, start a new lookup from the root
- if (!bestMatch.hasMatch()) {
+ if (!bestMatch.hasMatch() || !bestMatch.primary) {
lookups.emplace_back(root, charStart, dataStartOffset);
}
@@ -201,36 +201,35 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
continue;
}
- // If the matched token is primary, check whether it is better than
- // the current best match, if yes, replace the best match. In any
- // case just continue
- if (match.primary) {
- if (match.size() > bestMatch.size()) {
- bestMatch = match;
- }
- continue;
+ // Replace the best match with longest token
+ if (match.size() > bestMatch.size()) {
+ bestMatch = match;
}
- // Otherwise -- if the matched token is a non-primary token (and no
- // primary token has been found until now) -- mark the match in the
- // TokenizedData
- if (!bestMatch.hasMatch()) {
+ // If the matched token is a non-primary token -- mark the match in
+ // the TokenizedData list
+ if (!match.primary) {
data.mark(match.token.id, data.size() - match.size() + 1,
match.size());
}
}
- // We have found a token and there are no more states to advance or the
- // text handler has found something -- abort to return the new token
- if (bestMatch.hasMatch()) {
- if ((nextLookups.empty() || data.size() > initialDataSize)) {
+
+ // If a token has been found and the token is a primary token, check
+ // whether we have to abort, otherwise if we have a non-primary match,
+ // reset it once it can no longer be advanced
+ if (bestMatch.hasMatch() && nextLookups.empty()) {
+ if (bestMatch.primary) {
break;
+ } else {
+ bestMatch = TokenMatch{};
}
- } else {
- // Record all incomming characters
- data.append(c, charStart, charEnd);
}
+ // Record all incomming characters
+ data.append(c, charStart, charEnd);
+
+
// Swap the lookups and the nextLookups list
lookups = std::move(nextLookups);
nextLookups.clear();
@@ -241,17 +240,17 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
// If we found data, emit a corresponding data token
if (data.size() > initialDataSize &&
- (!bestMatch.hasMatch() ||
+ (!bestMatch.hasMatch() || !bestMatch.primary ||
bestMatch.dataStartOffset > initialDataSize)) {
// If we have a "bestMatch" wich starts after text data has started,
// trim the TokenizedData to this offset
- if (bestMatch.dataStartOffset > initialDataSize) {
+ if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) {
data.trim(bestMatch.dataStartOffset);
}
// Create a token containing the data location
bestMatch.token = Token{data.getLocation()};
- } else if (bestMatch.hasMatch() &&
+ } else if (bestMatch.hasMatch() && bestMatch.primary &&
bestMatch.dataStartOffset == initialDataSize) {
data.trim(initialDataSize);
}
diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp
index 9f644c2..45fc77a 100644
--- a/test/core/parser/utils/TokenizerTest.cpp
+++ b/test/core/parser/utils/TokenizerTest.cpp
@@ -26,6 +26,60 @@
namespace ousia {
+static void assertPrimaryToken(CharReader &reader, Tokenizer &tokenizer,
+ TokenId id, const std::string &text,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset,
+ SourceId sourceId = InvalidSourceId)
+{
+ Token token;
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+ EXPECT_EQ(id, token.id);
+ EXPECT_EQ(text, token.content);
+ if (start != InvalidSourceOffset) {
+ EXPECT_EQ(start, token.getLocation().getStart());
+ }
+ if (end != InvalidSourceOffset) {
+ EXPECT_EQ(end, token.getLocation().getEnd());
+ }
+ EXPECT_EQ(sourceId, token.getLocation().getSourceId());
+}
+
+static void expectData(const std::string &expected, SourceOffset tokenStart,
+ SourceOffset tokenEnd, SourceOffset textStart,
+ SourceOffset textEnd, const Token &token,
+ TokenizedData &data,
+ WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+ ASSERT_EQ(Tokens::Data, token.id);
+
+ Token textToken;
+ TokenizedDataReader reader = data.reader();
+ ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode));
+
+ EXPECT_EQ(expected, textToken.content);
+ EXPECT_EQ(tokenStart, token.location.getStart());
+ EXPECT_EQ(tokenEnd, token.location.getEnd());
+ EXPECT_EQ(textStart, textToken.getLocation().getStart());
+ EXPECT_EQ(textEnd, textToken.getLocation().getEnd());
+ EXPECT_TRUE(reader.atEnd());
+}
+
+static void assertDataToken(CharReader &reader, Tokenizer &tokenizer,
+ const std::string &expected,
+ SourceOffset tokenStart, SourceOffset tokenEnd,
+ SourceOffset textStart, SourceOffset textEnd,
+ WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+ Token token;
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+
+ expectData(expected, tokenStart, tokenEnd, textStart, textEnd, token, data,
+ mode);
+}
+
TEST(Tokenizer, tokenRegistration)
{
Tokenizer tokenizer;
@@ -53,25 +107,6 @@ TEST(Tokenizer, tokenRegistration)
ASSERT_EQ("d", tokenizer.lookupToken(1U).string);
}
-void expectData(const std::string &expected, SourceOffset tokenStart,
- SourceOffset tokenEnd, SourceOffset textStart,
- SourceOffset textEnd, const Token &token, TokenizedData &data,
- WhitespaceMode mode = WhitespaceMode::PRESERVE)
-{
- ASSERT_EQ(Tokens::Data, token.id);
-
- Token textToken;
- TokenizedDataReader reader = data.reader();
- ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode));
-
- EXPECT_EQ(expected, textToken.content);
- EXPECT_EQ(tokenStart, token.location.getStart());
- EXPECT_EQ(tokenEnd, token.location.getEnd());
- EXPECT_EQ(textStart, textToken.getLocation().getStart());
- EXPECT_EQ(textEnd, textToken.getLocation().getEnd());
- EXPECT_TRUE(reader.atEnd());
-}
-
TEST(Tokenizer, textTokenPreserveWhitespace)
{
{
@@ -451,6 +486,80 @@ TEST(Tokenizer, nonPrimaryTokens)
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
+TEST(Tokenizer, primaryNonPrimaryTokenInteraction)
+{
+ CharReader reader{"<<test1>><test2><<test3\\><<<test4>>>"};
+ // 01234567890123456789012 3456789012345
+ // 0 1 2 3
+
+ Tokenizer tokenizer;
+
+ TokenId tP1 = tokenizer.registerToken("<", true);
+ TokenId tP2 = tokenizer.registerToken(">", true);
+ TokenId tP3 = tokenizer.registerToken("\\>", true);
+ TokenId tN1 = tokenizer.registerToken("<<", false);
+ TokenId tN2 = tokenizer.registerToken(">>", false);
+
+ TokenSet tokens = TokenSet{tN1, tN2};
+
+ Token token, textToken;
+ {
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+ ASSERT_EQ(Tokens::Data, token.id);
+
+ TokenizedDataReader dataReader = data.reader();
+ assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 0, 2);
+ assertText(dataReader, "test1", tokens, WhitespaceMode::TRIM, 2, 7);
+ assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 7, 9);
+ assertEnd(dataReader);
+ }
+
+ assertPrimaryToken(reader, tokenizer, tP1, "<", 9, 10);
+ assertDataToken(reader, tokenizer, "test2", 10, 15, 10, 15);
+ assertPrimaryToken(reader, tokenizer, tP2, ">", 15, 16);
+
+ {
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+ ASSERT_EQ(Tokens::Data, token.id);
+
+ TokenizedDataReader dataReader = data.reader();
+ assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 16, 18);
+ assertText(dataReader, "test3", tokens, WhitespaceMode::TRIM, 18, 23);
+ assertEnd(dataReader);
+ }
+
+ assertPrimaryToken(reader, tokenizer, tP3, "\\>", 23, 25);
+
+ {
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+ ASSERT_EQ(Tokens::Data, token.id);
+
+ TokenizedDataReader dataReader = data.reader();
+ assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 25, 27);
+ assertEnd(dataReader);
+ }
+
+ assertPrimaryToken(reader, tokenizer, tP1, "<", 27, 28);
+
+ {
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+ ASSERT_EQ(Tokens::Data, token.id);
+
+ TokenizedDataReader dataReader = data.reader();
+ assertText(dataReader, "test4", tokens, WhitespaceMode::TRIM, 28, 33);
+ assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 33, 35);
+ assertEnd(dataReader);
+ }
+
+ assertPrimaryToken(reader, tokenizer, tP2, ">", 35, 36);
+
+ TokenizedData data;
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
+}
TEST(Tokenizer, ambiguousTokens2)
{
@@ -476,6 +585,5 @@ TEST(Tokenizer, ambiguousTokens2)
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
-
}