From 31c83c05d257c9a7a336f12342c401f97d380674 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 1 Mar 2015 13:50:15 +0100
Subject: Prefer longer non-primary tokens

---
 src/core/parser/utils/Tokenizer.cpp      |  45 +++++-----
 test/core/parser/utils/TokenizerTest.cpp | 148 ++++++++++++++++++++++++++-----
 2 files changed, 150 insertions(+), 43 deletions(-)

diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 94d9cb0..8d540a6 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -188,7 +188,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 		const size_t dataStartOffset = data.size();
 
 		// If we do not have a match yet, start a new lookup from the root
-		if (!bestMatch.hasMatch()) {
+		if (!bestMatch.hasMatch() || !bestMatch.primary) {
 			lookups.emplace_back(root, charStart, dataStartOffset);
 		}
 
@@ -201,36 +201,35 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 				continue;
 			}
 
-			// If the matched token is primary, check whether it is better than
-			// the current best match, if yes, replace the best match. In any
-			// case just continue
-			if (match.primary) {
-				if (match.size() > bestMatch.size()) {
-					bestMatch = match;
-				}
-				continue;
+			// Replace the best match with longest token
+			if (match.size() > bestMatch.size()) {
+				bestMatch = match;
 			}
 
-			// Otherwise -- if the matched token is a non-primary token (and no
-			// primary token has been found until now) -- mark the match in the
-			// TokenizedData
-			if (!bestMatch.hasMatch()) {
+			// If the matched token is a non-primary token -- mark the match in
+			// the TokenizedData list
+			if (!match.primary) {
 				data.mark(match.token.id, data.size() - match.size() + 1,
 				          match.size());
 			}
 		}
 
-		// We have found a token and there are no more states to advance or the
-		// text handler has found something -- abort to return the new token
-		if (bestMatch.hasMatch()) {
-			if ((nextLookups.empty() || data.size() > initialDataSize)) {
+
+		// If a token has been found and the token is a primary token, check
+		// whether we have to abort, otherwise if we have a non-primary match,
+		// reset it once it can no longer be advanced
+		if (bestMatch.hasMatch() && nextLookups.empty()) {
+			if (bestMatch.primary) {
 				break;
+			} else {
+				bestMatch = TokenMatch{};
 			}
-		} else {
-			// Record all incomming characters
-			data.append(c, charStart, charEnd);
 		}
 
+		// Record all incomming characters
+		data.append(c, charStart, charEnd);
+
+
 		// Swap the lookups and the nextLookups list
 		lookups = std::move(nextLookups);
 		nextLookups.clear();
@@ -241,17 +240,17 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 
 	// If we found data, emit a corresponding data token
 	if (data.size() > initialDataSize &&
-	    (!bestMatch.hasMatch() ||
+	    (!bestMatch.hasMatch() || !bestMatch.primary ||
 	     bestMatch.dataStartOffset > initialDataSize)) {
 		// If we have a "bestMatch" wich starts after text data has started,
 		// trim the TokenizedData to this offset
-		if (bestMatch.dataStartOffset > initialDataSize) {
+		if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) {
 			data.trim(bestMatch.dataStartOffset);
 		}
 
 		// Create a token containing the data location
 		bestMatch.token = Token{data.getLocation()};
-	} else if (bestMatch.hasMatch() &&
+	} else if (bestMatch.hasMatch() && bestMatch.primary &&
 	           bestMatch.dataStartOffset == initialDataSize) {
 		data.trim(initialDataSize);
 	}
diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp
index 9f644c2..45fc77a 100644
--- a/test/core/parser/utils/TokenizerTest.cpp
+++ b/test/core/parser/utils/TokenizerTest.cpp
@@ -26,6 +26,60 @@
 
 namespace ousia {
 
+static void assertPrimaryToken(CharReader &reader, Tokenizer &tokenizer,
+                               TokenId id, const std::string &text,
+                               SourceOffset start = InvalidSourceOffset,
+                               SourceOffset end = InvalidSourceOffset,
+                               SourceId sourceId = InvalidSourceId)
+{
+	Token token;
+	TokenizedData data;
+	ASSERT_TRUE(tokenizer.read(reader, token, data));
+	EXPECT_EQ(id, token.id);
+	EXPECT_EQ(text, token.content);
+	if (start != InvalidSourceOffset) {
+		EXPECT_EQ(start, token.getLocation().getStart());
+	}
+	if (end != InvalidSourceOffset) {
+		EXPECT_EQ(end, token.getLocation().getEnd());
+	}
+	EXPECT_EQ(sourceId, token.getLocation().getSourceId());
+}
+
+static void expectData(const std::string &expected, SourceOffset tokenStart,
+                       SourceOffset tokenEnd, SourceOffset textStart,
+                       SourceOffset textEnd, const Token &token,
+                       TokenizedData &data,
+                       WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+	ASSERT_EQ(Tokens::Data, token.id);
+
+	Token textToken;
+	TokenizedDataReader reader = data.reader();
+	ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode));
+
+	EXPECT_EQ(expected, textToken.content);
+	EXPECT_EQ(tokenStart, token.location.getStart());
+	EXPECT_EQ(tokenEnd, token.location.getEnd());
+	EXPECT_EQ(textStart, textToken.getLocation().getStart());
+	EXPECT_EQ(textEnd, textToken.getLocation().getEnd());
+	EXPECT_TRUE(reader.atEnd());
+}
+
+static void assertDataToken(CharReader &reader, Tokenizer &tokenizer,
+                            const std::string &expected,
+                            SourceOffset tokenStart, SourceOffset tokenEnd,
+                            SourceOffset textStart, SourceOffset textEnd,
+                            WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+	Token token;
+	TokenizedData data;
+	ASSERT_TRUE(tokenizer.read(reader, token, data));
+
+	expectData(expected, tokenStart, tokenEnd, textStart, textEnd, token, data,
+	           mode);
+}
+
 TEST(Tokenizer, tokenRegistration)
 {
 	Tokenizer tokenizer;
@@ -53,25 +107,6 @@ TEST(Tokenizer, tokenRegistration)
 	ASSERT_EQ("d", tokenizer.lookupToken(1U).string);
 }
 
-void expectData(const std::string &expected, SourceOffset tokenStart,
-                SourceOffset tokenEnd, SourceOffset textStart,
-                SourceOffset textEnd, const Token &token, TokenizedData &data,
-                WhitespaceMode mode = WhitespaceMode::PRESERVE)
-{
-	ASSERT_EQ(Tokens::Data, token.id);
-
-	Token textToken;
-	TokenizedDataReader reader = data.reader();
-	ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode));
-
-	EXPECT_EQ(expected, textToken.content);
-	EXPECT_EQ(tokenStart, token.location.getStart());
-	EXPECT_EQ(tokenEnd, token.location.getEnd());
-	EXPECT_EQ(textStart, textToken.getLocation().getStart());
-	EXPECT_EQ(textEnd, textToken.getLocation().getEnd());
-	EXPECT_TRUE(reader.atEnd());
-}
-
 TEST(Tokenizer, textTokenPreserveWhitespace)
 {
 	{
@@ -451,6 +486,80 @@ TEST(Tokenizer, nonPrimaryTokens)
 	ASSERT_FALSE(tokenizer.read(reader, token, data));
 }
 
+TEST(Tokenizer, primaryNonPrimaryTokenInteraction)
+{
+	CharReader reader{"<<test1>><test2><<test3\\><<<test4>>>"};
+	//                 01234567890123456789012 3456789012345
+	//                 0         1         2          3
+
+	Tokenizer tokenizer;
+
+	TokenId tP1 = tokenizer.registerToken("<", true);
+	TokenId tP2 = tokenizer.registerToken(">", true);
+	TokenId tP3 = tokenizer.registerToken("\\>", true);
+	TokenId tN1 = tokenizer.registerToken("<<", false);
+	TokenId tN2 = tokenizer.registerToken(">>", false);
+
+	TokenSet tokens = TokenSet{tN1, tN2};
+
+	Token token, textToken;
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 0, 2);
+		assertText(dataReader, "test1", tokens, WhitespaceMode::TRIM, 2, 7);
+		assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 7, 9);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP1, "<", 9, 10);
+	assertDataToken(reader, tokenizer, "test2", 10, 15, 10, 15);
+	assertPrimaryToken(reader, tokenizer, tP2, ">", 15, 16);
+
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 16, 18);
+		assertText(dataReader, "test3", tokens, WhitespaceMode::TRIM, 18, 23);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP3, "\\>", 23, 25);
+
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 25, 27);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP1, "<", 27, 28);
+
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertText(dataReader, "test4", tokens, WhitespaceMode::TRIM, 28, 33);
+		assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 33, 35);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP2, ">", 35, 36);
+
+	TokenizedData data;
+	ASSERT_FALSE(tokenizer.read(reader, token, data));
+}
 
 TEST(Tokenizer, ambiguousTokens2)
 {
@@ -476,6 +585,5 @@ TEST(Tokenizer, ambiguousTokens2)
 		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 }
-
 }
 
-- 
cgit v1.2.3