diff options
| -rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 45 | ||||
| -rw-r--r-- | test/core/parser/utils/TokenizerTest.cpp | 148 | 
2 files changed, 150 insertions, 43 deletions
| diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 94d9cb0..8d540a6 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -188,7 +188,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)  		const size_t dataStartOffset = data.size();  		// If we do not have a match yet, start a new lookup from the root -		if (!bestMatch.hasMatch()) { +		if (!bestMatch.hasMatch() || !bestMatch.primary) {  			lookups.emplace_back(root, charStart, dataStartOffset);  		} @@ -201,36 +201,35 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)  				continue;  			} -			// If the matched token is primary, check whether it is better than -			// the current best match, if yes, replace the best match. In any -			// case just continue -			if (match.primary) { -				if (match.size() > bestMatch.size()) { -					bestMatch = match; -				} -				continue; +			// Replace the best match with longest token +			if (match.size() > bestMatch.size()) { +				bestMatch = match;  			} -			// Otherwise -- if the matched token is a non-primary token (and no -			// primary token has been found until now) -- mark the match in the -			// TokenizedData -			if (!bestMatch.hasMatch()) { +			// If the matched token is a non-primary token -- mark the match in +			// the TokenizedData list +			if (!match.primary) {  				data.mark(match.token.id, data.size() - match.size() + 1,  				          match.size());  			}  		} -		// We have found a token and there are no more states to advance or the -		// text handler has found something -- abort to return the new token -		if (bestMatch.hasMatch()) { -			if ((nextLookups.empty() || data.size() > initialDataSize)) { + +		// If a token has been found and the token is a primary token, check +		// whether we have to abort, otherwise if we have a non-primary match, +		// reset it once it can no longer be advanced +		if (bestMatch.hasMatch() && nextLookups.empty()) { +			if (bestMatch.primary) {  				break; +			} else { +				bestMatch = TokenMatch{};  			} -		} else { -			// Record all incomming characters -			data.append(c, charStart, charEnd);  		} +		// Record all incomming characters +		data.append(c, charStart, charEnd); + +  		// Swap the lookups and the nextLookups list  		lookups = std::move(nextLookups);  		nextLookups.clear(); @@ -241,17 +240,17 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)  	// If we found data, emit a corresponding data token  	if (data.size() > initialDataSize && -	    (!bestMatch.hasMatch() || +	    (!bestMatch.hasMatch() || !bestMatch.primary ||  	     bestMatch.dataStartOffset > initialDataSize)) {  		// If we have a "bestMatch" wich starts after text data has started,  		// trim the TokenizedData to this offset -		if (bestMatch.dataStartOffset > initialDataSize) { +		if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) {  			data.trim(bestMatch.dataStartOffset);  		}  		// Create a token containing the data location  		bestMatch.token = Token{data.getLocation()}; -	} else if (bestMatch.hasMatch() && +	} else if (bestMatch.hasMatch() && bestMatch.primary &&  	           bestMatch.dataStartOffset == initialDataSize) {  		data.trim(initialDataSize);  	} diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 9f644c2..45fc77a 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -26,6 +26,60 @@  namespace ousia { +static void assertPrimaryToken(CharReader &reader, Tokenizer &tokenizer, +                               TokenId id, const std::string &text, +                               SourceOffset start = InvalidSourceOffset, +                               SourceOffset end = InvalidSourceOffset, +                               SourceId sourceId = InvalidSourceId) +{ +	Token token; +	TokenizedData data; +	ASSERT_TRUE(tokenizer.read(reader, token, data)); +	EXPECT_EQ(id, token.id); +	EXPECT_EQ(text, token.content); +	if (start != InvalidSourceOffset) { +		EXPECT_EQ(start, token.getLocation().getStart()); +	} +	if (end != InvalidSourceOffset) { +		EXPECT_EQ(end, token.getLocation().getEnd()); +	} +	EXPECT_EQ(sourceId, token.getLocation().getSourceId()); +} + +static void expectData(const std::string &expected, SourceOffset tokenStart, +                       SourceOffset tokenEnd, SourceOffset textStart, +                       SourceOffset textEnd, const Token &token, +                       TokenizedData &data, +                       WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ +	ASSERT_EQ(Tokens::Data, token.id); + +	Token textToken; +	TokenizedDataReader reader = data.reader(); +	ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode)); + +	EXPECT_EQ(expected, textToken.content); +	EXPECT_EQ(tokenStart, token.location.getStart()); +	EXPECT_EQ(tokenEnd, token.location.getEnd()); +	EXPECT_EQ(textStart, textToken.getLocation().getStart()); +	EXPECT_EQ(textEnd, textToken.getLocation().getEnd()); +	EXPECT_TRUE(reader.atEnd()); +} + +static void assertDataToken(CharReader &reader, Tokenizer &tokenizer, +                            const std::string &expected, +                            SourceOffset tokenStart, SourceOffset tokenEnd, +                            SourceOffset textStart, SourceOffset textEnd, +                            WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ +	Token token; +	TokenizedData data; +	ASSERT_TRUE(tokenizer.read(reader, token, data)); + +	expectData(expected, tokenStart, tokenEnd, textStart, textEnd, token, data, +	           mode); +} +  TEST(Tokenizer, tokenRegistration)  {  	Tokenizer tokenizer; @@ -53,25 +107,6 @@ TEST(Tokenizer, tokenRegistration)  	ASSERT_EQ("d", tokenizer.lookupToken(1U).string);  } -void expectData(const std::string &expected, SourceOffset tokenStart, -                SourceOffset tokenEnd, SourceOffset textStart, -                SourceOffset textEnd, const Token &token, TokenizedData &data, -                WhitespaceMode mode = WhitespaceMode::PRESERVE) -{ -	ASSERT_EQ(Tokens::Data, token.id); - -	Token textToken; -	TokenizedDataReader reader = data.reader(); -	ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode)); - -	EXPECT_EQ(expected, textToken.content); -	EXPECT_EQ(tokenStart, token.location.getStart()); -	EXPECT_EQ(tokenEnd, token.location.getEnd()); -	EXPECT_EQ(textStart, textToken.getLocation().getStart()); -	EXPECT_EQ(textEnd, textToken.getLocation().getEnd()); -	EXPECT_TRUE(reader.atEnd()); -} -  TEST(Tokenizer, textTokenPreserveWhitespace)  {  	{ @@ -451,6 +486,80 @@ TEST(Tokenizer, nonPrimaryTokens)  	ASSERT_FALSE(tokenizer.read(reader, token, data));  } +TEST(Tokenizer, primaryNonPrimaryTokenInteraction) +{ +	CharReader reader{"<<test1>><test2><<test3\\><<<test4>>>"}; +	//                 01234567890123456789012 3456789012345 +	//                 0         1         2          3 + +	Tokenizer tokenizer; + +	TokenId tP1 = tokenizer.registerToken("<", true); +	TokenId tP2 = tokenizer.registerToken(">", true); +	TokenId tP3 = tokenizer.registerToken("\\>", true); +	TokenId tN1 = tokenizer.registerToken("<<", false); +	TokenId tN2 = tokenizer.registerToken(">>", false); + +	TokenSet tokens = TokenSet{tN1, tN2}; + +	Token token, textToken; +	{ +		TokenizedData data; +		ASSERT_TRUE(tokenizer.read(reader, token, data)); +		ASSERT_EQ(Tokens::Data, token.id); + +		TokenizedDataReader dataReader = data.reader(); +		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 0, 2); +		assertText(dataReader, "test1", tokens, WhitespaceMode::TRIM, 2, 7); +		assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 7, 9); +		assertEnd(dataReader); +	} + +	assertPrimaryToken(reader, tokenizer, tP1, "<", 9, 10); +	assertDataToken(reader, tokenizer, "test2", 10, 15, 10, 15); +	assertPrimaryToken(reader, tokenizer, tP2, ">", 15, 16); + +	{ +		TokenizedData data; +		ASSERT_TRUE(tokenizer.read(reader, token, data)); +		ASSERT_EQ(Tokens::Data, token.id); + +		TokenizedDataReader dataReader = data.reader(); +		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 16, 18); +		assertText(dataReader, "test3", tokens, WhitespaceMode::TRIM, 18, 23); +		assertEnd(dataReader); +	} + +	assertPrimaryToken(reader, tokenizer, tP3, "\\>", 23, 25); + +	{ +		TokenizedData data; +		ASSERT_TRUE(tokenizer.read(reader, token, data)); +		ASSERT_EQ(Tokens::Data, token.id); + +		TokenizedDataReader dataReader = data.reader(); +		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 25, 27); +		assertEnd(dataReader); +	} + +	assertPrimaryToken(reader, tokenizer, tP1, "<", 27, 28); + +	{ +		TokenizedData data; +		ASSERT_TRUE(tokenizer.read(reader, token, data)); +		ASSERT_EQ(Tokens::Data, token.id); + +		TokenizedDataReader dataReader = data.reader(); +		assertText(dataReader, "test4", tokens, WhitespaceMode::TRIM, 28, 33); +		assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 33, 35); +		assertEnd(dataReader); +	} + +	assertPrimaryToken(reader, tokenizer, tP2, ">", 35, 36); + +	TokenizedData data; +	ASSERT_FALSE(tokenizer.read(reader, token, data)); +}  TEST(Tokenizer, ambiguousTokens2)  { @@ -476,6 +585,5 @@ TEST(Tokenizer, ambiguousTokens2)  		ASSERT_FALSE(tokenizer.read(reader, token, data));  	}  } -  } | 
