From 84c9abc3e9762c4486ddc5ca0352a5d697a51987 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Wed, 25 Feb 2015 23:09:26 +0100
Subject: start of branch, commit log will be rewritten

---
 src/core/parser/utils/Tokenizer.cpp | 264 ++++++++++++++++--------------------
 1 file changed, 118 insertions(+), 146 deletions(-)

(limited to 'src/core/parser/utils/Tokenizer.cpp')

diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 2e0ac13..e78b0f4 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -22,8 +22,8 @@
 #include <core/common/CharReader.hpp>
 #include <core/common/Exceptions.hpp>
 #include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
 
+#include "TokenizedData.hpp"
 #include "Tokenizer.hpp"
 
 namespace ousia {
@@ -42,26 +42,33 @@ struct TokenMatch {
 	Token token;
 
 	/**
-	 * Current length of the data within the text handler. The text buffer needs
-	 * to be trimmed to this length if this token matches.
+	 * Position at which this token starts in the TokenizedData instance.
 	 */
-	size_t textLength;
+	size_t dataStartOffset;
 
 	/**
-	 * End location of the current text handler. This location needs to be used
-	 * for the text token that is emitted before the actual token.
+	 * Set to true if the matched token is a primary token.
 	 */
-	size_t textEnd;
+	bool primary;
 
 	/**
 	 * Constructor of the TokenMatch class.
 	 */
-	TokenMatch() : textLength(0), textEnd(0) {}
+	TokenMatch() : dataStartOffset(0), primary(false) {}
 
 	/**
 	 * Returns true if this TokenMatch instance actually represents a match.
+	 *
+	 * @return true if the TokenMatch actually has a match.
+	 */
+	bool hasMatch() const { return token.id != Tokens::Empty; }
+
+	/**
+	 * Returns the length of the matched token.
+	 *
+	 * @return the length of the token string.
 	 */
-	bool hasMatch() { return token.id != Tokens::Empty; }
+	size_t size() const { return token.content.size(); }
 };
 
 /* Internal class TokenLookup */
@@ -83,36 +90,28 @@ private:
 	size_t start;
 
 	/**
-	 * Current length of the data within the text handler. The text buffer needs
-	 * to be trimmed to this length if this token matches.
+	 * Position at which this token starts in the TokenizedData instance.
 	 */
-	size_t textLength;
-
-	/**
-	 * End location of the current text handler. This location needs to be used
-	 * for the text token that is emitted before the actual token.
-	 */
-	size_t textEnd;
+	size_t dataStartOffset;
 
 public:
 	/**
 	 * Constructor of the TokenLookup class.
 	 *
 	 * @param node is the current node.
-	 * @param start is the start position.
-	 * @param textLength is the text buffer length of the previous text token.
-	 * @param textEnd is the current end location of the previous text token.
+	 * @param start is the start position in the source file.
+	 * @param dataStartOffset is the current length of the TokenizedData buffer.
 	 */
-	TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength,
-	            size_t textEnd)
-	    : node(node), start(start), textLength(textLength), textEnd(textEnd)
+	TokenLookup(const TokenTrie::Node *node, size_t start,
+	            size_t dataStartOffset)
+	    : node(node), start(start), dataStartOffset(dataStartOffset)
 	{
 	}
 
 	/**
 	 * Tries to extend the current path in the token trie with the given
-	 * character. If a complete token is matched, stores this match in the
-	 * tokens list (in case it is longer than any previous token).
+	 * character. If a complete token is matched, stores the match in the given
+	 * TokenMatch reference and returns true.
 	 *
 	 * @param c is the character that should be appended to the current prefix.
 	 * @param lookups is a list to which new TokeLookup instances are added --
@@ -123,73 +122,49 @@ public:
 	 * Tokenizer.
 	 * @param end is the end byte offset of the current character.
 	 * @param sourceId is the source if of this file.
+	 * @return true if a token was matched, false otherwise.
 	 */
-	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
-	             const std::vector<std::string> &tokens, SourceOffset end,
-	             SourceId sourceId)
+	bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+	             const std::vector<Tokenizer::TokenDescriptor> &tokens,
+	             SourceOffset end, SourceId sourceId)
 	{
-		// Check whether we can continue the current token path with the given
-		// character without visiting an already visited node
+		// Set to true once a token has been matched
+		bool res = false;
+
+		// Check whether we can continue the current token path, if not, abort
 		auto it = node->children.find(c);
 		if (it == node->children.end()) {
-			return;
+			return res;
 		}
 
 		// Check whether the new node represents a complete token a whether it
 		// is longer than the current token. If yes, replace the current token.
 		node = it->second.get();
-		if (node->type != Tokens::Empty) {
-			const std::string &str = tokens[node->type];
-			size_t len = str.size();
-			if (len > match.token.content.size()) {
-				match.token =
-				    Token{node->type, str, {sourceId, start, end}};
-				match.textLength = textLength;
-				match.textEnd = textEnd;
-			}
+		if (node->id != Tokens::Empty) {
+			const Tokenizer::TokenDescriptor &descr = tokens[node->id];
+			match.token = Token(node->id, descr.string,
+			                    SourceLocation(sourceId, start, end));
+			match.dataStartOffset = dataStartOffset;
+			match.primary = descr.primary;
+			res = true;
 		}
 
 		// If this state can possibly be advanced, store it in the states list.
 		if (!node->children.empty()) {
 			lookups.emplace_back(*this);
 		}
+		return res;
 	}
 };
 
-/**
- * Transforms the given token into a data token containing the extracted
- * text.
- *
- * @param handler is the WhitespaceHandler containing the collected data.
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
-static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match,
-                           SourceId sourceId)
-{
-	if (match.hasMatch()) {
-		match.token.content =
-		    std::string{handler.textBuf.data(), match.textLength};
-		match.token.location =
-		    SourceLocation{sourceId, handler.textStart, match.textEnd};
-	} else {
-		match.token.content = handler.toString();
-		match.token.location =
-		    SourceLocation{sourceId, handler.textStart, handler.textEnd};
-	}
-	match.token.id = Tokens::Data;
-}
 }
 
 /* Class Tokenizer */
 
-Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
-    : whitespaceMode(whitespaceMode), nextTokenId(0)
-{
-}
+Tokenizer::Tokenizer() : nextTokenId(0) {}
 
-template <typename TextHandler, bool read>
-bool Tokenizer::next(CharReader &reader, Token &token)
+template <bool read>
+bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 {
 	// If we're in the read mode, reset the char reader peek position to the
 	// current read position
@@ -199,43 +174,62 @@ bool Tokenizer::next(CharReader &reader, Token &token)
 
 	// Prepare the lookups in the token trie
 	const TokenTrie::Node *root = trie.getRoot();
-	TokenMatch match;
+	TokenMatch bestMatch;
 	std::vector<TokenLookup> lookups;
 	std::vector<TokenLookup> nextLookups;
 
-	// Instantiate the text handler
-	TextHandler textHandler;
-
 	// Peek characters from the reader and try to advance the current token tree
 	// cursor
 	char c;
+	const size_t initialDataSize = data.size();
 	size_t charStart = reader.getPeekOffset();
 	const SourceId sourceId = reader.getSourceId();
 	while (reader.peek(c)) {
 		const size_t charEnd = reader.getPeekOffset();
-		const size_t textLength = textHandler.textBuf.size();
-		const size_t textEnd = textHandler.textEnd;
+		const size_t dataStartOffset = data.size();
 
 		// If we do not have a match yet, start a new lookup from the root
-		if (!match.hasMatch()) {
-			TokenLookup{root, charStart, textLength, textEnd}.advance(
-			    c, nextLookups, match, tokens, charEnd, sourceId);
+		if (!bestMatch.hasMatch()) {
+			lookups.emplace_back(root, charStart, dataStartOffset);
 		}
 
 		// Try to advance all other lookups with the new character
+		TokenMatch match;
 		for (TokenLookup &lookup : lookups) {
-			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+			// Continue if the current lookup
+			if (!lookup.advance(c, nextLookups, match, tokens, charEnd,
+			                    sourceId)) {
+				continue;
+			}
+
+			// If the matched token is primary, check whether it is better than
+			// the current best match, if yes, replace the best match. In any
+			// case just continue
+			if (match.primary) {
+				if (match.size() > bestMatch.size()) {
+					bestMatch = match;
+				}
+				continue;
+			}
+
+			// Otherwise -- if the matched token is a non-primary token (and no
+			// primary token has been found until now) -- mark the match in the
+			// TokenizedData
+			if (!bestMatch.hasMatch()) {
+				data.mark(match.token.id, data.size() - match.size() + 1,
+				          match.size());
+			}
 		}
 
 		// We have found a token and there are no more states to advance or the
 		// text handler has found something -- abort to return the new token
-		if (match.hasMatch()) {
-			if ((nextLookups.empty() || textHandler.hasText())) {
+		if (bestMatch.hasMatch()) {
+			if ((nextLookups.empty() || data.size() > initialDataSize)) {
 				break;
 			}
 		} else {
 			// Record all incomming characters
-			textHandler.append(c, charStart, charEnd);
+			data.append(c, charStart, charEnd);
 		}
 
 		// Swap the lookups and the nextLookups list
@@ -246,60 +240,53 @@ bool Tokenizer::next(CharReader &reader, Token &token)
 		charStart = charEnd;
 	}
 
-	// If we found text, emit that text
-	if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
-		buildDataToken(textHandler, match, sourceId);
+	// If we found data, emit a corresponding data token
+	if (data.size() > initialDataSize &&
+	    (!bestMatch.hasMatch() ||
+	     bestMatch.dataStartOffset > initialDataSize)) {
+		// If we have a "bestMatch" wich starts after text data has started,
+		// trim the TokenizedData to this offset
+		if (bestMatch.dataStartOffset > initialDataSize) {
+			data.trim(bestMatch.dataStartOffset);
+		}
+
+		// Create a token containing the data location
+		bestMatch.token = Token{data.getLocation()};
 	}
 
 	// Move the read/peek cursor to the end of the token, abort if an error
 	// happens while doing so
-	if (match.hasMatch()) {
+	if (bestMatch.hasMatch()) {
 		// Make sure we have a valid location
-		if (match.token.location.getEnd() == InvalidSourceOffset) {
+		if (bestMatch.token.location.getEnd() == InvalidSourceOffset) {
 			throw OusiaException{"Token end position offset out of range"};
 		}
 
 		// Seek to the end of the current token
-		const size_t end = match.token.location.getEnd();
+		const size_t end = bestMatch.token.location.getEnd();
 		if (read) {
 			reader.seek(end);
 		} else {
 			reader.seekPeekCursor(end);
 		}
-		token = match.token;
+		token = bestMatch.token;
 	} else {
 		token = Token{};
 	}
-	return match.hasMatch();
+	return bestMatch.hasMatch();
 }
 
-bool Tokenizer::read(CharReader &reader, Token &token)
+bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data)
 {
-	switch (whitespaceMode) {
-		case WhitespaceMode::PRESERVE:
-			return next<PreservingWhitespaceHandler, true>(reader, token);
-		case WhitespaceMode::TRIM:
-			return next<TrimmingWhitespaceHandler, true>(reader, token);
-		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingWhitespaceHandler, true>(reader, token);
-	}
-	return false;
+	return next<true>(reader, token, data);
 }
 
-bool Tokenizer::peek(CharReader &reader, Token &token)
+bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data)
 {
-	switch (whitespaceMode) {
-		case WhitespaceMode::PRESERVE:
-			return next<PreservingWhitespaceHandler, false>(reader, token);
-		case WhitespaceMode::TRIM:
-			return next<TrimmingWhitespaceHandler, false>(reader, token);
-		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingWhitespaceHandler, false>(reader, token);
-	}
-	return false;
+	return next<false>(reader, token, data);
 }
 
-TokenId Tokenizer::registerToken(const std::string &token)
+TokenId Tokenizer::registerToken(const std::string &token, bool primary)
 {
 	// Abort if an empty token should be registered
 	if (token.empty()) {
@@ -309,8 +296,8 @@ TokenId Tokenizer::registerToken(const std::string &token)
 	// Search for a new slot in the tokens list
 	TokenId type = Tokens::Empty;
 	for (size_t i = nextTokenId; i < tokens.size(); i++) {
-		if (tokens[i].empty()) {
-			tokens[i] = token;
+		if (!tokens[i].valid()) {
+			tokens[i] = TokenDescriptor(token, primary);
 			type = i;
 			break;
 		}
@@ -320,62 +307,47 @@ TokenId Tokenizer::registerToken(const std::string &token)
 	// override the special token type handles
 	if (type == Tokens::Empty) {
 		type = tokens.size();
-		if (type == Tokens::Data || type == Tokens::Empty) {
+		if (type >= Tokens::MaxTokenId) {
 			throw OusiaException{"Token type ids depleted!"};
 		}
-		tokens.emplace_back(token);
+		tokens.emplace_back(token, primary);
 	}
 	nextTokenId = type + 1;
 
-	// Try to register the token in the trie -- if this fails, remove it
-	// from the tokens list
+	// Try to register the token in the trie -- if this fails, remove it from
+	// the tokens list
 	if (!trie.registerToken(token, type)) {
-		tokens[type] = std::string{};
+		tokens[type] = TokenDescriptor();
 		nextTokenId = type;
 		return Tokens::Empty;
 	}
 	return type;
 }
 
-bool Tokenizer::unregisterToken(TokenId type)
+bool Tokenizer::unregisterToken(TokenId id)
 {
 	// Unregister the token from the trie, abort if an invalid type is given
-	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
-		tokens[type] = std::string{};
-		nextTokenId = type;
+	if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) {
+		tokens[id] = TokenDescriptor();
+		nextTokenId = id;
 		return true;
 	}
 	return false;
 }
 
-std::string Tokenizer::getTokenString(TokenId type)
-{
-	if (type < tokens.size()) {
-		return tokens[type];
-	}
-	return std::string{};
-}
+static Tokenizer::TokenDescriptor EmptyTokenDescriptor;
 
-void Tokenizer::setWhitespaceMode(WhitespaceMode mode)
+const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const
 {
-	whitespaceMode = mode;
+	if (id < tokens.size()) {
+		return tokens[id];
+	}
+	return EmptyTokenDescriptor;
 }
 
-WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; }
-
 /* Explicitly instantiate all possible instantiations of the "next" member
    function */
-template bool Tokenizer::next<PreservingWhitespaceHandler, false>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, false>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, false>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<PreservingWhitespaceHandler, true>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, true>(
-    CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, true>(
-    CharReader &reader, Token &token);
+template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &);
+template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &);
 }
 
-- 
cgit v1.2.3


From 81e009aa22b5018b055ddda689cd3e78336a164b Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sat, 28 Feb 2015 15:47:30 +0100
Subject: Always call trim if a bestMatch has been found

---
 src/core/parser/utils/Tokenizer.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src/core/parser/utils/Tokenizer.cpp')

diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index e78b0f4..94d9cb0 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -156,7 +156,6 @@ public:
 		return res;
 	}
 };
-
 }
 
 /* Class Tokenizer */
@@ -252,6 +251,9 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 
 		// Create a token containing the data location
 		bestMatch.token = Token{data.getLocation()};
+	} else if (bestMatch.hasMatch() &&
+	           bestMatch.dataStartOffset == initialDataSize) {
+		data.trim(initialDataSize);
 	}
 
 	// Move the read/peek cursor to the end of the token, abort if an error
@@ -269,6 +271,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 		} else {
 			reader.seekPeekCursor(end);
 		}
+
 		token = bestMatch.token;
 	} else {
 		token = Token{};
-- 
cgit v1.2.3


From 31c83c05d257c9a7a336f12342c401f97d380674 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 1 Mar 2015 13:50:15 +0100
Subject: Prefer longer non-primary tokens

---
 src/core/parser/utils/Tokenizer.cpp      |  45 +++++-----
 test/core/parser/utils/TokenizerTest.cpp | 148 ++++++++++++++++++++++++++-----
 2 files changed, 150 insertions(+), 43 deletions(-)

(limited to 'src/core/parser/utils/Tokenizer.cpp')

diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 94d9cb0..8d540a6 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -188,7 +188,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 		const size_t dataStartOffset = data.size();
 
 		// If we do not have a match yet, start a new lookup from the root
-		if (!bestMatch.hasMatch()) {
+		if (!bestMatch.hasMatch() || !bestMatch.primary) {
 			lookups.emplace_back(root, charStart, dataStartOffset);
 		}
 
@@ -201,36 +201,35 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 				continue;
 			}
 
-			// If the matched token is primary, check whether it is better than
-			// the current best match, if yes, replace the best match. In any
-			// case just continue
-			if (match.primary) {
-				if (match.size() > bestMatch.size()) {
-					bestMatch = match;
-				}
-				continue;
+			// Replace the best match with longest token
+			if (match.size() > bestMatch.size()) {
+				bestMatch = match;
 			}
 
-			// Otherwise -- if the matched token is a non-primary token (and no
-			// primary token has been found until now) -- mark the match in the
-			// TokenizedData
-			if (!bestMatch.hasMatch()) {
+			// If the matched token is a non-primary token -- mark the match in
+			// the TokenizedData list
+			if (!match.primary) {
 				data.mark(match.token.id, data.size() - match.size() + 1,
 				          match.size());
 			}
 		}
 
-		// We have found a token and there are no more states to advance or the
-		// text handler has found something -- abort to return the new token
-		if (bestMatch.hasMatch()) {
-			if ((nextLookups.empty() || data.size() > initialDataSize)) {
+
+		// If a token has been found and the token is a primary token, check
+		// whether we have to abort, otherwise if we have a non-primary match,
+		// reset it once it can no longer be advanced
+		if (bestMatch.hasMatch() && nextLookups.empty()) {
+			if (bestMatch.primary) {
 				break;
+			} else {
+				bestMatch = TokenMatch{};
 			}
-		} else {
-			// Record all incomming characters
-			data.append(c, charStart, charEnd);
 		}
 
+		// Record all incomming characters
+		data.append(c, charStart, charEnd);
+
+
 		// Swap the lookups and the nextLookups list
 		lookups = std::move(nextLookups);
 		nextLookups.clear();
@@ -241,17 +240,17 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 
 	// If we found data, emit a corresponding data token
 	if (data.size() > initialDataSize &&
-	    (!bestMatch.hasMatch() ||
+	    (!bestMatch.hasMatch() || !bestMatch.primary ||
 	     bestMatch.dataStartOffset > initialDataSize)) {
 		// If we have a "bestMatch" wich starts after text data has started,
 		// trim the TokenizedData to this offset
-		if (bestMatch.dataStartOffset > initialDataSize) {
+		if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) {
 			data.trim(bestMatch.dataStartOffset);
 		}
 
 		// Create a token containing the data location
 		bestMatch.token = Token{data.getLocation()};
-	} else if (bestMatch.hasMatch() &&
+	} else if (bestMatch.hasMatch() && bestMatch.primary &&
 	           bestMatch.dataStartOffset == initialDataSize) {
 		data.trim(initialDataSize);
 	}
diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp
index 9f644c2..45fc77a 100644
--- a/test/core/parser/utils/TokenizerTest.cpp
+++ b/test/core/parser/utils/TokenizerTest.cpp
@@ -26,6 +26,60 @@
 
 namespace ousia {
 
+static void assertPrimaryToken(CharReader &reader, Tokenizer &tokenizer,
+                               TokenId id, const std::string &text,
+                               SourceOffset start = InvalidSourceOffset,
+                               SourceOffset end = InvalidSourceOffset,
+                               SourceId sourceId = InvalidSourceId)
+{
+	Token token;
+	TokenizedData data;
+	ASSERT_TRUE(tokenizer.read(reader, token, data));
+	EXPECT_EQ(id, token.id);
+	EXPECT_EQ(text, token.content);
+	if (start != InvalidSourceOffset) {
+		EXPECT_EQ(start, token.getLocation().getStart());
+	}
+	if (end != InvalidSourceOffset) {
+		EXPECT_EQ(end, token.getLocation().getEnd());
+	}
+	EXPECT_EQ(sourceId, token.getLocation().getSourceId());
+}
+
+static void expectData(const std::string &expected, SourceOffset tokenStart,
+                       SourceOffset tokenEnd, SourceOffset textStart,
+                       SourceOffset textEnd, const Token &token,
+                       TokenizedData &data,
+                       WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+	ASSERT_EQ(Tokens::Data, token.id);
+
+	Token textToken;
+	TokenizedDataReader reader = data.reader();
+	ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode));
+
+	EXPECT_EQ(expected, textToken.content);
+	EXPECT_EQ(tokenStart, token.location.getStart());
+	EXPECT_EQ(tokenEnd, token.location.getEnd());
+	EXPECT_EQ(textStart, textToken.getLocation().getStart());
+	EXPECT_EQ(textEnd, textToken.getLocation().getEnd());
+	EXPECT_TRUE(reader.atEnd());
+}
+
+static void assertDataToken(CharReader &reader, Tokenizer &tokenizer,
+                            const std::string &expected,
+                            SourceOffset tokenStart, SourceOffset tokenEnd,
+                            SourceOffset textStart, SourceOffset textEnd,
+                            WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+	Token token;
+	TokenizedData data;
+	ASSERT_TRUE(tokenizer.read(reader, token, data));
+
+	expectData(expected, tokenStart, tokenEnd, textStart, textEnd, token, data,
+	           mode);
+}
+
 TEST(Tokenizer, tokenRegistration)
 {
 	Tokenizer tokenizer;
@@ -53,25 +107,6 @@ TEST(Tokenizer, tokenRegistration)
 	ASSERT_EQ("d", tokenizer.lookupToken(1U).string);
 }
 
-void expectData(const std::string &expected, SourceOffset tokenStart,
-                SourceOffset tokenEnd, SourceOffset textStart,
-                SourceOffset textEnd, const Token &token, TokenizedData &data,
-                WhitespaceMode mode = WhitespaceMode::PRESERVE)
-{
-	ASSERT_EQ(Tokens::Data, token.id);
-
-	Token textToken;
-	TokenizedDataReader reader = data.reader();
-	ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode));
-
-	EXPECT_EQ(expected, textToken.content);
-	EXPECT_EQ(tokenStart, token.location.getStart());
-	EXPECT_EQ(tokenEnd, token.location.getEnd());
-	EXPECT_EQ(textStart, textToken.getLocation().getStart());
-	EXPECT_EQ(textEnd, textToken.getLocation().getEnd());
-	EXPECT_TRUE(reader.atEnd());
-}
-
 TEST(Tokenizer, textTokenPreserveWhitespace)
 {
 	{
@@ -451,6 +486,80 @@ TEST(Tokenizer, nonPrimaryTokens)
 	ASSERT_FALSE(tokenizer.read(reader, token, data));
 }
 
+TEST(Tokenizer, primaryNonPrimaryTokenInteraction)
+{
+	CharReader reader{"<<test1>><test2><<test3\\><<<test4>>>"};
+	//                 01234567890123456789012 3456789012345
+	//                 0         1         2          3
+
+	Tokenizer tokenizer;
+
+	TokenId tP1 = tokenizer.registerToken("<", true);
+	TokenId tP2 = tokenizer.registerToken(">", true);
+	TokenId tP3 = tokenizer.registerToken("\\>", true);
+	TokenId tN1 = tokenizer.registerToken("<<", false);
+	TokenId tN2 = tokenizer.registerToken(">>", false);
+
+	TokenSet tokens = TokenSet{tN1, tN2};
+
+	Token token, textToken;
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 0, 2);
+		assertText(dataReader, "test1", tokens, WhitespaceMode::TRIM, 2, 7);
+		assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 7, 9);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP1, "<", 9, 10);
+	assertDataToken(reader, tokenizer, "test2", 10, 15, 10, 15);
+	assertPrimaryToken(reader, tokenizer, tP2, ">", 15, 16);
+
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 16, 18);
+		assertText(dataReader, "test3", tokens, WhitespaceMode::TRIM, 18, 23);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP3, "\\>", 23, 25);
+
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 25, 27);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP1, "<", 27, 28);
+
+	{
+		TokenizedData data;
+		ASSERT_TRUE(tokenizer.read(reader, token, data));
+		ASSERT_EQ(Tokens::Data, token.id);
+
+		TokenizedDataReader dataReader = data.reader();
+		assertText(dataReader, "test4", tokens, WhitespaceMode::TRIM, 28, 33);
+		assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 33, 35);
+		assertEnd(dataReader);
+	}
+
+	assertPrimaryToken(reader, tokenizer, tP2, ">", 35, 36);
+
+	TokenizedData data;
+	ASSERT_FALSE(tokenizer.read(reader, token, data));
+}
 
 TEST(Tokenizer, ambiguousTokens2)
 {
@@ -476,6 +585,5 @@ TEST(Tokenizer, ambiguousTokens2)
 		ASSERT_FALSE(tokenizer.read(reader, token, data));
 	}
 }
-
 }
 
-- 
cgit v1.2.3