From 84c9abc3e9762c4486ddc5ca0352a5d697a51987 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Wed, 25 Feb 2015 23:09:26 +0100 Subject: start of branch, commit log will be rewritten --- src/core/parser/utils/Tokenizer.cpp | 264 ++++++++++++++++-------------------- 1 file changed, 118 insertions(+), 146 deletions(-) (limited to 'src/core/parser/utils/Tokenizer.cpp') diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..e78b0f4 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@ #include #include #include -#include +#include "TokenizedData.hpp" #include "Tokenizer.hpp" namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch { Token token; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; + size_t dataStartOffset; /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. + * Set to true if the matched token is a primary token. */ - size_t textEnd; + bool primary; /** * Constructor of the TokenMatch class. */ - TokenMatch() : textLength(0), textEnd(0) {} + TokenMatch() : dataStartOffset(0), primary(false) {} /** * Returns true if this TokenMatch instance actually represents a match. + * + * @return true if the TokenMatch actually has a match. + */ + bool hasMatch() const { return token.id != Tokens::Empty; } + + /** + * Returns the length of the matched token. + * + * @return the length of the token string. */ - bool hasMatch() { return token.id != Tokens::Empty; } + size_t size() const { return token.content.size(); } }; /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private: size_t start; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; + size_t dataStartOffset; public: /** * Constructor of the TokenLookup class. * * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. + * @param start is the start position in the source file. + * @param dataStartOffset is the current length of the TokenizedData buffer. */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t dataStartOffset) + : node(node), start(start), dataStartOffset(dataStartOffset) { } /** * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). + * character. If a complete token is matched, stores the match in the given + * TokenMatch reference and returns true. * * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,49 @@ public: * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. + * @return true if a token was matched, false otherwise. */ - void advance(char c, std::vector &lookups, TokenMatch &match, - const std::vector &tokens, SourceOffset end, - SourceId sourceId) + bool advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, + SourceOffset end, SourceId sourceId) { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node + // Set to true once a token has been matched + bool res = false; + + // Check whether we can continue the current token path, if not, abort auto it = node->children.find(c); if (it == node->children.end()) { - return; + return res; } // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); - if (node->type != Tokens::Empty) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - Token{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } + if (node->id != Tokens::Empty) { + const Tokenizer::TokenDescriptor &descr = tokens[node->id]; + match.token = Token(node->id, descr.string, + SourceLocation(sourceId, start, end)); + match.dataStartOffset = dataStartOffset; + match.primary = descr.primary; + res = true; } // If this state can possibly be advanced, store it in the states list. if (!node->children.empty()) { lookups.emplace_back(*this); } + return res; } }; -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.id = Tokens::Data; -} } /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template -bool Tokenizer::next(CharReader &reader, Token &token) +template +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -199,43 +174,62 @@ bool Tokenizer::next(CharReader &reader, Token &token) // Prepare the lookups in the token trie const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; + TokenMatch bestMatch; std::vector lookups; std::vector nextLookups; - // Instantiate the text handler - TextHandler textHandler; - // Peek characters from the reader and try to advance the current token tree // cursor char c; + const size_t initialDataSize = data.size(); size_t charStart = reader.getPeekOffset(); const SourceId sourceId = reader.getSourceId(); while (reader.peek(c)) { const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; + const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); + if (!bestMatch.hasMatch()) { + lookups.emplace_back(root, charStart, dataStartOffset); } // Try to advance all other lookups with the new character + TokenMatch match; for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + // Continue if the current lookup + if (!lookup.advance(c, nextLookups, match, tokens, charEnd, + sourceId)) { + continue; + } + + // If the matched token is primary, check whether it is better than + // the current best match, if yes, replace the best match. In any + // case just continue + if (match.primary) { + if (match.size() > bestMatch.size()) { + bestMatch = match; + } + continue; + } + + // Otherwise -- if the matched token is a non-primary token (and no + // primary token has been found until now) -- mark the match in the + // TokenizedData + if (!bestMatch.hasMatch()) { + data.mark(match.token.id, data.size() - match.size() + 1, + match.size()); + } } // We have found a token and there are no more states to advance or the // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { + if (bestMatch.hasMatch()) { + if ((nextLookups.empty() || data.size() > initialDataSize)) { break; } } else { // Record all incomming characters - textHandler.append(c, charStart, charEnd); + data.append(c, charStart, charEnd); } // Swap the lookups and the nextLookups list @@ -246,60 +240,53 @@ bool Tokenizer::next(CharReader &reader, Token &token) charStart = charEnd; } - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildDataToken(textHandler, match, sourceId); + // If we found data, emit a corresponding data token + if (data.size() > initialDataSize && + (!bestMatch.hasMatch() || + bestMatch.dataStartOffset > initialDataSize)) { + // If we have a "bestMatch" wich starts after text data has started, + // trim the TokenizedData to this offset + if (bestMatch.dataStartOffset > initialDataSize) { + data.trim(bestMatch.dataStartOffset); + } + + // Create a token containing the data location + bestMatch.token = Token{data.getLocation()}; } // Move the read/peek cursor to the end of the token, abort if an error // happens while doing so - if (match.hasMatch()) { + if (bestMatch.hasMatch()) { // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { + if (bestMatch.token.location.getEnd() == InvalidSourceOffset) { throw OusiaException{"Token end position offset out of range"}; } // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); + const size_t end = bestMatch.token.location.getEnd(); if (read) { reader.seek(end); } else { reader.seekPeekCursor(end); } - token = match.token; + token = bestMatch.token; } else { token = Token{}; } - return match.hasMatch(); + return bestMatch.hasMatch(); } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary) { // Abort if an empty token should be registered if (token.empty()) { @@ -309,8 +296,8 @@ TokenId Tokenizer::registerToken(const std::string &token) // Search for a new slot in the tokens list TokenId type = Tokens::Empty; for (size_t i = nextTokenId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; + if (!tokens[i].valid()) { + tokens[i] = TokenDescriptor(token, primary); type = i; break; } @@ -320,62 +307,47 @@ TokenId Tokenizer::registerToken(const std::string &token) // override the special token type handles if (type == Tokens::Empty) { type = tokens.size(); - if (type == Tokens::Data || type == Tokens::Empty) { + if (type >= Tokens::MaxTokenId) { throw OusiaException{"Token type ids depleted!"}; } - tokens.emplace_back(token); + tokens.emplace_back(token, primary); } nextTokenId = type + 1; - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list + // Try to register the token in the trie -- if this fails, remove it from + // the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; + tokens[type] = TokenDescriptor(); nextTokenId = type; return Tokens::Empty; } return type; } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id) { // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenId = type; + if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { + tokens[id] = TokenDescriptor(); + nextTokenId = id; return true; } return false; } -std::string Tokenizer::getTokenString(TokenId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const { - whitespaceMode = mode; + if (id < tokens.size()) { + return tokens[id]; + } + return EmptyTokenDescriptor; } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } - /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); } -- cgit v1.2.3 From 81e009aa22b5018b055ddda689cd3e78336a164b Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 28 Feb 2015 15:47:30 +0100 Subject: Always call trim if a bestMatch has been found --- src/core/parser/utils/Tokenizer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/core/parser/utils/Tokenizer.cpp') diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index e78b0f4..94d9cb0 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -156,7 +156,6 @@ public: return res; } }; - } /* Class Tokenizer */ @@ -252,6 +251,9 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) // Create a token containing the data location bestMatch.token = Token{data.getLocation()}; + } else if (bestMatch.hasMatch() && + bestMatch.dataStartOffset == initialDataSize) { + data.trim(initialDataSize); } // Move the read/peek cursor to the end of the token, abort if an error @@ -269,6 +271,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) } else { reader.seekPeekCursor(end); } + token = bestMatch.token; } else { token = Token{}; -- cgit v1.2.3 From 31c83c05d257c9a7a336f12342c401f97d380674 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 13:50:15 +0100 Subject: Prefer longer non-primary tokens --- src/core/parser/utils/Tokenizer.cpp | 45 +++++----- test/core/parser/utils/TokenizerTest.cpp | 148 ++++++++++++++++++++++++++----- 2 files changed, 150 insertions(+), 43 deletions(-) (limited to 'src/core/parser/utils/Tokenizer.cpp') diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 94d9cb0..8d540a6 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -188,7 +188,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!bestMatch.hasMatch()) { + if (!bestMatch.hasMatch() || !bestMatch.primary) { lookups.emplace_back(root, charStart, dataStartOffset); } @@ -201,36 +201,35 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) continue; } - // If the matched token is primary, check whether it is better than - // the current best match, if yes, replace the best match. In any - // case just continue - if (match.primary) { - if (match.size() > bestMatch.size()) { - bestMatch = match; - } - continue; + // Replace the best match with longest token + if (match.size() > bestMatch.size()) { + bestMatch = match; } - // Otherwise -- if the matched token is a non-primary token (and no - // primary token has been found until now) -- mark the match in the - // TokenizedData - if (!bestMatch.hasMatch()) { + // If the matched token is a non-primary token -- mark the match in + // the TokenizedData list + if (!match.primary) { data.mark(match.token.id, data.size() - match.size() + 1, match.size()); } } - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (bestMatch.hasMatch()) { - if ((nextLookups.empty() || data.size() > initialDataSize)) { + + // If a token has been found and the token is a primary token, check + // whether we have to abort, otherwise if we have a non-primary match, + // reset it once it can no longer be advanced + if (bestMatch.hasMatch() && nextLookups.empty()) { + if (bestMatch.primary) { break; + } else { + bestMatch = TokenMatch{}; } - } else { - // Record all incomming characters - data.append(c, charStart, charEnd); } + // Record all incomming characters + data.append(c, charStart, charEnd); + + // Swap the lookups and the nextLookups list lookups = std::move(nextLookups); nextLookups.clear(); @@ -241,17 +240,17 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) // If we found data, emit a corresponding data token if (data.size() > initialDataSize && - (!bestMatch.hasMatch() || + (!bestMatch.hasMatch() || !bestMatch.primary || bestMatch.dataStartOffset > initialDataSize)) { // If we have a "bestMatch" wich starts after text data has started, // trim the TokenizedData to this offset - if (bestMatch.dataStartOffset > initialDataSize) { + if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) { data.trim(bestMatch.dataStartOffset); } // Create a token containing the data location bestMatch.token = Token{data.getLocation()}; - } else if (bestMatch.hasMatch() && + } else if (bestMatch.hasMatch() && bestMatch.primary && bestMatch.dataStartOffset == initialDataSize) { data.trim(initialDataSize); } diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 9f644c2..45fc77a 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -26,6 +26,60 @@ namespace ousia { +static void assertPrimaryToken(CharReader &reader, Tokenizer &tokenizer, + TokenId id, const std::string &text, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId sourceId = InvalidSourceId) +{ + Token token; + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + EXPECT_EQ(id, token.id); + EXPECT_EQ(text, token.content); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, token.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, token.getLocation().getEnd()); + } + EXPECT_EQ(sourceId, token.getLocation().getSourceId()); +} + +static void expectData(const std::string &expected, SourceOffset tokenStart, + SourceOffset tokenEnd, SourceOffset textStart, + SourceOffset textEnd, const Token &token, + TokenizedData &data, + WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ + ASSERT_EQ(Tokens::Data, token.id); + + Token textToken; + TokenizedDataReader reader = data.reader(); + ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode)); + + EXPECT_EQ(expected, textToken.content); + EXPECT_EQ(tokenStart, token.location.getStart()); + EXPECT_EQ(tokenEnd, token.location.getEnd()); + EXPECT_EQ(textStart, textToken.getLocation().getStart()); + EXPECT_EQ(textEnd, textToken.getLocation().getEnd()); + EXPECT_TRUE(reader.atEnd()); +} + +static void assertDataToken(CharReader &reader, Tokenizer &tokenizer, + const std::string &expected, + SourceOffset tokenStart, SourceOffset tokenEnd, + SourceOffset textStart, SourceOffset textEnd, + WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ + Token token; + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + + expectData(expected, tokenStart, tokenEnd, textStart, textEnd, token, data, + mode); +} + TEST(Tokenizer, tokenRegistration) { Tokenizer tokenizer; @@ -53,25 +107,6 @@ TEST(Tokenizer, tokenRegistration) ASSERT_EQ("d", tokenizer.lookupToken(1U).string); } -void expectData(const std::string &expected, SourceOffset tokenStart, - SourceOffset tokenEnd, SourceOffset textStart, - SourceOffset textEnd, const Token &token, TokenizedData &data, - WhitespaceMode mode = WhitespaceMode::PRESERVE) -{ - ASSERT_EQ(Tokens::Data, token.id); - - Token textToken; - TokenizedDataReader reader = data.reader(); - ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode)); - - EXPECT_EQ(expected, textToken.content); - EXPECT_EQ(tokenStart, token.location.getStart()); - EXPECT_EQ(tokenEnd, token.location.getEnd()); - EXPECT_EQ(textStart, textToken.getLocation().getStart()); - EXPECT_EQ(textEnd, textToken.getLocation().getEnd()); - EXPECT_TRUE(reader.atEnd()); -} - TEST(Tokenizer, textTokenPreserveWhitespace) { { @@ -451,6 +486,80 @@ TEST(Tokenizer, nonPrimaryTokens) ASSERT_FALSE(tokenizer.read(reader, token, data)); } +TEST(Tokenizer, primaryNonPrimaryTokenInteraction) +{ + CharReader reader{"<><<<>>"}; + // 01234567890123456789012 3456789012345 + // 0 1 2 3 + + Tokenizer tokenizer; + + TokenId tP1 = tokenizer.registerToken("<", true); + TokenId tP2 = tokenizer.registerToken(">", true); + TokenId tP3 = tokenizer.registerToken("\\>", true); + TokenId tN1 = tokenizer.registerToken("<<", false); + TokenId tN2 = tokenizer.registerToken(">>", false); + + TokenSet tokens = TokenSet{tN1, tN2}; + + Token token, textToken; + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 0, 2); + assertText(dataReader, "test1", tokens, WhitespaceMode::TRIM, 2, 7); + assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 7, 9); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP1, "<", 9, 10); + assertDataToken(reader, tokenizer, "test2", 10, 15, 10, 15); + assertPrimaryToken(reader, tokenizer, tP2, ">", 15, 16); + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 16, 18); + assertText(dataReader, "test3", tokens, WhitespaceMode::TRIM, 18, 23); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP3, "\\>", 23, 25); + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 25, 27); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP1, "<", 27, 28); + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertText(dataReader, "test4", tokens, WhitespaceMode::TRIM, 28, 33); + assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 33, 35); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP2, ">", 35, 36); + + TokenizedData data; + ASSERT_FALSE(tokenizer.read(reader, token, data)); +} TEST(Tokenizer, ambiguousTokens2) { @@ -476,6 +585,5 @@ TEST(Tokenizer, ambiguousTokens2) ASSERT_FALSE(tokenizer.read(reader, token, data)); } } - } -- cgit v1.2.3