From 4854509f8add1e2ff167623fb0e8d4216d9d6023 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 17:54:27 +0100 Subject: Implemented DynamicTokenizer and unit tests --- src/plugins/plain/DynamicTokenizer.cpp | 514 +++++++++++++++++++++++++++++++-- 1 file changed, 493 insertions(+), 21 deletions(-) (limited to 'src/plugins/plain/DynamicTokenizer.cpp') diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp index 7690395..a8f2317 100644 --- a/src/plugins/plain/DynamicTokenizer.cpp +++ b/src/plugins/plain/DynamicTokenizer.cpp @@ -17,57 +17,529 @@ */ #include -#include -#include +#include #include +#include +#include #include "DynamicTokenizer.hpp" namespace ousia { +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + DynamicToken token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t textLength, size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the DynamicToken instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * DynamicTokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + DynamicToken{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/* Internal class TextHandlerBase */ + +/** + * Base class used for those classes that may be used as TextHandler in the + * DynamicTokenizer::next function. + */ +class TextHandlerBase { +public: + /** + * Start position of the extracted text. + */ + size_t textStart; + + /** + * End position of the extracted text. + */ + size_t textEnd; + + /** + * Buffer containing the extracted text. + */ + std::vector textBuf; + + /** + * Constructor of the TextHandlerBase base class. Initializes the start and + * end position with zeros. + */ + TextHandlerBase() : textStart(0), textEnd(0) {} + + /** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ + void buildTextToken(TokenMatch &match, SourceId sourceId) + { + if (match.hasMatch()) { + match.token.content = + std::string{textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, textStart, match.textEnd}; + } else { + match.token.content = std::string{textBuf.data(), textBuf.size()}; + match.token.location = SourceLocation{sourceId, textStart, textEnd}; + } + match.token.type = TextToken; + } + + /** + * Returns true if this whitespace handler has found any text and a text + * token could be emitted. + * + * @return true if the internal data buffer is non-empty. + */ + bool hasText() { return !textBuf.empty(); } +}; + +/* Internal class PreservingTextHandler */ + +/** + * The PreservingTextHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; + + /** + * Appends the given character to the internal text buffer, does not + * eliminate whitespace. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + textBuf.push_back(c); + } +}; + +/* Internal class TrimmingTextHandler */ + /** - * The TokenDescriptor class is a simple wrapper around a standard string - * containing the character sequence of the token. + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. */ -class TokenDescriptor { +class TrimmingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; + /** - * The character sequence of the token. + * Buffer used internally to temporarily store all whitespace characters. + * They are only added to the output buffer if another non-whitespace + * character is reached. */ - std::string str; + std::vector whitespaceBuf; /** - * Default constructor of the TokenDescriptor class. Used to describe - * special tokens. + * Appends the given character to the internal text buffer, eliminates + * whitespace characters at the begin and end of the text. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. */ - TokenDescriptor(); + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + whitespaceBuf.push_back(c); + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (!whitespaceBuf.empty()) { + textBuf.insert(textBuf.end(), whitespaceBuf.begin(), + whitespaceBuf.end()); + whitespaceBuf.clear(); + } + textBuf.push_back(c); + } +}; + +/* Internal class CollapsingTextHandler */ + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; /** - * Constructor initializing the character sequence of the token. + * Flag set to true if a whitespace character was reached. */ - TokenDescriptor(const std::string &str) : str(str) {} + bool hasWhitespace = false; + + /** + * Appends the given character to the internal text buffer, eliminates + * redundant whitespace characters. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + hasWhitespace = true; + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (hasWhitespace) { + textBuf.push_back(' '); + hasWhitespace = false; + } + textBuf.push_back(c); + } }; +} /* Class DynamicTokenizer */ -void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode) +DynamicTokenizer::DynamicTokenizer(CharReader &reader, + WhitespaceMode whitespaceMode) + : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0) { - whitespaceMode = mode; } -WhitespaceMode DynamicTokenizer::getWhitespaceMode() +template +bool DynamicTokenizer::next(DynamicToken &token) { - return whitespaceMode; + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector lookups; + std::vector nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && + (!match.hasMatch() || match.textLength > 0)) { + textHandler.buildTextToken(match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = DynamicToken{}; + } + return match.hasMatch(); +} + +bool DynamicTokenizer::read(DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(token); + case WhitespaceMode::TRIM: + return next(token); + case WhitespaceMode::COLLAPSE: + return next(token); + } + return false; +} + +bool DynamicTokenizer::peek(DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(token); + case WhitespaceMode::TRIM: + return next(token); + case WhitespaceMode::COLLAPSE: + return next(token); + } + return false; } +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } -/* Constant initializations */ + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; -static const TokenDescriptor Empty; -static const TokenDescriptor Text; -static const TokenDescriptor* DynamicTokenizer::Empty = &Empty; -static const TokenDescriptor* DynamicTokenizer::Token = &Text; + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string(); + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); +template bool DynamicTokenizer::next( + DynamicToken &token); } -- cgit v1.2.3 From f713b1d393230e7083727d457623fdac878eb248 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 8 Feb 2015 18:48:07 +0100 Subject: DynamicTokenizer now gets the reader as a parameter to read and peek -- the beauty of this tokenizer is that it has no internal state depending on the reader, so it doesn't need to hold a reference to it --- src/plugins/plain/DynamicTokenizer.cpp | 35 ++++++------- src/plugins/plain/DynamicTokenizer.hpp | 22 ++++---- test/plugins/plain/DynamicTokenizerTest.cpp | 81 ++++++++++++++--------------- 3 files changed, 67 insertions(+), 71 deletions(-) (limited to 'src/plugins/plain/DynamicTokenizer.cpp') diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp index a8f2317..f2cfcd1 100644 --- a/src/plugins/plain/DynamicTokenizer.cpp +++ b/src/plugins/plain/DynamicTokenizer.cpp @@ -345,14 +345,13 @@ public: /* Class DynamicTokenizer */ -DynamicTokenizer::DynamicTokenizer(CharReader &reader, - WhitespaceMode whitespaceMode) - : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0) +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) { } template -bool DynamicTokenizer::next(DynamicToken &token) +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -437,28 +436,28 @@ bool DynamicTokenizer::next(DynamicToken &token) return match.hasMatch(); } -bool DynamicTokenizer::read(DynamicToken &token) +bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next(token); + return next(reader, token); case WhitespaceMode::TRIM: - return next(token); + return next(reader, token); case WhitespaceMode::COLLAPSE: - return next(token); + return next(reader, token); } return false; } -bool DynamicTokenizer::peek(DynamicToken &token) +bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next(token); + return next(reader, token); case WhitespaceMode::TRIM: - return next(token); + return next(reader, token); case WhitespaceMode::COLLAPSE: - return next(token); + return next(reader, token); } return false; } @@ -530,16 +529,16 @@ WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } /* Explicitly instantiate all possible instantiations of the "next" member function */ template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader, DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader, DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); template bool DynamicTokenizer::next( - DynamicToken &token); + CharReader &reader,DynamicToken &token); } diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp index 760bebf..0b4dd39 100644 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -118,11 +118,6 @@ enum class WhitespaceMode { */ class DynamicTokenizer { private: - /** - * CharReader instance from which the tokens should be read. - */ - CharReader &reader; - /** * Internally used token trie. This object holds all registered tokens. */ @@ -151,23 +146,22 @@ private: * @tparam TextHandler is the type to be used for the textHandler instance. * @tparam read specifies whether the function should start from and advance * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. * @param token is the token structure into which the token information * should be written. * @return false if the end of the stream has been reached, true otherwise. */ template - bool next(DynamicToken &token); + bool next(CharReader &reader, DynamicToken &token); public: /** * Constructor of the DynamicTokenizer class. * - * @param reader is the CharReader that should be used for reading the - * tokens. * @param whitespaceMode specifies how whitespace should be handled. */ - DynamicTokenizer(CharReader &reader, - WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Registers the given string as a token. Returns a const pointer at a @@ -222,23 +216,27 @@ public: * Reads a new token from the CharReader and stores it in the given * DynamicToken instance. * + * @param reader is the CharReader instance from which the data should be + * read. * @param token is a reference at the token instance into which the Token * information should be written. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(DynamicToken &token); + bool read(CharReader &reader, DynamicToken &token); /** * The peek method does not advance the read position of the char reader, * but reads the next token from the current char reader peek position. * + * @param reader is the CharReader instance from which the data should be + * read. * @param token is a reference at the token instance into which the Token * information should be written. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(DynamicToken &token); + bool peek(CharReader &reader, DynamicToken &token); }; } diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp index 63fa466..5183fdd 100644 --- a/test/plugins/plain/DynamicTokenizerTest.cpp +++ b/test/plugins/plain/DynamicTokenizerTest.cpp @@ -25,8 +25,7 @@ namespace ousia { TEST(DynamicTokenizer, tokenRegistration) { - CharReader reader{"test"}; - DynamicTokenizer tokenizer{reader}; + DynamicTokenizer tokenizer; ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); @@ -57,10 +56,10 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE}; + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ(" this \t is only a \n\n test text ", token.content); @@ -68,17 +67,17 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(36U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE}; + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -86,7 +85,7 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(32U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } } @@ -96,10 +95,10 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM}; + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -107,17 +106,17 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(33U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM}; + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -125,7 +124,7 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(32U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } } @@ -135,10 +134,10 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE}; + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -146,17 +145,17 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(33U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE}; + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -164,21 +163,21 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(32U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } } TEST(DynamicTokenizer, simpleReadToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer{reader}; + DynamicTokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test1", token.content); @@ -194,7 +193,7 @@ TEST(DynamicTokenizer, simpleReadToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); ASSERT_EQ(":", token.content); @@ -210,7 +209,7 @@ TEST(DynamicTokenizer, simpleReadToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test2", token.content); @@ -227,14 +226,14 @@ TEST(DynamicTokenizer, simpleReadToken) TEST(DynamicTokenizer, simplePeekToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer{reader}; + DynamicTokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { DynamicToken token; - ASSERT_TRUE(tokenizer.peek(token)); + ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test1", token.content); @@ -248,7 +247,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.peek(token)); + ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(tid, token.type); ASSERT_EQ(":", token.content); @@ -262,7 +261,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.peek(token)); + ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test2", token.content); @@ -276,7 +275,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test1", token.content); @@ -290,7 +289,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); ASSERT_EQ(":", token.content); @@ -304,7 +303,7 @@ TEST(DynamicTokenizer, simplePeekToken) { DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("test2", token.content); @@ -320,7 +319,7 @@ TEST(DynamicTokenizer, simplePeekToken) TEST(DynamicTokenizer, ambiguousTokens) { CharReader reader{"abc"}; - DynamicTokenizer tokenizer(reader); + DynamicTokenizer tokenizer; TokenTypeId t1 = tokenizer.registerToken("abd"); TokenTypeId t2 = tokenizer.registerToken("bc"); @@ -329,7 +328,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(1U, t2); DynamicToken token; - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("a", token.content); @@ -338,7 +337,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); - ASSERT_TRUE(tokenizer.read(token)); + ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(t2, token.type); ASSERT_EQ("bc", token.content); @@ -347,7 +346,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(3U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(token)); + ASSERT_FALSE(tokenizer.read(reader, token)); } TEST(DynamicTokenizer, commentTestWhitespacePreserve) @@ -355,7 +354,7 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(reader, WhitespaceMode::PRESERVE); + DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); @@ -371,14 +370,14 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) DynamicToken t; for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(t)); + EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); EXPECT_EQ(te.content, t.content); EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); } - ASSERT_FALSE(tokenizer.read(t)); + ASSERT_FALSE(tokenizer.read(reader, t)); } TEST(DynamicTokenizer, commentTestWhitespaceCollapse) @@ -386,7 +385,7 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(reader, WhitespaceMode::COLLAPSE); + DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); @@ -402,14 +401,14 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) DynamicToken t; for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(t)); + EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); EXPECT_EQ(te.content, t.content); EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); } - ASSERT_FALSE(tokenizer.read(t)); + ASSERT_FALSE(tokenizer.read(reader, t)); } } -- cgit v1.2.3