summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--src/plugins/plain/DynamicTokenizer.cpp514
-rw-r--r--src/plugins/plain/DynamicTokenizer.hpp154
-rw-r--r--test/plugins/plain/DynamicTokenizerTest.cpp416
4 files changed, 1016 insertions, 70 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9b224d..867ca6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,6 +197,7 @@ TARGET_LINK_LIBRARIES(ousia_xml
)
ADD_LIBRARY(ousia_plain
+ src/plugins/plain/DynamicTokenizer
src/plugins/plain/TokenTrie
src/plugins/plain/PlainFormatStreamReader
)
@@ -325,6 +326,7 @@ IF(TEST)
ADD_EXECUTABLE(ousia_test_plain
test/plugins/plain/TokenTrieTest
+ test/plugins/plain/DynamicTokenizerTest
test/plugins/plain/PlainFormatStreamReaderTest
)
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
index 7690395..a8f2317 100644
--- a/src/plugins/plain/DynamicTokenizer.cpp
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -17,57 +17,529 @@
*/
#include <memory>
-#include <string>
-#include <unordered_map>
+#include <vector>
#include <core/common/CharReader.hpp>
+#include <core/common/Exceptions.hpp>
+#include <core/common/Utils.hpp>
#include "DynamicTokenizer.hpp"
namespace ousia {
+namespace {
+
+/* Internal class TokenMatch */
+
+/**
+ * Contains information about a matching token.
+ */
+struct TokenMatch {
+ /**
+ * Token that was matched.
+ */
+ DynamicToken token;
+
+ /**
+ * Current length of the data within the text handler. The text buffer needs
+ * to be trimmed to this length if this token matches.
+ */
+ size_t textLength;
+
+ /**
+ * End location of the current text handler. This location needs to be used
+ * for the text token that is emitted before the actual token.
+ */
+ size_t textEnd;
+
+ /**
+ * Constructor of the TokenMatch class.
+ */
+ TokenMatch() : textLength(0), textEnd(0) {}
+
+ /**
+ * Returns true if this TokenMatch instance actually represents a match.
+ */
+ bool hasMatch() { return token.type != EmptyToken; }
+};
+
+/* Internal class TokenLookup */
+
+/**
+ * The TokenLookup class is used to represent a thread in a running token
+ * lookup.
+ */
+class TokenLookup {
+private:
+ /**
+ * Current node within the token trie.
+ */
+ TokenTrie::Node const *node;
+
+ /**
+ * Start offset within the source file.
+ */
+ size_t start;
+
+ /**
+ * Current length of the data within the text handler. The text buffer needs
+ * to be trimmed to this length if this token matches.
+ */
+ size_t textLength;
+
+ /**
+ * End location of the current text handler. This location needs to be used
+ * for the text token that is emitted before the actual token.
+ */
+ size_t textEnd;
+
+public:
+ /**
+ * Constructor of the TokenLookup class.
+ *
+ * @param node is the current node.
+ * @param start is the start position.
+ * @param textLength is the text buffer length of the previous text token.
+ * @param textEnd is the current end location of the previous text token.
+ */
+ TokenLookup(const TokenTrie::Node *node, size_t start,
+ size_t textLength, size_t textEnd)
+ : node(node), start(start), textLength(textLength), textEnd(textEnd)
+ {
+ }
+
+ /**
+ * Tries to extend the current path in the token trie with the given
+ * character. If a complete token is matched, stores this match in the
+ * tokens list (in case it is longer than any previous token).
+ *
+ * @param c is the character that should be appended to the current prefix.
+ * @param lookups is a list to which new TokeLookup instances are added --
+ * which could potentially be expanded in the next iteration.
+ * @param match is the DynamicToken instance to which the matching token
+ * should be written.
+ * @param tokens is a reference at the internal token list of the
+ * DynamicTokenizer.
+ * @param end is the end byte offset of the current character.
+ * @param sourceId is the source if of this file.
+ */
+ void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+ const std::vector<std::string> &tokens, SourceOffset end,
+ SourceId sourceId)
+ {
+ // Check whether we can continue the current token path with the given
+ // character without visiting an already visited node
+ auto it = node->children.find(c);
+ if (it == node->children.end()) {
+ return;
+ }
+
+ // Check whether the new node represents a complete token a whether it
+ // is longer than the current token. If yes, replace the current token.
+ node = it->second.get();
+ if (node->type != EmptyToken) {
+ const std::string &str = tokens[node->type];
+ size_t len = str.size();
+ if (len > match.token.content.size()) {
+ match.token =
+ DynamicToken{node->type, str, {sourceId, start, end}};
+ match.textLength = textLength;
+ match.textEnd = textEnd;
+ }
+ }
+
+ // If this state can possibly be advanced, store it in the states list.
+ if (!node->children.empty()) {
+ lookups.emplace_back(*this);
+ }
+ }
+};
+
+/* Internal class TextHandlerBase */
+
+/**
+ * Base class used for those classes that may be used as TextHandler in the
+ * DynamicTokenizer::next function.
+ */
+class TextHandlerBase {
+public:
+ /**
+ * Start position of the extracted text.
+ */
+ size_t textStart;
+
+ /**
+ * End position of the extracted text.
+ */
+ size_t textEnd;
+
+ /**
+ * Buffer containing the extracted text.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Constructor of the TextHandlerBase base class. Initializes the start and
+ * end position with zeros.
+ */
+ TextHandlerBase() : textStart(0), textEnd(0) {}
+
+ /**
+ * Transforms the given token into a text token containing the extracted
+ * text.
+ *
+ * @param token is the output token to which the text should be written.
+ * @param sourceId is the source id of the underlying file.
+ */
+ void buildTextToken(TokenMatch &match, SourceId sourceId)
+ {
+ if (match.hasMatch()) {
+ match.token.content =
+ std::string{textBuf.data(), match.textLength};
+ match.token.location =
+ SourceLocation{sourceId, textStart, match.textEnd};
+ } else {
+ match.token.content = std::string{textBuf.data(), textBuf.size()};
+ match.token.location = SourceLocation{sourceId, textStart, textEnd};
+ }
+ match.token.type = TextToken;
+ }
+
+ /**
+ * Returns true if this whitespace handler has found any text and a text
+ * token could be emitted.
+ *
+ * @return true if the internal data buffer is non-empty.
+ */
+ bool hasText() { return !textBuf.empty(); }
+};
+
+/* Internal class PreservingTextHandler */
+
+/**
+ * The PreservingTextHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
+
+ /**
+ * Appends the given character to the internal text buffer, does not
+ * eliminate whitespace.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+ textBuf.push_back(c);
+ }
+};
+
+/* Internal class TrimmingTextHandler */
+
/**
- * The TokenDescriptor class is a simple wrapper around a standard string
- * containing the character sequence of the token.
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
*/
-class TokenDescriptor {
+class TrimmingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
+
/**
- * The character sequence of the token.
+ * Buffer used internally to temporarily store all whitespace characters.
+ * They are only added to the output buffer if another non-whitespace
+ * character is reached.
*/
- std::string str;
+ std::vector<char> whitespaceBuf;
/**
- * Default constructor of the TokenDescriptor class. Used to describe
- * special tokens.
+ * Appends the given character to the internal text buffer, eliminates
+ * whitespace characters at the begin and end of the text.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
*/
- TokenDescriptor();
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ whitespaceBuf.push_back(c);
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (!whitespaceBuf.empty()) {
+ textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+ whitespaceBuf.end());
+ whitespaceBuf.clear();
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/* Internal class CollapsingTextHandler */
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
/**
- * Constructor initializing the character sequence of the token.
+ * Flag set to true if a whitespace character was reached.
*/
- TokenDescriptor(const std::string &str) : str(str) {}
+ bool hasWhitespace = false;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * redundant whitespace characters.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ hasWhitespace = true;
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (hasWhitespace) {
+ textBuf.push_back(' ');
+ hasWhitespace = false;
+ }
+ textBuf.push_back(c);
+ }
};
+}
/* Class DynamicTokenizer */
-void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode)
+DynamicTokenizer::DynamicTokenizer(CharReader &reader,
+ WhitespaceMode whitespaceMode)
+ : reader(reader), whitespaceMode(whitespaceMode), nextTokenTypeId(0)
{
- whitespaceMode = mode;
}
-WhitespaceMode DynamicTokenizer::getWhitespaceMode()
+template <typename TextHandler, bool read>
+bool DynamicTokenizer::next(DynamicToken &token)
{
- return whitespaceMode;
+ // If we're in the read mode, reset the char reader peek position to the
+ // current read position
+ if (read) {
+ reader.resetPeek();
+ }
+
+ // Prepare the lookups in the token trie
+ const TokenTrie::Node *root = trie.getRoot();
+ TokenMatch match;
+ std::vector<TokenLookup> lookups;
+ std::vector<TokenLookup> nextLookups;
+
+ // Instantiate the text handler
+ TextHandler textHandler;
+
+ // Peek characters from the reader and try to advance the current token tree
+ // cursor
+ char c;
+ size_t charStart = reader.getPeekOffset();
+ const SourceId sourceId = reader.getSourceId();
+ while (reader.peek(c)) {
+ const size_t charEnd = reader.getPeekOffset();
+ const size_t textLength = textHandler.textBuf.size();
+ const size_t textEnd = textHandler.textEnd;
+
+ // If we do not have a match yet, start a new lookup from the root
+ if (!match.hasMatch()) {
+ TokenLookup{root, charStart, textLength, textEnd}.advance(
+ c, nextLookups, match, tokens, charEnd, sourceId);
+ }
+
+ // Try to advance all other lookups with the new character
+ for (TokenLookup &lookup : lookups) {
+ lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+ }
+
+ // We have found a token and there are no more states to advance or the
+ // text handler has found something -- abort to return the new token
+ if (match.hasMatch()) {
+ if ((nextLookups.empty() || textHandler.hasText())) {
+ break;
+ }
+ } else {
+ // Record all incomming characters
+ textHandler.append(c, charStart, charEnd);
+ }
+
+ // Swap the lookups and the nextLookups list
+ lookups = std::move(nextLookups);
+ nextLookups.clear();
+
+ // Advance the offset
+ charStart = charEnd;
+ }
+
+ // If we found text, emit that text
+ if (textHandler.hasText() &&
+ (!match.hasMatch() || match.textLength > 0)) {
+ textHandler.buildTextToken(match, sourceId);
+ }
+
+ // Move the read/peek cursor to the end of the token, abort if an error
+ // happens while doing so
+ if (match.hasMatch()) {
+ // Make sure we have a valid location
+ if (match.token.location.getEnd() == InvalidSourceOffset) {
+ throw OusiaException{"Token end position offset out of range"};
+ }
+
+ // Seek to the end of the current token
+ const size_t end = match.token.location.getEnd();
+ if (read) {
+ reader.seek(end);
+ } else {
+ reader.seekPeekCursor(end);
+ }
+ token = match.token;
+ } else {
+ token = DynamicToken{};
+ }
+ return match.hasMatch();
+}
+
+bool DynamicTokenizer::read(DynamicToken &token)
+{
+ switch (whitespaceMode) {
+ case WhitespaceMode::PRESERVE:
+ return next<PreservingTextHandler, true>(token);
+ case WhitespaceMode::TRIM:
+ return next<TrimmingTextHandler, true>(token);
+ case WhitespaceMode::COLLAPSE:
+ return next<CollapsingTextHandler, true>(token);
+ }
+ return false;
+}
+
+bool DynamicTokenizer::peek(DynamicToken &token)
+{
+ switch (whitespaceMode) {
+ case WhitespaceMode::PRESERVE:
+ return next<PreservingTextHandler, false>(token);
+ case WhitespaceMode::TRIM:
+ return next<TrimmingTextHandler, false>(token);
+ case WhitespaceMode::COLLAPSE:
+ return next<CollapsingTextHandler, false>(token);
+ }
+ return false;
}
+TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
+{
+ // Abort if an empty token should be registered
+ if (token.empty()) {
+ return EmptyToken;
+ }
+
+ // Search for a new slot in the tokens list
+ TokenTypeId type = EmptyToken;
+ for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
+ if (tokens[i].empty()) {
+ tokens[i] = token;
+ type = i;
+ break;
+ }
+ }
-/* Constant initializations */
+ // No existing slot was found, add a new one -- make sure we do not
+ // override the special token type handles
+ if (type == EmptyToken) {
+ type = tokens.size();
+ if (type == TextToken || type == EmptyToken) {
+ throw OusiaException{"Token type ids depleted!"};
+ }
+ tokens.emplace_back(token);
+ }
+ nextTokenTypeId = type + 1;
-static const TokenDescriptor Empty;
-static const TokenDescriptor Text;
-static const TokenDescriptor* DynamicTokenizer::Empty = &Empty;
-static const TokenDescriptor* DynamicTokenizer::Token = &Text;
+ // Try to register the token in the trie -- if this fails, remove it
+ // from the tokens list
+ if (!trie.registerToken(token, type)) {
+ tokens[type] = std::string();
+ nextTokenTypeId = type;
+ return EmptyToken;
+ }
+ return type;
+}
+
+bool DynamicTokenizer::unregisterToken(TokenTypeId type)
+{
+ // Unregister the token from the trie, abort if an invalid type is given
+ if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
+ tokens[type] = std::string{};
+ nextTokenTypeId = type;
+ return true;
+ }
+ return false;
+}
+
+std::string DynamicTokenizer::getTokenString(TokenTypeId type)
+{
+ if (type < tokens.size()) {
+ return tokens[type];
+ }
+ return std::string{};
+}
+
+void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
+{
+ whitespaceMode = mode;
+}
+WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
+/* Explicitly instantiate all possible instantiations of the "next" member
+ function */
+template bool DynamicTokenizer::next<PreservingTextHandler, false>(
+ DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
+ DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
+ DynamicToken &token);
+template bool DynamicTokenizer::next<PreservingTextHandler, true>(
+ DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
+ DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
+ DynamicToken &token);
}
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
index f7fef13..760bebf 100644
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -28,34 +28,63 @@
#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#include <set>
+#include <string>
+#include <vector>
+
#include <core/common/Location.hpp>
+#include "TokenTrie.hpp"
+
namespace ousia {
// Forward declarations
class CharReader;
-class TokenDescriptor;
/**
* The DynamicToken structure describes a token discovered by the Tokenizer.
*/
struct DynamicToken {
/**
- * Pointer pointing at the TokenDescriptor instance this token corresponds
- * to. May be one of the special TokenDescriptors defined as static members
- * of the DynamicTokenizer class.
+ * Id of the type of this token.
*/
- TokenDescriptor const *descriptor;
+ TokenTypeId type;
/**
* String that was matched.
*/
- std::string str;
+ std::string content;
/**
* Location from which the string was extracted.
*/
SourceLocation location;
+
+ /**
+ * Default constructor.
+ */
+ DynamicToken() : type(EmptyToken) {}
+
+ /**
+ * Constructor of the DynamicToken struct.
+ *
+ * @param id represents the token type.
+ * @param content is the string content that has been extracted.
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ DynamicToken(TokenTypeId type, const std::string &content,
+ SourceLocation location)
+ : type(type), content(content), location(location)
+ {
+ }
+
+ /**
+ * Constructor of the DynamicToken struct, only initializes the token type
+ *
+ * @param type is the id corresponding to the type of the token.
+ */
+ DynamicToken(TokenTypeId type) : type(type) {}
};
/**
@@ -64,43 +93,70 @@ struct DynamicToken {
*/
enum class WhitespaceMode {
/**
- * Preserves all whitespaces as they are found in the source file.
- */
+ * Preserves all whitespaces as they are found in the source file.
+ */
PRESERVE,
/**
- * Trims whitespace at the beginning and the end of the found text.
- */
+ * Trims whitespace at the beginning and the end of the found text.
+ */
TRIM,
/**
- * Whitespaces are trimmed and collapsed, multiple whitespace characters
- * are replaced by a single space character.
- */
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
COLLAPSE
};
/**
* The DynamicTokenizer is used to extract tokens and chunks of text from a
* CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters.
+ * to modify the handling of whitespace characters. Note that the
+ * DynamicTokenizer always tries to extract the longest possible token from the
+ * tokenizer.
*/
class DynamicTokenizer {
private:
/**
- * Reference at the char reader.
+ * CharReader instance from which the tokens should be read.
*/
CharReader &reader;
/**
+ * Internally used token trie. This object holds all registered tokens.
+ */
+ TokenTrie trie;
+
+ /**
* Flag defining whether whitespaces should be preserved or not.
*/
WhitespaceMode whitespaceMode;
/**
- * Vector containing all registered token descriptors.
+ * Vector containing all registered token types.
+ */
+ std::vector<std::string> tokens;
+
+ /**
+ * Next index in the tokens list where to search for a new token id.
*/
- std::vector<std::unique_ptr<TokenDescriptor>> descriptors;
+ size_t nextTokenTypeId;
+
+ /**
+ * Templated function used internally to read the current token. The
+ * function is templated in order to force code generation for all six
+ * combiations of whitespace modes and reading/peeking.
+ *
+ * @tparam TextHandler is the type to be used for the textHandler instance.
+ * @tparam read specifies whether the function should start from and advance
+ * the read pointer of the char reader.
+ * @param token is the token structure into which the token information
+ * should be written.
+ * @return false if the end of the stream has been reached, true otherwise.
+ */
+ template <typename TextHandler, bool read>
+ bool next(DynamicToken &token);
public:
/**
@@ -108,43 +164,44 @@ public:
*
* @param reader is the CharReader that should be used for reading the
* tokens.
- * @param preserveWhitespaces should be set to true if all whitespaces
- * should be preserved (for preformated environments).
- */
- DynamicTokenizer(CharReader &reader)
- : reader(reader),
- preserveWhitespaces(preserveWhitespaces),
- location(reader.getSourceId()),
- empty(true),
- hasWhitespace(false)
- {
- }
-
- /**
- * Destructor of the DynamicTokenizer class.
+ * @param whitespaceMode specifies how whitespace should be handled.
*/
- ~DynamicTokenizer();
+ DynamicTokenizer(CharReader &reader,
+ WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
/**
* Registers the given string as a token. Returns a const pointer at a
* TokenDescriptor that will be used to reference the newly created token.
*
* @param token is the token string that should be registered.
- * @return a pointer at a TokenDescriptor which is representative for the
- * newly registered token. Returns nullptr if a token with this string
- * was already registered.
+ * @return a unique identifier for the registered token or EmptyToken if
+ * an error occured.
*/
- const TokenDescriptor* registerToken(const std::string &token);
+ TokenTypeId registerToken(const std::string &token);
/**
- * Unregisters the token belonging to the given TokenDescriptor.
+ * Unregisters the token belonging to the given TokenTypeId.
*
- * @param descr is a TokenDescriptor that was previously returned by
- * registerToken.
+ * @param type is the token type that should be unregistered. The
+ *TokenTypeId
+ * must have been returned by registerToken.
* @return true if the operation was successful, false otherwise (e.g.
* because the given TokenDescriptor was already unregistered).
*/
- bool unregisterToken(const TokenDescriptor *descr);
+ bool unregisterToken(TokenTypeId type);
+
+ /**
+ * Returns the token that was registered under the given TokenTypeId id or
+ *an
+ * empty string if an invalid TokenTypeId id is given.
+ *
+ * @param type is the TokenTypeId id for which the corresponding token
+ *string
+ * should be returned.
+ * @return the registered token string or an empty string if the given type
+ * was invalid.
+ */
+ std::string getTokenString(TokenTypeId type);
/**
* Sets the whitespace mode.
@@ -173,17 +230,16 @@ public:
bool read(DynamicToken &token);
/**
- * TokenDescriptor representing an empty token.
- */
- static const *TokenDescriptor Empty;
-
- /**
- * TokenDescriptor representing generic text.
+ * The peek method does not advance the read position of the char reader,
+ * but reads the next token from the current char reader peek position.
+ *
+ * @param token is a reference at the token instance into which the Token
+ * information should be written.
+ * @return true if a token could be read, false if the end of the stream
+ * has been reached.
*/
- static const *TokenDescriptor Text;
-
+ bool peek(DynamicToken &token);
};
-
}
#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
diff --git a/test/plugins/plain/DynamicTokenizerTest.cpp b/test/plugins/plain/DynamicTokenizerTest.cpp
index e69de29..63fa466 100644
--- a/test/plugins/plain/DynamicTokenizerTest.cpp
+++ b/test/plugins/plain/DynamicTokenizerTest.cpp
@@ -0,0 +1,416 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <gtest/gtest.h>
+
+#include <core/common/CharReader.hpp>
+#include <plugins/plain/DynamicTokenizer.hpp>
+
+namespace ousia {
+
+TEST(DynamicTokenizer, tokenRegistration)
+{
+ CharReader reader{"test"};
+ DynamicTokenizer tokenizer{reader};
+
+ ASSERT_EQ(EmptyToken, tokenizer.registerToken(""));
+
+ ASSERT_EQ(0U, tokenizer.registerToken("a"));
+ ASSERT_EQ(EmptyToken, tokenizer.registerToken("a"));
+ ASSERT_EQ("a", tokenizer.getTokenString(0U));
+
+ ASSERT_EQ(1U, tokenizer.registerToken("b"));
+ ASSERT_EQ(EmptyToken, tokenizer.registerToken("b"));
+ ASSERT_EQ("b", tokenizer.getTokenString(1U));
+
+ ASSERT_EQ(2U, tokenizer.registerToken("c"));
+ ASSERT_EQ(EmptyToken, tokenizer.registerToken("c"));
+ ASSERT_EQ("c", tokenizer.getTokenString(2U));
+
+ ASSERT_TRUE(tokenizer.unregisterToken(1U));
+ ASSERT_FALSE(tokenizer.unregisterToken(1U));
+ ASSERT_EQ("", tokenizer.getTokenString(1U));
+
+ ASSERT_EQ(1U, tokenizer.registerToken("d"));
+ ASSERT_EQ(EmptyToken, tokenizer.registerToken("d"));
+ ASSERT_EQ("d", tokenizer.getTokenString(1U));
+}
+
+TEST(DynamicTokenizer, textTokenPreserveWhitespace)
+{
+ {
+ CharReader reader{" this \t is only a \n\n test text "};
+ // 012345 6789012345678 9 0123456789012345
+ // 0 1 2 3
+ DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ(" this \t is only a \n\n test text ", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(36U, loc.getEnd());
+
+ ASSERT_FALSE(tokenizer.read(token));
+ }
+
+ {
+ CharReader reader{"this \t is only a \n\n test text"};
+ // 01234 5678901234567 8 9012345678901
+ // 0 1 2 3
+ DynamicTokenizer tokenizer{reader, WhitespaceMode::PRESERVE};
+
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("this \t is only a \n\n test text", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(32U, loc.getEnd());
+
+ ASSERT_FALSE(tokenizer.read(token));
+ }
+}
+
+TEST(DynamicTokenizer, textTokenTrimWhitespace)
+{
+ {
+ CharReader reader{" this \t is only a \n\n test text "};
+ // 012345 6789012345678 9 0123456789012345
+ // 0 1 2 3
+ DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("this \t is only a \n\n test text", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(1U, loc.getStart());
+ ASSERT_EQ(33U, loc.getEnd());
+
+ ASSERT_FALSE(tokenizer.read(token));
+ }
+
+ {
+ CharReader reader{"this \t is only a \n\n test text"};
+ // 01234 5678901234567 8 9012345678901
+ // 0 1 2 3
+ DynamicTokenizer tokenizer{reader, WhitespaceMode::TRIM};
+
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("this \t is only a \n\n test text", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(32U, loc.getEnd());
+
+ ASSERT_FALSE(tokenizer.read(token));
+ }
+}
+
+TEST(DynamicTokenizer, textTokenCollapseWhitespace)
+{
+ {
+ CharReader reader{" this \t is only a \n\n test text "};
+ // 012345 6789012345678 9 0123456789012345
+ // 0 1 2 3
+ DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("this is only a test text", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(1U, loc.getStart());
+ ASSERT_EQ(33U, loc.getEnd());
+
+ ASSERT_FALSE(tokenizer.read(token));
+ }
+
+ {
+ CharReader reader{"this \t is only a \n\n test text"};
+ // 01234 5678901234567 8 9012345678901
+ // 0 1 2 3
+ DynamicTokenizer tokenizer{reader, WhitespaceMode::COLLAPSE};
+
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("this is only a test text", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(32U, loc.getEnd());
+
+ ASSERT_FALSE(tokenizer.read(token));
+ }
+}
+
+TEST(DynamicTokenizer, simpleReadToken)
+{
+ CharReader reader{"test1:test2"};
+ DynamicTokenizer tokenizer{reader};
+
+ const TokenTypeId tid = tokenizer.registerToken(":");
+ ASSERT_EQ(0U, tid);
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("test1", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(5U, loc.getEnd());
+
+ char c;
+ ASSERT_TRUE(reader.peek(c));
+ ASSERT_EQ(':', c);
+ }
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(tid, token.type);
+ ASSERT_EQ(":", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(5U, loc.getStart());
+ ASSERT_EQ(6U, loc.getEnd());
+
+ char c;
+ ASSERT_TRUE(reader.peek(c));
+ ASSERT_EQ('t', c);
+ }
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("test2", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(6U, loc.getStart());
+ ASSERT_EQ(11U, loc.getEnd());
+
+ char c;
+ ASSERT_FALSE(reader.peek(c));
+ }
+}
+
+TEST(DynamicTokenizer, simplePeekToken)
+{
+ CharReader reader{"test1:test2"};
+ DynamicTokenizer tokenizer{reader};
+
+ const TokenTypeId tid = tokenizer.registerToken(":");
+ ASSERT_EQ(0U, tid);
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.peek(token));
+
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("test1", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(5U, loc.getEnd());
+ ASSERT_EQ(0U, reader.getOffset());
+ ASSERT_EQ(5U, reader.getPeekOffset());
+ }
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.peek(token));
+
+ ASSERT_EQ(tid, token.type);
+ ASSERT_EQ(":", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(5U, loc.getStart());
+ ASSERT_EQ(6U, loc.getEnd());
+ ASSERT_EQ(0U, reader.getOffset());
+ ASSERT_EQ(6U, reader.getPeekOffset());
+ }
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.peek(token));
+
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("test2", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(6U, loc.getStart());
+ ASSERT_EQ(11U, loc.getEnd());
+ ASSERT_EQ(0U, reader.getOffset());
+ ASSERT_EQ(11U, reader.getPeekOffset());
+ }
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("test1", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(5U, loc.getEnd());
+ ASSERT_EQ(5U, reader.getOffset());
+ ASSERT_EQ(5U, reader.getPeekOffset());
+ }
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(tid, token.type);
+ ASSERT_EQ(":", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(5U, loc.getStart());
+ ASSERT_EQ(6U, loc.getEnd());
+ ASSERT_EQ(6U, reader.getOffset());
+ ASSERT_EQ(6U, reader.getPeekOffset());
+ }
+
+ {
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("test2", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(6U, loc.getStart());
+ ASSERT_EQ(11U, loc.getEnd());
+ ASSERT_EQ(11U, reader.getOffset());
+ ASSERT_EQ(11U, reader.getPeekOffset());
+ }
+}
+
+TEST(DynamicTokenizer, ambiguousTokens)
+{
+ CharReader reader{"abc"};
+ DynamicTokenizer tokenizer(reader);
+
+ TokenTypeId t1 = tokenizer.registerToken("abd");
+ TokenTypeId t2 = tokenizer.registerToken("bc");
+
+ ASSERT_EQ(0U, t1);
+ ASSERT_EQ(1U, t2);
+
+ DynamicToken token;
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(TextToken, token.type);
+ ASSERT_EQ("a", token.content);
+
+ SourceLocation loc = token.location;
+ ASSERT_EQ(0U, loc.getStart());
+ ASSERT_EQ(1U, loc.getEnd());
+
+ ASSERT_TRUE(tokenizer.read(token));
+
+ ASSERT_EQ(t2, token.type);
+ ASSERT_EQ("bc", token.content);
+
+ loc = token.location;
+ ASSERT_EQ(1U, loc.getStart());
+ ASSERT_EQ(3U, loc.getEnd());
+
+ ASSERT_FALSE(tokenizer.read(token));
+}
+
+TEST(DynamicTokenizer, commentTestWhitespacePreserve)
+{
+ CharReader reader{"Test/Test /* Block Comment */", 0};
+ // 012345678901234567890123456789
+ // 0 1 2
+ DynamicTokenizer tokenizer(reader, WhitespaceMode::PRESERVE);
+
+ const TokenTypeId t1 = tokenizer.registerToken("/");
+ const TokenTypeId t2 = tokenizer.registerToken("/*");
+ const TokenTypeId t3 = tokenizer.registerToken("*/");
+
+ std::vector<DynamicToken> expected = {
+ {TextToken, "Test", SourceLocation{0, 0, 4}},
+ {t1, "/", SourceLocation{0, 4, 5}},
+ {TextToken, "Test ", SourceLocation{0, 5, 10}},
+ {t2, "/*", SourceLocation{0, 10, 12}},
+ {TextToken, " Block Comment ", SourceLocation{0, 12, 27}},
+ {t3, "*/", SourceLocation{0, 27, 29}}};
+
+ DynamicToken t;
+ for (auto &te : expected) {
+ EXPECT_TRUE(tokenizer.read(t));
+ EXPECT_EQ(te.type, t.type);
+ EXPECT_EQ(te.content, t.content);
+ EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
+ EXPECT_EQ(te.location.getStart(), t.location.getStart());
+ EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
+ }
+ ASSERT_FALSE(tokenizer.read(t));
+}
+
+TEST(DynamicTokenizer, commentTestWhitespaceCollapse)
+{
+ CharReader reader{"Test/Test /* Block Comment */", 0};
+ // 012345678901234567890123456789
+ // 0 1 2
+ DynamicTokenizer tokenizer(reader, WhitespaceMode::COLLAPSE);
+
+ const TokenTypeId t1 = tokenizer.registerToken("/");
+ const TokenTypeId t2 = tokenizer.registerToken("/*");
+ const TokenTypeId t3 = tokenizer.registerToken("*/");
+
+ std::vector<DynamicToken> expected = {
+ {TextToken, "Test", SourceLocation{0, 0, 4}},
+ {t1, "/", SourceLocation{0, 4, 5}},
+ {TextToken, "Test", SourceLocation{0, 5, 9}},
+ {t2, "/*", SourceLocation{0, 10, 12}},
+ {TextToken, "Block Comment", SourceLocation{0, 13, 26}},
+ {t3, "*/", SourceLocation{0, 27, 29}}};
+
+ DynamicToken t;
+ for (auto &te : expected) {
+ EXPECT_TRUE(tokenizer.read(t));
+ EXPECT_EQ(te.type, t.type);
+ EXPECT_EQ(te.content, t.content);
+ EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
+ EXPECT_EQ(te.location.getStart(), t.location.getStart());
+ EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
+ }
+ ASSERT_FALSE(tokenizer.read(t));
+}
+
+}
+