summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--src/core/parser/utils/TokenizedData.cpp365
-rw-r--r--src/core/parser/utils/TokenizedData.hpp189
-rw-r--r--test/core/parser/utils/TokenizedDataTest.cpp526
4 files changed, 1082 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9a47d2..ea5c3aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,6 +190,7 @@ ADD_LIBRARY(ousia_core
src/core/parser/stack/TypesystemHandler
src/core/parser/utils/SourceOffsetVector
src/core/parser/utils/Token
+ src/core/parser/utils/TokenizedData
src/core/parser/utils/Tokenizer
src/core/parser/utils/TokenTrie
src/core/resource/Resource
@@ -325,6 +326,7 @@ IF(TEST)
test/core/parser/stack/StackTest
test/core/parser/stack/StateTest
test/core/parser/utils/SourceOffsetVectorTest
+ test/core/parser/utils/TokenizedDataTest
test/core/parser/utils/TokenizerTest
test/core/parser/utils/TokenTrieTest
test/core/resource/ResourceLocatorTest
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
new file mode 100644
index 0000000..fc7bfaf
--- /dev/null
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -0,0 +1,365 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include <core/common/Utils.hpp>
+
+#include "SourceOffsetVector.hpp"
+#include "TokenizedData.hpp"
+
+namespace ousia {
+namespace {
+/**
+ * Structure used to represent the position of a token in the internal
+ * character buffer.
+ */
+struct TokenMark {
+ /**
+ * Relative position of the token in the buffer.
+ */
+ size_t bufStart;
+
+ /**
+ * TokenId of the associated token.
+ */
+ TokenId id;
+
+ /**
+ * Length of the token.
+ */
+ TokenLength len;
+
+ /**
+ * Constructor of the TokenMark structure, initializes all members with the
+ * given values.
+ *
+ * @param id is the id of the token that is marked here.
+ * @param bufStart is the start position of the TokenMark in the internal
+ * character buffer.
+ * @param len is the length of the token.
+ */
+ TokenMark(TokenId id, size_t bufStart, TokenLength len)
+ : bufStart(bufStart), id(id), len(len)
+ {
+ }
+
+ /**
+ * Creates a dummy TokenMark instance used for comparison purposes. This
+ * TokenMark will compare to be smaller than an equal TokenMark with
+ * equivalent start.
+ *
+ * @param bufStart is start position of the TokenMark in the internal
+ * character buffer.
+ */
+ TokenMark(size_t bufStart)
+ : bufStart(bufStart),
+ id(Tokens::Empty),
+ len(std::numeric_limits<TokenLength>::max())
+ {
+ }
+
+ /**
+ * Operator used for sorting TokenMark instances. They are sorted in such a
+ * way that the instances with smallest bufStart come first and for equal
+ * bufStart values instances with the largest length come first.
+ *
+ * @param m1 is the left-hand side TokenMark instance of the comparison.
+ * @param m2 is the right-hand side TokenMark instance of the comparison.
+ */
+ friend bool operator<(const TokenMark &m1, const TokenMark &m2)
+ {
+ return (m1.bufStart < m2.bufStart) ||
+ (m1.bufStart == m2.bufStart && m1.len > m2.len);
+ }
+};
+}
+
+/**
+ * Structure used to hold all the internal data structures that may be
+ * exchanged between TokenizedData instances.
+ */
+class TokenizedDataImpl {
+private:
+ /**
+ * SourceId representing the source file from which the current content is
+ * being read.
+ */
+ SourceId sourceId;
+
+ /**
+ * Buffer containing the actual character data.
+ */
+ std::vector<char> buf;
+
+ /**
+ * Vector containing all token marks.
+ */
+ std::vector<TokenMark> marks;
+
+ /**
+ * Vector storing all the character offsets efficiently.
+ */
+ SourceOffsetVector offsets;
+
+ /**
+ * Flag indicating whether the internal "marks" vector is sorted.
+ */
+ bool sorted;
+
+public:
+ /**
+ * Constructor of TokenizedDataImpl. Takes the SourceId that should be used
+ * for returned tokens.
+ *
+ * @param sourceId is the source identifier that should be used for
+ * constructing the location when returning tokens.
+ */
+ TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {}
+
+ /**
+ * Appends a complete string to the internal character buffer and extends
+ * the text regions in the regions map.
+ *
+ * @param data is the string that should be appended to the buffer.
+ * @param offsStart is the start offset in bytes in the input file.
+ * @return the current size of the internal byte buffer. The returned value
+ * is intended to be used for the "mark" function.
+ */
+ size_t append(const std::string &data, SourceOffset offsStart)
+ { // Append the data to the internal buffer
+ buf.insert(buf.end(), data.begin(), data.end());
+
+ // Extend the text regions, interpolate the source position (this may
+ // yield incorrect results)
+ const size_t size = buf.size();
+ for (SourceOffset offs = offsStart; offs < offsStart + data.size();
+ offs++) {
+ offsets.storeOffset(offs, offs + 1);
+ }
+
+ return size;
+ }
+
+ /**
+ * Appends a single character to the internal character buffer and extends
+ * the text regions in the regions map.
+ *
+ * @param c is the character that should be appended to the buffer.
+ * @param offsStart is the start offset in bytes in the input file.
+ * @param offsEnd is the end offset in bytes in the input file.
+ * @return the current size of the internal byte buffer. The returned value
+ * is intended to be used for the "mark" function.
+ */
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd)
+ {
+ // Add the character to the list and store the location of the character
+ // in the source file
+ buf.push_back(c);
+ offsets.storeOffset(offsStart, offsEnd);
+ return buf.size();
+ }
+
+ /**
+ * Stores a token at the given position.
+ *
+ * @param id is the token that should be stored.
+ * @param bufStart is the start position in the internal buffer. Use the
+ * values returned by append to calculate the start position.
+ * @param len is the length of the token.
+ */
+ void mark(TokenId id, size_t bufStart, TokenLength len)
+ {
+ // Push the new instance back onto the list
+ marks.emplace_back(id, bufStart, len);
+
+ // Update the sorted flag as soon as more than one element is in the
+ // list
+ if (marks.size() > 1U) {
+ sorted = sorted && *(marks.end() - 2) < *(marks.end() - 1);
+ }
+ }
+
+ /**
+ * Returns the next token or a text token if no explicit token is available.
+ * Advances the given cursor to the end of the returned token.
+ *
+ * @param token is the Token instance to which the token should be written.
+ * @param mode is the WhitespaceMode to be used for extracting the text
+ * cursor.
+ * @param tokens is a set of enabled tokens. Tokens that are not in this set
+ * are ignored and returned as text.
+ * @param cursor is the position in the character buffer from which on the
+ * next token should be read. The cursor will be updated to the position
+ * beyond the returned token.
+ * @return true if a token was returned, false if no more tokens are
+ * available.
+ */
+ bool next(Token &token, WhitespaceMode mode,
+ const std::unordered_set<TokenId> &tokens, size_t &cursor)
+ {
+ // Sort the "marks" vector if it has not been sorted yet.
+ if (!sorted) {
+ std::sort(marks.begin(), marks.end());
+ sorted = true;
+ }
+
+ // Fetch the next larger TokenMark instance, make sure the token is in
+ // the "enabled" list
+ auto it =
+ std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor));
+ while (it != marks.end() && tokens.count(it->id) == 0) {
+ it++;
+ }
+
+ // Calculate the buffer start and end character, based on the returned
+ // TokenMark instance
+ const size_t end = (it != marks.end()) ? it->bufStart : buf.size();
+
+ // Depending on the whitespace mode, fetch all the data between the
+ // cursor position and the calculated end position and return a token
+ // containing that data.
+ if (cursor < end && cursor < buf.size()) {
+ switch (mode) {
+ case WhitespaceMode::PRESERVE: {
+ token = Token(
+ Tokens::Data, std::string(&buf[cursor], end - cursor),
+ SourceLocation(sourceId,
+ offsets.loadOffset(cursor).first,
+ offsets.loadOffset(end).first));
+ cursor = end;
+ return true;
+ }
+ case WhitespaceMode::TRIM:
+ case WhitespaceMode::COLLAPSE: {
+ // Calculate the collapsed string and the corresponding
+ // trimmed region
+ size_t stringStart;
+ size_t stringEnd;
+ std::string content;
+ if (mode == WhitespaceMode::TRIM) {
+ content = Utils::trim(&buf[cursor], end - cursor,
+ stringStart, stringEnd);
+ } else {
+ content = Utils::collapse(&buf[cursor], end - cursor,
+ stringStart, stringEnd);
+ }
+
+ // If the resulting string is empty (only whitespaces),
+ // abort
+ if (content.empty()) {
+ cursor = end;
+ break;
+ }
+
+ // Calculate the absolute positions and return the token
+ stringStart += cursor;
+ stringEnd += cursor;
+ token = Token(
+ Tokens::Data, content,
+ SourceLocation(sourceId,
+ offsets.loadOffset(stringStart).first,
+ offsets.loadOffset(stringEnd).first));
+ cursor = end;
+ return true;
+ }
+ }
+ }
+
+ // If start equals end, we're currently directly at a token
+ // instance. Return this token and advance the cursor to the end of
+ // the token.
+ if (cursor == end && it != marks.end()) {
+ const size_t tokenStart = it->bufStart;
+ const size_t tokenEnd = it->bufStart + it->len;
+ token = Token(
+ it->id, std::string(&buf[tokenStart], it->len),
+ SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,
+ offsets.loadOffset(tokenEnd).first));
+ cursor = tokenEnd;
+ return true;
+ }
+
+ // We've failed. There is no token. Only void. Reset token and return
+ // false.
+ token = Token();
+ return false;
+ }
+
+ /**
+ * Returns the current size of the internal buffer.
+ *
+ * @return the size of the internal character buffer.
+ */
+ size_t getSize() { return buf.size(); }
+};
+
+/* Class TokenizedData */
+
+TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}
+
+TokenizedData::TokenizedData(SourceId sourceId)
+ : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0)
+{
+}
+
+TokenizedData::~TokenizedData() {}
+
+size_t TokenizedData::append(const std::string &data, SourceOffset offsStart)
+{
+ return impl->append(data, offsStart);
+}
+
+size_t TokenizedData::append(char c, SourceOffset offsStart,
+ SourceOffset offsEnd)
+{
+ return impl->append(c, offsStart, offsEnd);
+}
+
+void TokenizedData::mark(TokenId id, TokenLength len)
+{
+ impl->mark(id, impl->getSize() - len, len);
+}
+
+void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)
+{
+ impl->mark(id, bufStart, len);
+}
+
+bool TokenizedData::next(Token &token, WhitespaceMode mode)
+{
+ return impl->next(token, mode, tokens, cursor);
+}
+
+bool TokenizedData::text(Token &token, WhitespaceMode mode)
+{
+ // Copy the current cursor position to not update the actual cursor position
+ // if the operation was not successful
+ size_t cursorCopy = cursor;
+ if (!impl->next(token, mode, tokens, cursorCopy) ||
+ token.id != Tokens::Data) {
+ return false;
+ }
+
+ // There is indeed a text token, update the internal cursor position
+ cursor = cursorCopy;
+ return true;
+}
+}
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
new file mode 100644
index 0000000..38125c4
--- /dev/null
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -0,0 +1,189 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file TokenizedData.hpp
+ *
+ * The TokenizedData class defined in this file stores string data extracted
+ * from a document including user defined tokens. Tokens can be dynamically
+ * enabled and disabled. And the data up to the next enabled token can be
+ * returned. Additionally, the data provided by the TokenizedData class is
+ * processed according to a whitespace mode that can be dynamically updated.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_TOKENIZED_DATA_HPP_
+#define _OUSIA_TOKENIZED_DATA_HPP_
+
+#include <cstdint>
+#include <memory>
+#include <unordered_set>
+
+#include <core/common/Location.hpp>
+#include <core/common/Whitespace.hpp>
+
+#include "Token.hpp"
+
+namespace ousia {
+
+// Forward declaration
+class TokenizedDataImpl;
+
+/**
+ * The TokenizedData class stores data extracted from a user defined document.
+ * As users are capable of defining their own tokens and these are only valid
+ * in certain scopes TokenizedData allows to divide the stored data into chunks
+ * separated by tokens.
+ */
+class TokenizedData {
+private:
+ /**
+ * Shared pointer pointing at the internal data. This data is shared when
+ * copying TokenizedData instances, which corresponds to forking a
+ * TokenizedData instance.
+ */
+ std::shared_ptr<TokenizedDataImpl> impl;
+
+ /**
+ * Contains all currently enabled token ids.
+ */
+ std::unordered_set<TokenId> tokens;
+
+ /**
+ * Position from which the last element was read from the internal buffer.
+ * This information is not shared with the other instances of TokenizedData
+ * pointing at the same location.
+ */
+ size_t cursor;
+
+public:
+ /**
+ * Default constructor, creates a new instance of TokenizedData, sets the
+ * internal SourceId to the InvalidSourceId constant.
+ */
+ TokenizedData();
+
+ /**
+ * Creates a new instance of TokenizedData, takes a SourceId.
+ *
+ * @param sourceId is the source identifier that should be used for
+ * constructing the location when returning tokens.
+ */
+ TokenizedData(SourceId sourceId);
+
+ /**
+ * Destructor. Needs to be defined explicitly for freeing a shared pointer
+ * of the incomplete TokenizedDataImpl type.
+ */
+ ~TokenizedData();
+
+ /**
+ * Appends a complete string to the internal character buffer. Note that the
+ * start and end positions for each character in the given data string will
+ * be interpolated and may thus be incorrect (e.g. when multi-character
+ * linebreaks or multi-character characters (not handled now) are read).
+ *
+ * @param data is the string that should be appended to the buffer.
+ * @param offsStart is the start offset in bytes in the input file.
+ * @return the current size of the internal byte buffer. The returned value
+ * is intended to be used for the "mark" function.
+ */
+ size_t append(const std::string &data, SourceOffset offsStart = 0);
+
+ /**
+ * Appends a single character to the internal character buffer.
+ *
+ * @param c is the character that should be appended to the buffer.
+ * @param start is the start offset in bytes in the input file.
+ * @param end is the end offset in bytes in the input file.
+ * @return the current size of the internal byte buffer. The returned value
+ * is intended to be used for the "mark" function.
+ */
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd);
+
+ /**
+ * Stores a token ending at the last character of the current buffer.
+ *
+ * @param id is the id of the token for which the mark should be stored.
+ * @param len is the length of the token.
+ */
+ void mark(TokenId id, TokenLength len);
+
+ /**
+ * Stores a token at the given position.
+ *
+ * @param id is the if of the token for which the mark should be stored.
+ * @param bufStart is the start position in the internal buffer. Use the
+ * values returned by append to calculate the start position.
+ * @param len is the length of the token.
+ */
+ void mark(TokenId id, size_t bufStart, TokenLength len);
+
+ /**
+ * Enables a single token id. Enabled tokens will no longer be returned as
+ * text. Instead, when querying for the next token, TokenizedData will
+ * return them as token and not as part of a Text token.
+ *
+ * @param id is the TokenId of the token that should be enabled.
+ */
+ void enableToken(TokenId id) { tokens.insert(id); }
+
+ /**
+ * Enables a set of token ids. Enabled tokens will no longer be returned as
+ * text. Instead, when querying for the next token, TokenizedData will
+ * return them as token and not as part of a Text token.
+ *
+ * @param ids is the TokenId of the token that should be enabled.
+ */
+ void enableToken(const std::unordered_set<TokenId> &ids)
+ {
+ tokens.insert(ids.begin(), ids.end());
+ }
+
+ /**
+ * Stores the next token in the given token reference, returns true if the
+ * operation was successful, false if there are no more tokens.
+ *
+ * @param token is an output parameter into which the read token will be
+ * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param mode is the whitespace mode that should be used when a text token
+ * is returned.
+ * @return true if the operation was successful and there is a next token,
+ * false if there are no more tokens.
+ */
+ bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+
+ /**
+ * Stores the next text token in the given token reference, returns true if
+ * the operation was successful (there was indeed a text token), false if
+ * the next token is not a text token or there were no more tokens.
+ *
+ * @param token is an output parameter into which the read token will be
+ * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param mode is the whitespace mode that should be used when a text token
+ * is returned.
+ * @return true if the operation was successful and there is a next token,
+ * false if there are no more tokens.
+ */
+ bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+};
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+
diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp
new file mode 100644
index 0000000..231bad9
--- /dev/null
+++ b/test/core/parser/utils/TokenizedDataTest.cpp
@@ -0,0 +1,526 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <gtest/gtest.h>
+
+#include <core/parser/utils/TokenizedData.hpp>
+
+namespace ousia {
+
+TEST(TokenizedData, dataWhitespacePreserve)
+{
+ TokenizedData data;
+ ASSERT_EQ(16U, data.append(" test1 test2 "));
+ // 0123456789012345
+ // 0 1
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ(" test1 test2 ", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(16U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, dataWhitespaceTrim)
+{
+ TokenizedData data;
+ ASSERT_EQ(16U, data.append(" test1 test2 "));
+ // 0123456789012345
+ // 0 1
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("test1 test2", token.content);
+ EXPECT_EQ(1U, token.getLocation().getStart());
+ EXPECT_EQ(14U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+}
+
+TEST(TokenizedData, dataWhitespaceCollapse)
+{
+ TokenizedData data;
+ ASSERT_EQ(16U, data.append(" test1 test2 "));
+ // 0123456789012345
+ // 0 1
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("test1 test2", token.content);
+ EXPECT_EQ(1U, token.getLocation().getStart());
+ EXPECT_EQ(14U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+}
+
+TEST(TokenizedData, singleToken)
+{
+ TokenizedData data;
+ ASSERT_EQ(2U, data.append("$$"));
+ data.mark(5, 0, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, singleDisabledToken)
+{
+ TokenizedData data;
+ ASSERT_EQ(2U, data.append("$$"));
+ data.mark(5, 0, 2);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, dualToken)
+{
+ TokenizedData data;
+ ASSERT_EQ(2U, data.append("$$"));
+ data.mark(6, 0, 1);
+ data.mark(5, 0, 2);
+ data.mark(6, 1, 1);
+
+ data.enableToken(5);
+ data.enableToken(6);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, dualTokenShorterEnabled)
+{
+ TokenizedData data;
+ ASSERT_EQ(2U, data.append("$$"));
+ data.mark(6, 0, 1);
+ data.mark(5, 0, 2);
+ data.mark(6, 1, 1);
+
+ data.enableToken(6);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(6U, token.id);
+ EXPECT_EQ("$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(1U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(6U, token.id);
+ EXPECT_EQ("$", token.content);
+ EXPECT_EQ(1U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, dualTokenLongerEnabled)
+{
+ TokenizedData data;
+ ASSERT_EQ(2U, data.append("$$"));
+ data.mark(5, 0, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, tokensAndDataPreserveWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(10U, data.append("$$ test $$"));
+ // 0123456789
+ data.mark(5, 0, 2);
+ data.mark(5, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ(" test ", token.content);
+ EXPECT_EQ(2U, token.getLocation().getStart());
+ EXPECT_EQ(8U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(8U, token.getLocation().getStart());
+ EXPECT_EQ(10U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, tokensAndDataTrimWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(10U, data.append("$$ test $$"));
+ // 0123456789
+ data.mark(5, 0, 2);
+ data.mark(5, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("test", token.content);
+ EXPECT_EQ(3U, token.getLocation().getStart());
+ EXPECT_EQ(7U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(8U, token.getLocation().getStart());
+ EXPECT_EQ(10U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+}
+
+TEST(TokenizedData, tokensAndDataCollapseWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(10U, data.append("$$ test $$"));
+ // 0123456789
+ data.mark(5, 0, 2);
+ data.mark(5, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("test", token.content);
+ EXPECT_EQ(3U, token.getLocation().getStart());
+ EXPECT_EQ(7U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(8U, token.getLocation().getStart());
+ EXPECT_EQ(10U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+}
+
+TEST(TokenizedData, tokensAndWhitespacePreserveWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(10U, data.append("$$ $$"));
+ // 0123456789
+ data.mark(5, 0, 2);
+ data.mark(5, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ(" ", token.content);
+ EXPECT_EQ(2U, token.getLocation().getStart());
+ EXPECT_EQ(8U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(8U, token.getLocation().getStart());
+ EXPECT_EQ(10U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, tokensAndWhitespaceTrimWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(10U, data.append("$$ $$"));
+ // 0123456789
+ data.mark(5, 0, 2);
+ data.mark(5, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(8U, token.getLocation().getStart());
+ EXPECT_EQ(10U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+}
+
+TEST(TokenizedData, tokensAndWhitespaceCollapseWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(10U, data.append("$$ $$"));
+ // 0123456789
+ data.mark(5, 0, 2);
+ data.mark(5, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(8U, token.getLocation().getStart());
+ EXPECT_EQ(10U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+}
+
+TEST(TokenizedData, textPreserveWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(6U, data.append(" $$ "));
+ // 012345
+ data.mark(5, 2, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ(" ", token.content);
+ EXPECT_EQ(0U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(2U, token.getLocation().getStart());
+ EXPECT_EQ(4U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ(" ", token.content);
+ EXPECT_EQ(4U, token.getLocation().getStart());
+ EXPECT_EQ(6U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.text(token, WhitespaceMode::PRESERVE));
+ ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+}
+
+TEST(TokenizedData, textTrimWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(6U, data.append(" $$ "));
+ // 012345
+ data.mark(5, 2, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM));
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(2U, token.getLocation().getStart());
+ EXPECT_EQ(4U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM));
+ ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+}
+
+TEST(TokenizedData, textCollapseWhitespace)
+{
+ TokenizedData data;
+ ASSERT_EQ(6U, data.append(" $$ "));
+ // 012345
+ data.mark(5, 2, 2);
+
+ data.enableToken(5);
+
+ Token token;
+ ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(5U, token.id);
+ EXPECT_EQ("$$", token.content);
+ EXPECT_EQ(2U, token.getLocation().getStart());
+ EXPECT_EQ(4U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
+ ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+}
+
+TEST(TokenizedData, appendChars)
+{
+ TokenizedData data;
+ ASSERT_EQ(1U, data.append('t', 5, 7));
+ ASSERT_EQ(2U, data.append('e', 7, 8));
+ ASSERT_EQ(3U, data.append('s', 8, 10));
+ ASSERT_EQ(4U, data.append('t', 10, 12));
+
+ Token token;
+ ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("test", token.content);
+ EXPECT_EQ(5U, token.getLocation().getStart());
+ EXPECT_EQ(12U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
+ ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+}
+
+TEST(TokenizedData, copy)
+{
+ TokenizedData data;
+ ASSERT_EQ(7U, data.append(" a $ b "));
+ // 0123456
+ data.mark(6, 3, 1);
+ data.enableToken(6);
+
+ Token token;
+ ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("a", token.content);
+ EXPECT_EQ(1U, token.getLocation().getStart());
+ EXPECT_EQ(2U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
+
+ TokenizedData dataCopy = data;
+
+ ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(6U, token.id);
+ EXPECT_EQ("$", token.content);
+ EXPECT_EQ(3U, token.getLocation().getStart());
+ EXPECT_EQ(4U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(dataCopy.next(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(6U, token.id);
+ EXPECT_EQ("$", token.content);
+ EXPECT_EQ(3U, token.getLocation().getStart());
+ EXPECT_EQ(4U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+
+ ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ(" b ", token.content);
+ EXPECT_EQ(4U, token.getLocation().getStart());
+ EXPECT_EQ(7U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+ ASSERT_FALSE(data.next(token));
+
+ ASSERT_TRUE(dataCopy.text(token, WhitespaceMode::COLLAPSE));
+ EXPECT_EQ(Tokens::Data, token.id);
+ EXPECT_EQ("b", token.content);
+ EXPECT_EQ(5U, token.getLocation().getStart());
+ EXPECT_EQ(6U, token.getLocation().getEnd());
+ EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+ ASSERT_FALSE(dataCopy.next(token));
+}
+}
+