summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt108
-rw-r--r--src/core/common/SourceContextReader.cpp5
-rw-r--r--src/core/common/Token.hpp6
-rw-r--r--src/core/common/Utils.cpp6
-rw-r--r--src/core/common/Utils.hpp53
-rw-r--r--src/core/parser/utils/TokenizedData.cpp286
-rw-r--r--src/core/parser/utils/TokenizedData.hpp70
-rw-r--r--src/core/parser/utils/Tokenizer.cpp7
-rw-r--r--src/core/parser/utils/Tokenizer.hpp2
-rw-r--r--test/core/parser/utils/TokenizedDataTest.cpp598
10 files changed, 591 insertions, 550 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 54f971c..225e63d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,14 +181,14 @@ ADD_LIBRARY(ousia_core
src/core/parser/ParserContext
src/core/parser/ParserScope
src/core/parser/stack/Callbacks
- src/core/parser/stack/DocumentHandler
- src/core/parser/stack/DomainHandler
- src/core/parser/stack/GenericParserStates
- src/core/parser/stack/Handler
- src/core/parser/stack/ImportIncludeHandler
+# src/core/parser/stack/DocumentHandler
+# src/core/parser/stack/DomainHandler
+# src/core/parser/stack/GenericParserStates
+# src/core/parser/stack/Handler
+# src/core/parser/stack/ImportIncludeHandler
src/core/parser/stack/State
- src/core/parser/stack/Stack
- src/core/parser/stack/TypesystemHandler
+# src/core/parser/stack/Stack
+# src/core/parser/stack/TypesystemHandler
src/core/parser/utils/SourceOffsetVector
src/core/parser/utils/TokenizedData
src/core/parser/utils/Tokenizer
@@ -212,19 +212,19 @@ ADD_LIBRARY(ousia_core
# ousia_core
#)
-ADD_LIBRARY(ousia_osml
- src/formats/osml/OsmlParser
- src/formats/osml/OsmlStreamParser
-)
+#ADD_LIBRARY(ousia_osml
+# src/formats/osml/OsmlParser
+# src/formats/osml/OsmlStreamParser
+#)
-TARGET_LINK_LIBRARIES(ousia_osml
- ousia_core
-)
+#TARGET_LINK_LIBRARIES(ousia_osml
+# ousia_core
+#)
ADD_LIBRARY(ousia_osxml
src/formats/osxml/OsxmlAttributeLocator
src/formats/osxml/OsxmlEventParser
- src/formats/osxml/OsxmlParser
+# src/formats/osxml/OsxmlParser
)
TARGET_LINK_LIBRARIES(ousia_osxml
@@ -273,19 +273,19 @@ TARGET_LINK_LIBRARIES(ousia_xml
# Command line interface
-ADD_EXECUTABLE(ousia
- src/cli/Main
-)
+#ADD_EXECUTABLE(ousia
+# src/cli/Main
+#)
-TARGET_LINK_LIBRARIES(ousia
- ousia_core
- ousia_filesystem
- ousia_html
- ousia_xml
- ousia_osml
- ousia_osxml
- ${Boost_LIBRARIES}
-)
+#TARGET_LINK_LIBRARIES(ousia
+# ousia_core
+# ousia_filesystem
+# ousia_html
+# ousia_xml
+# ousia_osml
+# ousia_osxml
+# ${Boost_LIBRARIES}
+#)
# If testing is enabled, build the unit tests
IF(TEST)
@@ -323,11 +323,11 @@ IF(TEST)
test/core/model/StyleTest
test/core/model/TypesystemTest
test/core/parser/ParserScopeTest
- test/core/parser/stack/StackTest
+# test/core/parser/stack/StackTest
test/core/parser/stack/StateTest
test/core/parser/utils/SourceOffsetVectorTest
test/core/parser/utils/TokenizedDataTest
- test/core/parser/utils/TokenizerTest
+# test/core/parser/utils/TokenizerTest
test/core/parser/utils/TokenTrieTest
test/core/resource/ResourceLocatorTest
test/core/resource/ResourceRequestTest
@@ -383,29 +383,29 @@ IF(TEST)
# ousia_mozjs
# )
- ADD_EXECUTABLE(ousia_test_osml
- test/formats/osml/OsmlParserTest
- test/formats/osml/OsmlStreamParserTest
- )
+# ADD_EXECUTABLE(ousia_test_osml
+# test/formats/osml/OsmlParserTest
+# test/formats/osml/OsmlStreamParserTest
+# )
- TARGET_LINK_LIBRARIES(ousia_test_osml
- ${GTEST_LIBRARIES}
- ousia_core
- ousia_osml
- ousia_filesystem
- )
+# TARGET_LINK_LIBRARIES(ousia_test_osml
+# ${GTEST_LIBRARIES}
+# ousia_core
+# ousia_osml
+# ousia_filesystem
+# )
- ADD_EXECUTABLE(ousia_test_osxml
- test/formats/osxml/OsxmlEventParserTest
- test/formats/osxml/OsxmlParserTest
- )
+# ADD_EXECUTABLE(ousia_test_osxml
+# test/formats/osxml/OsxmlEventParserTest
+# test/formats/osxml/OsxmlParserTest
+# )
- TARGET_LINK_LIBRARIES(ousia_test_osxml
- ${GTEST_LIBRARIES}
- ousia_core
- ousia_osxml
- ousia_filesystem
- )
+# TARGET_LINK_LIBRARIES(ousia_test_osxml
+# ${GTEST_LIBRARIES}
+# ousia_core
+# ousia_osxml
+# ousia_filesystem
+# )
ADD_EXECUTABLE(ousia_test_xml
test/plugins/xml/XmlOutputTest
@@ -423,8 +423,8 @@ IF(TEST)
ADD_TEST(ousia_test_filesystem ousia_test_filesystem)
ADD_TEST(ousia_test_html ousia_test_html)
# ADD_TEST(ousia_test_mozjs ousia_test_mozjs)
- ADD_TEST(ousia_test_osml ousia_test_osml)
- ADD_TEST(ousia_test_osxml ousia_test_osxml)
+# ADD_TEST(ousia_test_osml ousia_test_osml)
+# ADD_TEST(ousia_test_osxml ousia_test_osxml)
ADD_TEST(ousia_test_xml ousia_test_xml)
ENDIF()
@@ -442,9 +442,9 @@ INSTALL(DIRECTORY data/ DESTINATION share/ousia
OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE
)
-INSTALL(TARGETS ousia
- RUNTIME DESTINATION bin
-)
+#INSTALL(TARGETS ousia
+# RUNTIME DESTINATION bin
+#)
IF(INSTALL_GEDIT_HIGHLIGHTER)
INSTALL(FILES contrib/gtksourceview-3.0/language-specs/ousia.lang
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp
index d5d379c..f7dbdf3 100644
--- a/src/core/common/SourceContextReader.cpp
+++ b/src/core/common/SourceContextReader.cpp
@@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader,
ctx.relLen = end - start; // end >= start (I2)
// Remove linebreaks at the beginning and the end
- const std::pair<size_t, size_t> b =
- Utils::trim(lineBuf, Utils::isLinebreak);
+ const std::pair<size_t, size_t> b = Utils::trim(
+ lineBuf,
+ [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); });
ssize_t s = b.first, e = b.second;
s = std::min(s, static_cast<ssize_t>(ctx.relPos));
diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp
index 07d7c8f..0cf56b0 100644
--- a/src/core/common/Token.hpp
+++ b/src/core/common/Token.hpp
@@ -91,10 +91,10 @@ constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4;
constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5;
/**
- * Token which represents an unindentation -- issued if the indentation of
+ * Token which represents an dedentation -- issued if the indentation of
* this line is smaller than the indentation of the previous line.
*/
-constexpr TokenId Unindent = std::numeric_limits<TokenId>::max() - 6;
+constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6;
/**
* Maximum token id to be used. Tokens allocated for users should not surpass
@@ -165,7 +165,7 @@ struct Token {
* @return true if the TokenId indicates that this token is a "special"
* token.
*/
-
+ bool isSpecial() const {return id > Tokens::MaxTokenId;}
/**
* The getLocation function allows the tokens to be directly passed as
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index a77951e..85d2c28 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename)
return std::string{};
}
-std::string Utils::trim(const std::string &s)
-{
- std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
- return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
bool Utils::startsWith(const std::string &s, const std::string &prefix)
{
return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix;
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 7d96562..82a8f8c 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -124,14 +124,6 @@ public:
static bool hasNonWhitepaceChar(const std::string &s);
/**
- * Removes whitespace at the beginning and the end of the given string.
- *
- * @param s is the string that should be trimmed.
- * @return a trimmed copy of s.
- */
- static std::string trim(const std::string &s);
-
- /**
* Trims the given string or vector of chars by returning the start and end
* index.
*
@@ -153,8 +145,8 @@ public:
*
* @param s is the container that should be trimmed.
* @param len is the number of elements in the container.
- * @param f is a function that returns true for values that should be
- * removed.
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
*/
@@ -163,7 +155,7 @@ public:
{
size_t start = 0;
for (size_t i = 0; i < len; i++) {
- if (!f(s[i])) {
+ if (!f(i)) {
start = i;
break;
}
@@ -171,7 +163,7 @@ public:
size_t end = 0;
for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {
- if (!f(s[i])) {
+ if (!f(i)) {
end = i + 1;
break;
}
@@ -198,17 +190,33 @@ public:
* the collapsed version of the string ends.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
*/
- template <class T>
- static std::string trim(const T &s, size_t len, size_t &start, size_t &end)
+ template <class T, class Filter>
+ static std::string trim(const T &s, size_t len, size_t &start, size_t &end,
+ Filter f)
{
- auto res = trim(s, len, isWhitespace);
+ auto res = trim(s, len, f);
start = res.first;
end = res.second;
return std::string(&s[start], end - start);
}
/**
+ * Removes whitespace at the beginning and the end of the given string.
+ *
+ * @param s is the string that should be trimmed.
+ * @return a trimmed copy of s.
+ */
+ static std::string trim(const std::string &s)
+ {
+ std::pair<size_t, size_t> bounds =
+ trim(s, [&s](size_t i) { return isWhitespace(s[i]); });
+ return s.substr(bounds.first, bounds.second - bounds.first);
+ }
+
+ /**
* Collapses the whitespaces in the given string (trims the string and
* replaces all whitespace characters by a single one).
*
@@ -219,7 +227,8 @@ public:
{
size_t start;
size_t end;
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -236,7 +245,8 @@ public:
static std::string collapse(const std::string &s, size_t &start,
size_t &end)
{
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -244,6 +254,8 @@ public:
* replaces all whitespace characters by a single one).
*
* @tparam T is the string type that should be used.
+ * @tparam Filter is a filter function used for detecting the character
+ * indices that might be removed.
* @param s is the string in which the whitespace should be collapsed.
* @param len is the length of the input string
* @param start is an output parameter which is set to the offset at which
@@ -252,9 +264,9 @@ public:
* the collapsed version of the string ends.
* @return a copy of s with collapsed whitespace.
*/
- template <class T>
+ template <class T, class Filter>
static std::string collapse(const T &s, size_t len, size_t &start,
- size_t &end)
+ size_t &end, Filter f)
{
// Result vector
std::vector<char> res;
@@ -268,8 +280,7 @@ public:
bool hadWhitespace = false;
for (size_t i = 0; i < len; i++) {
const char c = s[i];
- const bool whitespace = isWhitespace(c);
- if (whitespace) {
+ if (f(i)) {
hadWhitespace = !res.empty();
} else {
// Adapt the start and end position
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
index 0ec56af..aeefa26 100644
--- a/src/core/parser/utils/TokenizedData.cpp
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -48,6 +48,17 @@ struct TokenMark {
TokenLength len;
/**
+ * Specifies whether the token is special or not.
+ */
+ bool special;
+
+ /**
+ * Maximum token length.
+ */
+ static constexpr TokenLength MaxTokenLength =
+ std::numeric_limits<TokenLength>::max();
+
+ /**
* Constructor of the TokenMark structure, initializes all members with the
* given values.
*
@@ -55,9 +66,10 @@ struct TokenMark {
* @param bufStart is the start position of the TokenMark in the internal
* character buffer.
* @param len is the length of the token.
+ * @param special modifies the sort order, special tokens are prefered.
*/
- TokenMark(TokenId id, size_t bufStart, TokenLength len)
- : bufStart(bufStart), id(id), len(len)
+ TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special)
+ : bufStart(bufStart), id(id), len(len), special(special)
{
}
@@ -72,7 +84,8 @@ struct TokenMark {
TokenMark(size_t bufStart)
: bufStart(bufStart),
id(Tokens::Empty),
- len(std::numeric_limits<TokenLength>::max())
+ len(MaxTokenLength),
+ special(true)
{
}
@@ -86,8 +99,22 @@ struct TokenMark {
*/
friend bool operator<(const TokenMark &m1, const TokenMark &m2)
{
- return (m1.bufStart < m2.bufStart) ||
- (m1.bufStart == m2.bufStart && m1.len > m2.len);
+ // Prefer the mark with the smaller bufStart
+ if (m1.bufStart < m2.bufStart) {
+ return true;
+ }
+
+ // Special handling for marks with the same bufStart
+ if (m1.bufStart == m2.bufStart) {
+ // If exactly one of the two marks is special, return true if this
+ // one is special
+ if (m1.special != m2.special) {
+ return m1.special;
+ }
+ // Otherwise prefer longer marks
+ return m1.len > m2.len;
+ }
+ return false;
}
};
}
@@ -110,6 +137,11 @@ private:
std::vector<char> buf;
/**
+ * Buffset storing the "protected" flag of the character data.
+ */
+ std::vector<bool> protectedChars;
+
+ /**
* Vector storing all the character offsets efficiently.
*/
SourceOffsetVector offsets;
@@ -120,6 +152,26 @@ private:
mutable std::vector<TokenMark> marks;
/**
+ * Position of the first linebreak in a sequence of linebreaks.
+ */
+ size_t firstLinebreak;
+
+ /**
+ * Current indentation level.
+ */
+ uint16_t currentIndentation;
+
+ /**
+ * Last indentation level.
+ */
+ uint16_t lastIndentation;
+
+ /**
+ * Number of linebreaks without any content between them.
+ */
+ uint16_t numLinebreaks;
+
+ /**
* Flag indicating whether the internal "marks" vector is sorted.
*/
mutable bool sorted;
@@ -132,7 +184,7 @@ public:
* @param sourceId is the source identifier that should be used for
* constructing the location when returning tokens.
*/
- TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {}
+ TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); }
/**
* Appends a complete string to the internal character buffer and extends
@@ -140,25 +192,22 @@ public:
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
+ * @param protect if set to true, the appended characters will not be
+ * affected by whitespace handling, they will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(const std::string &data, SourceOffset offsStart)
- { // Append the data to the internal buffer
- buf.insert(buf.end(), data.begin(), data.end());
-
- // Extend the text regions, interpolate the source position (this may
- // yield incorrect results)
- const size_t size = buf.size();
+ size_t append(const std::string &data, SourceOffset offsStart, bool protect)
+ {
for (size_t i = 0; i < data.size(); i++) {
if (offsStart != InvalidSourceOffset) {
- offsets.storeOffset(offsStart + i, offsStart + i + 1);
+ append(data[i], offsStart + i, offsStart + i + 1, protect);
} else {
- offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset);
+ append(data[i], InvalidSourceOffset, InvalidSourceOffset,
+ protect);
}
}
-
- return size;
+ return size();
}
/**
@@ -168,16 +217,86 @@ public:
* @param c is the character that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
* @param offsEnd is the end offset in bytes in the input file.
+ * @param protect if set to true, the appended character will not be
+ * affected by whitespace handling, it will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd)
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+ bool protect)
{
// Add the character to the list and store the location of the character
// in the source file
buf.push_back(c);
+ protectedChars.push_back(protect);
offsets.storeOffset(offsStart, offsEnd);
- return buf.size();
+
+ // Insert special tokens
+ const size_t size = buf.size();
+ const bool isWhitespace = Utils::isWhitespace(c);
+ const bool isLinebreak = Utils::isLinebreak(c);
+
+ // Handle linebreaks
+ if (isLinebreak) {
+ // Mark linebreaks as linebreak
+ mark(Tokens::Newline, size - 1, 1, false);
+
+ // The linebreak sequence started at the previous character
+ if (numLinebreaks == 0) {
+ firstLinebreak = size - 1;
+ }
+
+ // Reset the indentation
+ currentIndentation = 0;
+
+ // Increment the number of linebreaks
+ numLinebreaks++;
+
+ const size_t markStart = firstLinebreak;
+ const size_t markLength = size - firstLinebreak;
+
+ // Issue two consecutive linebreaks as paragraph token
+ if (numLinebreaks == 2) {
+ mark(Tokens::Paragraph, markStart, markLength, false);
+ }
+
+ // Issue three consecutive linebreaks as paragraph token
+ if (numLinebreaks >= 3) {
+ mark(Tokens::Section, markStart, markLength, false);
+ }
+ } else if (isWhitespace) {
+ // Count the whitespace characters at the beginning of the line
+ if (numLinebreaks > 0) {
+ // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the
+ // next multiple of eight.
+ if (c == '\t') {
+ currentIndentation = (currentIndentation + 8) & ~7;
+ } else {
+ currentIndentation++;
+ }
+ }
+ }
+
+ // Issue indent and unindent tokens
+ if (!isWhitespace && numLinebreaks > 0) {
+ // Issue a larger indentation than that in the previous line as
+ // "Indent" token
+ if (currentIndentation > lastIndentation) {
+ mark(Tokens::Indent, size - 1, 0, true);
+ }
+
+ // Issue a smaller indentation than that in the previous line as
+ // "Dedent" token
+ if (currentIndentation < lastIndentation) {
+ mark(Tokens::Dedent, size - 1, 0, true);
+ }
+
+ // Reset the internal state machine
+ lastIndentation = currentIndentation;
+ numLinebreaks = 0;
+ }
+
+ return size;
}
/**
@@ -187,11 +306,12 @@ public:
* @param bufStart is the start position in the internal buffer. Use the
* values returned by append to calculate the start position.
* @param len is the length of the token.
+ * @param special tags the mark as "special", prefering it in the sort order
*/
- void mark(TokenId id, size_t bufStart, TokenLength len)
+ void mark(TokenId id, size_t bufStart, TokenLength len, bool special)
{
// Push the new instance back onto the list
- marks.emplace_back(id, bufStart, len);
+ marks.emplace_back(id, bufStart, len, special);
// Update the sorted flag as soon as more than one element is in the
// list
@@ -215,9 +335,13 @@ public:
* @return true if a token was returned, false if no more tokens are
* available.
*/
- bool next(Token &token, WhitespaceMode mode,
- const std::unordered_set<TokenId> &tokens, size_t &cursor) const
+ bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens,
+ TokenizedDataCursor &cursor) const
{
+ // Some variables for convenient access
+ size_t &bufPos = cursor.bufPos;
+ size_t &markPos = cursor.markPos;
+
// Sort the "marks" vector if it has not been sorted yet.
if (!sorted) {
std::sort(marks.begin(), marks.end());
@@ -226,8 +350,8 @@ public:
// Fetch the next larger TokenMark instance, make sure the token is in
// the "enabled" list and within the buffer range
- auto it =
- std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor));
+ auto it = std::lower_bound(marks.begin() + markPos, marks.end(),
+ TokenMark(bufPos));
while (it != marks.end() && (tokens.count(it->id) == 0 ||
it->bufStart + it->len > buf.size())) {
it++;
@@ -240,15 +364,15 @@ public:
// Depending on the whitespace mode, fetch all the data between the
// cursor position and the calculated end position and return a token
// containing that data.
- if (cursor < end && cursor < buf.size()) {
+ if (bufPos < end && bufPos < buf.size()) {
switch (mode) {
case WhitespaceMode::PRESERVE: {
token = Token(
- Tokens::Data, std::string(&buf[cursor], end - cursor),
+ Tokens::Data, std::string(&buf[bufPos], end - bufPos),
SourceLocation(sourceId,
- offsets.loadOffset(cursor).first,
+ offsets.loadOffset(bufPos).first,
offsets.loadOffset(end).first));
- cursor = end;
+ bufPos = end;
return true;
}
case WhitespaceMode::TRIM:
@@ -258,30 +382,35 @@ public:
size_t stringStart;
size_t stringEnd;
std::string content;
+ const char *cBuf = &buf[bufPos];
+ auto filter = [cBuf, this](size_t i) -> bool {
+ return Utils::isWhitespace(cBuf[i]) &&
+ !protectedChars[i];
+ };
if (mode == WhitespaceMode::TRIM) {
- content = Utils::trim(&buf[cursor], end - cursor,
- stringStart, stringEnd);
+ content = Utils::trim(cBuf, end - bufPos, stringStart,
+ stringEnd, filter);
} else {
- content = Utils::collapse(&buf[cursor], end - cursor,
- stringStart, stringEnd);
+ content = Utils::collapse(
+ cBuf, end - bufPos, stringStart, stringEnd, filter);
}
// If the resulting string is empty (only whitespaces),
// abort
if (content.empty()) {
- cursor = end;
+ bufPos = end;
break;
}
// Calculate the absolute positions and return the token
- stringStart += cursor;
- stringEnd += cursor;
+ stringStart += bufPos;
+ stringEnd += bufPos;
token = Token(
Tokens::Data, content,
SourceLocation(sourceId,
offsets.loadOffset(stringStart).first,
offsets.loadOffset(stringEnd).first));
- cursor = end;
+ bufPos = end;
return true;
}
}
@@ -290,14 +419,18 @@ public:
// If start equals end, we're currently directly at a token
// instance. Return this token and advance the cursor to the end of
// the token.
- if (cursor == end && it != marks.end()) {
+ if (bufPos == end && it != marks.end()) {
const size_t tokenStart = it->bufStart;
const size_t tokenEnd = it->bufStart + it->len;
token = Token(
it->id, std::string(&buf[tokenStart], it->len),
SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,
offsets.loadOffset(tokenEnd).first));
- cursor = tokenEnd;
+
+ // Update the cursor, consume the token by incrementing the marks
+ // pos counter
+ bufPos = tokenEnd;
+ markPos = it - marks.begin() + 1;
return true;
}
@@ -314,8 +447,12 @@ public:
void clear()
{
buf.clear();
- marks.clear();
+ protectedChars.clear();
offsets.clear();
+ marks.clear();
+ currentIndentation = 0;
+ lastIndentation = 0;
+ numLinebreaks = 1; // Assume the stream starts with a linebreak
sorted = true;
}
@@ -367,39 +504,35 @@ public:
TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}
TokenizedData::TokenizedData(SourceId sourceId)
- : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0)
+ : impl(std::make_shared<TokenizedDataImpl>(sourceId))
{
}
TokenizedData::~TokenizedData() {}
-size_t TokenizedData::append(const std::string &data, SourceOffset offsStart)
+size_t TokenizedData::append(const std::string &data, SourceOffset offsStart,
+ bool protect)
{
- return impl->append(data, offsStart);
+ return impl->append(data, offsStart, protect);
}
size_t TokenizedData::append(char c, SourceOffset offsStart,
- SourceOffset offsEnd)
+ SourceOffset offsEnd, bool protect)
{
- return impl->append(c, offsStart, offsEnd);
+ return impl->append(c, offsStart, offsEnd, protect);
}
void TokenizedData::mark(TokenId id, TokenLength len)
{
- impl->mark(id, impl->size() - len, len);
+ impl->mark(id, impl->size() - len, len, false);
}
void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)
{
- impl->mark(id, bufStart, len);
+ impl->mark(id, bufStart, len, false);
}
-void TokenizedData::clear()
-{
- impl->clear();
- tokens.clear();
- cursor = 0;
-}
+void TokenizedData::clear() { impl->clear(); }
void TokenizedData::trim(size_t length) { impl->trim(length); }
@@ -412,49 +545,42 @@ SourceLocation TokenizedData::getLocation() const
return impl->getLocation();
}
-TokenizedDataReader reader() const
+TokenizedDataReader TokenizedData::reader() const
{
- return TokenizedDataReader(impl, std::unordered_set<TokenId>{}, 0, 0);
+ return TokenizedDataReader(impl, TokenizedDataCursor(),
+ TokenizedDataCursor());
}
/* Class TokenizedDataReader */
+TokenizedDataReader::TokenizedDataReader(
+ std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor)
+ : impl(impl), readCursor(readCursor), peekCursor(peekCursor)
+{
+}
+
TokenizedDataReaderFork TokenizedDataReader::fork()
{
- return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor);
+ return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor);
}
-bool TokenizedDataReader::atEnd() const { return readCursor >= size(); }
+bool TokenizedDataReader::atEnd() const
+{
+ return readCursor.bufPos >= impl->size();
+}
-bool TokenizedData::read(Token &token, const TokenSet &tokens,
- WhitespaceMode mode)
+bool TokenizedDataReader::read(Token &token, const TokenSet &tokens,
+ WhitespaceMode mode)
{
peekCursor = readCursor;
return impl->next(token, mode, tokens, readCursor);
}
-bool TokenizedData::peek(Token &token, const TokenSet &tokens,
- WhitespaceMode mode)
+bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens,
+ WhitespaceMode mode)
{
return impl->next(token, mode, tokens, peekCursor);
}
-
-Variant TokenizedData::text(WhitespaceMode mode)
-{
- // Copy the current cursor position to not update the actual cursor position
- // if the operation was not successful
- size_t cursorCopy = cursor;
- Token token;
- if (!impl->next(token, mode, tokens, cursorCopy) ||
- token.id != Tokens::Data) {
- return Variant{nullptr};
- }
-
- // There is indeed a text token, update the internal cursor position and
- // return the token as variant.
- cursor = cursorCopy;
- Variant res = Variant::fromString(token.content);
- res.setLocation(token.getLocation());
- return res;
-}
}
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 85b80ae..b72ca02 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -36,7 +36,6 @@
#include <unordered_set>
#include <core/common/Location.hpp>
-#include <core/common/Variant.hpp>
#include <core/common/Whitespace.hpp>
#include <core/common/Token.hpp>
@@ -48,6 +47,28 @@ class TokenizedDataReader;
class TokenizedDataReaderFork;
/**
+ * Internally used structure representing a cursor within the TokenizedData
+ * stream.
+ */
+struct TokenizedDataCursor {
+ /**
+ * Position within the byte buffer.
+ */
+ size_t bufPos;
+
+ /**
+ * Position within the token mark buffer.
+ */
+ size_t markPos;
+
+ /**
+ * Default constructor. The resulting cursor points at the beginning of the
+ * stream.
+ */
+ TokenizedDataCursor() : bufPos(0), markPos(0) {}
+};
+
+/**
* The TokenizedData class stores data extracted from a user defined document.
* The data stored in TokenizedData
*/
@@ -88,10 +109,13 @@ public:
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
+ * @param protect if set to true, the appended characters will not be
+ * affected by whitespace handling, they will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(const std::string &data, SourceOffset offsStart = 0);
+ size_t append(const std::string &data, SourceOffset offsStart = 0,
+ bool protect = false);
/**
* Appends a single character to the internal character buffer.
@@ -99,10 +123,13 @@ public:
* @param c is the character that should be appended to the buffer.
* @param start is the start offset in bytes in the input file.
* @param end is the end offset in bytes in the input file.
+ * @param protect if set to true, the appended character will not be
+ * affected by whitespace handling, it will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd);
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+ bool protect = false);
/**
* Stores a token ending at the last character of the current buffer.
@@ -187,15 +214,16 @@ private:
/**
* Position from which the last element was read from the internal buffer.
*/
- size_t readCursor;
+ TokenizedDataCursor readCursor;
/**
* Position from which the last element was peeked from the internal buffer.
*/
- size_t peekCursor;
+ TokenizedDataCursor peekCursor;
+protected:
/**
- * Private constructor of TokenizedDataReader, taking a reference to the
+ * Protected constructor of TokenizedDataReader, taking a reference to the
* internal TokenizedDataImpl structure storing the data that is accessed by
* the reader.
*
@@ -205,8 +233,9 @@ private:
* @param peekCursor is the cursor position from which tokens and text are
* peeked.
*/
- TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl,
- size_t readCursor, size_t peekCursor);
+ TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor);
public:
/**
@@ -237,7 +266,7 @@ public:
* false if there are no more tokens.
*/
bool read(Token &token, const TokenSet &tokens = TokenSet{},
- WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ WhitespaceMode mode = WhitespaceMode::TRIM);
/**
* Stores the next token in the given token reference, returns true if the
@@ -253,7 +282,7 @@ public:
* false if there are no more tokens.
*/
bool peek(Token &token, const TokenSet &tokens = TokenSet{},
- WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ WhitespaceMode mode = WhitespaceMode::TRIM);
/**
* Consumes the peeked tokens, the read cursor will now be at the position
@@ -265,20 +294,6 @@ public:
* Resets the peek cursor to the position of the read cursor.
*/
void resetPeek() { peekCursor = readCursor; }
-
- /**
- * Stores the next text token in the given token reference, returns true if
- * the operation was successful (there was indeed a text token), false if
- * the next token is not a text token or there were no more tokens.
- *
- * @param token is an output parameter into which the read token will be
- * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
- * @param mode is the whitespace mode that should be used when a text token
- * is returned.
- * @return a string variant with the data if there is any data or a nullptr
- * variant if there is no text.
- */
- Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE);
};
/**
@@ -309,8 +324,9 @@ private:
* peeked.
*/
TokenizedDataReaderFork(TokenizedDataReader &parent,
- std::shared_ptr<TokenizedDataImpl> impl,
- size_t readCursor, size_t peekCursor)
+ std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor)
: TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
{
}
@@ -320,7 +336,7 @@ public:
* Commits the read/peek progress to the underlying parent.
*/
void commit() { parent = *this; }
-}
+};
}
#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 51787cd..e78b0f4 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -156,6 +156,7 @@ public:
return res;
}
};
+
}
/* Class Tokenizer */
@@ -229,12 +230,6 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
} else {
// Record all incomming characters
data.append(c, charStart, charEnd);
-
- // Special token processing
- // TODO: Build a special state machine for this in another class
- if (c == '\n') {
- data.mark(Tokens::Newline, 1);
- }
}
// Swap the lookups and the nextLookups list
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index 2ddb9c9..74e3f0d 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -28,7 +28,7 @@
#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
-#include <set>
+#include <cstdint>
#include <string>
#include <vector>
diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp
index 6bd7234..dfe2526 100644
--- a/test/core/parser/utils/TokenizedDataTest.cpp
+++ b/test/core/parser/utils/TokenizedDataTest.cpp
@@ -22,6 +22,43 @@
namespace ousia {
+void assertToken(TokenizedDataReader &reader, TokenId id,
+ const std::string &text, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset,
+ SourceId sourceId = InvalidSourceId)
+{
+ Token token;
+ ASSERT_TRUE(reader.read(token, tokens, mode));
+ EXPECT_EQ(id, token.id);
+ EXPECT_EQ(text, token.content);
+ if (start != InvalidSourceOffset) {
+ EXPECT_EQ(start, token.getLocation().getStart());
+ }
+ if (end != InvalidSourceOffset) {
+ EXPECT_EQ(end, token.getLocation().getEnd());
+ }
+ EXPECT_EQ(sourceId, token.getLocation().getSourceId());
+}
+
+void assertText(TokenizedDataReader &reader, const std::string &text,
+ const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset,
+ SourceId id = InvalidSourceId)
+{
+ assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id);
+}
+
+void assertEnd(TokenizedDataReader &reader)
+{
+ Token token;
+ ASSERT_TRUE(reader.atEnd());
+ ASSERT_FALSE(reader.read(token));
+}
+
TEST(TokenizedData, dataWhitespacePreserve)
{
TokenizedData data;
@@ -29,15 +66,10 @@ TEST(TokenizedData, dataWhitespacePreserve)
// 0123456789012345
// 0 1
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" test1 test2 ", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(16U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, " test1 test2 ", TokenSet{}, WhitespaceMode::PRESERVE,
+ 0, 16);
+ assertEnd(reader);
}
TEST(TokenizedData, dataWhitespaceTrim)
@@ -47,15 +79,10 @@ TEST(TokenizedData, dataWhitespaceTrim)
// 0123456789012345
// 0 1
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test1 test2", token.content);
- EXPECT_EQ(1U, token.getLocation().getStart());
- EXPECT_EQ(14U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::TRIM, 1,
+ 14);
+ assertEnd(reader);
}
TEST(TokenizedData, dataWhitespaceCollapse)
@@ -65,15 +92,10 @@ TEST(TokenizedData, dataWhitespaceCollapse)
// 0123456789012345
// 0 1
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test1 test2", token.content);
- EXPECT_EQ(1U, token.getLocation().getStart());
- EXPECT_EQ(14U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::COLLAPSE, 1,
+ 14);
+ assertEnd(reader);
}
TEST(TokenizedData, singleToken)
@@ -82,17 +104,9 @@ TEST(TokenizedData, singleToken)
ASSERT_EQ(2U, data.append("$$"));
data.mark(5, 0, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, singleDisabledToken)
@@ -101,15 +115,9 @@ TEST(TokenizedData, singleDisabledToken)
ASSERT_EQ(2U, data.append("$$"));
data.mark(5, 0, 2);
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "$$", TokenSet{}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, dualToken)
@@ -120,18 +128,10 @@ TEST(TokenizedData, dualToken)
data.mark(5, 0, 2);
data.mark(6, 1, 1);
- data.enableToken(5);
- data.enableToken(6);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5, 6}, WhitespaceMode::COLLAPSE, 0,
+ 2);
+ assertEnd(reader);
}
TEST(TokenizedData, dualTokenShorterEnabled)
@@ -142,383 +142,281 @@ TEST(TokenizedData, dualTokenShorterEnabled)
data.mark(5, 0, 2);
data.mark(6, 1, 1);
- data.enableToken(6);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(1U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(1U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 0, 1);
+ assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 1, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, dualTokenLongerEnabled)
{
TokenizedData data;
ASSERT_EQ(2U, data.append("$$"));
+ data.mark(6, 0, 1);
data.mark(5, 0, 2);
+ data.mark(6, 1, 1);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndDataPreserveWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ test $$"));
- // 0123456789
+ ASSERT_EQ(18U, data.append("$$ test text $$"));
+ // 012345678901234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" test ", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(8U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2);
+ assertText(reader, " test text ", TokenSet{5}, WhitespaceMode::PRESERVE,
+ 2, 16);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 16, 18);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndDataTrimWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ test $$"));
- // 0123456789
+ ASSERT_EQ(18U, data.append("$$ test text $$"));
+ // 012345678901234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(7U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2);
+ assertText(reader, "test text", TokenSet{5}, WhitespaceMode::TRIM, 3,
+ 15);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 16, 18);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndDataCollapseWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ test $$"));
- // 0123456789
+ ASSERT_EQ(18U, data.append("$$ test text $$"));
+ // 012345678901234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(7U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertText(reader, "test text", TokenSet{5}, WhitespaceMode::COLLAPSE, 3,
+ 15);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 16, 18);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndWhitespacePreserveWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ $$"));
- // 0123456789
+ ASSERT_EQ(8U, data.append("$$ $$"));
+ // 01234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" ", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(8U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2);
+ assertText(reader, " ", TokenSet{5}, WhitespaceMode::PRESERVE, 2, 6);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 6, 8);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndWhitespaceTrimWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ $$"));
- // 0123456789
+ ASSERT_EQ(8U, data.append("$$ $$"));
+ // 01234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 6, 8);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndWhitespaceCollapseWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ $$"));
- // 0123456789
+ ASSERT_EQ(8U, data.append("$$ $$"));
+ // 01234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 6, 8);
+ assertEnd(reader);
}
-TEST(TokenizedData, textPreserveWhitespace)
+TEST(TokenizedData, appendChars)
{
TokenizedData data;
- ASSERT_EQ(6U, data.append(" $$ "));
- // 012345
- data.mark(5, 2, 2);
-
- data.enableToken(5);
-
- Variant text;
- text = data.text(WhitespaceMode::PRESERVE);
- EXPECT_EQ(" ", text.asString());
- EXPECT_EQ(0U, text.getLocation().getStart());
- EXPECT_EQ(2U, text.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId());
+ ASSERT_EQ(1U, data.append('t', 5, 7));
+ ASSERT_EQ(2U, data.append('e', 7, 8));
+ ASSERT_EQ(3U, data.append('s', 8, 10));
+ ASSERT_EQ(4U, data.append('t', 10, 12));
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- text = data.text(WhitespaceMode::PRESERVE);
- EXPECT_EQ(" ", text.asString());
- EXPECT_EQ(4U, text.getLocation().getStart());
- EXPECT_EQ(6U, text.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId());
-
- ASSERT_EQ(nullptr, data.text(WhitespaceMode::PRESERVE));
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test", TokenSet{5}, WhitespaceMode::COLLAPSE, 5, 12);
+ assertEnd(reader);
}
-TEST(TokenizedData, textTrimWhitespace)
+TEST(TokenizedData, protectedWhitespace)
{
TokenizedData data;
- ASSERT_EQ(6U, data.append(" $$ "));
- // 012345
- data.mark(5, 2, 2);
+ ASSERT_EQ(4U, data.append("test", 10));
+ ASSERT_EQ(11U, data.append(" test", 14, true));
- data.enableToken(5);
-
- Token token;
- ASSERT_EQ(nullptr, data.text(WhitespaceMode::TRIM));
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_EQ(nullptr, data.text(WhitespaceMode::TRIM));
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test test", TokenSet{5}, WhitespaceMode::COLLAPSE, 10,
+ 21);
+ assertEnd(reader);
}
-TEST(TokenizedData, textCollapseWhitespace)
+TEST(TokenizedData, specialNewlineToken)
{
TokenizedData data;
- ASSERT_EQ(6U, data.append(" $$ "));
- // 012345
- data.mark(5, 2, 2);
-
- data.enableToken(5);
+ data.append("a\nb\n \nc\n");
+ // 0 12 3456 78 9
+
+ const TokenSet tokens{Tokens::Newline};
+
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 1, 2);
+ assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 3, 4);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 7, 8);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 9, 10);
+ assertEnd(reader);
+}
- Token token;
- ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE));
+TEST(TokenizedData, specialParagraphToken)
+{
+ TokenizedData data;
+ data.append("a\nb\n \nc\n");
+ // 0 12 3456 78 9
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+ const TokenSet tokens{Tokens::Paragraph};
- ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE));
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3);
+ assertToken(reader, Tokens::Paragraph, "\n \n", tokens,
+ WhitespaceMode::COLLAPSE, 3, 8);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9);
+ assertEnd(reader);
}
-TEST(TokenizedData, appendChars)
+TEST(TokenizedData, specialSectionToken)
{
TokenizedData data;
- ASSERT_EQ(1U, data.append('t', 5, 7));
- ASSERT_EQ(2U, data.append('e', 7, 8));
- ASSERT_EQ(3U, data.append('s', 8, 10));
- ASSERT_EQ(4U, data.append('t', 10, 12));
+ data.append("a\nb\n \n \t \n");
+ // 0 12 3456 789 01 2
+ // 0 1
- Variant text = data.text(WhitespaceMode::COLLAPSE);
- ASSERT_EQ("test", text.asString());
- EXPECT_EQ(5U, text.getLocation().getStart());
- EXPECT_EQ(12U, text.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId());
+ const TokenSet tokens{Tokens::Section};
- ASSERT_EQ(nullptr, data.text(WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3);
+ assertToken(reader, Tokens::Section, "\n \n \t \n", tokens,
+ WhitespaceMode::COLLAPSE, 3, 13);
+ assertEnd(reader);
+}
- Token token;
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+TEST(TokenizedData, specialTokenPrecedence)
+{
+ TokenizedData data;
+ data.append("a\nb\n\nc\n\n\nd");
+ // 0 12 3 45 6 7 89
+
+ const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section};
+
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 1, 2);
+ assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3);
+ assertToken(reader, Tokens::Paragraph, "\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 3, 5);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 5, 6);
+ assertToken(reader, Tokens::Section, "\n\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 6, 9);
+ assertText(reader, "d", tokens, WhitespaceMode::COLLAPSE, 9, 10);
+ assertEnd(reader);
}
-TEST(TokenizedData, copy)
+TEST(TokenizedData, specialTokenPrecedence2)
{
TokenizedData data;
- ASSERT_EQ(7U, data.append(" a $ b "));
- // 0123456
- data.mark(6, 3, 1);
- data.enableToken(6);
+ data.append("\nb\n\nc\n\n\n");
+ // 0 12 3 45 6 7
+
+ const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section};
+
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 0, 1);
+ assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 1, 2);
+ assertToken(reader, Tokens::Paragraph, "\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 2, 4);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 4, 5);
+ assertToken(reader, Tokens::Section, "\n\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 5, 8);
+ assertEnd(reader);
+}
- Variant text;
- Token token;
+TEST(TokenizedData, specialTokenIndent)
+{
+ TokenizedData data;
+ data.append(" test\n\ttest2\n test3 \ttest4\ntest5");
+ // 01234567 8 901234 5678901234567890 123456 789012
+ // 0 1 2 3 4
+ const TokenSet tokens{Tokens::Indent, Tokens::Dedent};
+
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 4, 4);
+ assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 10, 10);
+ assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+ assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
+ 38, 38);
+ assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43);
+ assertEnd(reader);
+}
- text = data.text(WhitespaceMode::COLLAPSE);
- ASSERT_EQ("a", text.asString());
- EXPECT_EQ(1U, text.getLocation().getStart());
- EXPECT_EQ(2U, text.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId());
-
- ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE));
-
- TokenizedData dataCopy = data;
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(dataCopy.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- text = data.text(WhitespaceMode::PRESERVE);
- ASSERT_EQ(" b ", text.asString());
- EXPECT_EQ(4U, text.getLocation().getStart());
- EXPECT_EQ(7U, text.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId());
- ASSERT_FALSE(data.next(token));
-
- text = dataCopy.text(WhitespaceMode::COLLAPSE);
- ASSERT_EQ("b", text.asString());
- EXPECT_EQ(5U, text.getLocation().getStart());
- EXPECT_EQ(6U, text.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId());
- ASSERT_FALSE(data.next(token));
+TEST(TokenizedData, specialTokenIndentOverlap)
+{
+ TokenizedData data;
+ data.append(" test\n\ttest2\n test3 \ttest4\ntest5");
+ // 01234567 8 901234 5678901234567890 123456 789012
+ // 0 1 2 3 4
+ const TokenSet tokens{Tokens::Indent, Tokens::Dedent, 5};
+
+ data.mark(5, 4, 4);
+
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 4, 4);
+ assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 10, 10);
+ assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+ assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
+ 38, 38);
+ assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43);
+ assertEnd(reader);
}
+
}