diff options
-rw-r--r-- | CMakeLists.txt | 108 | ||||
-rw-r--r-- | src/core/common/SourceContextReader.cpp | 5 | ||||
-rw-r--r-- | src/core/common/Token.hpp | 6 | ||||
-rw-r--r-- | src/core/common/Utils.cpp | 6 | ||||
-rw-r--r-- | src/core/common/Utils.hpp | 53 | ||||
-rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 286 | ||||
-rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 70 | ||||
-rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 7 | ||||
-rw-r--r-- | src/core/parser/utils/Tokenizer.hpp | 2 | ||||
-rw-r--r-- | test/core/parser/utils/TokenizedDataTest.cpp | 598 |
10 files changed, 591 insertions, 550 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 54f971c..225e63d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,14 +181,14 @@ ADD_LIBRARY(ousia_core src/core/parser/ParserContext src/core/parser/ParserScope src/core/parser/stack/Callbacks - src/core/parser/stack/DocumentHandler - src/core/parser/stack/DomainHandler - src/core/parser/stack/GenericParserStates - src/core/parser/stack/Handler - src/core/parser/stack/ImportIncludeHandler +# src/core/parser/stack/DocumentHandler +# src/core/parser/stack/DomainHandler +# src/core/parser/stack/GenericParserStates +# src/core/parser/stack/Handler +# src/core/parser/stack/ImportIncludeHandler src/core/parser/stack/State - src/core/parser/stack/Stack - src/core/parser/stack/TypesystemHandler +# src/core/parser/stack/Stack +# src/core/parser/stack/TypesystemHandler src/core/parser/utils/SourceOffsetVector src/core/parser/utils/TokenizedData src/core/parser/utils/Tokenizer @@ -212,19 +212,19 @@ ADD_LIBRARY(ousia_core # ousia_core #) -ADD_LIBRARY(ousia_osml - src/formats/osml/OsmlParser - src/formats/osml/OsmlStreamParser -) +#ADD_LIBRARY(ousia_osml +# src/formats/osml/OsmlParser +# src/formats/osml/OsmlStreamParser +#) -TARGET_LINK_LIBRARIES(ousia_osml - ousia_core -) +#TARGET_LINK_LIBRARIES(ousia_osml +# ousia_core +#) ADD_LIBRARY(ousia_osxml src/formats/osxml/OsxmlAttributeLocator src/formats/osxml/OsxmlEventParser - src/formats/osxml/OsxmlParser +# src/formats/osxml/OsxmlParser ) TARGET_LINK_LIBRARIES(ousia_osxml @@ -273,19 +273,19 @@ TARGET_LINK_LIBRARIES(ousia_xml # Command line interface -ADD_EXECUTABLE(ousia - src/cli/Main -) +#ADD_EXECUTABLE(ousia +# src/cli/Main +#) -TARGET_LINK_LIBRARIES(ousia - ousia_core - ousia_filesystem - ousia_html - ousia_xml - ousia_osml - ousia_osxml - ${Boost_LIBRARIES} -) +#TARGET_LINK_LIBRARIES(ousia +# ousia_core +# ousia_filesystem +# ousia_html +# ousia_xml +# ousia_osml +# ousia_osxml +# ${Boost_LIBRARIES} +#) # If testing is enabled, build the unit tests IF(TEST) @@ -323,11 +323,11 @@ IF(TEST) test/core/model/StyleTest test/core/model/TypesystemTest test/core/parser/ParserScopeTest - test/core/parser/stack/StackTest +# test/core/parser/stack/StackTest test/core/parser/stack/StateTest test/core/parser/utils/SourceOffsetVectorTest test/core/parser/utils/TokenizedDataTest - test/core/parser/utils/TokenizerTest +# test/core/parser/utils/TokenizerTest test/core/parser/utils/TokenTrieTest test/core/resource/ResourceLocatorTest test/core/resource/ResourceRequestTest @@ -383,29 +383,29 @@ IF(TEST) # ousia_mozjs # ) - ADD_EXECUTABLE(ousia_test_osml - test/formats/osml/OsmlParserTest - test/formats/osml/OsmlStreamParserTest - ) +# ADD_EXECUTABLE(ousia_test_osml +# test/formats/osml/OsmlParserTest +# test/formats/osml/OsmlStreamParserTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_osml - ${GTEST_LIBRARIES} - ousia_core - ousia_osml - ousia_filesystem - ) +# TARGET_LINK_LIBRARIES(ousia_test_osml +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_osml +# ousia_filesystem +# ) - ADD_EXECUTABLE(ousia_test_osxml - test/formats/osxml/OsxmlEventParserTest - test/formats/osxml/OsxmlParserTest - ) +# ADD_EXECUTABLE(ousia_test_osxml +# test/formats/osxml/OsxmlEventParserTest +# test/formats/osxml/OsxmlParserTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_osxml - ${GTEST_LIBRARIES} - ousia_core - ousia_osxml - ousia_filesystem - ) +# TARGET_LINK_LIBRARIES(ousia_test_osxml +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_osxml +# ousia_filesystem +# ) ADD_EXECUTABLE(ousia_test_xml test/plugins/xml/XmlOutputTest @@ -423,8 +423,8 @@ IF(TEST) ADD_TEST(ousia_test_filesystem ousia_test_filesystem) ADD_TEST(ousia_test_html ousia_test_html) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) - ADD_TEST(ousia_test_osml ousia_test_osml) - ADD_TEST(ousia_test_osxml ousia_test_osxml) +# ADD_TEST(ousia_test_osml ousia_test_osml) +# ADD_TEST(ousia_test_osxml ousia_test_osxml) ADD_TEST(ousia_test_xml ousia_test_xml) ENDIF() @@ -442,9 +442,9 @@ INSTALL(DIRECTORY data/ DESTINATION share/ousia OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE ) -INSTALL(TARGETS ousia - RUNTIME DESTINATION bin -) +#INSTALL(TARGETS ousia +# RUNTIME DESTINATION bin +#) IF(INSTALL_GEDIT_HIGHLIGHTER) INSTALL(FILES contrib/gtksourceview-3.0/language-specs/ousia.lang diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp index d5d379c..f7dbdf3 100644 --- a/src/core/common/SourceContextReader.cpp +++ b/src/core/common/SourceContextReader.cpp @@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader, ctx.relLen = end - start; // end >= start (I2) // Remove linebreaks at the beginning and the end - const std::pair<size_t, size_t> b = - Utils::trim(lineBuf, Utils::isLinebreak); + const std::pair<size_t, size_t> b = Utils::trim( + lineBuf, + [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); }); ssize_t s = b.first, e = b.second; s = std::min(s, static_cast<ssize_t>(ctx.relPos)); diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp index 07d7c8f..0cf56b0 100644 --- a/src/core/common/Token.hpp +++ b/src/core/common/Token.hpp @@ -91,10 +91,10 @@ constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4; constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5; /** - * Token which represents an unindentation -- issued if the indentation of + * Token which represents an dedentation -- issued if the indentation of * this line is smaller than the indentation of the previous line. */ -constexpr TokenId Unindent = std::numeric_limits<TokenId>::max() - 6; +constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6; /** * Maximum token id to be used. Tokens allocated for users should not surpass @@ -165,7 +165,7 @@ struct Token { * @return true if the TokenId indicates that this token is a "special" * token. */ - + bool isSpecial() const {return id > Tokens::MaxTokenId;} /** * The getLocation function allows the tokens to be directly passed as diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index a77951e..85d2c28 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename) return std::string{}; } -std::string Utils::trim(const std::string &s) -{ - std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::startsWith(const std::string &s, const std::string &prefix) { return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 7d96562..82a8f8c 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -124,14 +124,6 @@ public: static bool hasNonWhitepaceChar(const std::string &s); /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** * Trims the given string or vector of chars by returning the start and end * index. * @@ -153,8 +145,8 @@ public: * * @param s is the container that should be trimmed. * @param len is the number of elements in the container. - * @param f is a function that returns true for values that should be - * removed. + * @param f is a function that returns true for values at a certain index + * that should be removed. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" */ @@ -163,7 +155,7 @@ public: { size_t start = 0; for (size_t i = 0; i < len; i++) { - if (!f(s[i])) { + if (!f(i)) { start = i; break; } @@ -171,7 +163,7 @@ public: size_t end = 0; for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) { - if (!f(s[i])) { + if (!f(i)) { end = i + 1; break; } @@ -198,17 +190,33 @@ public: * the collapsed version of the string ends. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" + * @param f is a function that returns true for values at a certain index + * that should be removed. */ - template <class T> - static std::string trim(const T &s, size_t len, size_t &start, size_t &end) + template <class T, class Filter> + static std::string trim(const T &s, size_t len, size_t &start, size_t &end, + Filter f) { - auto res = trim(s, len, isWhitespace); + auto res = trim(s, len, f); start = res.first; end = res.second; return std::string(&s[start], end - start); } /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s) + { + std::pair<size_t, size_t> bounds = + trim(s, [&s](size_t i) { return isWhitespace(s[i]); }); + return s.substr(bounds.first, bounds.second - bounds.first); + } + + /** * Collapses the whitespaces in the given string (trims the string and * replaces all whitespace characters by a single one). * @@ -219,7 +227,8 @@ public: { size_t start; size_t end; - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -236,7 +245,8 @@ public: static std::string collapse(const std::string &s, size_t &start, size_t &end) { - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -244,6 +254,8 @@ public: * replaces all whitespace characters by a single one). * * @tparam T is the string type that should be used. + * @tparam Filter is a filter function used for detecting the character + * indices that might be removed. * @param s is the string in which the whitespace should be collapsed. * @param len is the length of the input string * @param start is an output parameter which is set to the offset at which @@ -252,9 +264,9 @@ public: * the collapsed version of the string ends. * @return a copy of s with collapsed whitespace. */ - template <class T> + template <class T, class Filter> static std::string collapse(const T &s, size_t len, size_t &start, - size_t &end) + size_t &end, Filter f) { // Result vector std::vector<char> res; @@ -268,8 +280,7 @@ public: bool hadWhitespace = false; for (size_t i = 0; i < len; i++) { const char c = s[i]; - const bool whitespace = isWhitespace(c); - if (whitespace) { + if (f(i)) { hadWhitespace = !res.empty(); } else { // Adapt the start and end position diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index 0ec56af..aeefa26 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -48,6 +48,17 @@ struct TokenMark { TokenLength len; /** + * Specifies whether the token is special or not. + */ + bool special; + + /** + * Maximum token length. + */ + static constexpr TokenLength MaxTokenLength = + std::numeric_limits<TokenLength>::max(); + + /** * Constructor of the TokenMark structure, initializes all members with the * given values. * @@ -55,9 +66,10 @@ struct TokenMark { * @param bufStart is the start position of the TokenMark in the internal * character buffer. * @param len is the length of the token. + * @param special modifies the sort order, special tokens are prefered. */ - TokenMark(TokenId id, size_t bufStart, TokenLength len) - : bufStart(bufStart), id(id), len(len) + TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) + : bufStart(bufStart), id(id), len(len), special(special) { } @@ -72,7 +84,8 @@ struct TokenMark { TokenMark(size_t bufStart) : bufStart(bufStart), id(Tokens::Empty), - len(std::numeric_limits<TokenLength>::max()) + len(MaxTokenLength), + special(true) { } @@ -86,8 +99,22 @@ struct TokenMark { */ friend bool operator<(const TokenMark &m1, const TokenMark &m2) { - return (m1.bufStart < m2.bufStart) || - (m1.bufStart == m2.bufStart && m1.len > m2.len); + // Prefer the mark with the smaller bufStart + if (m1.bufStart < m2.bufStart) { + return true; + } + + // Special handling for marks with the same bufStart + if (m1.bufStart == m2.bufStart) { + // If exactly one of the two marks is special, return true if this + // one is special + if (m1.special != m2.special) { + return m1.special; + } + // Otherwise prefer longer marks + return m1.len > m2.len; + } + return false; } }; } @@ -110,6 +137,11 @@ private: std::vector<char> buf; /** + * Buffset storing the "protected" flag of the character data. + */ + std::vector<bool> protectedChars; + + /** * Vector storing all the character offsets efficiently. */ SourceOffsetVector offsets; @@ -120,6 +152,26 @@ private: mutable std::vector<TokenMark> marks; /** + * Position of the first linebreak in a sequence of linebreaks. + */ + size_t firstLinebreak; + + /** + * Current indentation level. + */ + uint16_t currentIndentation; + + /** + * Last indentation level. + */ + uint16_t lastIndentation; + + /** + * Number of linebreaks without any content between them. + */ + uint16_t numLinebreaks; + + /** * Flag indicating whether the internal "marks" vector is sorted. */ mutable bool sorted; @@ -132,7 +184,7 @@ public: * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ - TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} + TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); } /** * Appends a complete string to the internal character buffer and extends @@ -140,25 +192,22 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart) - { // Append the data to the internal buffer - buf.insert(buf.end(), data.begin(), data.end()); - - // Extend the text regions, interpolate the source position (this may - // yield incorrect results) - const size_t size = buf.size(); + size_t append(const std::string &data, SourceOffset offsStart, bool protect) + { for (size_t i = 0; i < data.size(); i++) { if (offsStart != InvalidSourceOffset) { - offsets.storeOffset(offsStart + i, offsStart + i + 1); + append(data[i], offsStart + i, offsStart + i + 1, protect); } else { - offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset); + append(data[i], InvalidSourceOffset, InvalidSourceOffset, + protect); } } - - return size; + return size(); } /** @@ -168,16 +217,86 @@ public: * @param c is the character that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param offsEnd is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect) { // Add the character to the list and store the location of the character // in the source file buf.push_back(c); + protectedChars.push_back(protect); offsets.storeOffset(offsStart, offsEnd); - return buf.size(); + + // Insert special tokens + const size_t size = buf.size(); + const bool isWhitespace = Utils::isWhitespace(c); + const bool isLinebreak = Utils::isLinebreak(c); + + // Handle linebreaks + if (isLinebreak) { + // Mark linebreaks as linebreak + mark(Tokens::Newline, size - 1, 1, false); + + // The linebreak sequence started at the previous character + if (numLinebreaks == 0) { + firstLinebreak = size - 1; + } + + // Reset the indentation + currentIndentation = 0; + + // Increment the number of linebreaks + numLinebreaks++; + + const size_t markStart = firstLinebreak; + const size_t markLength = size - firstLinebreak; + + // Issue two consecutive linebreaks as paragraph token + if (numLinebreaks == 2) { + mark(Tokens::Paragraph, markStart, markLength, false); + } + + // Issue three consecutive linebreaks as paragraph token + if (numLinebreaks >= 3) { + mark(Tokens::Section, markStart, markLength, false); + } + } else if (isWhitespace) { + // Count the whitespace characters at the beginning of the line + if (numLinebreaks > 0) { + // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the + // next multiple of eight. + if (c == '\t') { + currentIndentation = (currentIndentation + 8) & ~7; + } else { + currentIndentation++; + } + } + } + + // Issue indent and unindent tokens + if (!isWhitespace && numLinebreaks > 0) { + // Issue a larger indentation than that in the previous line as + // "Indent" token + if (currentIndentation > lastIndentation) { + mark(Tokens::Indent, size - 1, 0, true); + } + + // Issue a smaller indentation than that in the previous line as + // "Dedent" token + if (currentIndentation < lastIndentation) { + mark(Tokens::Dedent, size - 1, 0, true); + } + + // Reset the internal state machine + lastIndentation = currentIndentation; + numLinebreaks = 0; + } + + return size; } /** @@ -187,11 +306,12 @@ public: * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. + * @param special tags the mark as "special", prefering it in the sort order */ - void mark(TokenId id, size_t bufStart, TokenLength len) + void mark(TokenId id, size_t bufStart, TokenLength len, bool special) { // Push the new instance back onto the list - marks.emplace_back(id, bufStart, len); + marks.emplace_back(id, bufStart, len, special); // Update the sorted flag as soon as more than one element is in the // list @@ -215,9 +335,13 @@ public: * @return true if a token was returned, false if no more tokens are * available. */ - bool next(Token &token, WhitespaceMode mode, - const std::unordered_set<TokenId> &tokens, size_t &cursor) const + bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, + TokenizedDataCursor &cursor) const { + // Some variables for convenient access + size_t &bufPos = cursor.bufPos; + size_t &markPos = cursor.markPos; + // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { std::sort(marks.begin(), marks.end()); @@ -226,8 +350,8 @@ public: // Fetch the next larger TokenMark instance, make sure the token is in // the "enabled" list and within the buffer range - auto it = - std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); + auto it = std::lower_bound(marks.begin() + markPos, marks.end(), + TokenMark(bufPos)); while (it != marks.end() && (tokens.count(it->id) == 0 || it->bufStart + it->len > buf.size())) { it++; @@ -240,15 +364,15 @@ public: // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. - if (cursor < end && cursor < buf.size()) { + if (bufPos < end && bufPos < buf.size()) { switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( - Tokens::Data, std::string(&buf[cursor], end - cursor), + Tokens::Data, std::string(&buf[bufPos], end - bufPos), SourceLocation(sourceId, - offsets.loadOffset(cursor).first, + offsets.loadOffset(bufPos).first, offsets.loadOffset(end).first)); - cursor = end; + bufPos = end; return true; } case WhitespaceMode::TRIM: @@ -258,30 +382,35 @@ public: size_t stringStart; size_t stringEnd; std::string content; + const char *cBuf = &buf[bufPos]; + auto filter = [cBuf, this](size_t i) -> bool { + return Utils::isWhitespace(cBuf[i]) && + !protectedChars[i]; + }; if (mode == WhitespaceMode::TRIM) { - content = Utils::trim(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::trim(cBuf, end - bufPos, stringStart, + stringEnd, filter); } else { - content = Utils::collapse(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::collapse( + cBuf, end - bufPos, stringStart, stringEnd, filter); } // If the resulting string is empty (only whitespaces), // abort if (content.empty()) { - cursor = end; + bufPos = end; break; } // Calculate the absolute positions and return the token - stringStart += cursor; - stringEnd += cursor; + stringStart += bufPos; + stringEnd += bufPos; token = Token( Tokens::Data, content, SourceLocation(sourceId, offsets.loadOffset(stringStart).first, offsets.loadOffset(stringEnd).first)); - cursor = end; + bufPos = end; return true; } } @@ -290,14 +419,18 @@ public: // If start equals end, we're currently directly at a token // instance. Return this token and advance the cursor to the end of // the token. - if (cursor == end && it != marks.end()) { + if (bufPos == end && it != marks.end()) { const size_t tokenStart = it->bufStart; const size_t tokenEnd = it->bufStart + it->len; token = Token( it->id, std::string(&buf[tokenStart], it->len), SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, offsets.loadOffset(tokenEnd).first)); - cursor = tokenEnd; + + // Update the cursor, consume the token by incrementing the marks + // pos counter + bufPos = tokenEnd; + markPos = it - marks.begin() + 1; return true; } @@ -314,8 +447,12 @@ public: void clear() { buf.clear(); - marks.clear(); + protectedChars.clear(); offsets.clear(); + marks.clear(); + currentIndentation = 0; + lastIndentation = 0; + numLinebreaks = 1; // Assume the stream starts with a linebreak sorted = true; } @@ -367,39 +504,35 @@ public: TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} TokenizedData::TokenizedData(SourceId sourceId) - : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) + : impl(std::make_shared<TokenizedDataImpl>(sourceId)) { } TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, + bool protect) { - return impl->append(data, offsStart); + return impl->append(data, offsStart, protect); } size_t TokenizedData::append(char c, SourceOffset offsStart, - SourceOffset offsEnd) + SourceOffset offsEnd, bool protect) { - return impl->append(c, offsStart, offsEnd); + return impl->append(c, offsStart, offsEnd, protect); } void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->size() - len, len); + impl->mark(id, impl->size() - len, len, false); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) { - impl->mark(id, bufStart, len); + impl->mark(id, bufStart, len, false); } -void TokenizedData::clear() -{ - impl->clear(); - tokens.clear(); - cursor = 0; -} +void TokenizedData::clear() { impl->clear(); } void TokenizedData::trim(size_t length) { impl->trim(length); } @@ -412,49 +545,42 @@ SourceLocation TokenizedData::getLocation() const return impl->getLocation(); } -TokenizedDataReader reader() const +TokenizedDataReader TokenizedData::reader() const { - return TokenizedDataReader(impl, std::unordered_set<TokenId>{}, 0, 0); + return TokenizedDataReader(impl, TokenizedDataCursor(), + TokenizedDataCursor()); } /* Class TokenizedDataReader */ +TokenizedDataReader::TokenizedDataReader( + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} + TokenizedDataReaderFork TokenizedDataReader::fork() { - return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor); + return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); } -bool TokenizedDataReader::atEnd() const { return readCursor >= size(); } +bool TokenizedDataReader::atEnd() const +{ + return readCursor.bufPos >= impl->size(); +} -bool TokenizedData::read(Token &token, const TokenSet &tokens, - WhitespaceMode mode) +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) { peekCursor = readCursor; return impl->next(token, mode, tokens, readCursor); } -bool TokenizedData::peek(Token &token, const TokenSet &tokens, - WhitespaceMode mode) +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) { return impl->next(token, mode, tokens, peekCursor); } - -Variant TokenizedData::text(WhitespaceMode mode) -{ - // Copy the current cursor position to not update the actual cursor position - // if the operation was not successful - size_t cursorCopy = cursor; - Token token; - if (!impl->next(token, mode, tokens, cursorCopy) || - token.id != Tokens::Data) { - return Variant{nullptr}; - } - - // There is indeed a text token, update the internal cursor position and - // return the token as variant. - cursor = cursorCopy; - Variant res = Variant::fromString(token.content); - res.setLocation(token.getLocation()); - return res; -} } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 85b80ae..b72ca02 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -36,7 +36,6 @@ #include <unordered_set> #include <core/common/Location.hpp> -#include <core/common/Variant.hpp> #include <core/common/Whitespace.hpp> #include <core/common/Token.hpp> @@ -48,6 +47,28 @@ class TokenizedDataReader; class TokenizedDataReaderFork; /** + * Internally used structure representing a cursor within the TokenizedData + * stream. + */ +struct TokenizedDataCursor { + /** + * Position within the byte buffer. + */ + size_t bufPos; + + /** + * Position within the token mark buffer. + */ + size_t markPos; + + /** + * Default constructor. The resulting cursor points at the beginning of the + * stream. + */ + TokenizedDataCursor() : bufPos(0), markPos(0) {} +}; + +/** * The TokenizedData class stores data extracted from a user defined document. * The data stored in TokenizedData */ @@ -88,10 +109,13 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart = 0); + size_t append(const std::string &data, SourceOffset offsStart = 0, + bool protect = false); /** * Appends a single character to the internal character buffer. @@ -99,10 +123,13 @@ public: * @param c is the character that should be appended to the buffer. * @param start is the start offset in bytes in the input file. * @param end is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect = false); /** * Stores a token ending at the last character of the current buffer. @@ -187,15 +214,16 @@ private: /** * Position from which the last element was read from the internal buffer. */ - size_t readCursor; + TokenizedDataCursor readCursor; /** * Position from which the last element was peeked from the internal buffer. */ - size_t peekCursor; + TokenizedDataCursor peekCursor; +protected: /** - * Private constructor of TokenizedDataReader, taking a reference to the + * Protected constructor of TokenizedDataReader, taking a reference to the * internal TokenizedDataImpl structure storing the data that is accessed by * the reader. * @@ -205,8 +233,9 @@ private: * @param peekCursor is the cursor position from which tokens and text are * peeked. */ - TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl, - size_t readCursor, size_t peekCursor); + TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor); public: /** @@ -237,7 +266,7 @@ public: * false if there are no more tokens. */ bool read(Token &token, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::COLLAPSE); + WhitespaceMode mode = WhitespaceMode::TRIM); /** * Stores the next token in the given token reference, returns true if the @@ -253,7 +282,7 @@ public: * false if there are no more tokens. */ bool peek(Token &token, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::COLLAPSE); + WhitespaceMode mode = WhitespaceMode::TRIM); /** * Consumes the peeked tokens, the read cursor will now be at the position @@ -265,20 +294,6 @@ public: * Resets the peek cursor to the position of the read cursor. */ void resetPeek() { peekCursor = readCursor; } - - /** - * Stores the next text token in the given token reference, returns true if - * the operation was successful (there was indeed a text token), false if - * the next token is not a text token or there were no more tokens. - * - * @param token is an output parameter into which the read token will be - * stored. The TokenId is set to Tokens::Empty if there are no more tokens. - * @param mode is the whitespace mode that should be used when a text token - * is returned. - * @return a string variant with the data if there is any data or a nullptr - * variant if there is no text. - */ - Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE); }; /** @@ -309,8 +324,9 @@ private: * peeked. */ TokenizedDataReaderFork(TokenizedDataReader &parent, - std::shared_ptr<TokenizedDataImpl> impl, - size_t readCursor, size_t peekCursor) + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) { } @@ -320,7 +336,7 @@ public: * Commits the read/peek progress to the underlying parent. */ void commit() { parent = *this; } -} +}; } #endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 51787cd..e78b0f4 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -156,6 +156,7 @@ public: return res; } }; + } /* Class Tokenizer */ @@ -229,12 +230,6 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) } else { // Record all incomming characters data.append(c, charStart, charEnd); - - // Special token processing - // TODO: Build a special state machine for this in another class - if (c == '\n') { - data.mark(Tokens::Newline, 1); - } } // Swap the lookups and the nextLookups list diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index 2ddb9c9..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -28,7 +28,7 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include <set> +#include <cstdint> #include <string> #include <vector> diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp index 6bd7234..dfe2526 100644 --- a/test/core/parser/utils/TokenizedDataTest.cpp +++ b/test/core/parser/utils/TokenizedDataTest.cpp @@ -22,6 +22,43 @@ namespace ousia { +void assertToken(TokenizedDataReader &reader, TokenId id, + const std::string &text, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId sourceId = InvalidSourceId) +{ + Token token; + ASSERT_TRUE(reader.read(token, tokens, mode)); + EXPECT_EQ(id, token.id); + EXPECT_EQ(text, token.content); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, token.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, token.getLocation().getEnd()); + } + EXPECT_EQ(sourceId, token.getLocation().getSourceId()); +} + +void assertText(TokenizedDataReader &reader, const std::string &text, + const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId id = InvalidSourceId) +{ + assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id); +} + +void assertEnd(TokenizedDataReader &reader) +{ + Token token; + ASSERT_TRUE(reader.atEnd()); + ASSERT_FALSE(reader.read(token)); +} + TEST(TokenizedData, dataWhitespacePreserve) { TokenizedData data; @@ -29,15 +66,10 @@ TEST(TokenizedData, dataWhitespacePreserve) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" test1 test2 ", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(16U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, " test1 test2 ", TokenSet{}, WhitespaceMode::PRESERVE, + 0, 16); + assertEnd(reader); } TEST(TokenizedData, dataWhitespaceTrim) @@ -47,15 +79,10 @@ TEST(TokenizedData, dataWhitespaceTrim) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test1 test2", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(14U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::TRIM, 1, + 14); + assertEnd(reader); } TEST(TokenizedData, dataWhitespaceCollapse) @@ -65,15 +92,10 @@ TEST(TokenizedData, dataWhitespaceCollapse) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test1 test2", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(14U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::COLLAPSE, 1, + 14); + assertEnd(reader); } TEST(TokenizedData, singleToken) @@ -82,17 +104,9 @@ TEST(TokenizedData, singleToken) ASSERT_EQ(2U, data.append("$$")); data.mark(5, 0, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, singleDisabledToken) @@ -101,15 +115,9 @@ TEST(TokenizedData, singleDisabledToken) ASSERT_EQ(2U, data.append("$$")); data.mark(5, 0, 2); - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "$$", TokenSet{}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, dualToken) @@ -120,18 +128,10 @@ TEST(TokenizedData, dualToken) data.mark(5, 0, 2); data.mark(6, 1, 1); - data.enableToken(5); - data.enableToken(6); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5, 6}, WhitespaceMode::COLLAPSE, 0, + 2); + assertEnd(reader); } TEST(TokenizedData, dualTokenShorterEnabled) @@ -142,383 +142,281 @@ TEST(TokenizedData, dualTokenShorterEnabled) data.mark(5, 0, 2); data.mark(6, 1, 1); - data.enableToken(6); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(1U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 1, 2); + assertEnd(reader); } TEST(TokenizedData, dualTokenLongerEnabled) { TokenizedData data; ASSERT_EQ(2U, data.append("$$")); + data.mark(6, 0, 1); data.mark(5, 0, 2); + data.mark(6, 1, 1); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataPreserveWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" test ", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(8U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2); + assertText(reader, " test text ", TokenSet{5}, WhitespaceMode::PRESERVE, + 2, 16); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataTrimWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2); + assertText(reader, "test text", TokenSet{5}, WhitespaceMode::TRIM, 3, + 15); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataCollapseWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertText(reader, "test text", TokenSet{5}, WhitespaceMode::COLLAPSE, 3, + 15); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespacePreserveWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(8U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2); + assertText(reader, " ", TokenSet{5}, WhitespaceMode::PRESERVE, 2, 6); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 6, 8); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespaceTrimWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 6, 8); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespaceCollapseWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 6, 8); + assertEnd(reader); } -TEST(TokenizedData, textPreserveWhitespace) +TEST(TokenizedData, appendChars) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); - - data.enableToken(5); - - Variant text; - text = data.text(WhitespaceMode::PRESERVE); - EXPECT_EQ(" ", text.asString()); - EXPECT_EQ(0U, text.getLocation().getStart()); - EXPECT_EQ(2U, text.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); + ASSERT_EQ(1U, data.append('t', 5, 7)); + ASSERT_EQ(2U, data.append('e', 7, 8)); + ASSERT_EQ(3U, data.append('s', 8, 10)); + ASSERT_EQ(4U, data.append('t', 10, 12)); - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - text = data.text(WhitespaceMode::PRESERVE); - EXPECT_EQ(" ", text.asString()); - EXPECT_EQ(4U, text.getLocation().getStart()); - EXPECT_EQ(6U, text.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); - - ASSERT_EQ(nullptr, data.text(WhitespaceMode::PRESERVE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test", TokenSet{5}, WhitespaceMode::COLLAPSE, 5, 12); + assertEnd(reader); } -TEST(TokenizedData, textTrimWhitespace) +TEST(TokenizedData, protectedWhitespace) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); + ASSERT_EQ(4U, data.append("test", 10)); + ASSERT_EQ(11U, data.append(" test", 14, true)); - data.enableToken(5); - - Token token; - ASSERT_EQ(nullptr, data.text(WhitespaceMode::TRIM)); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_EQ(nullptr, data.text(WhitespaceMode::TRIM)); - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test test", TokenSet{5}, WhitespaceMode::COLLAPSE, 10, + 21); + assertEnd(reader); } -TEST(TokenizedData, textCollapseWhitespace) +TEST(TokenizedData, specialNewlineToken) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); - - data.enableToken(5); + data.append("a\nb\n \nc\n"); + // 0 12 3456 78 9 + + const TokenSet tokens{Tokens::Newline}; + + TokenizedDataReader reader = data.reader(); + assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 1, 2); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 3, 4); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 7, 8); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 9, 10); + assertEnd(reader); +} - Token token; - ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE)); +TEST(TokenizedData, specialParagraphToken) +{ + TokenizedData data; + data.append("a\nb\n \nc\n"); + // 0 12 3456 78 9 - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); + const TokenSet tokens{Tokens::Paragraph}; - ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3); + assertToken(reader, Tokens::Paragraph, "\n \n", tokens, + WhitespaceMode::COLLAPSE, 3, 8); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9); + assertEnd(reader); } -TEST(TokenizedData, appendChars) +TEST(TokenizedData, specialSectionToken) { TokenizedData data; - ASSERT_EQ(1U, data.append('t', 5, 7)); - ASSERT_EQ(2U, data.append('e', 7, 8)); - ASSERT_EQ(3U, data.append('s', 8, 10)); - ASSERT_EQ(4U, data.append('t', 10, 12)); + data.append("a\nb\n \n \t \n"); + // 0 12 3456 789 01 2 + // 0 1 - Variant text = data.text(WhitespaceMode::COLLAPSE); - ASSERT_EQ("test", text.asString()); - EXPECT_EQ(5U, text.getLocation().getStart()); - EXPECT_EQ(12U, text.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); + const TokenSet tokens{Tokens::Section}; - ASSERT_EQ(nullptr, data.text(WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3); + assertToken(reader, Tokens::Section, "\n \n \t \n", tokens, + WhitespaceMode::COLLAPSE, 3, 13); + assertEnd(reader); +} - Token token; - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); +TEST(TokenizedData, specialTokenPrecedence) +{ + TokenizedData data; + data.append("a\nb\n\nc\n\n\nd"); + // 0 12 3 45 6 7 89 + + const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section}; + + TokenizedDataReader reader = data.reader(); + assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 1, 2); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3); + assertToken(reader, Tokens::Paragraph, "\n\n", tokens, + WhitespaceMode::COLLAPSE, 3, 5); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 5, 6); + assertToken(reader, Tokens::Section, "\n\n\n", tokens, + WhitespaceMode::COLLAPSE, 6, 9); + assertText(reader, "d", tokens, WhitespaceMode::COLLAPSE, 9, 10); + assertEnd(reader); } -TEST(TokenizedData, copy) +TEST(TokenizedData, specialTokenPrecedence2) { TokenizedData data; - ASSERT_EQ(7U, data.append(" a $ b ")); - // 0123456 - data.mark(6, 3, 1); - data.enableToken(6); + data.append("\nb\n\nc\n\n\n"); + // 0 12 3 45 6 7 + + const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section}; + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 0, 1); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 1, 2); + assertToken(reader, Tokens::Paragraph, "\n\n", tokens, + WhitespaceMode::COLLAPSE, 2, 4); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 4, 5); + assertToken(reader, Tokens::Section, "\n\n\n", tokens, + WhitespaceMode::COLLAPSE, 5, 8); + assertEnd(reader); +} - Variant text; - Token token; +TEST(TokenizedData, specialTokenIndent) +{ + TokenizedData data; + data.append(" test\n\ttest2\n test3 \ttest4\ntest5"); + // 01234567 8 901234 5678901234567890 123456 789012 + // 0 1 2 3 4 + const TokenSet tokens{Tokens::Indent, Tokens::Dedent}; + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 4, 4); + assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 10, 10); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, + 38, 38); + assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43); + assertEnd(reader); +} - text = data.text(WhitespaceMode::COLLAPSE); - ASSERT_EQ("a", text.asString()); - EXPECT_EQ(1U, text.getLocation().getStart()); - EXPECT_EQ(2U, text.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); - - ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE)); - - TokenizedData dataCopy = data; - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(dataCopy.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - text = data.text(WhitespaceMode::PRESERVE); - ASSERT_EQ(" b ", text.asString()); - EXPECT_EQ(4U, text.getLocation().getStart()); - EXPECT_EQ(7U, text.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); - ASSERT_FALSE(data.next(token)); - - text = dataCopy.text(WhitespaceMode::COLLAPSE); - ASSERT_EQ("b", text.asString()); - EXPECT_EQ(5U, text.getLocation().getStart()); - EXPECT_EQ(6U, text.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); - ASSERT_FALSE(data.next(token)); +TEST(TokenizedData, specialTokenIndentOverlap) +{ + TokenizedData data; + data.append(" test\n\ttest2\n test3 \ttest4\ntest5"); + // 01234567 8 901234 5678901234567890 123456 789012 + // 0 1 2 3 4 + const TokenSet tokens{Tokens::Indent, Tokens::Dedent, 5}; + + data.mark(5, 4, 4); + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 4, 4); + assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 10, 10); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, + 38, 38); + assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43); + assertEnd(reader); } + } |