diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-25 23:09:26 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-25 23:09:26 +0100 |
commit | 84c9abc3e9762c4486ddc5ca0352a5d697a51987 (patch) | |
tree | b95db6ab2c2c6c2fba430218411a4ddf1d31b19f | |
parent | 8891dea26a1653a003b4171155e155d3aa6689ae (diff) |
start of branch, commit log will be rewritten
29 files changed, 1501 insertions, 1577 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index ea5c3aa..225e63d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,6 +158,7 @@ ADD_LIBRARY(ousia_core src/core/common/Rtti src/core/common/RttiBuilder src/core/common/SourceContextReader + src/core/common/Token src/core/common/Utils src/core/common/Variant src/core/common/VariantConverter @@ -180,16 +181,15 @@ ADD_LIBRARY(ousia_core src/core/parser/ParserContext src/core/parser/ParserScope src/core/parser/stack/Callbacks - src/core/parser/stack/DocumentHandler - src/core/parser/stack/DomainHandler - src/core/parser/stack/GenericParserStates - src/core/parser/stack/Handler - src/core/parser/stack/ImportIncludeHandler +# src/core/parser/stack/DocumentHandler +# src/core/parser/stack/DomainHandler +# src/core/parser/stack/GenericParserStates +# src/core/parser/stack/Handler +# src/core/parser/stack/ImportIncludeHandler src/core/parser/stack/State - src/core/parser/stack/Stack - src/core/parser/stack/TypesystemHandler +# src/core/parser/stack/Stack +# src/core/parser/stack/TypesystemHandler src/core/parser/utils/SourceOffsetVector - src/core/parser/utils/Token src/core/parser/utils/TokenizedData src/core/parser/utils/Tokenizer src/core/parser/utils/TokenTrie @@ -212,19 +212,19 @@ ADD_LIBRARY(ousia_core # ousia_core #) -ADD_LIBRARY(ousia_osml - src/formats/osml/OsmlParser - src/formats/osml/OsmlStreamParser -) +#ADD_LIBRARY(ousia_osml +# src/formats/osml/OsmlParser +# src/formats/osml/OsmlStreamParser +#) -TARGET_LINK_LIBRARIES(ousia_osml - ousia_core -) +#TARGET_LINK_LIBRARIES(ousia_osml +# ousia_core +#) ADD_LIBRARY(ousia_osxml src/formats/osxml/OsxmlAttributeLocator src/formats/osxml/OsxmlEventParser - src/formats/osxml/OsxmlParser +# src/formats/osxml/OsxmlParser ) TARGET_LINK_LIBRARIES(ousia_osxml @@ -273,19 +273,19 @@ TARGET_LINK_LIBRARIES(ousia_xml # Command line interface -ADD_EXECUTABLE(ousia - src/cli/Main -) +#ADD_EXECUTABLE(ousia +# src/cli/Main +#) -TARGET_LINK_LIBRARIES(ousia - ousia_core - ousia_filesystem - ousia_html - ousia_xml - ousia_osml - ousia_osxml - ${Boost_LIBRARIES} -) +#TARGET_LINK_LIBRARIES(ousia +# ousia_core +# ousia_filesystem +# ousia_html +# ousia_xml +# ousia_osml +# ousia_osxml +# ${Boost_LIBRARIES} +#) # If testing is enabled, build the unit tests IF(TEST) @@ -323,11 +323,11 @@ IF(TEST) test/core/model/StyleTest test/core/model/TypesystemTest test/core/parser/ParserScopeTest - test/core/parser/stack/StackTest +# test/core/parser/stack/StackTest test/core/parser/stack/StateTest test/core/parser/utils/SourceOffsetVectorTest test/core/parser/utils/TokenizedDataTest - test/core/parser/utils/TokenizerTest +# test/core/parser/utils/TokenizerTest test/core/parser/utils/TokenTrieTest test/core/resource/ResourceLocatorTest test/core/resource/ResourceRequestTest @@ -383,29 +383,29 @@ IF(TEST) # ousia_mozjs # ) - ADD_EXECUTABLE(ousia_test_osml - test/formats/osml/OsmlParserTest - test/formats/osml/OsmlStreamParserTest - ) +# ADD_EXECUTABLE(ousia_test_osml +# test/formats/osml/OsmlParserTest +# test/formats/osml/OsmlStreamParserTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_osml - ${GTEST_LIBRARIES} - ousia_core - ousia_osml - ousia_filesystem - ) +# TARGET_LINK_LIBRARIES(ousia_test_osml +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_osml +# ousia_filesystem +# ) - ADD_EXECUTABLE(ousia_test_osxml - test/formats/osxml/OsxmlEventParserTest - test/formats/osxml/OsxmlParserTest - ) +# ADD_EXECUTABLE(ousia_test_osxml +# test/formats/osxml/OsxmlEventParserTest +# test/formats/osxml/OsxmlParserTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_osxml - ${GTEST_LIBRARIES} - ousia_core - ousia_osxml - ousia_filesystem - ) +# TARGET_LINK_LIBRARIES(ousia_test_osxml +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_osxml +# ousia_filesystem +# ) ADD_EXECUTABLE(ousia_test_xml test/plugins/xml/XmlOutputTest @@ -423,8 +423,8 @@ IF(TEST) ADD_TEST(ousia_test_filesystem ousia_test_filesystem) ADD_TEST(ousia_test_html ousia_test_html) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) - ADD_TEST(ousia_test_osml ousia_test_osml) - ADD_TEST(ousia_test_osxml ousia_test_osxml) +# ADD_TEST(ousia_test_osml ousia_test_osml) +# ADD_TEST(ousia_test_osxml ousia_test_osxml) ADD_TEST(ousia_test_xml ousia_test_xml) ENDIF() @@ -442,9 +442,9 @@ INSTALL(DIRECTORY data/ DESTINATION share/ousia OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE ) -INSTALL(TARGETS ousia - RUNTIME DESTINATION bin -) +#INSTALL(TARGETS ousia +# RUNTIME DESTINATION bin +#) IF(INSTALL_GEDIT_HIGHLIGHTER) INSTALL(FILES contrib/gtksourceview-3.0/language-specs/ousia.lang diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp index d5d379c..f7dbdf3 100644 --- a/src/core/common/SourceContextReader.cpp +++ b/src/core/common/SourceContextReader.cpp @@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader, ctx.relLen = end - start; // end >= start (I2) // Remove linebreaks at the beginning and the end - const std::pair<size_t, size_t> b = - Utils::trim(lineBuf, Utils::isLinebreak); + const std::pair<size_t, size_t> b = Utils::trim( + lineBuf, + [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); }); ssize_t s = b.first, e = b.second; s = std::min(s, static_cast<ssize_t>(ctx.relPos)); diff --git a/src/core/parser/utils/Token.cpp b/src/core/common/Token.cpp index 8bcdbb5..8bcdbb5 100644 --- a/src/core/parser/utils/Token.cpp +++ b/src/core/common/Token.cpp diff --git a/src/core/parser/utils/Token.hpp b/src/core/common/Token.hpp index f907450..0cf56b0 100644 --- a/src/core/parser/utils/Token.hpp +++ b/src/core/common/Token.hpp @@ -30,6 +30,7 @@ #include <cstdint> #include <limits> #include <string> +#include <unordered_set> #include <core/common/Location.hpp> @@ -46,6 +47,11 @@ using TokenId = uint32_t; using TokenLength = uint16_t; /** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set<TokenId>; + +/** * Namespace containing constants for TokenId instances with special meaning. */ namespace Tokens { @@ -66,15 +72,29 @@ constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2; /** * Token which represents a paragraph token -- issued if two consecutive - * newlines occur with optionally any amout of whitespace between them. + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached. */ constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3; /** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4; + +/** * Token which represents an indentation token -- issued if the indentation of - * this line is larget than the indentation of the previous line. + * this line is larger than the indentation of the previous line. */ -constexpr TokenId Indentation = std::numeric_limits<TokenId>::max() - 4; +constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5; + +/** + * Token which represents an dedentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. + */ +constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6; /** * Maximum token id to be used. Tokens allocated for users should not surpass @@ -109,6 +129,17 @@ struct Token { Token() : id(Tokens::Empty) {} /** + * Constructor of a "data" token with no explicit content. + * + * @param location is the location of the extracted string content in the + * source file. + */ + Token(SourceLocation location) + : id(Tokens::Data), location(location) + { + } + + /** * Constructor of the Token struct. * * @param id represents the token id. @@ -129,6 +160,14 @@ struct Token { Token(TokenId id) : id(id) {} /** + * Returns true if this token is special. + * + * @return true if the TokenId indicates that this token is a "special" + * token. + */ + bool isSpecial() const {return id > Tokens::MaxTokenId;} + + /** * The getLocation function allows the tokens to be directly passed as * parameter to Logger or LoggableException instances. * diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index a77951e..85d2c28 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename) return std::string{}; } -std::string Utils::trim(const std::string &s) -{ - std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::startsWith(const std::string &s, const std::string &prefix) { return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 7d96562..82a8f8c 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -124,14 +124,6 @@ public: static bool hasNonWhitepaceChar(const std::string &s); /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** * Trims the given string or vector of chars by returning the start and end * index. * @@ -153,8 +145,8 @@ public: * * @param s is the container that should be trimmed. * @param len is the number of elements in the container. - * @param f is a function that returns true for values that should be - * removed. + * @param f is a function that returns true for values at a certain index + * that should be removed. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" */ @@ -163,7 +155,7 @@ public: { size_t start = 0; for (size_t i = 0; i < len; i++) { - if (!f(s[i])) { + if (!f(i)) { start = i; break; } @@ -171,7 +163,7 @@ public: size_t end = 0; for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) { - if (!f(s[i])) { + if (!f(i)) { end = i + 1; break; } @@ -198,17 +190,33 @@ public: * the collapsed version of the string ends. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" + * @param f is a function that returns true for values at a certain index + * that should be removed. */ - template <class T> - static std::string trim(const T &s, size_t len, size_t &start, size_t &end) + template <class T, class Filter> + static std::string trim(const T &s, size_t len, size_t &start, size_t &end, + Filter f) { - auto res = trim(s, len, isWhitespace); + auto res = trim(s, len, f); start = res.first; end = res.second; return std::string(&s[start], end - start); } /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s) + { + std::pair<size_t, size_t> bounds = + trim(s, [&s](size_t i) { return isWhitespace(s[i]); }); + return s.substr(bounds.first, bounds.second - bounds.first); + } + + /** * Collapses the whitespaces in the given string (trims the string and * replaces all whitespace characters by a single one). * @@ -219,7 +227,8 @@ public: { size_t start; size_t end; - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -236,7 +245,8 @@ public: static std::string collapse(const std::string &s, size_t &start, size_t &end) { - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -244,6 +254,8 @@ public: * replaces all whitespace characters by a single one). * * @tparam T is the string type that should be used. + * @tparam Filter is a filter function used for detecting the character + * indices that might be removed. * @param s is the string in which the whitespace should be collapsed. * @param len is the length of the input string * @param start is an output parameter which is set to the offset at which @@ -252,9 +264,9 @@ public: * the collapsed version of the string ends. * @return a copy of s with collapsed whitespace. */ - template <class T> + template <class T, class Filter> static std::string collapse(const T &s, size_t len, size_t &start, - size_t &end) + size_t &end, Filter f) { // Result vector std::vector<char> res; @@ -268,8 +280,7 @@ public: bool hadWhitespace = false; for (size_t i = 0; i < len; i++) { const char c = s[i]; - const bool whitespace = isWhitespace(c); - if (whitespace) { + if (f(i)) { hadWhitespace = !res.empty(); } else { // Adapt the start and end position diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include <string> -#include <vector> - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector<char> textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - WhitespaceHandler() : textStart(0), textEnd(0) {} - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } - - /** - * Returns the content of the WhitespaceHandler as string. - */ - std::string toString() const - { - return std::string(textBuf.data(), textBuf.size()); - } -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd); - } - - /** - * Static version of PreservingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector<char> whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); - } - - /** - * Static version of TrimmingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param whitespaceBuf is a reference at the buffer for storing whitespace - * characters. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd, std::vector<char> &whitespaceBuf) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); - } - - /** - * Static version of CollapsingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param hasWhitespace is a reference at the "hasWhitespace" flag. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd, bool &hasWhitespace) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template <typename WhitespaceHandler, typename Buffer> -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, - size_t start) -{ - for (auto elem : buf) { - handler.append(elem, start, start + 1); - start++; - } -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ - diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index bb04bd3..d44176a 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -25,6 +25,7 @@ #include <core/model/Domain.hpp> #include <core/model/Project.hpp> #include <core/model/Typesystem.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <core/parser/ParserScope.hpp> #include <core/parser/ParserContext.hpp> @@ -372,8 +373,15 @@ bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field, return valid && scope().resolveValue(data, type, logger); } -bool DocumentChildHandler::data(Variant &data) +bool DocumentChildHandler::data(TokenizedData &data) { + // TODO: Handle this correctly + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // For now, except "no data" as success + return true; + } + // We're past the region in which explicit fields can be defined in the // parent structure element scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true); @@ -393,11 +401,11 @@ bool DocumentChildHandler::data(Variant &data) // If it is a primitive field directly, try to parse the content. if (field->isPrimitive()) { // Add it as primitive content. - if (!convertData(field, data, logger())) { + if (!convertData(field, text, logger())) { return false; } - parent->createChildDocumentPrimitive(data, fieldIdx); + parent->createChildDocumentPrimitive(text, fieldIdx); return true; } @@ -411,7 +419,7 @@ bool DocumentChildHandler::data(Variant &data) for (auto primitiveField : defaultFields) { // Then try to parse the content using the type specification. forks.emplace_back(logger().fork()); - if (!convertData(primitiveField, data, forks.back())) { + if (!convertData(primitiveField, text, forks.back())) { continue; } @@ -424,7 +432,7 @@ bool DocumentChildHandler::data(Variant &data) createPath(fieldIdx, path, parent); // Then create the primitive element - parent->createChildDocumentPrimitive(data); + parent->createChildDocumentPrimitive(text); return true; } @@ -434,10 +442,10 @@ bool DocumentChildHandler::data(Variant &data) if (defaultFields.empty()) { logger().error("Got data, but structure \"" + name() + "\" does not have any primitive field", - data); + text); } else { logger().error("Could not read data with any of the possible fields:", - data); + text); size_t f = 0; for (auto field : defaultFields) { logger().note(std::string("Field ") + @@ -471,4 +479,4 @@ namespace RttiTypes { const Rtti DocumentField = RttiBuilder<ousia::parser_stack::DocumentField>( "DocumentField").parent(&Node); } -}
\ No newline at end of file +} diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 862081c..dda7d8b 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -167,7 +167,7 @@ public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; @@ -213,4 +213,4 @@ extern const Rtti DocumentField; } } -#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */
\ No newline at end of file +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index bf5d4ea..3d413e8 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -18,6 +18,7 @@ #include <core/common/Exceptions.hpp> #include <core/common/Logger.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <core/parser/ParserContext.hpp> #include "Callbacks.hpp" @@ -130,7 +131,7 @@ bool EmptyHandler::annotationEnd(const Variant &className, return true; } -bool EmptyHandler::data(Variant &data) +bool EmptyHandler::data(TokenizedData &data) { // Support any data return true; @@ -184,10 +185,13 @@ bool StaticHandler::annotationEnd(const Variant &className, return false; } -bool StaticHandler::data(Variant &data) +bool StaticHandler::data(TokenizedData &data) { - logger().error("Did not expect any data here", data); - return false; + if (data.text(WhitespaceMode::TRIM) != nullptr) { + logger().error("Did not expect any data here", data); + return false; + } + return true; } /* Class StaticFieldHandler */ @@ -227,12 +231,19 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(Variant &data) +bool StaticFieldHandler::data(TokenizedData &data) { + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // Providing no data here is ok as long as the "doHandle" callback + // function has already been called + return handled; + } + // Call the doHandle function if this has not been done before if (!handled) { handled = true; - doHandle(data, args); + doHandle(text, args); return true; } @@ -240,7 +251,7 @@ bool StaticFieldHandler::data(Variant &data) logger().error( std::string("Found data, but the corresponding argument \"") + argName + std::string("\" was already specified"), - data); + text); // Print the location at which the attribute was originally specified auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 7cda7a4..929466d 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -31,6 +31,7 @@ namespace ousia { class ParserScope; class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -158,40 +159,63 @@ protected: */ const std::string &name() const; -public: - /** - * Virtual destructor. - */ - virtual ~Handler(); - /** * Calls the corresponding function in the Callbacks instance. Sets the * whitespace mode that specifies how string data should be processed. The * calls to this function are placed on a stack by the underlying Stack - * class. + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * * @param whitespaceMode specifies one of the three WhitespaceMode constants * PRESERVE, TRIM or COLLAPSE. */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); + void pushWhitespaceMode(WhitespaceMode whitespaceMode); /** - * Calls the corresponding function in the Callbacks instance. - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. */ - void registerToken(const std::string &token); + void popWhitespaceMode(); /** - * Calls the corresponding function in the Callbacks instance. - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * - * @param token is the token string that should be unregistered. + * @param tokens is a list of tokens that should be reported to this handler + * instance via the "token" method. */ - void unregisterToken(const std::string &token); + void pushTokens(const std::vector<std::string> &tokens); + + /** + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. + */ + void popWhitespaceMode(); + + + /** + * Calls the corresponding function in the Callbacks instance. This method + * registers the given tokens as tokens that are generally available, tokens + * must be explicitly enabled using the "pushTokens" and "popTokens" method. + * Tokens that have not been registered are not guaranteed to be reported, + * even though they are + */ + void registerTokens(const std::vector<std::string> &tokens); + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); /** * Returns the command name for which the handler was created. @@ -299,11 +323,11 @@ public: * Handler instance. Should return true if the data could be handled, false * otherwise. * - * @param data is a string variant containing the character data and its - * location. + * @param data is an instance of TokenizedData containing the segmented + * character data and its location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(Variant &data) = 0; + virtual bool data(TokenizedData &data) = 0; }; /** @@ -333,7 +357,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; /** * Creates an instance of the EmptyHandler class. @@ -359,7 +383,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; /** @@ -412,7 +436,7 @@ protected: public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; } } diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 5b67248..309c9a0 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -19,6 +19,7 @@ #include <core/common/Logger.hpp> #include <core/common/Utils.hpp> #include <core/common/Exceptions.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <core/parser/ParserScope.hpp> #include <core/parser/ParserContext.hpp> @@ -413,16 +414,24 @@ void Stack::command(const Variant &name, const Variant::mapType &args) } } -void Stack::data(const Variant &data) +void Stack::data(TokenizedData data) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); + // TODO: Rewrite this function for token handling + // TODO: This loop needs to be refactored out + while (!data.atEnd()) { + // End handlers that already had a default field and are currently not + // active. + endOverdueHandlers(); - while (true) { - // Check whether there is any command the data can be sent to + const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); + + // Check whether there is any command the data can be sent to -- if not, + // make sure the data actually is data if (stack.empty()) { - throw LoggableException("No command here to receive data.", data); + if (hasNonWhitespaceText) { + throw LoggableException("No command here to receive data.", data); + } + return; } // Fetch the current command handler information @@ -440,7 +449,10 @@ void Stack::data(const Variant &data) // If the "hadDefaultField" flag is set, we already issued an error // message if (!info.hadDefaultField) { - logger().error("Did not expect any data here", data); + if (hasNonWhitespaceText) { + logger().error("Did not expect any data here", data); + } + return; } } @@ -454,8 +466,16 @@ void Stack::data(const Variant &data) // Pass the data to the current Handler instance bool valid = false; try { - Variant dataCopy = data; - valid = info.handler->data(dataCopy); + // Create a fork of the TokenizedData and let the handler work + // on it + TokenizedData dataFork = data; + valid = info.handler->data(dataFork); + + // If the data was validly handled by the handler, commit the + // change + if (valid) { + data = dataFork; + } } catch (LoggableException ex) { loggerFork.log(ex); @@ -482,6 +502,19 @@ void Stack::data(const Variant &data) } } +void Stack::data(const Variant &stringData) +{ + // Fetch the SourceLocation of the given stringData variant + SourceLocation loc = stringData.getLocation(); + + // Create a TokenizedData instance and feed the given string data into it + TokenizedData tokenizedData(loc.getSourceId()); + tokenizedData.append(stringData.asString(), loc.getStart()); + + // Call the actual "data" method + data(tokenizedData); +} + void Stack::fieldStart(bool isDefault) { // Make sure the current handler stack is not empty @@ -584,4 +617,4 @@ void Stack::token(Variant token) // TODO } } -}
\ No newline at end of file +} diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index b67ce82..cd29b28 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -44,6 +44,7 @@ namespace ousia { // Forward declarations class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -292,13 +293,24 @@ public: void command(const Variant &name, const Variant::mapType &args); /** - * Function that shuold be called whenever character data is found in the + * Function that should be called whenever character data is found in the * input stream. May only be called if the currently is a command on the * stack. * - * @param data is a string variant containing the data that has been found. + * @param data is a TokenizedData instance containing the pre-segmented data + * that should be read. + */ + void data(TokenizedData data); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. The given string variant is converted into a TokenizedData + * instance internally. + * + * @param stringData is a string variant containing the data that has been + * found. */ - void data(const Variant &data); + void data(const Variant &stringData); /** * Function that should be called whenever a new field starts. Fields of the diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..aaebe7d 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -127,7 +127,7 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) + std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); @@ -157,7 +157,31 @@ public: /** * Returns the number of characters for which offsets are stored. */ - size_t size() { return lens.size(); } + size_t size() const { return lens.size(); } + + /** + * Trims the length of the TokenizedData instance to the given length. + * Removes all token matches that lie within the trimmed region. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) { + if (length < size()) { + lens.resize(length); + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + } + } + + /** + * Resets the SourceOffsetVector to the state it had when it was + * constructed. + */ + void clear() { + lens.clear(); + offsets.clear(); + lastEnd = 0; + } }; } diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 80cc945..a45d3ff 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia { /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(Tokens::Empty) {} +TokenTrie::Node::Node() : id(Tokens::Empty) {} /* Class DynamicTokenTree */ bool TokenTrie::registerToken(const std::string &token, - TokenId type) noexcept + TokenId id) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token, } // If the resulting node already has a type set, we're screwed. - if (node->type != Tokens::Empty) { + if (node->id != Tokens::Empty) { return false; } // Otherwise just set the type to the given type. - node->type = type; + node->id = id; return true; } @@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->type != Tokens::Empty || node->children.size() > 1) && + if ((node->id != Tokens::Empty || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; @@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept } // If the node type is already Tokens::Empty, we cannot do anything here - if (node->type == Tokens::Empty) { + if (node->id == Tokens::Empty) { return false; } // If the target node has children, we cannot delete the subtree. Set the // type to Tokens::Empty instead if (!node->children.empty()) { - node->type = Tokens::Empty; + node->id = Tokens::Empty; return true; } @@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept } node = it->second.get(); } - return node->type; + return node->id; } } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index b2d1539..c470acc 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,7 +33,7 @@ #include <limits> #include <unordered_map> -#include "Token.hpp" +#include <core/common/Token.hpp> namespace ousia { @@ -75,10 +75,9 @@ public: ChildMap children; /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. + * Id of the token represented by this node. */ - TokenId type; + TokenId id; /** * Default constructor, initializes the descriptor with nullptr. @@ -99,10 +98,10 @@ public: * * @param token is the character sequence that should be registered as * token. - * @param type is the descriptor that should be set for this token. + * @param id is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, TokenId type) noexcept; + bool registerToken(const std::string &token, TokenId id) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..aeefa26 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -48,6 +48,17 @@ struct TokenMark { TokenLength len; /** + * Specifies whether the token is special or not. + */ + bool special; + + /** + * Maximum token length. + */ + static constexpr TokenLength MaxTokenLength = + std::numeric_limits<TokenLength>::max(); + + /** * Constructor of the TokenMark structure, initializes all members with the * given values. * @@ -55,9 +66,10 @@ struct TokenMark { * @param bufStart is the start position of the TokenMark in the internal * character buffer. * @param len is the length of the token. + * @param special modifies the sort order, special tokens are prefered. */ - TokenMark(TokenId id, size_t bufStart, TokenLength len) - : bufStart(bufStart), id(id), len(len) + TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) + : bufStart(bufStart), id(id), len(len), special(special) { } @@ -72,7 +84,8 @@ struct TokenMark { TokenMark(size_t bufStart) : bufStart(bufStart), id(Tokens::Empty), - len(std::numeric_limits<TokenLength>::max()) + len(MaxTokenLength), + special(true) { } @@ -86,8 +99,22 @@ struct TokenMark { */ friend bool operator<(const TokenMark &m1, const TokenMark &m2) { - return (m1.bufStart < m2.bufStart) || - (m1.bufStart == m2.bufStart && m1.len > m2.len); + // Prefer the mark with the smaller bufStart + if (m1.bufStart < m2.bufStart) { + return true; + } + + // Special handling for marks with the same bufStart + if (m1.bufStart == m2.bufStart) { + // If exactly one of the two marks is special, return true if this + // one is special + if (m1.special != m2.special) { + return m1.special; + } + // Otherwise prefer longer marks + return m1.len > m2.len; + } + return false; } }; } @@ -110,9 +137,9 @@ private: std::vector<char> buf; /** - * Vector containing all token marks. + * Buffset storing the "protected" flag of the character data. */ - std::vector<TokenMark> marks; + std::vector<bool> protectedChars; /** * Vector storing all the character offsets efficiently. @@ -120,9 +147,34 @@ private: SourceOffsetVector offsets; /** + * Vector containing all token marks. + */ + mutable std::vector<TokenMark> marks; + + /** + * Position of the first linebreak in a sequence of linebreaks. + */ + size_t firstLinebreak; + + /** + * Current indentation level. + */ + uint16_t currentIndentation; + + /** + * Last indentation level. + */ + uint16_t lastIndentation; + + /** + * Number of linebreaks without any content between them. + */ + uint16_t numLinebreaks; + + /** * Flag indicating whether the internal "marks" vector is sorted. */ - bool sorted; + mutable bool sorted; public: /** @@ -132,7 +184,7 @@ public: * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ - TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} + TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); } /** * Appends a complete string to the internal character buffer and extends @@ -140,22 +192,22 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart) - { // Append the data to the internal buffer - buf.insert(buf.end(), data.begin(), data.end()); - - // Extend the text regions, interpolate the source position (this may - // yield incorrect results) - const size_t size = buf.size(); - for (SourceOffset offs = offsStart; offs < offsStart + data.size(); - offs++) { - offsets.storeOffset(offs, offs + 1); + size_t append(const std::string &data, SourceOffset offsStart, bool protect) + { + for (size_t i = 0; i < data.size(); i++) { + if (offsStart != InvalidSourceOffset) { + append(data[i], offsStart + i, offsStart + i + 1, protect); + } else { + append(data[i], InvalidSourceOffset, InvalidSourceOffset, + protect); + } } - - return size; + return size(); } /** @@ -165,16 +217,86 @@ public: * @param c is the character that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param offsEnd is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect) { // Add the character to the list and store the location of the character // in the source file buf.push_back(c); + protectedChars.push_back(protect); offsets.storeOffset(offsStart, offsEnd); - return buf.size(); + + // Insert special tokens + const size_t size = buf.size(); + const bool isWhitespace = Utils::isWhitespace(c); + const bool isLinebreak = Utils::isLinebreak(c); + + // Handle linebreaks + if (isLinebreak) { + // Mark linebreaks as linebreak + mark(Tokens::Newline, size - 1, 1, false); + + // The linebreak sequence started at the previous character + if (numLinebreaks == 0) { + firstLinebreak = size - 1; + } + + // Reset the indentation + currentIndentation = 0; + + // Increment the number of linebreaks + numLinebreaks++; + + const size_t markStart = firstLinebreak; + const size_t markLength = size - firstLinebreak; + + // Issue two consecutive linebreaks as paragraph token + if (numLinebreaks == 2) { + mark(Tokens::Paragraph, markStart, markLength, false); + } + + // Issue three consecutive linebreaks as paragraph token + if (numLinebreaks >= 3) { + mark(Tokens::Section, markStart, markLength, false); + } + } else if (isWhitespace) { + // Count the whitespace characters at the beginning of the line + if (numLinebreaks > 0) { + // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the + // next multiple of eight. + if (c == '\t') { + currentIndentation = (currentIndentation + 8) & ~7; + } else { + currentIndentation++; + } + } + } + + // Issue indent and unindent tokens + if (!isWhitespace && numLinebreaks > 0) { + // Issue a larger indentation than that in the previous line as + // "Indent" token + if (currentIndentation > lastIndentation) { + mark(Tokens::Indent, size - 1, 0, true); + } + + // Issue a smaller indentation than that in the previous line as + // "Dedent" token + if (currentIndentation < lastIndentation) { + mark(Tokens::Dedent, size - 1, 0, true); + } + + // Reset the internal state machine + lastIndentation = currentIndentation; + numLinebreaks = 0; + } + + return size; } /** @@ -184,11 +306,12 @@ public: * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. + * @param special tags the mark as "special", prefering it in the sort order */ - void mark(TokenId id, size_t bufStart, TokenLength len) + void mark(TokenId id, size_t bufStart, TokenLength len, bool special) { // Push the new instance back onto the list - marks.emplace_back(id, bufStart, len); + marks.emplace_back(id, bufStart, len, special); // Update the sorted flag as soon as more than one element is in the // list @@ -212,9 +335,13 @@ public: * @return true if a token was returned, false if no more tokens are * available. */ - bool next(Token &token, WhitespaceMode mode, - const std::unordered_set<TokenId> &tokens, size_t &cursor) + bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, + TokenizedDataCursor &cursor) const { + // Some variables for convenient access + size_t &bufPos = cursor.bufPos; + size_t &markPos = cursor.markPos; + // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { std::sort(marks.begin(), marks.end()); @@ -222,10 +349,11 @@ public: } // Fetch the next larger TokenMark instance, make sure the token is in - // the "enabled" list - auto it = - std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); - while (it != marks.end() && tokens.count(it->id) == 0) { + // the "enabled" list and within the buffer range + auto it = std::lower_bound(marks.begin() + markPos, marks.end(), + TokenMark(bufPos)); + while (it != marks.end() && (tokens.count(it->id) == 0 || + it->bufStart + it->len > buf.size())) { it++; } @@ -236,15 +364,15 @@ public: // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. - if (cursor < end && cursor < buf.size()) { + if (bufPos < end && bufPos < buf.size()) { switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( - Tokens::Data, std::string(&buf[cursor], end - cursor), + Tokens::Data, std::string(&buf[bufPos], end - bufPos), SourceLocation(sourceId, - offsets.loadOffset(cursor).first, + offsets.loadOffset(bufPos).first, offsets.loadOffset(end).first)); - cursor = end; + bufPos = end; return true; } case WhitespaceMode::TRIM: @@ -254,30 +382,35 @@ public: size_t stringStart; size_t stringEnd; std::string content; + const char *cBuf = &buf[bufPos]; + auto filter = [cBuf, this](size_t i) -> bool { + return Utils::isWhitespace(cBuf[i]) && + !protectedChars[i]; + }; if (mode == WhitespaceMode::TRIM) { - content = Utils::trim(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::trim(cBuf, end - bufPos, stringStart, + stringEnd, filter); } else { - content = Utils::collapse(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::collapse( + cBuf, end - bufPos, stringStart, stringEnd, filter); } // If the resulting string is empty (only whitespaces), // abort if (content.empty()) { - cursor = end; + bufPos = end; break; } // Calculate the absolute positions and return the token - stringStart += cursor; - stringEnd += cursor; + stringStart += bufPos; + stringEnd += bufPos; token = Token( Tokens::Data, content, SourceLocation(sourceId, offsets.loadOffset(stringStart).first, offsets.loadOffset(stringEnd).first)); - cursor = end; + bufPos = end; return true; } } @@ -286,14 +419,18 @@ public: // If start equals end, we're currently directly at a token // instance. Return this token and advance the cursor to the end of // the token. - if (cursor == end && it != marks.end()) { + if (bufPos == end && it != marks.end()) { const size_t tokenStart = it->bufStart; const size_t tokenEnd = it->bufStart + it->len; token = Token( it->id, std::string(&buf[tokenStart], it->len), SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, offsets.loadOffset(tokenEnd).first)); - cursor = tokenEnd; + + // Update the cursor, consume the token by incrementing the marks + // pos counter + bufPos = tokenEnd; + markPos = it - marks.begin() + 1; return true; } @@ -304,11 +441,62 @@ public: } /** + * Resets the TokenizedDataImpl instance to the state it had when it was + * constructred. + */ + void clear() + { + buf.clear(); + protectedChars.clear(); + offsets.clear(); + marks.clear(); + currentIndentation = 0; + lastIndentation = 0; + numLinebreaks = 1; // Assume the stream starts with a linebreak + sorted = true; + } + + /** + * Trims the length of the TokenizedDataImpl instance to the given length. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) + { + if (length < size()) { + buf.resize(length); + offsets.trim(length); + } + } + + /** * Returns the current size of the internal buffer. * * @return the size of the internal character buffer. */ - size_t getSize() { return buf.size(); } + size_t size() const { return buf.size(); } + + /** + * Returns true if no data is in the data buffer. + * + * @return true if the "buf" instance has no data. + */ + bool empty() const { return buf.empty(); } + + /** + * Returns the current location of all data in the buffer. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const + { + if (empty()) { + return SourceLocation{sourceId}; + } + return SourceLocation{sourceId, offsets.loadOffset(0).first, + offsets.loadOffset(size()).second}; + } }; /* Class TokenizedData */ @@ -316,50 +504,83 @@ public: TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} TokenizedData::TokenizedData(SourceId sourceId) - : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) + : impl(std::make_shared<TokenizedDataImpl>(sourceId)) { } TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, + bool protect) { - return impl->append(data, offsStart); + return impl->append(data, offsStart, protect); } size_t TokenizedData::append(char c, SourceOffset offsStart, - SourceOffset offsEnd) + SourceOffset offsEnd, bool protect) { - return impl->append(c, offsStart, offsEnd); + return impl->append(c, offsStart, offsEnd, protect); } void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->getSize() - len, len); + impl->mark(id, impl->size() - len, len, false); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) { - impl->mark(id, bufStart, len); + impl->mark(id, bufStart, len, false); } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear() { impl->clear(); } + +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const { - return impl->next(token, mode, tokens, cursor); + return impl->getLocation(); } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +TokenizedDataReader TokenizedData::reader() const { - // Copy the current cursor position to not update the actual cursor position - // if the operation was not successful - size_t cursorCopy = cursor; - if (!impl->next(token, mode, tokens, cursorCopy) || - token.id != Tokens::Data) { - return false; - } + return TokenizedDataReader(impl, TokenizedDataCursor(), + TokenizedDataCursor()); +} + +/* Class TokenizedDataReader */ - // There is indeed a text token, update the internal cursor position - cursor = cursorCopy; - return true; +TokenizedDataReader::TokenizedDataReader( + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ + return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const +{ + return readCursor.bufPos >= impl->size(); +} + +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + peekCursor = readCursor; + return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + return impl->next(token, mode, tokens, peekCursor); } } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..b72ca02 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -37,40 +37,48 @@ #include <core/common/Location.hpp> #include <core/common/Whitespace.hpp> - -#include "Token.hpp" +#include <core/common/Token.hpp> namespace ousia { // Forward declaration class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork; /** - * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * Internally used structure representing a cursor within the TokenizedData + * stream. */ -class TokenizedData { -private: +struct TokenizedDataCursor { /** - * Shared pointer pointing at the internal data. This data is shared when - * copying TokenizedData instances, which corresponds to forking a - * TokenizedData instance. + * Position within the byte buffer. */ - std::shared_ptr<TokenizedDataImpl> impl; + size_t bufPos; /** - * Contains all currently enabled token ids. + * Position within the token mark buffer. */ - std::unordered_set<TokenId> tokens; + size_t markPos; /** - * Position from which the last element was read from the internal buffer. - * This information is not shared with the other instances of TokenizedData - * pointing at the same location. + * Default constructor. The resulting cursor points at the beginning of the + * stream. + */ + TokenizedDataCursor() : bufPos(0), markPos(0) {} +}; + +/** + * The TokenizedData class stores data extracted from a user defined document. + * The data stored in TokenizedData + */ +class TokenizedData { +private: + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. */ - size_t cursor; + std::shared_ptr<TokenizedDataImpl> impl; public: /** @@ -101,10 +109,13 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart = 0); + size_t append(const std::string &data, SourceOffset offsStart = 0, + bool protect = false); /** * Appends a single character to the internal character buffer. @@ -112,10 +123,13 @@ public: * @param c is the character that should be appended to the buffer. * @param start is the start offset in bytes in the input file. * @param end is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect = false); /** * Stores a token ending at the last character of the current buffer. @@ -136,54 +150,194 @@ public: void mark(TokenId id, size_t bufStart, TokenLength len); /** - * Enables a single token id. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Resets the TokenizedData instance to the state it had when it was + * constructred. + */ + void clear(); + + /** + * Trims the length of the TokenizedData instance to the given length. Note + * that this function does not remove any token matches for performance + * reasons, it merely renders them incaccessible. Appending new data after + * calling trim will make the token marks accessible again. Thus this method + * should be the last function called to modify the data buffer and the + * token marks. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length); + + /** + * Returns the number of characters currently represented by this + * TokenizedData instance. + */ + size_t size() const; + + /** + * Returns true if the TokenizedData instance is empty, false otherwise. * - * @param id is the TokenId of the token that should be enabled. + * @return true if not data is stored inside the TokenizedData instance. */ - void enableToken(TokenId id) { tokens.insert(id); } + bool empty() const; /** - * Enables a set of token ids. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Returns the location of the entire TokenizedData instance. * - * @param ids is the TokenId of the token that should be enabled. + * @return the location of the entire data represented by this instance. */ - void enableToken(const std::unordered_set<TokenId> &ids) - { - tokens.insert(ids.begin(), ids.end()); - } + SourceLocation getLocation() const; + + /** + * Returns a TokenizedDataReader instance that can be used to access the + * data. + * + * @return a new TokenizedDataReader instance pointing at the beginning of + * the internal buffer. + */ + TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: + friend TokenizedData; + + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. + */ + std::shared_ptr<const TokenizedDataImpl> impl; + + /** + * Position from which the last element was read from the internal buffer. + */ + TokenizedDataCursor readCursor; + + /** + * Position from which the last element was peeked from the internal buffer. + */ + TokenizedDataCursor peekCursor; + +protected: + /** + * Protected constructor of TokenizedDataReader, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader. + * + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor); + +public: + /** + * Returns a new TokenizedDataReaderFork from which tokens and text can be + * read without advancing this reader instance. + */ + TokenizedDataReaderFork fork(); + + /** + * Returns true if this TokenizedData instance is at the end. + * + * @return true if the end of the TokenizedData instance has been reached. + */ + bool atEnd() const; /** * Stores the next token in the given token reference, returns true if the - * operation was successful, false if there are no more tokens. + * operation was successful, false if there are no more tokens. Advances the + * internal cursor and re * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool read(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM); /** - * Stores the next text token in the given token reference, returns true if - * the operation was successful (there was indeed a text token), false if - * the next token is not a text token or there were no more tokens. + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool peek(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM); + + /** + * Consumes the peeked tokens, the read cursor will now be at the position + * of the peek cursor. + */ + void consumePeek() { readCursor = peekCursor; } + + /** + * Resets the peek cursor to the position of the read cursor. + */ + void resetPeek() { peekCursor = readCursor; } +}; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: + friend TokenizedDataReader; + + /** + * Reference pointing at the parent TokenizedDataReader to which changes may + * be commited. + */ + TokenizedDataReader &parent; + + /** + * Private constructor of TokenizedDataReaderFork, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader and a reference at the parent TokenizedDataReader. + * + * @param parent is the TokenizedDataReader instance to which the current + * read/peek progress may be commited. + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReaderFork(TokenizedDataReader &parent, + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) + { + } + +public: + /** + * Commits the read/peek progress to the underlying parent. + */ + void commit() { parent = *this; } }; } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..e78b0f4 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@ #include <core/common/CharReader.hpp> #include <core/common/Exceptions.hpp> #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include "TokenizedData.hpp" #include "Tokenizer.hpp" namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch { Token token; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; + size_t dataStartOffset; /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. + * Set to true if the matched token is a primary token. */ - size_t textEnd; + bool primary; /** * Constructor of the TokenMatch class. */ - TokenMatch() : textLength(0), textEnd(0) {} + TokenMatch() : dataStartOffset(0), primary(false) {} /** * Returns true if this TokenMatch instance actually represents a match. + * + * @return true if the TokenMatch actually has a match. + */ + bool hasMatch() const { return token.id != Tokens::Empty; } + + /** + * Returns the length of the matched token. + * + * @return the length of the token string. */ - bool hasMatch() { return token.id != Tokens::Empty; } + size_t size() const { return token.content.size(); } }; /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private: size_t start; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; + size_t dataStartOffset; public: /** * Constructor of the TokenLookup class. * * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. + * @param start is the start position in the source file. + * @param dataStartOffset is the current length of the TokenizedData buffer. */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t dataStartOffset) + : node(node), start(start), dataStartOffset(dataStartOffset) { } /** * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). + * character. If a complete token is matched, stores the match in the given + * TokenMatch reference and returns true. * * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,49 @@ public: * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. + * @return true if a token was matched, false otherwise. */ - void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, - const std::vector<std::string> &tokens, SourceOffset end, - SourceId sourceId) + bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, + const std::vector<Tokenizer::TokenDescriptor> &tokens, + SourceOffset end, SourceId sourceId) { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node + // Set to true once a token has been matched + bool res = false; + + // Check whether we can continue the current token path, if not, abort auto it = node->children.find(c); if (it == node->children.end()) { - return; + return res; } // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); - if (node->type != Tokens::Empty) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - Token{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } + if (node->id != Tokens::Empty) { + const Tokenizer::TokenDescriptor &descr = tokens[node->id]; + match.token = Token(node->id, descr.string, + SourceLocation(sourceId, start, end)); + match.dataStartOffset = dataStartOffset; + match.primary = descr.primary; + res = true; } // If this state can possibly be advanced, store it in the states list. if (!node->children.empty()) { lookups.emplace_back(*this); } + return res; } }; -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.id = Tokens::Data; -} } /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template <typename TextHandler, bool read> -bool Tokenizer::next(CharReader &reader, Token &token) +template <bool read> +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -199,43 +174,62 @@ bool Tokenizer::next(CharReader &reader, Token &token) // Prepare the lookups in the token trie const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; + TokenMatch bestMatch; std::vector<TokenLookup> lookups; std::vector<TokenLookup> nextLookups; - // Instantiate the text handler - TextHandler textHandler; - // Peek characters from the reader and try to advance the current token tree // cursor char c; + const size_t initialDataSize = data.size(); size_t charStart = reader.getPeekOffset(); const SourceId sourceId = reader.getSourceId(); while (reader.peek(c)) { const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; + const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); + if (!bestMatch.hasMatch()) { + lookups.emplace_back(root, charStart, dataStartOffset); } // Try to advance all other lookups with the new character + TokenMatch match; for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + // Continue if the current lookup + if (!lookup.advance(c, nextLookups, match, tokens, charEnd, + sourceId)) { + continue; + } + + // If the matched token is primary, check whether it is better than + // the current best match, if yes, replace the best match. In any + // case just continue + if (match.primary) { + if (match.size() > bestMatch.size()) { + bestMatch = match; + } + continue; + } + + // Otherwise -- if the matched token is a non-primary token (and no + // primary token has been found until now) -- mark the match in the + // TokenizedData + if (!bestMatch.hasMatch()) { + data.mark(match.token.id, data.size() - match.size() + 1, + match.size()); + } } // We have found a token and there are no more states to advance or the // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { + if (bestMatch.hasMatch()) { + if ((nextLookups.empty() || data.size() > initialDataSize)) { break; } } else { // Record all incomming characters - textHandler.append(c, charStart, charEnd); + data.append(c, charStart, charEnd); } // Swap the lookups and the nextLookups list @@ -246,60 +240,53 @@ bool Tokenizer::next(CharReader &reader, Token &token) charStart = charEnd; } - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildDataToken(textHandler, match, sourceId); + // If we found data, emit a corresponding data token + if (data.size() > initialDataSize && + (!bestMatch.hasMatch() || + bestMatch.dataStartOffset > initialDataSize)) { + // If we have a "bestMatch" wich starts after text data has started, + // trim the TokenizedData to this offset + if (bestMatch.dataStartOffset > initialDataSize) { + data.trim(bestMatch.dataStartOffset); + } + + // Create a token containing the data location + bestMatch.token = Token{data.getLocation()}; } // Move the read/peek cursor to the end of the token, abort if an error // happens while doing so - if (match.hasMatch()) { + if (bestMatch.hasMatch()) { // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { + if (bestMatch.token.location.getEnd() == InvalidSourceOffset) { throw OusiaException{"Token end position offset out of range"}; } // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); + const size_t end = bestMatch.token.location.getEnd(); if (read) { reader.seek(end); } else { reader.seekPeekCursor(end); } - token = match.token; + token = bestMatch.token; } else { token = Token{}; } - return match.hasMatch(); + return bestMatch.hasMatch(); } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingWhitespaceHandler, true>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingWhitespaceHandler, true>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingWhitespaceHandler, true>(reader, token); - } - return false; + return next<true>(reader, token, data); } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingWhitespaceHandler, false>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingWhitespaceHandler, false>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingWhitespaceHandler, false>(reader, token); - } - return false; + return next<false>(reader, token, data); } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary) { // Abort if an empty token should be registered if (token.empty()) { @@ -309,8 +296,8 @@ TokenId Tokenizer::registerToken(const std::string &token) // Search for a new slot in the tokens list TokenId type = Tokens::Empty; for (size_t i = nextTokenId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; + if (!tokens[i].valid()) { + tokens[i] = TokenDescriptor(token, primary); type = i; break; } @@ -320,62 +307,47 @@ TokenId Tokenizer::registerToken(const std::string &token) // override the special token type handles if (type == Tokens::Empty) { type = tokens.size(); - if (type == Tokens::Data || type == Tokens::Empty) { + if (type >= Tokens::MaxTokenId) { throw OusiaException{"Token type ids depleted!"}; } - tokens.emplace_back(token); + tokens.emplace_back(token, primary); } nextTokenId = type + 1; - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list + // Try to register the token in the trie -- if this fails, remove it from + // the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; + tokens[type] = TokenDescriptor(); nextTokenId = type; return Tokens::Empty; } return type; } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id) { // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenId = type; + if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { + tokens[id] = TokenDescriptor(); + nextTokenId = id; return true; } return false; } -std::string Tokenizer::getTokenString(TokenId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const { - whitespaceMode = mode; + if (id < tokens.size()) { + return tokens[id]; + } + return EmptyTokenDescriptor; } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } - /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool Tokenizer::next<PreservingWhitespaceHandler, false>( - CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, false>( - CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, false>( - CharReader &reader, Token &token); -template bool Tokenizer::next<PreservingWhitespaceHandler, true>( - CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, true>( - CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, true>( - CharReader &reader, Token &token); +template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &); } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@ /** * @file Tokenizer.hpp * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -28,44 +28,80 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include <set> +#include <cstdint> #include <string> #include <vector> #include <core/common/Location.hpp> -#include <core/common/Whitespace.hpp> +#include <core/common/Token.hpp> -#include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; +class TokenizedData; /** * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance. */ class Tokenizer { -private: +public: /** - * Internally used token trie. This object holds all registered tokens. + * Internally used structure describing a registered token. */ - TokenTrie trie; + struct TokenDescriptor { + /** + * String describing the token. + */ + std::string string; + + /** + * Set to true if this token is primary. + */ + bool primary; + + /** + * Constructor of the TokenDescriptor class. + * + * @param string is the string representation of the registered token. + * @param primary specifies whether the token is a primary token that + * should be returned as a single token, or a secondary token, that + * should be returned as part of TokenizedData. + */ + TokenDescriptor(const std::string &string, bool primary) + : string(string), primary(primary) + { + } + + /** + * Default constructor. + */ + TokenDescriptor() : primary(false) {} + + /** + * Returns true if the TokenDescriptor represents a valid token. + */ + bool valid() { return !string.empty(); } + }; +private: /** - * Flag defining whether whitespaces should be preserved or not. + * Internally used token trie. This object holds all registered tokens. */ - WhitespaceMode whitespaceMode; + TokenTrie trie; /** * Vector containing all registered token types. */ - std::vector<std::string> tokens; + std::vector<TokenDescriptor> tokens; /** * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private: /** * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. + * function is templated in order to force optimized code generation for + * both reading and peeking. * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. + * @tparam read specifies whether the method should read the token or just + * peek. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return false if the end of the stream has been reached, true otherwise. */ - template <typename TextHandler, bool read> - bool next(CharReader &reader, Token &token); + template <bool read> + bool next(CharReader &reader, Token &token, TokenizedData &data); public: /** * Constructor of the Tokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. */ - Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + Tokenizer(); /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. + * Registers the given string as a token. Returns a unique identifier + * describing the registered token. * * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if + * @param primary specifies whether the token is a primary token -- if true, + * the token will be returned as a single, standalone token. Otherwise the + * token will be returned as part of a "TokenizedData" structure. + * @return a unique identifier for the registered token or Tokens::Empty if * an error occured. */ - TokenId registerToken(const std::string &token); + TokenId registerToken(const std::string &token, bool primary = true); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The - *TokenId - * must have been returned by registerToken. + * TokenId must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). + * because the token with the given TokenId was already unregistered). */ - bool unregisterToken(TokenId type); + bool unregisterToken(TokenId id); /** * Returns the token that was registered under the given TokenId id or - *an - * empty string if an invalid TokenId id is given. + * an empty string if an invalid TokenId id is given. * - * @param type is the TokenId id for which the corresponding token - *string + * @param id is the TokenId for which the corresponding TokenDescriptor * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. + * @return the registered TokenDescriptor or an invalid TokenDescriptor if + * the given TokenId is invalid. */ - WhitespaceMode getWhitespaceMode(); + const TokenDescriptor& lookupToken(TokenId id) const; /** * Reads a new token from the CharReader and stores it in the given - * Token instance. + * Token instance. If the token has the id Tokens::Data, use the "getData" + * method to fetch a reference at the underlying TokenizedData instance + * storing the data. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(CharReader &reader, Token &token); + bool read(CharReader &reader, Token &token, TokenizedData &data); /** * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public: * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(CharReader &reader, Token &token); + bool peek(CharReader &reader, Token &token, TokenizedData &data); }; } diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..d4cdbf8 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -94,92 +94,11 @@ public: static const PlainFormatTokens OsmlTokens; -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: - /** - * Internal character buffer. - */ - std::vector<char> buf; - - /** - * Start location of the character data. - */ - SourceOffset start; - - /** - * End location of the character data. - */ - SourceOffset end; - -public: - /** - * Default constructor, initializes start and end with zeros. - */ - DataHandler() : start(0), end(0) {} - - /** - * Returns true if the internal buffer is empty. - * - * @return true if no characters were added to the internal buffer, false - * otherwise. - */ - bool isEmpty() { return buf.empty(); } - - /** - * Appends a single character to the internal buffer. - * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. - */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) - { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; - } - - /** - * Appends a string to the internal buffer. - * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. - */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) - { - if (isEmpty()) { - start = stringStart; - } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; - } - - /** - * Converts the internal buffer to a variant with attached location - * information. - * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. - */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } -}; - OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) - : reader(reader), logger(logger), tokenizer(OsmlTokens) + : reader(reader), + logger(logger), + tokenizer(OsmlTokens), + data(reader.getSourceId()) { // Place an intial command representing the complete file on the stack commands.push(Command{"", Variant::mapType{}, true, true, true, false}); @@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; - bool hasCharSiceNSSep = false; + bool hasCharSinceNSSep = false; std::vector<char> identifier; size_t end = reader.getPeekOffset(); char c, c2; @@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) if ((first && Utils::isIdentifierStartCharacter(c)) || (!first && Utils::isIdentifierCharacter(c))) { identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { identifier.push_back(c); } else { @@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) // This is no longer the first character first = false; - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); + // Advance the hasCharSinceNSSep flag + hasCharSinceNSSep = allowNSSep && (c != ':'); end = reader.getPeekOffset(); reader.consumePeek(); @@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment() { Token token; size_t depth = 1; - while (tokenizer.read(reader, token)) { + while (tokenizer.read(reader, token, data)) { + // Throw the comment data away + data.clear(); + if (token.id == OsmlTokens.BlockCommentEnd) { depth--; if (depth == 0) { @@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment() } } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData() { - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); + if (!data.empty()) { location = data.getLocation(); reader.resetPeek(); return true; @@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField() OsmlStreamParser::State OsmlStreamParser::parse() { - // Handler for incomming data - DataHandler handler; + // Reset the data handler + data.clear(); // Read tokens until the outer loop should be left Token token; - while (tokenizer.peek(reader, token)) { + while (tokenizer.peek(reader, token, data)) { const TokenId type = token.id; // Special handling for Backslash and Text @@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() // Try to parse a command if (Utils::isIdentifierStartCharacter(c)) { // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse() // If this was an annotation start token, add the parsed < to the // output if (type == OsmlTokens.AnnotationStart) { - handler.append('<', token.location.getStart(), - token.location.getStart() + 1); + data.append('<', token.location.getStart(), + token.location.getStart() + 1); } - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); + data.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); continue; } else if (type == Tokens::Data) { @@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse() location = token.location; return State::FIELD_START; } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - reader.consumePeek(); continue; } // A non-text token was reached, make sure all pending data commands // have been issued - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse() Command &cmd = commands.top(); if (!cmd.inField) { cmd.inField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got field start token \"{\", but no command for which to " "start the field. Write \"\\{\" to insert this sequence as " "text.", - token); + token);*/ } else if (token.id == OsmlTokens.FieldEnd) { - if (closeField()) { + closeField(); + return State::FIELD_END; +/* if (closeField()) { return State::FIELD_END; } logger.error( "Got field end token \"}\", but there is no field to end. " "Write \"\\}\" to insert this sequence as text.", - token); + token);*/ } else if (token.id == OsmlTokens.DefaultFieldStart) { // Try to start a default field the first time the token is reached Command &topCmd = commands.top(); if (!topCmd.inField) { topCmd.inField = true; topCmd.inDefaultField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got default field start token \"{!\", but no command for " "which to start the field. Write \"\\{!\" to insert this " "sequence as text", - token); + token);*/ } else if (token.id == OsmlTokens.AnnotationEnd) { // We got a single annotation end token "\>" -- simply issue the // ANNOTATION_END event @@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() } // Issue available data - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse() return State::END; } +Variant OsmlStreamParser::getText(WhitespaceMode mode) +{ + TokenizedData dataFork = data; + Variant text = dataFork.text(mode); + location = text.getLocation(); + return text; +} + const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..453a2bb 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,17 +29,19 @@ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include <stack> +#include <memory> #include <core/common/Variant.hpp> +#include <core/common/Whitespace.hpp> #include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/utils/TokenizedData.hpp> namespace ousia { // Forward declarations class CharReader; class Logger; -class DataHandler; +class OsmlStreamParserImpl; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml @@ -137,26 +139,15 @@ public: Variant arguments; /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange : 1; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField : 1; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). + * Vector used as stack for holding the number of opening/closing braces + * and the corresponding "isDefaultField" flag. */ - bool inRangeField : 1; + std::vector<bool> fields; /** - * Set to true if we are currently in a field that has been especially - * marked as default field (using the "|") syntax. + * Set to true if this is a command with clear begin and end. */ - bool inDefaultField : 1; + bool hasRange; /** * Default constructor. @@ -164,7 +155,6 @@ public: Command() : hasRange(false), inField(false), - inRangeField(false), inDefaultField() { } @@ -178,15 +168,10 @@ public: * command. * @param hasRange should be set to true if this is a command with * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently are inside the - * outer field of a ranged command. * @param inDefaultField is set to true if we currently are in a * specially marked default field. */ - Command(Variant name, Variant arguments, bool hasRange, - bool inField, bool inRangeField, bool inDefaultField) + Command(Variant name, Variant arguments, bool hasRange) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), @@ -215,25 +200,20 @@ private: Tokenizer tokenizer; /** - * Stack containing the current commands. - */ - std::stack<Command> commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). + * Variant containing the tokenized data that was returned from the + * tokenizer as data. */ - Variant data; + TokenizedData data; /** - * Contains the location of the last token. + * Stack containing the current commands. */ - SourceLocation location; + std::stack<Command> commands; /** - * Contains the field index of the current command. + * Pointer at */ - size_t fieldIdx; + std::unique_ptr<OsmlStreamParserImpl> impl; /** * Function used internall to parse an identifier. @@ -291,12 +271,10 @@ private: /** * Checks whether there is any data pending to be issued, if yes, issues it. * - * @param handler is the data handler that contains the data that may be - * returned to the user. * @return true if there was any data and DATA should be returned by the * parse function, false otherwise. */ - bool checkIssueData(DataHandler &handler); + bool checkIssueData(); /** * Called before any data is appended to the internal data handler. Checks @@ -328,6 +306,12 @@ public: OsmlStreamParser(CharReader &reader, Logger &logger); /** + * Destructor of the OsmlStreamParser, needed to destroy the incomplete + * OsmlStreamParserImpl. + */ + ~OsmlStreamParser(); + + /** * Continues parsing. Returns one of the states defined in the State enum. * Callers should stop once the State::END state is reached. Use the getter * functions to get more information about the current state, such as the @@ -344,7 +328,19 @@ public: * @return a reference at a variant containing the data parsed by the * "parse" function. */ - const Variant &getData() const { return data; } + const TokenizedData &getData() const { return data; } + + /** + * Returns the complete content of the internal TokenizedData instance as + * a single string Variant. This method is mainly used in the unit tests for + * this class, it simply calls the text() method of TokenizedData. + * + * @param mode is the WhitespaceMode that should be used for returning the + * text. + * @return a string variant containing the text content of the internal + * TokenizedData instance or a nullptr variant if there is no text. + */ + Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Returns a reference at the internally stored command name. Only valid if @@ -371,13 +367,6 @@ public: * syntax). */ bool inDefaultField() const; - - /** - * Returns a reference at the char reader. - * - * @return the last internal token location. - */ - const SourceLocation &getLocation() const { return location; } }; } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..855f80d 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,6 @@ #include <core/common/Variant.hpp> #include <core/common/VariantReader.hpp> #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" @@ -57,17 +56,6 @@ public: std::vector<char> textBuf; /** - * Current whitespace buffer (for the trimming whitspace mode) - */ - std::vector<char> whitespaceBuf; - - /** - * Flag indicating whether a whitespace character was present (for the - * collapsing whitespace mode). - */ - bool hasWhitespace; - - /** * Current character data start. */ size_t textStart; @@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) SourceLocation loc = xmlSyncLoggerPosition(p, ulen); // Fetch some variables for convenience - const WhitespaceMode mode = parser->getWhitespaceMode(); OsxmlEventParserData &data = parser->getData(); std::vector<char> &textBuf = data.textBuf; - std::vector<char> &whitespaceBuf = data.whitespaceBuf; - bool &hasWhitespace = data.hasWhitespace; - size_t &textStart = data.textStart; - size_t &textEnd = data.textEnd; - - size_t pos = loc.getStart(); - for (size_t i = 0; i < ulen; i++, pos++) { - switch (mode) { - case WhitespaceMode::PRESERVE: - PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd); - break; - case WhitespaceMode::TRIM: - TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - whitespaceBuf); - break; - case WhitespaceMode::COLLAPSE: - CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - hasWhitespace); - break; - } + + // Update start and end position + if (textBuf.empty()) { + data.textStart = loc.getStart(); } + data.textEnd = loc.getEnd(); + + // Insert the data into the text buffer + textBuf.insert(textBuf.end(), &s[0], &s[ulen]); } /* Class OsxmlEvents */ @@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ OsxmlEventParserData::OsxmlEventParserData() - : depth(0), - annotationEndTagDepth(-1), - hasWhitespace(false), - textStart(0), - textEnd(0) + : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0) { } @@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId) // Reset the text buffers textBuf.clear(); - whitespaceBuf.clear(); - hasWhitespace = false; textStart = 0; textEnd = 0; @@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), data(new OsxmlEventParserData()) { } @@ -532,16 +497,6 @@ void OsxmlEventParser::parse() } } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ - return whitespaceMode; -} - CharReader &OsxmlEventParser::getReader() const { return reader; } Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..e3fd5d4 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@ #include <memory> #include <string> -#include <core/common/Whitespace.hpp> - namespace ousia { // Forward declarations @@ -99,13 +97,10 @@ public: virtual void fieldEnd() = 0; /** - * Called whenever data is found. Whitespace data is handled as specified - * and the data has been parsed to the specified variant type. This function - * is not called if the parsing failed, the parser prints an error message - * instead. + * Called whenever string data is found. * - * @param data is the already parsed data that should be passed to the - * handler. + * @param data is a Variant containing the string data that was found in the + * XML file. */ virtual void data(const Variant &data) = 0; }; @@ -135,11 +130,6 @@ private: Logger &logger; /** - * Current whitespace mode. - */ - WhitespaceMode whitespaceMode; - - /** * Data to be used by the internal functions. */ std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +161,6 @@ public: void parse(); /** - * Sets the whitespace handling mode. - * - * @param whitespaceMode defines how whitespace in the data should be - * handled. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Returns the current whitespace handling mode. - * - * @return the currently set whitespace handling mode. - */ - WhitespaceMode getWhitespaceMode() const; - - /** * Returns the internal CharReader reference. * * @return the CharReader reference. diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp index a93f14a..83966d5 100644 --- a/test/core/parser/stack/StackTest.cpp +++ b/test/core/parser/stack/StackTest.cpp @@ -24,6 +24,7 @@ #include <core/parser/stack/Handler.hpp> #include <core/parser/stack/Stack.hpp> #include <core/parser/stack/State.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <core/StandaloneEnvironment.hpp> @@ -53,7 +54,7 @@ struct Tracker { Variant::mapType annotationStartArgs; Variant annotationEndClassName; Variant annotationEndElementName; - Variant dataData; + TokenizedData dataData; bool startResult; bool fieldStartSetIsDefault; @@ -81,7 +82,7 @@ struct Tracker { annotationStartArgs = Variant::mapType{}; annotationEndClassName = Variant::fromString(std::string{}); annotationEndElementName = Variant::fromString(std::string{}); - dataData = Variant::fromString(std::string{}); + dataData = TokenizedData(); startResult = true; fieldStartSetIsDefault = false; @@ -157,7 +158,7 @@ public: return tracker.annotationEndResult; } - bool data(Variant &data) override + bool data(TokenizedData &data) override { tracker.dataCount++; tracker.dataData = data; @@ -363,7 +364,7 @@ TEST(Stack, multipleFields) s.data("test"); tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test", tracker.dataData); + EXPECT_EQ("test", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc @@ -375,7 +376,7 @@ TEST(Stack, multipleFields) s.data("test2"); tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test2", tracker.dataData); + EXPECT_EQ("test2", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc @@ -387,7 +388,7 @@ TEST(Stack, multipleFields) s.data("test3"); tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test3", tracker.dataData); + EXPECT_EQ("test3", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc @@ -744,4 +745,4 @@ TEST(Stack, fieldAfterDefaultField) ASSERT_FALSE(logger.hasError()); } } -}
\ No newline at end of file +} diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp index 231bad9..dfe2526 100644 --- a/test/core/parser/utils/TokenizedDataTest.cpp +++ b/test/core/parser/utils/TokenizedDataTest.cpp @@ -22,6 +22,43 @@ namespace ousia { +void assertToken(TokenizedDataReader &reader, TokenId id, + const std::string &text, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId sourceId = InvalidSourceId) +{ + Token token; + ASSERT_TRUE(reader.read(token, tokens, mode)); + EXPECT_EQ(id, token.id); + EXPECT_EQ(text, token.content); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, token.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, token.getLocation().getEnd()); + } + EXPECT_EQ(sourceId, token.getLocation().getSourceId()); +} + +void assertText(TokenizedDataReader &reader, const std::string &text, + const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId id = InvalidSourceId) +{ + assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id); +} + +void assertEnd(TokenizedDataReader &reader) +{ + Token token; + ASSERT_TRUE(reader.atEnd()); + ASSERT_FALSE(reader.read(token)); +} + TEST(TokenizedData, dataWhitespacePreserve) { TokenizedData data; @@ -29,15 +66,10 @@ TEST(TokenizedData, dataWhitespacePreserve) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" test1 test2 ", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(16U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, " test1 test2 ", TokenSet{}, WhitespaceMode::PRESERVE, + 0, 16); + assertEnd(reader); } TEST(TokenizedData, dataWhitespaceTrim) @@ -47,15 +79,10 @@ TEST(TokenizedData, dataWhitespaceTrim) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test1 test2", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(14U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::TRIM, 1, + 14); + assertEnd(reader); } TEST(TokenizedData, dataWhitespaceCollapse) @@ -65,15 +92,10 @@ TEST(TokenizedData, dataWhitespaceCollapse) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test1 test2", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(14U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::COLLAPSE, 1, + 14); + assertEnd(reader); } TEST(TokenizedData, singleToken) @@ -82,17 +104,9 @@ TEST(TokenizedData, singleToken) ASSERT_EQ(2U, data.append("$$")); data.mark(5, 0, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, singleDisabledToken) @@ -101,15 +115,9 @@ TEST(TokenizedData, singleDisabledToken) ASSERT_EQ(2U, data.append("$$")); data.mark(5, 0, 2); - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "$$", TokenSet{}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, dualToken) @@ -120,18 +128,10 @@ TEST(TokenizedData, dualToken) data.mark(5, 0, 2); data.mark(6, 1, 1); - data.enableToken(5); - data.enableToken(6); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5, 6}, WhitespaceMode::COLLAPSE, 0, + 2); + assertEnd(reader); } TEST(TokenizedData, dualTokenShorterEnabled) @@ -142,385 +142,281 @@ TEST(TokenizedData, dualTokenShorterEnabled) data.mark(5, 0, 2); data.mark(6, 1, 1); - data.enableToken(6); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(1U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 1, 2); + assertEnd(reader); } TEST(TokenizedData, dualTokenLongerEnabled) { TokenizedData data; ASSERT_EQ(2U, data.append("$$")); + data.mark(6, 0, 1); data.mark(5, 0, 2); + data.mark(6, 1, 1); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataPreserveWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" test ", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(8U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2); + assertText(reader, " test text ", TokenSet{5}, WhitespaceMode::PRESERVE, + 2, 16); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataTrimWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2); + assertText(reader, "test text", TokenSet{5}, WhitespaceMode::TRIM, 3, + 15); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataCollapseWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertText(reader, "test text", TokenSet{5}, WhitespaceMode::COLLAPSE, 3, + 15); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespacePreserveWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(8U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2); + assertText(reader, " ", TokenSet{5}, WhitespaceMode::PRESERVE, 2, 6); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 6, 8); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespaceTrimWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 6, 8); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespaceCollapseWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 6, 8); + assertEnd(reader); } -TEST(TokenizedData, textPreserveWhitespace) +TEST(TokenizedData, appendChars) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); - - data.enableToken(5); + ASSERT_EQ(1U, data.append('t', 5, 7)); + ASSERT_EQ(2U, data.append('e', 7, 8)); + ASSERT_EQ(3U, data.append('s', 8, 10)); + ASSERT_EQ(4U, data.append('t', 10, 12)); - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(4U, token.getLocation().getStart()); - EXPECT_EQ(6U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.text(token, WhitespaceMode::PRESERVE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test", TokenSet{5}, WhitespaceMode::COLLAPSE, 5, 12); + assertEnd(reader); } -TEST(TokenizedData, textTrimWhitespace) +TEST(TokenizedData, protectedWhitespace) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); + ASSERT_EQ(4U, data.append("test", 10)); + ASSERT_EQ(11U, data.append(" test", 14, true)); - data.enableToken(5); - - Token token; - ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM)); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test test", TokenSet{5}, WhitespaceMode::COLLAPSE, 10, + 21); + assertEnd(reader); +} - ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM)); - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); +TEST(TokenizedData, specialNewlineToken) +{ + TokenizedData data; + data.append("a\nb\n \nc\n"); + // 0 12 3456 78 9 + + const TokenSet tokens{Tokens::Newline}; + + TokenizedDataReader reader = data.reader(); + assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 1, 2); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 3, 4); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 7, 8); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 9, 10); + assertEnd(reader); } -TEST(TokenizedData, textCollapseWhitespace) +TEST(TokenizedData, specialParagraphToken) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); + data.append("a\nb\n \nc\n"); + // 0 12 3456 78 9 - data.enableToken(5); + const TokenSet tokens{Tokens::Paragraph}; - Token token; - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3); + assertToken(reader, Tokens::Paragraph, "\n \n", tokens, + WhitespaceMode::COLLAPSE, 3, 8); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9); + assertEnd(reader); +} - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); +TEST(TokenizedData, specialSectionToken) +{ + TokenizedData data; + data.append("a\nb\n \n \t \n"); + // 0 12 3456 789 01 2 + // 0 1 + + const TokenSet tokens{Tokens::Section}; - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3); + assertToken(reader, Tokens::Section, "\n \n \t \n", tokens, + WhitespaceMode::COLLAPSE, 3, 13); + assertEnd(reader); } -TEST(TokenizedData, appendChars) +TEST(TokenizedData, specialTokenPrecedence) { TokenizedData data; - ASSERT_EQ(1U, data.append('t', 5, 7)); - ASSERT_EQ(2U, data.append('e', 7, 8)); - ASSERT_EQ(3U, data.append('s', 8, 10)); - ASSERT_EQ(4U, data.append('t', 10, 12)); + data.append("a\nb\n\nc\n\n\nd"); + // 0 12 3 45 6 7 89 + + const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section}; + + TokenizedDataReader reader = data.reader(); + assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 1, 2); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3); + assertToken(reader, Tokens::Paragraph, "\n\n", tokens, + WhitespaceMode::COLLAPSE, 3, 5); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 5, 6); + assertToken(reader, Tokens::Section, "\n\n\n", tokens, + WhitespaceMode::COLLAPSE, 6, 9); + assertText(reader, "d", tokens, WhitespaceMode::COLLAPSE, 9, 10); + assertEnd(reader); +} - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(5U, token.getLocation().getStart()); - EXPECT_EQ(12U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); +TEST(TokenizedData, specialTokenPrecedence2) +{ + TokenizedData data; + data.append("\nb\n\nc\n\n\n"); + // 0 12 3 45 6 7 + + const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section}; + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 0, 1); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 1, 2); + assertToken(reader, Tokens::Paragraph, "\n\n", tokens, + WhitespaceMode::COLLAPSE, 2, 4); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 4, 5); + assertToken(reader, Tokens::Section, "\n\n\n", tokens, + WhitespaceMode::COLLAPSE, 5, 8); + assertEnd(reader); } -TEST(TokenizedData, copy) +TEST(TokenizedData, specialTokenIndent) { TokenizedData data; - ASSERT_EQ(7U, data.append(" a $ b ")); - // 0123456 - data.mark(6, 3, 1); - data.enableToken(6); + data.append(" test\n\ttest2\n test3 \ttest4\ntest5"); + // 01234567 8 901234 5678901234567890 123456 789012 + // 0 1 2 3 4 + const TokenSet tokens{Tokens::Indent, Tokens::Dedent}; + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 4, 4); + assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 10, 10); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, + 38, 38); + assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43); + assertEnd(reader); +} - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("a", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); - - TokenizedData dataCopy = data; - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(dataCopy.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" b ", token.content); - EXPECT_EQ(4U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(data.next(token)); - - ASSERT_TRUE(dataCopy.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("b", token.content); - EXPECT_EQ(5U, token.getLocation().getStart()); - EXPECT_EQ(6U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(dataCopy.next(token)); +TEST(TokenizedData, specialTokenIndentOverlap) +{ + TokenizedData data; + data.append(" test\n\ttest2\n test3 \ttest4\ntest5"); + // 01234567 8 901234 5678901234567890 123456 789012 + // 0 1 2 3 4 + const TokenSet tokens{Tokens::Indent, Tokens::Dedent, 5}; + + data.mark(5, 4, 4); + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 4, 4); + assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 10, 10); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, + 38, 38); + assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43); + assertEnd(reader); } + } diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 3809a12..0f2bfb7 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -20,6 +20,7 @@ #include <core/common/CharReader.hpp> #include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/utils/TokenizedData.hpp> namespace ousia { @@ -31,23 +32,40 @@ TEST(Tokenizer, tokenRegistration) ASSERT_EQ(0U, tokenizer.registerToken("a")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("a")); - ASSERT_EQ("a", tokenizer.getTokenString(0U)); + ASSERT_EQ("a", tokenizer.lookupToken(0U).string); ASSERT_EQ(1U, tokenizer.registerToken("b")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("b")); - ASSERT_EQ("b", tokenizer.getTokenString(1U)); + ASSERT_EQ("b", tokenizer.lookupToken(1U).string); ASSERT_EQ(2U, tokenizer.registerToken("c")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("c")); - ASSERT_EQ("c", tokenizer.getTokenString(2U)); + ASSERT_EQ("c", tokenizer.lookupToken(2U).string); ASSERT_TRUE(tokenizer.unregisterToken(1U)); ASSERT_FALSE(tokenizer.unregisterToken(1U)); - ASSERT_EQ("", tokenizer.getTokenString(1U)); + ASSERT_EQ("", tokenizer.lookupToken(1U).string); ASSERT_EQ(1U, tokenizer.registerToken("d")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("d")); - ASSERT_EQ("d", tokenizer.getTokenString(1U)); + ASSERT_EQ("d", tokenizer.lookupToken(1U).string); +} + +void expectData(const std::string &expected, SourceOffset tokenStart, + SourceOffset tokenEnd, SourceOffset textStart, + SourceOffset textEnd, const Token &token, TokenizedData &data, + WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ + ASSERT_EQ(Tokens::Data, token.id); + + Variant text = data.text(mode); + ASSERT_TRUE(text.isString()); + + EXPECT_EQ(expected, text.asString()); + EXPECT_EQ(tokenStart, token.location.getStart()); + EXPECT_EQ(tokenEnd, token.location.getEnd()); + EXPECT_EQ(textStart, text.getLocation().getStart()); + EXPECT_EQ(textEnd, text.getLocation().getEnd()); } TEST(Tokenizer, textTokenPreserveWhitespace) @@ -56,36 +74,34 @@ TEST(Tokenizer, textTokenPreserveWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(36U, loc.getEnd()); + expectData(" this \t is only a \n\n test text ", 0, 36, 0, 36, + token, data, WhitespaceMode::PRESERVE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, + token, data, WhitespaceMode::PRESERVE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -95,36 +111,34 @@ TEST(Tokenizer, textTokenTrimWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 36, 1, 33, token, + data, WhitespaceMode::TRIM); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, + token, data, WhitespaceMode::TRIM); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -134,36 +148,34 @@ TEST(Tokenizer, textTokenCollapseWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this is only a test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); + expectData("this is only a test text", 0, 36, 1, 33, token, data, + WhitespaceMode::COLLAPSE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this is only a test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this is only a test text", 0, 32, 0, 32, token, data, + WhitespaceMode::COLLAPSE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -177,14 +189,12 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + expectData("test1", 0, 5, 0, 5, token, data); char c; ASSERT_TRUE(reader.peek(c)); @@ -193,7 +203,8 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -209,14 +220,10 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + expectData("test2", 6, 11, 6, 11, token, data); char c; ASSERT_FALSE(reader.peek(c)); @@ -233,21 +240,17 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); + expectData("test1", 0, 5, 0, 5, token, data); ASSERT_EQ(0U, reader.getOffset()); ASSERT_EQ(5U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -261,35 +264,26 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); + expectData("test2", 6, 11, 6, 11, token, data); ASSERT_EQ(0U, reader.getOffset()); ASSERT_EQ(11U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + expectData("test1", 0, 5, 0, 5, token, data); ASSERT_EQ(5U, reader.getOffset()); ASSERT_EQ(5U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -303,14 +297,9 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + expectData("test2", 6, 11, 6, 11, token, data); ASSERT_EQ(11U, reader.getOffset()); ASSERT_EQ(11U, reader.getPeekOffset()); } @@ -320,6 +309,7 @@ TEST(Tokenizer, ambiguousTokens) { CharReader reader{"abc"}; Tokenizer tokenizer; + TokenizedData data; TokenId t1 = tokenizer.registerToken("abd"); TokenId t2 = tokenizer.registerToken("bc"); @@ -328,16 +318,17 @@ TEST(Tokenizer, ambiguousTokens) ASSERT_EQ(1U, t2); Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_TRUE(tokenizer.read(reader, token, data)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("a", token.content); + expectData("a", 0, 1, 0, 1, token, data); SourceLocation loc = token.location; ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); - ASSERT_TRUE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(t2, token.id); ASSERT_EQ("bc", token.content); @@ -346,7 +337,8 @@ TEST(Tokenizer, ambiguousTokens) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(3U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } TEST(Tokenizer, commentTestWhitespacePreserve) @@ -354,7 +346,7 @@ TEST(Tokenizer, commentTestWhitespacePreserve) CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - Tokenizer tokenizer(WhitespaceMode::PRESERVE); + Tokenizer tokenizer; const TokenId t1 = tokenizer.registerToken("/"); const TokenId t2 = tokenizer.registerToken("/*"); @@ -370,45 +362,23 @@ TEST(Tokenizer, commentTestWhitespacePreserve) Token t; for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); + TokenizedData data(0); + EXPECT_TRUE(tokenizer.read(reader, t, data)); EXPECT_EQ(te.id, t.id); - EXPECT_EQ(te.content, t.content); + if (te.id != Tokens::Data) { + EXPECT_EQ(te.content, t.content); + } else { + Variant text = data.text(WhitespaceMode::PRESERVE); + ASSERT_TRUE(text.isString()); + EXPECT_EQ(te.content, text.asString()); + } EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -TEST(Tokenizer, commentTestWhitespaceCollapse) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - Tokenizer tokenizer(WhitespaceMode::COLLAPSE); - const TokenId t1 = tokenizer.registerToken("/"); - const TokenId t2 = tokenizer.registerToken("/*"); - const TokenId t3 = tokenizer.registerToken("*/"); - - std::vector<Token> expected = { - {Tokens::Data, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {Tokens::Data, "Test", SourceLocation{0, 5, 9}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {Tokens::Data, "Block Comment", SourceLocation{0, 13, 26}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.id, t.id); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); + TokenizedData data; + ASSERT_FALSE(tokenizer.read(reader, t, data)); } } diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index d52fa5b..3d01007 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -30,11 +30,21 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); // static ConcreteLogger logger; +static OsmlStreamParser::State skipEmptyData(OsmlStreamParser &reader) +{ + OsmlStreamParser::State res = reader.parse(); + if (res == OsmlStreamParser::State::DATA) { + EXPECT_FALSE(reader.getData().hasNonWhitespaceText()); + res = reader.parse(); + } + return res; +} + static void assertCommand(OsmlStreamParser &reader, const std::string &name, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::COMMAND, skipEmptyData(reader)); EXPECT_EQ(name, reader.getCommandName().asString()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); @@ -57,16 +67,19 @@ static void assertCommand(OsmlStreamParser &reader, const std::string &name, static void assertData(OsmlStreamParser &reader, const std::string &data, SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) + SourceOffset end = InvalidSourceOffset, + WhitespaceMode mode = WhitespaceMode::COLLAPSE) { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(data, reader.getData().asString()); + Variant text = reader.getText(mode); + ASSERT_TRUE(text.isString()); + EXPECT_EQ(data, text.asString()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getData().getLocation().getStart()); + EXPECT_EQ(start, text.getLocation().getStart()); EXPECT_EQ(start, reader.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getData().getLocation().getEnd()); + EXPECT_EQ(end, text.getLocation().getEnd()); EXPECT_EQ(end, reader.getLocation().getEnd()); } } @@ -75,7 +88,7 @@ static void assertFieldStart(OsmlStreamParser &reader, bool defaultField, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::FIELD_START, skipEmptyData(reader)); EXPECT_EQ(defaultField, reader.inDefaultField()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); @@ -89,7 +102,7 @@ static void assertFieldEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::FIELD_END, skipEmptyData(reader)); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); } @@ -103,7 +116,7 @@ static void assertAnnotationStart(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, skipEmptyData(reader)); EXPECT_EQ(name, reader.getCommandName().asString()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); @@ -131,7 +144,7 @@ static void assertAnnotationEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, skipEmptyData(reader)); ASSERT_EQ(name, reader.getCommandName().asString()); if (!elementName.empty()) { ASSERT_EQ(1U, reader.getCommandArguments().asMap().size()); @@ -152,7 +165,7 @@ static void assertEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::END, skipEmptyData(reader)); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); } @@ -205,26 +218,14 @@ TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) assertData(reader, "hello world", 1, 14); } -TEST(OsmlStreamParser, escapeWhitespace) -{ - const char *testString = " hello\\ \\ world "; - // 012345 67 89012345 - // 0 1 - CharReader charReader(testString); - - OsmlStreamParser reader(charReader, logger); - - assertData(reader, "hello world", 1, 15); -} - static void testEscapeSpecialCharacter(const std::string &c) { CharReader charReader(std::string("\\") + c); OsmlStreamParser reader(charReader, logger); EXPECT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(c, reader.getData().asString()); + EXPECT_EQ(c, reader.getText().asString()); - SourceLocation loc = reader.getData().getLocation(); + SourceLocation loc = reader.getText().getLocation(); EXPECT_EQ(0U, loc.getStart()); EXPECT_EQ(1U + c.size(), loc.getEnd()); } @@ -253,16 +254,16 @@ TEST(OsmlStreamParser, singleLineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(33U, loc.getStart()); ASSERT_EQ(34U, loc.getEnd()); } @@ -279,16 +280,16 @@ TEST(OsmlStreamParser, multilineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(40U, loc.getStart()); ASSERT_EQ(41U, loc.getEnd()); } @@ -305,16 +306,16 @@ TEST(OsmlStreamParser, nestedMultilineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(40U, loc.getStart()); ASSERT_EQ(41U, loc.getEnd()); } @@ -569,8 +570,11 @@ TEST(OsmlStreamParser, multipleCommands) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); + assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); assertCommand(reader, "b", 3, 5); + assertData(reader, " ", 5, 6, WhitespaceMode::PRESERVE); assertCommand(reader, "c", 6, 8); + assertData(reader, " ", 8, 9, WhitespaceMode::PRESERVE); assertCommand(reader, "d", 9, 11); assertEnd(reader, 11, 11); } @@ -584,10 +588,13 @@ TEST(OsmlStreamParser, fieldsWithSpaces) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); + assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); assertFieldStart(reader, false, 3, 4); assertCommand(reader, "b", 4, 6); + assertData(reader, " ", 6, 7, WhitespaceMode::PRESERVE); assertCommand(reader, "c", 7, 9); assertFieldEnd(reader, 9, 10); + assertData(reader, " \n\n {", 10, 12, WhitespaceMode::PRESERVE); assertFieldStart(reader, false, 16, 17); assertCommand(reader, "d", 17, 19); assertFieldEnd(reader, 19, 20); diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp index 3293370..6942166 100644 --- a/test/formats/osxml/OsxmlEventParserTest.cpp +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -21,6 +21,7 @@ #include <core/frontend/TerminalLogger.hpp> #include <core/common/CharReader.hpp> #include <core/common/Variant.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <formats/osxml/OsxmlEventParser.hpp> @@ -74,13 +75,11 @@ public: }; static std::vector<std::pair<OsxmlEvent, Variant>> parseXml( - const char *testString, - WhitespaceMode whitespaceMode = WhitespaceMode::TRIM) + const char *testString) { TestOsxmlEventListener listener; CharReader reader(testString); OsxmlEventParser parser(reader, listener, logger); - parser.setWhitespaceMode(whitespaceMode); parser.parse(); return listener.events; } @@ -157,7 +156,7 @@ TEST(OsxmlEventParser, magicTopLevelTagInside) ASSERT_EQ(expectedEvents, events); } -TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) +TEST(OsxmlEventParser, commandWithData) { const char *testString = "<a> hello \n world </a>"; // 012345678901 234567890123 @@ -168,50 +167,12 @@ TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - auto events = parseXml(testString, WhitespaceMode::PRESERVE); + auto events = parseXml(testString); ASSERT_EQ(expectedEvents, events); // Check the location of the text ASSERT_EQ(3U, events[1].second.asArray()[0].getLocation().getStart()); ASSERT_EQ(20U, events[1].second.asArray()[0].getLocation().getEnd()); } - -TEST(OsxmlEventParser, commandWithDataTrimWhitespace) -{ - const char *testString = "<a> hello \n world </a>"; - // 012345678901 234567890123 - // 0 1 2 - - std::vector<std::pair<OsxmlEvent, Variant>> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, - {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - - auto events = parseXml(testString, WhitespaceMode::TRIM); - ASSERT_EQ(expectedEvents, events); - - // Check the location of the text - ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); - ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); -} - -TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) -{ - const char *testString = "<a> hello \n world </a>"; - // 012345678901 234567890123 - // 0 1 2 - - std::vector<std::pair<OsxmlEvent, Variant>> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, - {OsxmlEvent::DATA, Variant::arrayType{"hello world"}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - - auto events = parseXml(testString, WhitespaceMode::COLLAPSE); - ASSERT_EQ(expectedEvents, events); - - // Check the location of the text - ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); - ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); -} } |