summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt110
-rw-r--r--src/core/common/SourceContextReader.cpp5
-rw-r--r--src/core/common/Token.cpp (renamed from src/core/parser/utils/Token.cpp)0
-rw-r--r--src/core/common/Token.hpp (renamed from src/core/parser/utils/Token.hpp)45
-rw-r--r--src/core/common/Utils.cpp6
-rw-r--r--src/core/common/Utils.hpp53
-rw-r--r--src/core/common/WhitespaceHandler.hpp284
-rw-r--r--src/core/parser/stack/DocumentHandler.cpp24
-rw-r--r--src/core/parser/stack/DocumentHandler.hpp4
-rw-r--r--src/core/parser/stack/Handler.cpp25
-rw-r--r--src/core/parser/stack/Handler.hpp74
-rw-r--r--src/core/parser/stack/Stack.cpp55
-rw-r--r--src/core/parser/stack/Stack.hpp18
-rw-r--r--src/core/parser/utils/SourceOffsetVector.hpp28
-rw-r--r--src/core/parser/utils/TokenTrie.cpp16
-rw-r--r--src/core/parser/utils/TokenTrie.hpp11
-rw-r--r--src/core/parser/utils/TokenizedData.cpp353
-rw-r--r--src/core/parser/utils/TokenizedData.hpp234
-rw-r--r--src/core/parser/utils/Tokenizer.cpp264
-rw-r--r--src/core/parser/utils/Tokenizer.hpp142
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp157
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp85
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp63
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp31
-rw-r--r--test/core/parser/stack/StackTest.cpp15
-rw-r--r--test/core/parser/utils/TokenizedDataTest.cpp602
-rw-r--r--test/core/parser/utils/TokenizerTest.cpp248
-rw-r--r--test/formats/osml/OsmlStreamParserTest.cpp79
-rw-r--r--test/formats/osxml/OsxmlEventParserTest.cpp47
29 files changed, 1501 insertions, 1577 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea5c3aa..225e63d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,6 +158,7 @@ ADD_LIBRARY(ousia_core
src/core/common/Rtti
src/core/common/RttiBuilder
src/core/common/SourceContextReader
+ src/core/common/Token
src/core/common/Utils
src/core/common/Variant
src/core/common/VariantConverter
@@ -180,16 +181,15 @@ ADD_LIBRARY(ousia_core
src/core/parser/ParserContext
src/core/parser/ParserScope
src/core/parser/stack/Callbacks
- src/core/parser/stack/DocumentHandler
- src/core/parser/stack/DomainHandler
- src/core/parser/stack/GenericParserStates
- src/core/parser/stack/Handler
- src/core/parser/stack/ImportIncludeHandler
+# src/core/parser/stack/DocumentHandler
+# src/core/parser/stack/DomainHandler
+# src/core/parser/stack/GenericParserStates
+# src/core/parser/stack/Handler
+# src/core/parser/stack/ImportIncludeHandler
src/core/parser/stack/State
- src/core/parser/stack/Stack
- src/core/parser/stack/TypesystemHandler
+# src/core/parser/stack/Stack
+# src/core/parser/stack/TypesystemHandler
src/core/parser/utils/SourceOffsetVector
- src/core/parser/utils/Token
src/core/parser/utils/TokenizedData
src/core/parser/utils/Tokenizer
src/core/parser/utils/TokenTrie
@@ -212,19 +212,19 @@ ADD_LIBRARY(ousia_core
# ousia_core
#)
-ADD_LIBRARY(ousia_osml
- src/formats/osml/OsmlParser
- src/formats/osml/OsmlStreamParser
-)
+#ADD_LIBRARY(ousia_osml
+# src/formats/osml/OsmlParser
+# src/formats/osml/OsmlStreamParser
+#)
-TARGET_LINK_LIBRARIES(ousia_osml
- ousia_core
-)
+#TARGET_LINK_LIBRARIES(ousia_osml
+# ousia_core
+#)
ADD_LIBRARY(ousia_osxml
src/formats/osxml/OsxmlAttributeLocator
src/formats/osxml/OsxmlEventParser
- src/formats/osxml/OsxmlParser
+# src/formats/osxml/OsxmlParser
)
TARGET_LINK_LIBRARIES(ousia_osxml
@@ -273,19 +273,19 @@ TARGET_LINK_LIBRARIES(ousia_xml
# Command line interface
-ADD_EXECUTABLE(ousia
- src/cli/Main
-)
+#ADD_EXECUTABLE(ousia
+# src/cli/Main
+#)
-TARGET_LINK_LIBRARIES(ousia
- ousia_core
- ousia_filesystem
- ousia_html
- ousia_xml
- ousia_osml
- ousia_osxml
- ${Boost_LIBRARIES}
-)
+#TARGET_LINK_LIBRARIES(ousia
+# ousia_core
+# ousia_filesystem
+# ousia_html
+# ousia_xml
+# ousia_osml
+# ousia_osxml
+# ${Boost_LIBRARIES}
+#)
# If testing is enabled, build the unit tests
IF(TEST)
@@ -323,11 +323,11 @@ IF(TEST)
test/core/model/StyleTest
test/core/model/TypesystemTest
test/core/parser/ParserScopeTest
- test/core/parser/stack/StackTest
+# test/core/parser/stack/StackTest
test/core/parser/stack/StateTest
test/core/parser/utils/SourceOffsetVectorTest
test/core/parser/utils/TokenizedDataTest
- test/core/parser/utils/TokenizerTest
+# test/core/parser/utils/TokenizerTest
test/core/parser/utils/TokenTrieTest
test/core/resource/ResourceLocatorTest
test/core/resource/ResourceRequestTest
@@ -383,29 +383,29 @@ IF(TEST)
# ousia_mozjs
# )
- ADD_EXECUTABLE(ousia_test_osml
- test/formats/osml/OsmlParserTest
- test/formats/osml/OsmlStreamParserTest
- )
+# ADD_EXECUTABLE(ousia_test_osml
+# test/formats/osml/OsmlParserTest
+# test/formats/osml/OsmlStreamParserTest
+# )
- TARGET_LINK_LIBRARIES(ousia_test_osml
- ${GTEST_LIBRARIES}
- ousia_core
- ousia_osml
- ousia_filesystem
- )
+# TARGET_LINK_LIBRARIES(ousia_test_osml
+# ${GTEST_LIBRARIES}
+# ousia_core
+# ousia_osml
+# ousia_filesystem
+# )
- ADD_EXECUTABLE(ousia_test_osxml
- test/formats/osxml/OsxmlEventParserTest
- test/formats/osxml/OsxmlParserTest
- )
+# ADD_EXECUTABLE(ousia_test_osxml
+# test/formats/osxml/OsxmlEventParserTest
+# test/formats/osxml/OsxmlParserTest
+# )
- TARGET_LINK_LIBRARIES(ousia_test_osxml
- ${GTEST_LIBRARIES}
- ousia_core
- ousia_osxml
- ousia_filesystem
- )
+# TARGET_LINK_LIBRARIES(ousia_test_osxml
+# ${GTEST_LIBRARIES}
+# ousia_core
+# ousia_osxml
+# ousia_filesystem
+# )
ADD_EXECUTABLE(ousia_test_xml
test/plugins/xml/XmlOutputTest
@@ -423,8 +423,8 @@ IF(TEST)
ADD_TEST(ousia_test_filesystem ousia_test_filesystem)
ADD_TEST(ousia_test_html ousia_test_html)
# ADD_TEST(ousia_test_mozjs ousia_test_mozjs)
- ADD_TEST(ousia_test_osml ousia_test_osml)
- ADD_TEST(ousia_test_osxml ousia_test_osxml)
+# ADD_TEST(ousia_test_osml ousia_test_osml)
+# ADD_TEST(ousia_test_osxml ousia_test_osxml)
ADD_TEST(ousia_test_xml ousia_test_xml)
ENDIF()
@@ -442,9 +442,9 @@ INSTALL(DIRECTORY data/ DESTINATION share/ousia
OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE
)
-INSTALL(TARGETS ousia
- RUNTIME DESTINATION bin
-)
+#INSTALL(TARGETS ousia
+# RUNTIME DESTINATION bin
+#)
IF(INSTALL_GEDIT_HIGHLIGHTER)
INSTALL(FILES contrib/gtksourceview-3.0/language-specs/ousia.lang
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp
index d5d379c..f7dbdf3 100644
--- a/src/core/common/SourceContextReader.cpp
+++ b/src/core/common/SourceContextReader.cpp
@@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader,
ctx.relLen = end - start; // end >= start (I2)
// Remove linebreaks at the beginning and the end
- const std::pair<size_t, size_t> b =
- Utils::trim(lineBuf, Utils::isLinebreak);
+ const std::pair<size_t, size_t> b = Utils::trim(
+ lineBuf,
+ [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); });
ssize_t s = b.first, e = b.second;
s = std::min(s, static_cast<ssize_t>(ctx.relPos));
diff --git a/src/core/parser/utils/Token.cpp b/src/core/common/Token.cpp
index 8bcdbb5..8bcdbb5 100644
--- a/src/core/parser/utils/Token.cpp
+++ b/src/core/common/Token.cpp
diff --git a/src/core/parser/utils/Token.hpp b/src/core/common/Token.hpp
index f907450..0cf56b0 100644
--- a/src/core/parser/utils/Token.hpp
+++ b/src/core/common/Token.hpp
@@ -30,6 +30,7 @@
#include <cstdint>
#include <limits>
#include <string>
+#include <unordered_set>
#include <core/common/Location.hpp>
@@ -46,6 +47,11 @@ using TokenId = uint32_t;
using TokenLength = uint16_t;
/**
+ * Type used for storing token sets.
+ */
+using TokenSet = std::unordered_set<TokenId>;
+
+/**
* Namespace containing constants for TokenId instances with special meaning.
*/
namespace Tokens {
@@ -66,15 +72,29 @@ constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2;
/**
* Token which represents a paragraph token -- issued if two consecutive
- * newlines occur with optionally any amout of whitespace between them.
+ * newlines occur with optionally any amout of whitespace between them. The
+ * paragraph token is not repeated until more text is reached.
*/
constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3;
/**
+ * Token which represents a section token -- issued if three or more
+ * consecutive newlines occur with optionally any amout of whitespace between
+ * them. The section token is not repeated until more text is reached.
+ */
+constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4;
+
+/**
* Token which represents an indentation token -- issued if the indentation of
- * this line is larget than the indentation of the previous line.
+ * this line is larger than the indentation of the previous line.
*/
-constexpr TokenId Indentation = std::numeric_limits<TokenId>::max() - 4;
+constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5;
+
+/**
+ * Token which represents an dedentation -- issued if the indentation of
+ * this line is smaller than the indentation of the previous line.
+ */
+constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6;
/**
* Maximum token id to be used. Tokens allocated for users should not surpass
@@ -109,6 +129,17 @@ struct Token {
Token() : id(Tokens::Empty) {}
/**
+ * Constructor of a "data" token with no explicit content.
+ *
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ Token(SourceLocation location)
+ : id(Tokens::Data), location(location)
+ {
+ }
+
+ /**
* Constructor of the Token struct.
*
* @param id represents the token id.
@@ -129,6 +160,14 @@ struct Token {
Token(TokenId id) : id(id) {}
/**
+ * Returns true if this token is special.
+ *
+ * @return true if the TokenId indicates that this token is a "special"
+ * token.
+ */
+ bool isSpecial() const {return id > Tokens::MaxTokenId;}
+
+ /**
* The getLocation function allows the tokens to be directly passed as
* parameter to Logger or LoggableException instances.
*
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index a77951e..85d2c28 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename)
return std::string{};
}
-std::string Utils::trim(const std::string &s)
-{
- std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
- return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
bool Utils::startsWith(const std::string &s, const std::string &prefix)
{
return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix;
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 7d96562..82a8f8c 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -124,14 +124,6 @@ public:
static bool hasNonWhitepaceChar(const std::string &s);
/**
- * Removes whitespace at the beginning and the end of the given string.
- *
- * @param s is the string that should be trimmed.
- * @return a trimmed copy of s.
- */
- static std::string trim(const std::string &s);
-
- /**
* Trims the given string or vector of chars by returning the start and end
* index.
*
@@ -153,8 +145,8 @@ public:
*
* @param s is the container that should be trimmed.
* @param len is the number of elements in the container.
- * @param f is a function that returns true for values that should be
- * removed.
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
*/
@@ -163,7 +155,7 @@ public:
{
size_t start = 0;
for (size_t i = 0; i < len; i++) {
- if (!f(s[i])) {
+ if (!f(i)) {
start = i;
break;
}
@@ -171,7 +163,7 @@ public:
size_t end = 0;
for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {
- if (!f(s[i])) {
+ if (!f(i)) {
end = i + 1;
break;
}
@@ -198,17 +190,33 @@ public:
* the collapsed version of the string ends.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
*/
- template <class T>
- static std::string trim(const T &s, size_t len, size_t &start, size_t &end)
+ template <class T, class Filter>
+ static std::string trim(const T &s, size_t len, size_t &start, size_t &end,
+ Filter f)
{
- auto res = trim(s, len, isWhitespace);
+ auto res = trim(s, len, f);
start = res.first;
end = res.second;
return std::string(&s[start], end - start);
}
/**
+ * Removes whitespace at the beginning and the end of the given string.
+ *
+ * @param s is the string that should be trimmed.
+ * @return a trimmed copy of s.
+ */
+ static std::string trim(const std::string &s)
+ {
+ std::pair<size_t, size_t> bounds =
+ trim(s, [&s](size_t i) { return isWhitespace(s[i]); });
+ return s.substr(bounds.first, bounds.second - bounds.first);
+ }
+
+ /**
* Collapses the whitespaces in the given string (trims the string and
* replaces all whitespace characters by a single one).
*
@@ -219,7 +227,8 @@ public:
{
size_t start;
size_t end;
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -236,7 +245,8 @@ public:
static std::string collapse(const std::string &s, size_t &start,
size_t &end)
{
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -244,6 +254,8 @@ public:
* replaces all whitespace characters by a single one).
*
* @tparam T is the string type that should be used.
+ * @tparam Filter is a filter function used for detecting the character
+ * indices that might be removed.
* @param s is the string in which the whitespace should be collapsed.
* @param len is the length of the input string
* @param start is an output parameter which is set to the offset at which
@@ -252,9 +264,9 @@ public:
* the collapsed version of the string ends.
* @return a copy of s with collapsed whitespace.
*/
- template <class T>
+ template <class T, class Filter>
static std::string collapse(const T &s, size_t len, size_t &start,
- size_t &end)
+ size_t &end, Filter f)
{
// Result vector
std::vector<char> res;
@@ -268,8 +280,7 @@ public:
bool hadWhitespace = false;
for (size_t i = 0; i < len; i++) {
const char c = s[i];
- const bool whitespace = isWhitespace(c);
- if (whitespace) {
+ if (f(i)) {
hadWhitespace = !res.empty();
} else {
// Adapt the start and end position
diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp
deleted file mode 100644
index ed52ea3..0000000
--- a/src/core/common/WhitespaceHandler.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file WhitespaceHandler.hpp
- *
- * Contains the WhitespaceHandler classes which are used in multiple places to
- * trim, compact or preserve whitespaces while at the same time maintaining the
- * position information associated with the input strings.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
-#define _OUSIA_WHITESPACE_HANDLER_HPP_
-
-#include <string>
-#include <vector>
-
-#include "Utils.hpp"
-
-namespace ousia {
-
-/**
- * WhitespaceHandler is a based class that can be used to collect text on a
- * character-by-character basis. Note that this class and its descendants are
- * hoped to be inlined by the compiler (and used in conjunction with templates),
- * thus they are fully defined inside this header.
- */
-class WhitespaceHandler {
-public:
- /**
- * Start position of the extracted text.
- */
- size_t textStart;
-
- /**
- * End position of the extracted text.
- */
- size_t textEnd;
-
- /**
- * Buffer containing the extracted text.
- */
- std::vector<char> textBuf;
-
- /**
- * Constructor of the TextHandlerBase base class. Initializes the start and
- * end position with zeros.
- */
- WhitespaceHandler() : textStart(0), textEnd(0) {}
-
- /**
- * Returns true if this whitespace handler has found any text and a text
- * token could be emitted.
- *
- * @return true if the internal data buffer is non-empty.
- */
- bool hasText() { return !textBuf.empty(); }
-
- /**
- * Returns the content of the WhitespaceHandler as string.
- */
- std::string toString() const
- {
- return std::string(textBuf.data(), textBuf.size());
- }
-};
-
-/**
- * The PreservingWhitespaceHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Appends the given character to the internal text buffer, does not
- * eliminate whitespace.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd);
- }
-
- /**
- * Static version of PreservingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd)
- {
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
- textBuf.push_back(c);
- }
-};
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Buffer used internally to temporarily store all whitespace characters.
- * They are only added to the output buffer if another non-whitespace
- * character is reached.
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * whitespace characters at the begin and end of the text.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf);
- }
-
- /**
- * Static version of TrimmingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- * @param whitespaceBuf is a reference at the buffer for storing whitespace
- * characters.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd, std::vector<char> &whitespaceBuf)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- whitespaceBuf.push_back(c);
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (!whitespaceBuf.empty()) {
- textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
- whitespaceBuf.end());
- whitespaceBuf.clear();
- }
- textBuf.push_back(c);
- }
-};
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Flag set to true if a whitespace character was reached.
- */
- bool hasWhitespace = false;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * redundant whitespace characters.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd, hasWhitespace);
- }
-
- /**
- * Static version of CollapsingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- * @param hasWhitespace is a reference at the "hasWhitespace" flag.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd, bool &hasWhitespace)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- hasWhitespace = true;
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (hasWhitespace) {
- textBuf.push_back(' ');
- hasWhitespace = false;
- }
- textBuf.push_back(c);
- }
-};
-
-/**
- * Function that can be used to append the given buffer (e.g. a string or a
- * vector) to the whitespace handler.
- *
- * @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
- * @tparam Buffer is an iterable type.
- * @param handler is the handler to which the characters of the Buffer should be
- * appended.
- * @param buf is the buffer from which the characters should be read.
- * @param start is the start byte offset. Each character is counted as one byte.
- */
-template <typename WhitespaceHandler, typename Buffer>
-inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
- size_t start)
-{
- for (auto elem : buf) {
- handler.append(elem, start, start + 1);
- start++;
- }
-}
-}
-
-#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */
-
diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp
index bb04bd3..d44176a 100644
--- a/src/core/parser/stack/DocumentHandler.cpp
+++ b/src/core/parser/stack/DocumentHandler.cpp
@@ -25,6 +25,7 @@
#include <core/model/Domain.hpp>
#include <core/model/Project.hpp>
#include <core/model/Typesystem.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <core/parser/ParserScope.hpp>
#include <core/parser/ParserContext.hpp>
@@ -372,8 +373,15 @@ bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field,
return valid && scope().resolveValue(data, type, logger);
}
-bool DocumentChildHandler::data(Variant &data)
+bool DocumentChildHandler::data(TokenizedData &data)
{
+ // TODO: Handle this correctly
+ Variant text = data.text(WhitespaceMode::TRIM);
+ if (text == nullptr) {
+ // For now, except "no data" as success
+ return true;
+ }
+
// We're past the region in which explicit fields can be defined in the
// parent structure element
scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true);
@@ -393,11 +401,11 @@ bool DocumentChildHandler::data(Variant &data)
// If it is a primitive field directly, try to parse the content.
if (field->isPrimitive()) {
// Add it as primitive content.
- if (!convertData(field, data, logger())) {
+ if (!convertData(field, text, logger())) {
return false;
}
- parent->createChildDocumentPrimitive(data, fieldIdx);
+ parent->createChildDocumentPrimitive(text, fieldIdx);
return true;
}
@@ -411,7 +419,7 @@ bool DocumentChildHandler::data(Variant &data)
for (auto primitiveField : defaultFields) {
// Then try to parse the content using the type specification.
forks.emplace_back(logger().fork());
- if (!convertData(primitiveField, data, forks.back())) {
+ if (!convertData(primitiveField, text, forks.back())) {
continue;
}
@@ -424,7 +432,7 @@ bool DocumentChildHandler::data(Variant &data)
createPath(fieldIdx, path, parent);
// Then create the primitive element
- parent->createChildDocumentPrimitive(data);
+ parent->createChildDocumentPrimitive(text);
return true;
}
@@ -434,10 +442,10 @@ bool DocumentChildHandler::data(Variant &data)
if (defaultFields.empty()) {
logger().error("Got data, but structure \"" + name() +
"\" does not have any primitive field",
- data);
+ text);
} else {
logger().error("Could not read data with any of the possible fields:",
- data);
+ text);
size_t f = 0;
for (auto field : defaultFields) {
logger().note(std::string("Field ") +
@@ -471,4 +479,4 @@ namespace RttiTypes {
const Rtti DocumentField = RttiBuilder<ousia::parser_stack::DocumentField>(
"DocumentField").parent(&Node);
}
-} \ No newline at end of file
+}
diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp
index 862081c..dda7d8b 100644
--- a/src/core/parser/stack/DocumentHandler.hpp
+++ b/src/core/parser/stack/DocumentHandler.hpp
@@ -167,7 +167,7 @@ public:
bool start(Variant::mapType &args) override;
void end() override;
- bool data(Variant &data) override;
+ bool data(TokenizedData &data) override;
bool fieldStart(bool &isDefault, size_t fieldIdx) override;
@@ -213,4 +213,4 @@ extern const Rtti DocumentField;
}
}
-#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ \ No newline at end of file
+#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */
diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp
index bf5d4ea..3d413e8 100644
--- a/src/core/parser/stack/Handler.cpp
+++ b/src/core/parser/stack/Handler.cpp
@@ -18,6 +18,7 @@
#include <core/common/Exceptions.hpp>
#include <core/common/Logger.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <core/parser/ParserContext.hpp>
#include "Callbacks.hpp"
@@ -130,7 +131,7 @@ bool EmptyHandler::annotationEnd(const Variant &className,
return true;
}
-bool EmptyHandler::data(Variant &data)
+bool EmptyHandler::data(TokenizedData &data)
{
// Support any data
return true;
@@ -184,10 +185,13 @@ bool StaticHandler::annotationEnd(const Variant &className,
return false;
}
-bool StaticHandler::data(Variant &data)
+bool StaticHandler::data(TokenizedData &data)
{
- logger().error("Did not expect any data here", data);
- return false;
+ if (data.text(WhitespaceMode::TRIM) != nullptr) {
+ logger().error("Did not expect any data here", data);
+ return false;
+ }
+ return true;
}
/* Class StaticFieldHandler */
@@ -227,12 +231,19 @@ void StaticFieldHandler::end()
}
}
-bool StaticFieldHandler::data(Variant &data)
+bool StaticFieldHandler::data(TokenizedData &data)
{
+ Variant text = data.text(WhitespaceMode::TRIM);
+ if (text == nullptr) {
+ // Providing no data here is ok as long as the "doHandle" callback
+ // function has already been called
+ return handled;
+ }
+
// Call the doHandle function if this has not been done before
if (!handled) {
handled = true;
- doHandle(data, args);
+ doHandle(text, args);
return true;
}
@@ -240,7 +251,7 @@ bool StaticFieldHandler::data(Variant &data)
logger().error(
std::string("Found data, but the corresponding argument \"") + argName +
std::string("\" was already specified"),
- data);
+ text);
// Print the location at which the attribute was originally specified
auto it = args.find(argName);
diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp
index 7cda7a4..929466d 100644
--- a/src/core/parser/stack/Handler.hpp
+++ b/src/core/parser/stack/Handler.hpp
@@ -31,6 +31,7 @@ namespace ousia {
class ParserScope;
class ParserContext;
class Logger;
+class TokenizedData;
namespace parser_stack {
@@ -158,40 +159,63 @@ protected:
*/
const std::string &name() const;
-public:
- /**
- * Virtual destructor.
- */
- virtual ~Handler();
-
/**
* Calls the corresponding function in the Callbacks instance. Sets the
* whitespace mode that specifies how string data should be processed. The
* calls to this function are placed on a stack by the underlying Stack
- * class.
+ * class. This function should be called from the "fieldStart" callback and
+ * the "start" callback. If no whitespace mode is pushed in the "start"
+ * method the whitespace mode "TRIM" is implicitly assumed.
*
* @param whitespaceMode specifies one of the three WhitespaceMode constants
* PRESERVE, TRIM or COLLAPSE.
*/
- void setWhitespaceMode(WhitespaceMode whitespaceMode);
+ void pushWhitespaceMode(WhitespaceMode whitespaceMode);
/**
- * Calls the corresponding function in the Callbacks instance.
- * Registers the given token as token that should be reported to the handler
- * using the "token" function.
- *
- * @param token is the token string that should be reported.
+ * Pops a previously pushed whitespace mode. Calls to this function should
+ * occur in the "end" callback and the "fieldEnd" callback. This function
+ * can only undo pushs that were performed by the pushWhitespaceMode()
+ * method of the same handler.
*/
- void registerToken(const std::string &token);
+ void popWhitespaceMode();
/**
- * Calls the corresponding function in the Callbacks instance.
- * Unregisters the given token, it will no longer be reported to the handler
- * using the "token" function.
+ * Calls the corresponding function in the Callbacks instance. Sets the
+ * whitespace mode that specifies how string data should be processed. The
+ * calls to this function are placed on a stack by the underlying Stack
+ * class. This function should be called from the "fieldStart" callback and
+ * the "start" callback. If no whitespace mode is pushed in the "start"
+ * method the whitespace mode "TRIM" is implicitly assumed.
*
- * @param token is the token string that should be unregistered.
+ * @param tokens is a list of tokens that should be reported to this handler
+ * instance via the "token" method.
*/
- void unregisterToken(const std::string &token);
+ void pushTokens(const std::vector<std::string> &tokens);
+
+ /**
+ * Pops a previously pushed whitespace mode. Calls to this function should
+ * occur in the "end" callback and the "fieldEnd" callback. This function
+ * can only undo pushs that were performed by the pushWhitespaceMode()
+ * method of the same handler.
+ */
+ void popWhitespaceMode();
+
+
+ /**
+ * Calls the corresponding function in the Callbacks instance. This method
+ * registers the given tokens as tokens that are generally available, tokens
+ * must be explicitly enabled using the "pushTokens" and "popTokens" method.
+ * Tokens that have not been registered are not guaranteed to be reported,
+ * even though they are
+ */
+ void registerTokens(const std::vector<std::string> &tokens);
+
+public:
+ /**
+ * Virtual destructor.
+ */
+ virtual ~Handler();
/**
* Returns the command name for which the handler was created.
@@ -299,11 +323,11 @@ public:
* Handler instance. Should return true if the data could be handled, false
* otherwise.
*
- * @param data is a string variant containing the character data and its
- * location.
+ * @param data is an instance of TokenizedData containing the segmented
+ * character data and its location.
* @return true if the data could be handled, false otherwise.
*/
- virtual bool data(Variant &data) = 0;
+ virtual bool data(TokenizedData &data) = 0;
};
/**
@@ -333,7 +357,7 @@ public:
Variant::mapType &args) override;
bool annotationEnd(const Variant &className,
const Variant &elementName) override;
- bool data(Variant &data) override;
+ bool data(TokenizedData &data) override;
/**
* Creates an instance of the EmptyHandler class.
@@ -359,7 +383,7 @@ public:
Variant::mapType &args) override;
bool annotationEnd(const Variant &className,
const Variant &elementName) override;
- bool data(Variant &data) override;
+ bool data(TokenizedData &data) override;
};
/**
@@ -412,7 +436,7 @@ protected:
public:
bool start(Variant::mapType &args) override;
void end() override;
- bool data(Variant &data) override;
+ bool data(TokenizedData &data) override;
};
}
}
diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp
index 5b67248..309c9a0 100644
--- a/src/core/parser/stack/Stack.cpp
+++ b/src/core/parser/stack/Stack.cpp
@@ -19,6 +19,7 @@
#include <core/common/Logger.hpp>
#include <core/common/Utils.hpp>
#include <core/common/Exceptions.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <core/parser/ParserScope.hpp>
#include <core/parser/ParserContext.hpp>
@@ -413,16 +414,24 @@ void Stack::command(const Variant &name, const Variant::mapType &args)
}
}
-void Stack::data(const Variant &data)
+void Stack::data(TokenizedData data)
{
- // End handlers that already had a default field and are currently not
- // active.
- endOverdueHandlers();
+ // TODO: Rewrite this function for token handling
+ // TODO: This loop needs to be refactored out
+ while (!data.atEnd()) {
+ // End handlers that already had a default field and are currently not
+ // active.
+ endOverdueHandlers();
- while (true) {
- // Check whether there is any command the data can be sent to
+ const bool hasNonWhitespaceText = data.hasNonWhitespaceText();
+
+ // Check whether there is any command the data can be sent to -- if not,
+ // make sure the data actually is data
if (stack.empty()) {
- throw LoggableException("No command here to receive data.", data);
+ if (hasNonWhitespaceText) {
+ throw LoggableException("No command here to receive data.", data);
+ }
+ return;
}
// Fetch the current command handler information
@@ -440,7 +449,10 @@ void Stack::data(const Variant &data)
// If the "hadDefaultField" flag is set, we already issued an error
// message
if (!info.hadDefaultField) {
- logger().error("Did not expect any data here", data);
+ if (hasNonWhitespaceText) {
+ logger().error("Did not expect any data here", data);
+ }
+ return;
}
}
@@ -454,8 +466,16 @@ void Stack::data(const Variant &data)
// Pass the data to the current Handler instance
bool valid = false;
try {
- Variant dataCopy = data;
- valid = info.handler->data(dataCopy);
+ // Create a fork of the TokenizedData and let the handler work
+ // on it
+ TokenizedData dataFork = data;
+ valid = info.handler->data(dataFork);
+
+ // If the data was validly handled by the handler, commit the
+ // change
+ if (valid) {
+ data = dataFork;
+ }
}
catch (LoggableException ex) {
loggerFork.log(ex);
@@ -482,6 +502,19 @@ void Stack::data(const Variant &data)
}
}
+void Stack::data(const Variant &stringData)
+{
+ // Fetch the SourceLocation of the given stringData variant
+ SourceLocation loc = stringData.getLocation();
+
+ // Create a TokenizedData instance and feed the given string data into it
+ TokenizedData tokenizedData(loc.getSourceId());
+ tokenizedData.append(stringData.asString(), loc.getStart());
+
+ // Call the actual "data" method
+ data(tokenizedData);
+}
+
void Stack::fieldStart(bool isDefault)
{
// Make sure the current handler stack is not empty
@@ -584,4 +617,4 @@ void Stack::token(Variant token)
// TODO
}
}
-} \ No newline at end of file
+}
diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp
index b67ce82..cd29b28 100644
--- a/src/core/parser/stack/Stack.hpp
+++ b/src/core/parser/stack/Stack.hpp
@@ -44,6 +44,7 @@ namespace ousia {
// Forward declarations
class ParserContext;
class Logger;
+class TokenizedData;
namespace parser_stack {
@@ -292,13 +293,24 @@ public:
void command(const Variant &name, const Variant::mapType &args);
/**
- * Function that shuold be called whenever character data is found in the
+ * Function that should be called whenever character data is found in the
* input stream. May only be called if the currently is a command on the
* stack.
*
- * @param data is a string variant containing the data that has been found.
+ * @param data is a TokenizedData instance containing the pre-segmented data
+ * that should be read.
+ */
+ void data(TokenizedData data);
+
+ /**
+ * Function that shuold be called whenever character data is found in the
+ * input stream. The given string variant is converted into a TokenizedData
+ * instance internally.
+ *
+ * @param stringData is a string variant containing the data that has been
+ * found.
*/
- void data(const Variant &data);
+ void data(const Variant &stringData);
/**
* Function that should be called whenever a new field starts. Fields of the
diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp
index d15055a..aaebe7d 100644
--- a/src/core/parser/utils/SourceOffsetVector.hpp
+++ b/src/core/parser/utils/SourceOffsetVector.hpp
@@ -127,7 +127,7 @@ public:
* read.
* @return a pair containing start and end source offset.
*/
- std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx)
+ std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) const
{
// Special treatment for the last character
const size_t count = lens.size();
@@ -157,7 +157,31 @@ public:
/**
* Returns the number of characters for which offsets are stored.
*/
- size_t size() { return lens.size(); }
+ size_t size() const { return lens.size(); }
+
+ /**
+ * Trims the length of the TokenizedData instance to the given length.
+ * Removes all token matches that lie within the trimmed region.
+ *
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
+ */
+ void trim(size_t length) {
+ if (length < size()) {
+ lens.resize(length);
+ offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1);
+ }
+ }
+
+ /**
+ * Resets the SourceOffsetVector to the state it had when it was
+ * constructed.
+ */
+ void clear() {
+ lens.clear();
+ offsets.clear();
+ lastEnd = 0;
+ }
};
}
diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp
index 80cc945..a45d3ff 100644
--- a/src/core/parser/utils/TokenTrie.cpp
+++ b/src/core/parser/utils/TokenTrie.cpp
@@ -22,12 +22,12 @@ namespace ousia {
/* Class DynamicTokenTree::Node */
-TokenTrie::Node::Node() : type(Tokens::Empty) {}
+TokenTrie::Node::Node() : id(Tokens::Empty) {}
/* Class DynamicTokenTree */
bool TokenTrie::registerToken(const std::string &token,
- TokenId type) noexcept
+ TokenId id) noexcept
{
// Abort if the token is empty -- this would taint the root node
if (token.empty()) {
@@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token,
}
// If the resulting node already has a type set, we're screwed.
- if (node->type != Tokens::Empty) {
+ if (node->id != Tokens::Empty) {
return false;
}
// Otherwise just set the type to the given type.
- node->type = type;
+ node->id = id;
return true;
}
@@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
// Reset the subtree handler if this node has another type
node = it->second.get();
- if ((node->type != Tokens::Empty || node->children.size() > 1) &&
+ if ((node->id != Tokens::Empty || node->children.size() > 1) &&
(i + 1 != token.size())) {
subtreeRoot = node;
subtreeKey = token[i + 1];
@@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
}
// If the node type is already Tokens::Empty, we cannot do anything here
- if (node->type == Tokens::Empty) {
+ if (node->id == Tokens::Empty) {
return false;
}
// If the target node has children, we cannot delete the subtree. Set the
// type to Tokens::Empty instead
if (!node->children.empty()) {
- node->type = Tokens::Empty;
+ node->id = Tokens::Empty;
return true;
}
@@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept
}
node = it->second.get();
}
- return node->type;
+ return node->id;
}
}
diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp
index b2d1539..c470acc 100644
--- a/src/core/parser/utils/TokenTrie.hpp
+++ b/src/core/parser/utils/TokenTrie.hpp
@@ -33,7 +33,7 @@
#include <limits>
#include <unordered_map>
-#include "Token.hpp"
+#include <core/common/Token.hpp>
namespace ousia {
@@ -75,10 +75,9 @@ public:
ChildMap children;
/**
- * Reference at the corresponding token descriptor. Set to nullptr if
- * no token is attached to this node.
+ * Id of the token represented by this node.
*/
- TokenId type;
+ TokenId id;
/**
* Default constructor, initializes the descriptor with nullptr.
@@ -99,10 +98,10 @@ public:
*
* @param token is the character sequence that should be registered as
* token.
- * @param type is the descriptor that should be set for this token.
+ * @param id is the descriptor that should be set for this token.
* @return true if the operation is successful, false otherwise.
*/
- bool registerToken(const std::string &token, TokenId type) noexcept;
+ bool registerToken(const std::string &token, TokenId id) noexcept;
/**
* Unregisters the token from the token tree. Returns true if the token was
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
index fc7bfaf..aeefa26 100644
--- a/src/core/parser/utils/TokenizedData.cpp
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -48,6 +48,17 @@ struct TokenMark {
TokenLength len;
/**
+ * Specifies whether the token is special or not.
+ */
+ bool special;
+
+ /**
+ * Maximum token length.
+ */
+ static constexpr TokenLength MaxTokenLength =
+ std::numeric_limits<TokenLength>::max();
+
+ /**
* Constructor of the TokenMark structure, initializes all members with the
* given values.
*
@@ -55,9 +66,10 @@ struct TokenMark {
* @param bufStart is the start position of the TokenMark in the internal
* character buffer.
* @param len is the length of the token.
+ * @param special modifies the sort order, special tokens are prefered.
*/
- TokenMark(TokenId id, size_t bufStart, TokenLength len)
- : bufStart(bufStart), id(id), len(len)
+ TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special)
+ : bufStart(bufStart), id(id), len(len), special(special)
{
}
@@ -72,7 +84,8 @@ struct TokenMark {
TokenMark(size_t bufStart)
: bufStart(bufStart),
id(Tokens::Empty),
- len(std::numeric_limits<TokenLength>::max())
+ len(MaxTokenLength),
+ special(true)
{
}
@@ -86,8 +99,22 @@ struct TokenMark {
*/
friend bool operator<(const TokenMark &m1, const TokenMark &m2)
{
- return (m1.bufStart < m2.bufStart) ||
- (m1.bufStart == m2.bufStart && m1.len > m2.len);
+ // Prefer the mark with the smaller bufStart
+ if (m1.bufStart < m2.bufStart) {
+ return true;
+ }
+
+ // Special handling for marks with the same bufStart
+ if (m1.bufStart == m2.bufStart) {
+ // If exactly one of the two marks is special, return true if this
+ // one is special
+ if (m1.special != m2.special) {
+ return m1.special;
+ }
+ // Otherwise prefer longer marks
+ return m1.len > m2.len;
+ }
+ return false;
}
};
}
@@ -110,9 +137,9 @@ private:
std::vector<char> buf;
/**
- * Vector containing all token marks.
+ * Buffset storing the "protected" flag of the character data.
*/
- std::vector<TokenMark> marks;
+ std::vector<bool> protectedChars;
/**
* Vector storing all the character offsets efficiently.
@@ -120,9 +147,34 @@ private:
SourceOffsetVector offsets;
/**
+ * Vector containing all token marks.
+ */
+ mutable std::vector<TokenMark> marks;
+
+ /**
+ * Position of the first linebreak in a sequence of linebreaks.
+ */
+ size_t firstLinebreak;
+
+ /**
+ * Current indentation level.
+ */
+ uint16_t currentIndentation;
+
+ /**
+ * Last indentation level.
+ */
+ uint16_t lastIndentation;
+
+ /**
+ * Number of linebreaks without any content between them.
+ */
+ uint16_t numLinebreaks;
+
+ /**
* Flag indicating whether the internal "marks" vector is sorted.
*/
- bool sorted;
+ mutable bool sorted;
public:
/**
@@ -132,7 +184,7 @@ public:
* @param sourceId is the source identifier that should be used for
* constructing the location when returning tokens.
*/
- TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {}
+ TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); }
/**
* Appends a complete string to the internal character buffer and extends
@@ -140,22 +192,22 @@ public:
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
+ * @param protect if set to true, the appended characters will not be
+ * affected by whitespace handling, they will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(const std::string &data, SourceOffset offsStart)
- { // Append the data to the internal buffer
- buf.insert(buf.end(), data.begin(), data.end());
-
- // Extend the text regions, interpolate the source position (this may
- // yield incorrect results)
- const size_t size = buf.size();
- for (SourceOffset offs = offsStart; offs < offsStart + data.size();
- offs++) {
- offsets.storeOffset(offs, offs + 1);
+ size_t append(const std::string &data, SourceOffset offsStart, bool protect)
+ {
+ for (size_t i = 0; i < data.size(); i++) {
+ if (offsStart != InvalidSourceOffset) {
+ append(data[i], offsStart + i, offsStart + i + 1, protect);
+ } else {
+ append(data[i], InvalidSourceOffset, InvalidSourceOffset,
+ protect);
+ }
}
-
- return size;
+ return size();
}
/**
@@ -165,16 +217,86 @@ public:
* @param c is the character that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
* @param offsEnd is the end offset in bytes in the input file.
+ * @param protect if set to true, the appended character will not be
+ * affected by whitespace handling, it will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd)
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+ bool protect)
{
// Add the character to the list and store the location of the character
// in the source file
buf.push_back(c);
+ protectedChars.push_back(protect);
offsets.storeOffset(offsStart, offsEnd);
- return buf.size();
+
+ // Insert special tokens
+ const size_t size = buf.size();
+ const bool isWhitespace = Utils::isWhitespace(c);
+ const bool isLinebreak = Utils::isLinebreak(c);
+
+ // Handle linebreaks
+ if (isLinebreak) {
+ // Mark linebreaks as linebreak
+ mark(Tokens::Newline, size - 1, 1, false);
+
+ // The linebreak sequence started at the previous character
+ if (numLinebreaks == 0) {
+ firstLinebreak = size - 1;
+ }
+
+ // Reset the indentation
+ currentIndentation = 0;
+
+ // Increment the number of linebreaks
+ numLinebreaks++;
+
+ const size_t markStart = firstLinebreak;
+ const size_t markLength = size - firstLinebreak;
+
+ // Issue two consecutive linebreaks as paragraph token
+ if (numLinebreaks == 2) {
+ mark(Tokens::Paragraph, markStart, markLength, false);
+ }
+
+ // Issue three consecutive linebreaks as paragraph token
+ if (numLinebreaks >= 3) {
+ mark(Tokens::Section, markStart, markLength, false);
+ }
+ } else if (isWhitespace) {
+ // Count the whitespace characters at the beginning of the line
+ if (numLinebreaks > 0) {
+ // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the
+ // next multiple of eight.
+ if (c == '\t') {
+ currentIndentation = (currentIndentation + 8) & ~7;
+ } else {
+ currentIndentation++;
+ }
+ }
+ }
+
+ // Issue indent and unindent tokens
+ if (!isWhitespace && numLinebreaks > 0) {
+ // Issue a larger indentation than that in the previous line as
+ // "Indent" token
+ if (currentIndentation > lastIndentation) {
+ mark(Tokens::Indent, size - 1, 0, true);
+ }
+
+ // Issue a smaller indentation than that in the previous line as
+ // "Dedent" token
+ if (currentIndentation < lastIndentation) {
+ mark(Tokens::Dedent, size - 1, 0, true);
+ }
+
+ // Reset the internal state machine
+ lastIndentation = currentIndentation;
+ numLinebreaks = 0;
+ }
+
+ return size;
}
/**
@@ -184,11 +306,12 @@ public:
* @param bufStart is the start position in the internal buffer. Use the
* values returned by append to calculate the start position.
* @param len is the length of the token.
+ * @param special tags the mark as "special", prefering it in the sort order
*/
- void mark(TokenId id, size_t bufStart, TokenLength len)
+ void mark(TokenId id, size_t bufStart, TokenLength len, bool special)
{
// Push the new instance back onto the list
- marks.emplace_back(id, bufStart, len);
+ marks.emplace_back(id, bufStart, len, special);
// Update the sorted flag as soon as more than one element is in the
// list
@@ -212,9 +335,13 @@ public:
* @return true if a token was returned, false if no more tokens are
* available.
*/
- bool next(Token &token, WhitespaceMode mode,
- const std::unordered_set<TokenId> &tokens, size_t &cursor)
+ bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens,
+ TokenizedDataCursor &cursor) const
{
+ // Some variables for convenient access
+ size_t &bufPos = cursor.bufPos;
+ size_t &markPos = cursor.markPos;
+
// Sort the "marks" vector if it has not been sorted yet.
if (!sorted) {
std::sort(marks.begin(), marks.end());
@@ -222,10 +349,11 @@ public:
}
// Fetch the next larger TokenMark instance, make sure the token is in
- // the "enabled" list
- auto it =
- std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor));
- while (it != marks.end() && tokens.count(it->id) == 0) {
+ // the "enabled" list and within the buffer range
+ auto it = std::lower_bound(marks.begin() + markPos, marks.end(),
+ TokenMark(bufPos));
+ while (it != marks.end() && (tokens.count(it->id) == 0 ||
+ it->bufStart + it->len > buf.size())) {
it++;
}
@@ -236,15 +364,15 @@ public:
// Depending on the whitespace mode, fetch all the data between the
// cursor position and the calculated end position and return a token
// containing that data.
- if (cursor < end && cursor < buf.size()) {
+ if (bufPos < end && bufPos < buf.size()) {
switch (mode) {
case WhitespaceMode::PRESERVE: {
token = Token(
- Tokens::Data, std::string(&buf[cursor], end - cursor),
+ Tokens::Data, std::string(&buf[bufPos], end - bufPos),
SourceLocation(sourceId,
- offsets.loadOffset(cursor).first,
+ offsets.loadOffset(bufPos).first,
offsets.loadOffset(end).first));
- cursor = end;
+ bufPos = end;
return true;
}
case WhitespaceMode::TRIM:
@@ -254,30 +382,35 @@ public:
size_t stringStart;
size_t stringEnd;
std::string content;
+ const char *cBuf = &buf[bufPos];
+ auto filter = [cBuf, this](size_t i) -> bool {
+ return Utils::isWhitespace(cBuf[i]) &&
+ !protectedChars[i];
+ };
if (mode == WhitespaceMode::TRIM) {
- content = Utils::trim(&buf[cursor], end - cursor,
- stringStart, stringEnd);
+ content = Utils::trim(cBuf, end - bufPos, stringStart,
+ stringEnd, filter);
} else {
- content = Utils::collapse(&buf[cursor], end - cursor,
- stringStart, stringEnd);
+ content = Utils::collapse(
+ cBuf, end - bufPos, stringStart, stringEnd, filter);
}
// If the resulting string is empty (only whitespaces),
// abort
if (content.empty()) {
- cursor = end;
+ bufPos = end;
break;
}
// Calculate the absolute positions and return the token
- stringStart += cursor;
- stringEnd += cursor;
+ stringStart += bufPos;
+ stringEnd += bufPos;
token = Token(
Tokens::Data, content,
SourceLocation(sourceId,
offsets.loadOffset(stringStart).first,
offsets.loadOffset(stringEnd).first));
- cursor = end;
+ bufPos = end;
return true;
}
}
@@ -286,14 +419,18 @@ public:
// If start equals end, we're currently directly at a token
// instance. Return this token and advance the cursor to the end of
// the token.
- if (cursor == end && it != marks.end()) {
+ if (bufPos == end && it != marks.end()) {
const size_t tokenStart = it->bufStart;
const size_t tokenEnd = it->bufStart + it->len;
token = Token(
it->id, std::string(&buf[tokenStart], it->len),
SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,
offsets.loadOffset(tokenEnd).first));
- cursor = tokenEnd;
+
+ // Update the cursor, consume the token by incrementing the marks
+ // pos counter
+ bufPos = tokenEnd;
+ markPos = it - marks.begin() + 1;
return true;
}
@@ -304,11 +441,62 @@ public:
}
/**
+ * Resets the TokenizedDataImpl instance to the state it had when it was
+ * constructred.
+ */
+ void clear()
+ {
+ buf.clear();
+ protectedChars.clear();
+ offsets.clear();
+ marks.clear();
+ currentIndentation = 0;
+ lastIndentation = 0;
+ numLinebreaks = 1; // Assume the stream starts with a linebreak
+ sorted = true;
+ }
+
+ /**
+ * Trims the length of the TokenizedDataImpl instance to the given length.
+ *
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
+ */
+ void trim(size_t length)
+ {
+ if (length < size()) {
+ buf.resize(length);
+ offsets.trim(length);
+ }
+ }
+
+ /**
* Returns the current size of the internal buffer.
*
* @return the size of the internal character buffer.
*/
- size_t getSize() { return buf.size(); }
+ size_t size() const { return buf.size(); }
+
+ /**
+ * Returns true if no data is in the data buffer.
+ *
+ * @return true if the "buf" instance has no data.
+ */
+ bool empty() const { return buf.empty(); }
+
+ /**
+ * Returns the current location of all data in the buffer.
+ *
+ * @return the location of the entire data represented by this instance.
+ */
+ SourceLocation getLocation() const
+ {
+ if (empty()) {
+ return SourceLocation{sourceId};
+ }
+ return SourceLocation{sourceId, offsets.loadOffset(0).first,
+ offsets.loadOffset(size()).second};
+ }
};
/* Class TokenizedData */
@@ -316,50 +504,83 @@ public:
TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}
TokenizedData::TokenizedData(SourceId sourceId)
- : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0)
+ : impl(std::make_shared<TokenizedDataImpl>(sourceId))
{
}
TokenizedData::~TokenizedData() {}
-size_t TokenizedData::append(const std::string &data, SourceOffset offsStart)
+size_t TokenizedData::append(const std::string &data, SourceOffset offsStart,
+ bool protect)
{
- return impl->append(data, offsStart);
+ return impl->append(data, offsStart, protect);
}
size_t TokenizedData::append(char c, SourceOffset offsStart,
- SourceOffset offsEnd)
+ SourceOffset offsEnd, bool protect)
{
- return impl->append(c, offsStart, offsEnd);
+ return impl->append(c, offsStart, offsEnd, protect);
}
void TokenizedData::mark(TokenId id, TokenLength len)
{
- impl->mark(id, impl->getSize() - len, len);
+ impl->mark(id, impl->size() - len, len, false);
}
void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)
{
- impl->mark(id, bufStart, len);
+ impl->mark(id, bufStart, len, false);
}
-bool TokenizedData::next(Token &token, WhitespaceMode mode)
+void TokenizedData::clear() { impl->clear(); }
+
+void TokenizedData::trim(size_t length) { impl->trim(length); }
+
+size_t TokenizedData::size() const { return impl->size(); }
+
+bool TokenizedData::empty() const { return impl->empty(); }
+
+SourceLocation TokenizedData::getLocation() const
{
- return impl->next(token, mode, tokens, cursor);
+ return impl->getLocation();
}
-bool TokenizedData::text(Token &token, WhitespaceMode mode)
+TokenizedDataReader TokenizedData::reader() const
{
- // Copy the current cursor position to not update the actual cursor position
- // if the operation was not successful
- size_t cursorCopy = cursor;
- if (!impl->next(token, mode, tokens, cursorCopy) ||
- token.id != Tokens::Data) {
- return false;
- }
+ return TokenizedDataReader(impl, TokenizedDataCursor(),
+ TokenizedDataCursor());
+}
+
+/* Class TokenizedDataReader */
- // There is indeed a text token, update the internal cursor position
- cursor = cursorCopy;
- return true;
+TokenizedDataReader::TokenizedDataReader(
+ std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor)
+ : impl(impl), readCursor(readCursor), peekCursor(peekCursor)
+{
+}
+
+TokenizedDataReaderFork TokenizedDataReader::fork()
+{
+ return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor);
+}
+
+bool TokenizedDataReader::atEnd() const
+{
+ return readCursor.bufPos >= impl->size();
+}
+
+bool TokenizedDataReader::read(Token &token, const TokenSet &tokens,
+ WhitespaceMode mode)
+{
+ peekCursor = readCursor;
+ return impl->next(token, mode, tokens, readCursor);
+}
+
+bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens,
+ WhitespaceMode mode)
+{
+ return impl->next(token, mode, tokens, peekCursor);
}
}
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 38125c4..b72ca02 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -37,40 +37,48 @@
#include <core/common/Location.hpp>
#include <core/common/Whitespace.hpp>
-
-#include "Token.hpp"
+#include <core/common/Token.hpp>
namespace ousia {
// Forward declaration
class TokenizedDataImpl;
+class TokenizedDataReader;
+class TokenizedDataReaderFork;
/**
- * The TokenizedData class stores data extracted from a user defined document.
- * As users are capable of defining their own tokens and these are only valid
- * in certain scopes TokenizedData allows to divide the stored data into chunks
- * separated by tokens.
+ * Internally used structure representing a cursor within the TokenizedData
+ * stream.
*/
-class TokenizedData {
-private:
+struct TokenizedDataCursor {
/**
- * Shared pointer pointing at the internal data. This data is shared when
- * copying TokenizedData instances, which corresponds to forking a
- * TokenizedData instance.
+ * Position within the byte buffer.
*/
- std::shared_ptr<TokenizedDataImpl> impl;
+ size_t bufPos;
/**
- * Contains all currently enabled token ids.
+ * Position within the token mark buffer.
*/
- std::unordered_set<TokenId> tokens;
+ size_t markPos;
/**
- * Position from which the last element was read from the internal buffer.
- * This information is not shared with the other instances of TokenizedData
- * pointing at the same location.
+ * Default constructor. The resulting cursor points at the beginning of the
+ * stream.
+ */
+ TokenizedDataCursor() : bufPos(0), markPos(0) {}
+};
+
+/**
+ * The TokenizedData class stores data extracted from a user defined document.
+ * The data stored in TokenizedData
+ */
+class TokenizedData {
+private:
+ /**
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
*/
- size_t cursor;
+ std::shared_ptr<TokenizedDataImpl> impl;
public:
/**
@@ -101,10 +109,13 @@ public:
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
+ * @param protect if set to true, the appended characters will not be
+ * affected by whitespace handling, they will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(const std::string &data, SourceOffset offsStart = 0);
+ size_t append(const std::string &data, SourceOffset offsStart = 0,
+ bool protect = false);
/**
* Appends a single character to the internal character buffer.
@@ -112,10 +123,13 @@ public:
* @param c is the character that should be appended to the buffer.
* @param start is the start offset in bytes in the input file.
* @param end is the end offset in bytes in the input file.
+ * @param protect if set to true, the appended character will not be
+ * affected by whitespace handling, it will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd);
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+ bool protect = false);
/**
* Stores a token ending at the last character of the current buffer.
@@ -136,54 +150,194 @@ public:
void mark(TokenId id, size_t bufStart, TokenLength len);
/**
- * Enables a single token id. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Resets the TokenizedData instance to the state it had when it was
+ * constructred.
+ */
+ void clear();
+
+ /**
+ * Trims the length of the TokenizedData instance to the given length. Note
+ * that this function does not remove any token matches for performance
+ * reasons, it merely renders them incaccessible. Appending new data after
+ * calling trim will make the token marks accessible again. Thus this method
+ * should be the last function called to modify the data buffer and the
+ * token marks.
+ *
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
+ */
+ void trim(size_t length);
+
+ /**
+ * Returns the number of characters currently represented by this
+ * TokenizedData instance.
+ */
+ size_t size() const;
+
+ /**
+ * Returns true if the TokenizedData instance is empty, false otherwise.
*
- * @param id is the TokenId of the token that should be enabled.
+ * @return true if not data is stored inside the TokenizedData instance.
*/
- void enableToken(TokenId id) { tokens.insert(id); }
+ bool empty() const;
/**
- * Enables a set of token ids. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Returns the location of the entire TokenizedData instance.
*
- * @param ids is the TokenId of the token that should be enabled.
+ * @return the location of the entire data represented by this instance.
*/
- void enableToken(const std::unordered_set<TokenId> &ids)
- {
- tokens.insert(ids.begin(), ids.end());
- }
+ SourceLocation getLocation() const;
+
+ /**
+ * Returns a TokenizedDataReader instance that can be used to access the
+ * data.
+ *
+ * @return a new TokenizedDataReader instance pointing at the beginning of
+ * the internal buffer.
+ */
+ TokenizedDataReader reader() const;
+};
+
+/**
+ * The TokenizedDataReader
+ */
+class TokenizedDataReader {
+private:
+ friend TokenizedData;
+
+ /**
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
+ */
+ std::shared_ptr<const TokenizedDataImpl> impl;
+
+ /**
+ * Position from which the last element was read from the internal buffer.
+ */
+ TokenizedDataCursor readCursor;
+
+ /**
+ * Position from which the last element was peeked from the internal buffer.
+ */
+ TokenizedDataCursor peekCursor;
+
+protected:
+ /**
+ * Protected constructor of TokenizedDataReader, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader.
+ *
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor);
+
+public:
+ /**
+ * Returns a new TokenizedDataReaderFork from which tokens and text can be
+ * read without advancing this reader instance.
+ */
+ TokenizedDataReaderFork fork();
+
+ /**
+ * Returns true if this TokenizedData instance is at the end.
+ *
+ * @return true if the end of the TokenizedData instance has been reached.
+ */
+ bool atEnd() const;
/**
* Stores the next token in the given token reference, returns true if the
- * operation was successful, false if there are no more tokens.
+ * operation was successful, false if there are no more tokens. Advances the
+ * internal cursor and re
*
* @param token is an output parameter into which the read token will be
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
- bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ bool read(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM);
/**
- * Stores the next text token in the given token reference, returns true if
- * the operation was successful (there was indeed a text token), false if
- * the next token is not a text token or there were no more tokens.
+ * Stores the next token in the given token reference, returns true if the
+ * operation was successful, false if there are no more tokens.
*
* @param token is an output parameter into which the read token will be
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
- bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ bool peek(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM);
+
+ /**
+ * Consumes the peeked tokens, the read cursor will now be at the position
+ * of the peek cursor.
+ */
+ void consumePeek() { readCursor = peekCursor; }
+
+ /**
+ * Resets the peek cursor to the position of the read cursor.
+ */
+ void resetPeek() { peekCursor = readCursor; }
+};
+
+/**
+ * The TokenizedDataReaderFork class is created when forking a
+ * TokenizedDataReader
+ */
+class TokenizedDataReaderFork : public TokenizedDataReader {
+private:
+ friend TokenizedDataReader;
+
+ /**
+ * Reference pointing at the parent TokenizedDataReader to which changes may
+ * be commited.
+ */
+ TokenizedDataReader &parent;
+
+ /**
+ * Private constructor of TokenizedDataReaderFork, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader and a reference at the parent TokenizedDataReader.
+ *
+ * @param parent is the TokenizedDataReader instance to which the current
+ * read/peek progress may be commited.
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReaderFork(TokenizedDataReader &parent,
+ std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor)
+ : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
+ {
+ }
+
+public:
+ /**
+ * Commits the read/peek progress to the underlying parent.
+ */
+ void commit() { parent = *this; }
};
}
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 2e0ac13..e78b0f4 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -22,8 +22,8 @@
#include <core/common/CharReader.hpp>
#include <core/common/Exceptions.hpp>
#include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
+#include "TokenizedData.hpp"
#include "Tokenizer.hpp"
namespace ousia {
@@ -42,26 +42,33 @@ struct TokenMatch {
Token token;
/**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
+ * Position at which this token starts in the TokenizedData instance.
*/
- size_t textLength;
+ size_t dataStartOffset;
/**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
+ * Set to true if the matched token is a primary token.
*/
- size_t textEnd;
+ bool primary;
/**
* Constructor of the TokenMatch class.
*/
- TokenMatch() : textLength(0), textEnd(0) {}
+ TokenMatch() : dataStartOffset(0), primary(false) {}
/**
* Returns true if this TokenMatch instance actually represents a match.
+ *
+ * @return true if the TokenMatch actually has a match.
+ */
+ bool hasMatch() const { return token.id != Tokens::Empty; }
+
+ /**
+ * Returns the length of the matched token.
+ *
+ * @return the length of the token string.
*/
- bool hasMatch() { return token.id != Tokens::Empty; }
+ size_t size() const { return token.content.size(); }
};
/* Internal class TokenLookup */
@@ -83,36 +90,28 @@ private:
size_t start;
/**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
+ * Position at which this token starts in the TokenizedData instance.
*/
- size_t textLength;
-
- /**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
- */
- size_t textEnd;
+ size_t dataStartOffset;
public:
/**
* Constructor of the TokenLookup class.
*
* @param node is the current node.
- * @param start is the start position.
- * @param textLength is the text buffer length of the previous text token.
- * @param textEnd is the current end location of the previous text token.
+ * @param start is the start position in the source file.
+ * @param dataStartOffset is the current length of the TokenizedData buffer.
*/
- TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength,
- size_t textEnd)
- : node(node), start(start), textLength(textLength), textEnd(textEnd)
+ TokenLookup(const TokenTrie::Node *node, size_t start,
+ size_t dataStartOffset)
+ : node(node), start(start), dataStartOffset(dataStartOffset)
{
}
/**
* Tries to extend the current path in the token trie with the given
- * character. If a complete token is matched, stores this match in the
- * tokens list (in case it is longer than any previous token).
+ * character. If a complete token is matched, stores the match in the given
+ * TokenMatch reference and returns true.
*
* @param c is the character that should be appended to the current prefix.
* @param lookups is a list to which new TokeLookup instances are added --
@@ -123,73 +122,49 @@ public:
* Tokenizer.
* @param end is the end byte offset of the current character.
* @param sourceId is the source if of this file.
+ * @return true if a token was matched, false otherwise.
*/
- void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
- const std::vector<std::string> &tokens, SourceOffset end,
- SourceId sourceId)
+ bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+ const std::vector<Tokenizer::TokenDescriptor> &tokens,
+ SourceOffset end, SourceId sourceId)
{
- // Check whether we can continue the current token path with the given
- // character without visiting an already visited node
+ // Set to true once a token has been matched
+ bool res = false;
+
+ // Check whether we can continue the current token path, if not, abort
auto it = node->children.find(c);
if (it == node->children.end()) {
- return;
+ return res;
}
// Check whether the new node represents a complete token a whether it
// is longer than the current token. If yes, replace the current token.
node = it->second.get();
- if (node->type != Tokens::Empty) {
- const std::string &str = tokens[node->type];
- size_t len = str.size();
- if (len > match.token.content.size()) {
- match.token =
- Token{node->type, str, {sourceId, start, end}};
- match.textLength = textLength;
- match.textEnd = textEnd;
- }
+ if (node->id != Tokens::Empty) {
+ const Tokenizer::TokenDescriptor &descr = tokens[node->id];
+ match.token = Token(node->id, descr.string,
+ SourceLocation(sourceId, start, end));
+ match.dataStartOffset = dataStartOffset;
+ match.primary = descr.primary;
+ res = true;
}
// If this state can possibly be advanced, store it in the states list.
if (!node->children.empty()) {
lookups.emplace_back(*this);
}
+ return res;
}
};
-/**
- * Transforms the given token into a data token containing the extracted
- * text.
- *
- * @param handler is the WhitespaceHandler containing the collected data.
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
-static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match,
- SourceId sourceId)
-{
- if (match.hasMatch()) {
- match.token.content =
- std::string{handler.textBuf.data(), match.textLength};
- match.token.location =
- SourceLocation{sourceId, handler.textStart, match.textEnd};
- } else {
- match.token.content = handler.toString();
- match.token.location =
- SourceLocation{sourceId, handler.textStart, handler.textEnd};
- }
- match.token.id = Tokens::Data;
-}
}
/* Class Tokenizer */
-Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
- : whitespaceMode(whitespaceMode), nextTokenId(0)
-{
-}
+Tokenizer::Tokenizer() : nextTokenId(0) {}
-template <typename TextHandler, bool read>
-bool Tokenizer::next(CharReader &reader, Token &token)
+template <bool read>
+bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
{
// If we're in the read mode, reset the char reader peek position to the
// current read position
@@ -199,43 +174,62 @@ bool Tokenizer::next(CharReader &reader, Token &token)
// Prepare the lookups in the token trie
const TokenTrie::Node *root = trie.getRoot();
- TokenMatch match;
+ TokenMatch bestMatch;
std::vector<TokenLookup> lookups;
std::vector<TokenLookup> nextLookups;
- // Instantiate the text handler
- TextHandler textHandler;
-
// Peek characters from the reader and try to advance the current token tree
// cursor
char c;
+ const size_t initialDataSize = data.size();
size_t charStart = reader.getPeekOffset();
const SourceId sourceId = reader.getSourceId();
while (reader.peek(c)) {
const size_t charEnd = reader.getPeekOffset();
- const size_t textLength = textHandler.textBuf.size();
- const size_t textEnd = textHandler.textEnd;
+ const size_t dataStartOffset = data.size();
// If we do not have a match yet, start a new lookup from the root
- if (!match.hasMatch()) {
- TokenLookup{root, charStart, textLength, textEnd}.advance(
- c, nextLookups, match, tokens, charEnd, sourceId);
+ if (!bestMatch.hasMatch()) {
+ lookups.emplace_back(root, charStart, dataStartOffset);
}
// Try to advance all other lookups with the new character
+ TokenMatch match;
for (TokenLookup &lookup : lookups) {
- lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+ // Continue if the current lookup
+ if (!lookup.advance(c, nextLookups, match, tokens, charEnd,
+ sourceId)) {
+ continue;
+ }
+
+ // If the matched token is primary, check whether it is better than
+ // the current best match, if yes, replace the best match. In any
+ // case just continue
+ if (match.primary) {
+ if (match.size() > bestMatch.size()) {
+ bestMatch = match;
+ }
+ continue;
+ }
+
+ // Otherwise -- if the matched token is a non-primary token (and no
+ // primary token has been found until now) -- mark the match in the
+ // TokenizedData
+ if (!bestMatch.hasMatch()) {
+ data.mark(match.token.id, data.size() - match.size() + 1,
+ match.size());
+ }
}
// We have found a token and there are no more states to advance or the
// text handler has found something -- abort to return the new token
- if (match.hasMatch()) {
- if ((nextLookups.empty() || textHandler.hasText())) {
+ if (bestMatch.hasMatch()) {
+ if ((nextLookups.empty() || data.size() > initialDataSize)) {
break;
}
} else {
// Record all incomming characters
- textHandler.append(c, charStart, charEnd);
+ data.append(c, charStart, charEnd);
}
// Swap the lookups and the nextLookups list
@@ -246,60 +240,53 @@ bool Tokenizer::next(CharReader &reader, Token &token)
charStart = charEnd;
}
- // If we found text, emit that text
- if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
- buildDataToken(textHandler, match, sourceId);
+ // If we found data, emit a corresponding data token
+ if (data.size() > initialDataSize &&
+ (!bestMatch.hasMatch() ||
+ bestMatch.dataStartOffset > initialDataSize)) {
+ // If we have a "bestMatch" wich starts after text data has started,
+ // trim the TokenizedData to this offset
+ if (bestMatch.dataStartOffset > initialDataSize) {
+ data.trim(bestMatch.dataStartOffset);
+ }
+
+ // Create a token containing the data location
+ bestMatch.token = Token{data.getLocation()};
}
// Move the read/peek cursor to the end of the token, abort if an error
// happens while doing so
- if (match.hasMatch()) {
+ if (bestMatch.hasMatch()) {
// Make sure we have a valid location
- if (match.token.location.getEnd() == InvalidSourceOffset) {
+ if (bestMatch.token.location.getEnd() == InvalidSourceOffset) {
throw OusiaException{"Token end position offset out of range"};
}
// Seek to the end of the current token
- const size_t end = match.token.location.getEnd();
+ const size_t end = bestMatch.token.location.getEnd();
if (read) {
reader.seek(end);
} else {
reader.seekPeekCursor(end);
}
- token = match.token;
+ token = bestMatch.token;
} else {
token = Token{};
}
- return match.hasMatch();
+ return bestMatch.hasMatch();
}
-bool Tokenizer::read(CharReader &reader, Token &token)
+bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data)
{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingWhitespaceHandler, true>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingWhitespaceHandler, true>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingWhitespaceHandler, true>(reader, token);
- }
- return false;
+ return next<true>(reader, token, data);
}
-bool Tokenizer::peek(CharReader &reader, Token &token)
+bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data)
{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingWhitespaceHandler, false>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingWhitespaceHandler, false>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingWhitespaceHandler, false>(reader, token);
- }
- return false;
+ return next<false>(reader, token, data);
}
-TokenId Tokenizer::registerToken(const std::string &token)
+TokenId Tokenizer::registerToken(const std::string &token, bool primary)
{
// Abort if an empty token should be registered
if (token.empty()) {
@@ -309,8 +296,8 @@ TokenId Tokenizer::registerToken(const std::string &token)
// Search for a new slot in the tokens list
TokenId type = Tokens::Empty;
for (size_t i = nextTokenId; i < tokens.size(); i++) {
- if (tokens[i].empty()) {
- tokens[i] = token;
+ if (!tokens[i].valid()) {
+ tokens[i] = TokenDescriptor(token, primary);
type = i;
break;
}
@@ -320,62 +307,47 @@ TokenId Tokenizer::registerToken(const std::string &token)
// override the special token type handles
if (type == Tokens::Empty) {
type = tokens.size();
- if (type == Tokens::Data || type == Tokens::Empty) {
+ if (type >= Tokens::MaxTokenId) {
throw OusiaException{"Token type ids depleted!"};
}
- tokens.emplace_back(token);
+ tokens.emplace_back(token, primary);
}
nextTokenId = type + 1;
- // Try to register the token in the trie -- if this fails, remove it
- // from the tokens list
+ // Try to register the token in the trie -- if this fails, remove it from
+ // the tokens list
if (!trie.registerToken(token, type)) {
- tokens[type] = std::string{};
+ tokens[type] = TokenDescriptor();
nextTokenId = type;
return Tokens::Empty;
}
return type;
}
-bool Tokenizer::unregisterToken(TokenId type)
+bool Tokenizer::unregisterToken(TokenId id)
{
// Unregister the token from the trie, abort if an invalid type is given
- if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
- tokens[type] = std::string{};
- nextTokenId = type;
+ if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) {
+ tokens[id] = TokenDescriptor();
+ nextTokenId = id;
return true;
}
return false;
}
-std::string Tokenizer::getTokenString(TokenId type)
-{
- if (type < tokens.size()) {
- return tokens[type];
- }
- return std::string{};
-}
+static Tokenizer::TokenDescriptor EmptyTokenDescriptor;
-void Tokenizer::setWhitespaceMode(WhitespaceMode mode)
+const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const
{
- whitespaceMode = mode;
+ if (id < tokens.size()) {
+ return tokens[id];
+ }
+ return EmptyTokenDescriptor;
}
-WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; }
-
/* Explicitly instantiate all possible instantiations of the "next" member
function */
-template bool Tokenizer::next<PreservingWhitespaceHandler, false>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, false>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, false>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<PreservingWhitespaceHandler, true>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, true>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, true>(
- CharReader &reader, Token &token);
+template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &);
+template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &);
}
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index f21c6a3..74e3f0d 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -19,8 +19,8 @@
/**
* @file Tokenizer.hpp
*
- * Tokenizer that can be reconfigured at runtime used for parsing the plain
- * text format.
+ * Tokenizer that can be reconfigured at runtime and is used for parsing the
+ * plain text format.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
@@ -28,44 +28,80 @@
#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
-#include <set>
+#include <cstdint>
#include <string>
#include <vector>
#include <core/common/Location.hpp>
-#include <core/common/Whitespace.hpp>
+#include <core/common/Token.hpp>
-#include "Token.hpp"
#include "TokenTrie.hpp"
namespace ousia {
// Forward declarations
class CharReader;
+class TokenizedData;
/**
* The Tokenizer is used to extract tokens and chunks of text from a
- * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters. Note that the
- * Tokenizer always tries to extract the longest possible token from the
- * tokenizer.
+ * CharReader. It allows to register and unregister tokens while parsing. Note
+ * that the Tokenizer always tries to extract the longest possible token from
+ * the tokenizer. Tokens can be registered as primary or non-primary token. If
+ * a Token is registered as a primary token, it is returned as a single Token
+ * instance if it occurs. In the non-primary case the token is returned as part
+ * of a segmented TokenizedData instance.
*/
class Tokenizer {
-private:
+public:
/**
- * Internally used token trie. This object holds all registered tokens.
+ * Internally used structure describing a registered token.
*/
- TokenTrie trie;
+ struct TokenDescriptor {
+ /**
+ * String describing the token.
+ */
+ std::string string;
+
+ /**
+ * Set to true if this token is primary.
+ */
+ bool primary;
+
+ /**
+ * Constructor of the TokenDescriptor class.
+ *
+ * @param string is the string representation of the registered token.
+ * @param primary specifies whether the token is a primary token that
+ * should be returned as a single token, or a secondary token, that
+ * should be returned as part of TokenizedData.
+ */
+ TokenDescriptor(const std::string &string, bool primary)
+ : string(string), primary(primary)
+ {
+ }
+
+ /**
+ * Default constructor.
+ */
+ TokenDescriptor() : primary(false) {}
+
+ /**
+ * Returns true if the TokenDescriptor represents a valid token.
+ */
+ bool valid() { return !string.empty(); }
+ };
+private:
/**
- * Flag defining whether whitespaces should be preserved or not.
+ * Internally used token trie. This object holds all registered tokens.
*/
- WhitespaceMode whitespaceMode;
+ TokenTrie trie;
/**
* Vector containing all registered token types.
*/
- std::vector<std::string> tokens;
+ std::vector<TokenDescriptor> tokens;
/**
* Next index in the tokens list where to search for a new token id.
@@ -74,90 +110,78 @@ private:
/**
* Templated function used internally to read the current token. The
- * function is templated in order to force code generation for all six
- * combiations of whitespace modes and reading/peeking.
+ * function is templated in order to force optimized code generation for
+ * both reading and peeking.
*
- * @tparam TextHandler is the type to be used for the textHandler instance.
- * @tparam read specifies whether the function should start from and advance
- * the read pointer of the char reader.
+ * @tparam read specifies whether the method should read the token or just
+ * peek.
* @param reader is the CharReader instance from which the data should be
* read.
* @param token is the token structure into which the token information
* should be written.
+ * @param data is a reference at the TokenizedData instance to which the
+ * token information should be appended.
* @return false if the end of the stream has been reached, true otherwise.
*/
- template <typename TextHandler, bool read>
- bool next(CharReader &reader, Token &token);
+ template <bool read>
+ bool next(CharReader &reader, Token &token, TokenizedData &data);
public:
/**
* Constructor of the Tokenizer class.
- *
- * @param whitespaceMode specifies how whitespace should be handled.
*/
- Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+ Tokenizer();
/**
- * Registers the given string as a token. Returns a const pointer at a
- * TokenDescriptor that will be used to reference the newly created token.
+ * Registers the given string as a token. Returns a unique identifier
+ * describing the registered token.
*
* @param token is the token string that should be registered.
- * @return a unique identifier for the registered token or EmptyToken if
+ * @param primary specifies whether the token is a primary token -- if true,
+ * the token will be returned as a single, standalone token. Otherwise the
+ * token will be returned as part of a "TokenizedData" structure.
+ * @return a unique identifier for the registered token or Tokens::Empty if
* an error occured.
*/
- TokenId registerToken(const std::string &token);
+ TokenId registerToken(const std::string &token, bool primary = true);
/**
* Unregisters the token belonging to the given TokenId.
*
* @param type is the token type that should be unregistered. The
- *TokenId
- * must have been returned by registerToken.
+ * TokenId must have been returned by registerToken.
* @return true if the operation was successful, false otherwise (e.g.
- * because the given TokenDescriptor was already unregistered).
+ * because the token with the given TokenId was already unregistered).
*/
- bool unregisterToken(TokenId type);
+ bool unregisterToken(TokenId id);
/**
* Returns the token that was registered under the given TokenId id or
- *an
- * empty string if an invalid TokenId id is given.
+ * an empty string if an invalid TokenId id is given.
*
- * @param type is the TokenId id for which the corresponding token
- *string
+ * @param id is the TokenId for which the corresponding TokenDescriptor
* should be returned.
- * @return the registered token string or an empty string if the given type
- * was invalid.
- */
- std::string getTokenString(TokenId type);
-
- /**
- * Sets the whitespace mode.
- *
- * @param whitespaceMode defines how whitespace should be treated in text
- * tokens.
- */
- void setWhitespaceMode(WhitespaceMode mode);
-
- /**
- * Returns the current value of the whitespace mode.
- *
- * @return the whitespace mode.
+ * @return the registered TokenDescriptor or an invalid TokenDescriptor if
+ * the given TokenId is invalid.
*/
- WhitespaceMode getWhitespaceMode();
+ const TokenDescriptor& lookupToken(TokenId id) const;
/**
* Reads a new token from the CharReader and stores it in the given
- * Token instance.
+ * Token instance. If the token has the id Tokens::Data, use the "getData"
+ * method to fetch a reference at the underlying TokenizedData instance
+ * storing the data.
*
* @param reader is the CharReader instance from which the data should be
* read.
* @param token is a reference at the token instance into which the Token
* information should be written.
+ * @param data is a reference at the TokenizedData instance to which the
+ * token information should be appended.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool read(CharReader &reader, Token &token);
+ bool read(CharReader &reader, Token &token, TokenizedData &data);
/**
* The peek method does not advance the read position of the char reader,
@@ -167,10 +191,12 @@ public:
* read.
* @param token is a reference at the token instance into which the Token
* information should be written.
+ * @param data is a reference at the TokenizedData instance to which the
+ * token information should be appended.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool peek(CharReader &reader, Token &token);
+ bool peek(CharReader &reader, Token &token, TokenizedData &data);
};
}
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index f61ac7d..d4cdbf8 100644
--- a/src/formats/osml/OsmlStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -94,92 +94,11 @@ public:
static const PlainFormatTokens OsmlTokens;
-/**
- * Class used internally to collect data issued via "DATA" event.
- */
-class DataHandler {
-private:
- /**
- * Internal character buffer.
- */
- std::vector<char> buf;
-
- /**
- * Start location of the character data.
- */
- SourceOffset start;
-
- /**
- * End location of the character data.
- */
- SourceOffset end;
-
-public:
- /**
- * Default constructor, initializes start and end with zeros.
- */
- DataHandler() : start(0), end(0) {}
-
- /**
- * Returns true if the internal buffer is empty.
- *
- * @return true if no characters were added to the internal buffer, false
- * otherwise.
- */
- bool isEmpty() { return buf.empty(); }
-
- /**
- * Appends a single character to the internal buffer.
- *
- * @param c is the character that should be added to the internal buffer.
- * @param charStart is the start position of the character.
- * @param charEnd is the end position of the character.
- */
- void append(char c, SourceOffset charStart, SourceOffset charEnd)
- {
- if (isEmpty()) {
- start = charStart;
- }
- buf.push_back(c);
- end = charEnd;
- }
-
- /**
- * Appends a string to the internal buffer.
- *
- * @param s is the string that should be added to the internal buffer.
- * @param stringStart is the start position of the string.
- * @param stringEnd is the end position of the string.
- */
- void append(const std::string &s, SourceOffset stringStart,
- SourceOffset stringEnd)
- {
- if (isEmpty()) {
- start = stringStart;
- }
- std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
- end = stringEnd;
- }
-
- /**
- * Converts the internal buffer to a variant with attached location
- * information.
- *
- * @param sourceId is the source id which is needed for building the
- * location information.
- * @return a Variant with the internal buffer content as string and
- * the correct start and end location.
- */
- Variant toVariant(SourceId sourceId)
- {
- Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
- res.setLocation({sourceId, start, end});
- return res;
- }
-};
-
OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
- : reader(reader), logger(logger), tokenizer(OsmlTokens)
+ : reader(reader),
+ logger(logger),
+ tokenizer(OsmlTokens),
+ data(reader.getSourceId())
{
// Place an intial command representing the complete file on the stack
commands.push(Command{"", Variant::mapType{}, true, true, true, false});
@@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
{
bool first = true;
- bool hasCharSiceNSSep = false;
+ bool hasCharSinceNSSep = false;
std::vector<char> identifier;
size_t end = reader.getPeekOffset();
char c, c2;
@@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
if ((first && Utils::isIdentifierStartCharacter(c)) ||
(!first && Utils::isIdentifierCharacter(c))) {
identifier.push_back(c);
- } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+ } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&
Utils::isIdentifierStartCharacter(c2)) {
identifier.push_back(c);
} else {
@@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
// This is no longer the first character
first = false;
- // Advance the hasCharSiceNSSep flag
- hasCharSiceNSSep = allowNSSep && (c != ':');
+ // Advance the hasCharSinceNSSep flag
+ hasCharSinceNSSep = allowNSSep && (c != ':');
end = reader.getPeekOffset();
reader.consumePeek();
@@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment()
{
Token token;
size_t depth = 1;
- while (tokenizer.read(reader, token)) {
+ while (tokenizer.read(reader, token, data)) {
+ // Throw the comment data away
+ data.clear();
+
if (token.id == OsmlTokens.BlockCommentEnd) {
depth--;
if (depth == 0) {
@@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment()
}
}
-bool OsmlStreamParser::checkIssueData(DataHandler &handler)
+bool OsmlStreamParser::checkIssueData()
{
- if (!handler.isEmpty()) {
- data = handler.toVariant(reader.getSourceId());
+ if (!data.empty()) {
location = data.getLocation();
reader.resetPeek();
return true;
@@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField()
OsmlStreamParser::State OsmlStreamParser::parse()
{
- // Handler for incomming data
- DataHandler handler;
+ // Reset the data handler
+ data.clear();
// Read tokens until the outer loop should be left
Token token;
- while (tokenizer.peek(reader, token)) {
+ while (tokenizer.peek(reader, token, data)) {
const TokenId type = token.id;
// Special handling for Backslash and Text
@@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// Try to parse a command
if (Utils::isIdentifierStartCharacter(c)) {
// Make sure to issue any data before it is to late
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// If this was an annotation start token, add the parsed < to the
// output
if (type == OsmlTokens.AnnotationStart) {
- handler.append('<', token.location.getStart(),
- token.location.getStart() + 1);
+ data.append('<', token.location.getStart(),
+ token.location.getStart() + 1);
}
- handler.append(c, token.location.getStart(),
- reader.getPeekOffset());
+ data.append(c, token.location.getStart(), reader.getPeekOffset());
reader.consumePeek();
continue;
} else if (type == Tokens::Data) {
@@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse()
location = token.location;
return State::FIELD_START;
}
-
- // Append the text to the data handler
- handler.append(token.content, token.location.getStart(),
- token.location.getEnd());
-
reader.consumePeek();
continue;
}
// A non-text token was reached, make sure all pending data commands
// have been issued
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse()
Command &cmd = commands.top();
if (!cmd.inField) {
cmd.inField = true;
- return State::FIELD_START;
}
- logger.error(
+ return State::FIELD_START;
+/* logger.error(
"Got field start token \"{\", but no command for which to "
"start the field. Write \"\\{\" to insert this sequence as "
"text.",
- token);
+ token);*/
} else if (token.id == OsmlTokens.FieldEnd) {
- if (closeField()) {
+ closeField();
+ return State::FIELD_END;
+/* if (closeField()) {
return State::FIELD_END;
}
logger.error(
"Got field end token \"}\", but there is no field to end. "
"Write \"\\}\" to insert this sequence as text.",
- token);
+ token);*/
} else if (token.id == OsmlTokens.DefaultFieldStart) {
// Try to start a default field the first time the token is reached
Command &topCmd = commands.top();
if (!topCmd.inField) {
topCmd.inField = true;
topCmd.inDefaultField = true;
- return State::FIELD_START;
}
- logger.error(
+ return State::FIELD_START;
+/* logger.error(
"Got default field start token \"{!\", but no command for "
"which to start the field. Write \"\\{!\" to insert this "
"sequence as text",
- token);
+ token);*/
} else if (token.id == OsmlTokens.AnnotationEnd) {
// We got a single annotation end token "\>" -- simply issue the
// ANNOTATION_END event
@@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
}
// Issue available data
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse()
return State::END;
}
+Variant OsmlStreamParser::getText(WhitespaceMode mode)
+{
+ TokenizedData dataFork = data;
+ Variant text = dataFork.text(mode);
+ location = text.getLocation();
+ return text;
+}
+
const Variant &OsmlStreamParser::getCommandName() const
{
return commands.top().name;
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index dc3034c..453a2bb 100644
--- a/src/formats/osml/OsmlStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -29,17 +29,19 @@
#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
#define _OUSIA_OSML_STREAM_PARSER_HPP_
-#include <stack>
+#include <memory>
#include <core/common/Variant.hpp>
+#include <core/common/Whitespace.hpp>
#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
namespace ousia {
// Forward declarations
class CharReader;
class Logger;
-class DataHandler;
+class OsmlStreamParserImpl;
/**
* The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
@@ -137,26 +139,15 @@ public:
Variant arguments;
/**
- * Set to true if this is a command with clear begin and end.
- */
- bool hasRange : 1;
-
- /**
- * Set to true if we are currently inside a field of this command.
- */
- bool inField : 1;
-
- /**
- * Set to true if we are currently in the range field of the command
- * (implies inField being set to true).
+ * Vector used as stack for holding the number of opening/closing braces
+ * and the corresponding "isDefaultField" flag.
*/
- bool inRangeField : 1;
+ std::vector<bool> fields;
/**
- * Set to true if we are currently in a field that has been especially
- * marked as default field (using the "|") syntax.
+ * Set to true if this is a command with clear begin and end.
*/
- bool inDefaultField : 1;
+ bool hasRange;
/**
* Default constructor.
@@ -164,7 +155,6 @@ public:
Command()
: hasRange(false),
inField(false),
- inRangeField(false),
inDefaultField()
{
}
@@ -178,15 +168,10 @@ public:
* command.
* @param hasRange should be set to true if this is a command with
* explicit range.
- * @param inField is set to true if we currently are inside a field
- * of this command.
- * @param inRangeField is set to true if we currently are inside the
- * outer field of a ranged command.
* @param inDefaultField is set to true if we currently are in a
* specially marked default field.
*/
- Command(Variant name, Variant arguments, bool hasRange,
- bool inField, bool inRangeField, bool inDefaultField)
+ Command(Variant name, Variant arguments, bool hasRange)
: name(std::move(name)),
arguments(std::move(arguments)),
hasRange(hasRange),
@@ -215,25 +200,20 @@ private:
Tokenizer tokenizer;
/**
- * Stack containing the current commands.
- */
- std::stack<Command> commands;
-
- /**
- * Variant containing the data that has been read (always is a string,
- * contains the exact location of the data in the source file).
+ * Variant containing the tokenized data that was returned from the
+ * tokenizer as data.
*/
- Variant data;
+ TokenizedData data;
/**
- * Contains the location of the last token.
+ * Stack containing the current commands.
*/
- SourceLocation location;
+ std::stack<Command> commands;
/**
- * Contains the field index of the current command.
+ * Pointer at
*/
- size_t fieldIdx;
+ std::unique_ptr<OsmlStreamParserImpl> impl;
/**
* Function used internall to parse an identifier.
@@ -291,12 +271,10 @@ private:
/**
* Checks whether there is any data pending to be issued, if yes, issues it.
*
- * @param handler is the data handler that contains the data that may be
- * returned to the user.
* @return true if there was any data and DATA should be returned by the
* parse function, false otherwise.
*/
- bool checkIssueData(DataHandler &handler);
+ bool checkIssueData();
/**
* Called before any data is appended to the internal data handler. Checks
@@ -328,6 +306,12 @@ public:
OsmlStreamParser(CharReader &reader, Logger &logger);
/**
+ * Destructor of the OsmlStreamParser, needed to destroy the incomplete
+ * OsmlStreamParserImpl.
+ */
+ ~OsmlStreamParser();
+
+ /**
* Continues parsing. Returns one of the states defined in the State enum.
* Callers should stop once the State::END state is reached. Use the getter
* functions to get more information about the current state, such as the
@@ -344,7 +328,19 @@ public:
* @return a reference at a variant containing the data parsed by the
* "parse" function.
*/
- const Variant &getData() const { return data; }
+ const TokenizedData &getData() const { return data; }
+
+ /**
+ * Returns the complete content of the internal TokenizedData instance as
+ * a single string Variant. This method is mainly used in the unit tests for
+ * this class, it simply calls the text() method of TokenizedData.
+ *
+ * @param mode is the WhitespaceMode that should be used for returning the
+ * text.
+ * @return a string variant containing the text content of the internal
+ * TokenizedData instance or a nullptr variant if there is no text.
+ */
+ Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE);
/**
* Returns a reference at the internally stored command name. Only valid if
@@ -371,13 +367,6 @@ public:
* syntax).
*/
bool inDefaultField() const;
-
- /**
- * Returns a reference at the char reader.
- *
- * @return the last internal token location.
- */
- const SourceLocation &getLocation() const { return location; }
};
}
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
index c9254b0..855f80d 100644
--- a/src/formats/osxml/OsxmlEventParser.cpp
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -25,7 +25,6 @@
#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
#include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
#include "OsxmlAttributeLocator.hpp"
#include "OsxmlEventParser.hpp"
@@ -57,17 +56,6 @@ public:
std::vector<char> textBuf;
/**
- * Current whitespace buffer (for the trimming whitspace mode)
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Flag indicating whether a whitespace character was present (for the
- * collapsing whitespace mode).
- */
- bool hasWhitespace;
-
- /**
* Current character data start.
*/
size_t textStart;
@@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
// Fetch some variables for convenience
- const WhitespaceMode mode = parser->getWhitespaceMode();
OsxmlEventParserData &data = parser->getData();
std::vector<char> &textBuf = data.textBuf;
- std::vector<char> &whitespaceBuf = data.whitespaceBuf;
- bool &hasWhitespace = data.hasWhitespace;
- size_t &textStart = data.textStart;
- size_t &textEnd = data.textEnd;
-
- size_t pos = loc.getStart();
- for (size_t i = 0; i < ulen; i++, pos++) {
- switch (mode) {
- case WhitespaceMode::PRESERVE:
- PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd);
- break;
- case WhitespaceMode::TRIM:
- TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- whitespaceBuf);
- break;
- case WhitespaceMode::COLLAPSE:
- CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- hasWhitespace);
- break;
- }
+
+ // Update start and end position
+ if (textBuf.empty()) {
+ data.textStart = loc.getStart();
}
+ data.textEnd = loc.getEnd();
+
+ // Insert the data into the text buffer
+ textBuf.insert(textBuf.end(), &s[0], &s[ulen]);
}
/* Class OsxmlEvents */
@@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {}
/* Class OsxmlEventParser */
OsxmlEventParserData::OsxmlEventParserData()
- : depth(0),
- annotationEndTagDepth(-1),
- hasWhitespace(false),
- textStart(0),
- textEnd(0)
+ : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0)
{
}
@@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId)
// Reset the text buffers
textBuf.clear();
- whitespaceBuf.clear();
- hasWhitespace = false;
textStart = 0;
textEnd = 0;
@@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
: reader(reader),
events(events),
logger(logger),
- whitespaceMode(WhitespaceMode::COLLAPSE),
data(new OsxmlEventParserData())
{
}
@@ -532,16 +497,6 @@ void OsxmlEventParser::parse()
}
}
-void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
-{
- this->whitespaceMode = whitespaceMode;
-}
-
-WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
-{
- return whitespaceMode;
-}
-
CharReader &OsxmlEventParser::getReader() const { return reader; }
Logger &OsxmlEventParser::getLogger() const { return logger; }
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
index e39245f..e3fd5d4 100644
--- a/src/formats/osxml/OsxmlEventParser.hpp
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -32,8 +32,6 @@
#include <memory>
#include <string>
-#include <core/common/Whitespace.hpp>
-
namespace ousia {
// Forward declarations
@@ -99,13 +97,10 @@ public:
virtual void fieldEnd() = 0;
/**
- * Called whenever data is found. Whitespace data is handled as specified
- * and the data has been parsed to the specified variant type. This function
- * is not called if the parsing failed, the parser prints an error message
- * instead.
+ * Called whenever string data is found.
*
- * @param data is the already parsed data that should be passed to the
- * handler.
+ * @param data is a Variant containing the string data that was found in the
+ * XML file.
*/
virtual void data(const Variant &data) = 0;
};
@@ -135,11 +130,6 @@ private:
Logger &logger;
/**
- * Current whitespace mode.
- */
- WhitespaceMode whitespaceMode;
-
- /**
* Data to be used by the internal functions.
*/
std::unique_ptr<OsxmlEventParserData> data;
@@ -171,21 +161,6 @@ public:
void parse();
/**
- * Sets the whitespace handling mode.
- *
- * @param whitespaceMode defines how whitespace in the data should be
- * handled.
- */
- void setWhitespaceMode(WhitespaceMode whitespaceMode);
-
- /**
- * Returns the current whitespace handling mode.
- *
- * @return the currently set whitespace handling mode.
- */
- WhitespaceMode getWhitespaceMode() const;
-
- /**
* Returns the internal CharReader reference.
*
* @return the CharReader reference.
diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp
index a93f14a..83966d5 100644
--- a/test/core/parser/stack/StackTest.cpp
+++ b/test/core/parser/stack/StackTest.cpp
@@ -24,6 +24,7 @@
#include <core/parser/stack/Handler.hpp>
#include <core/parser/stack/Stack.hpp>
#include <core/parser/stack/State.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <core/StandaloneEnvironment.hpp>
@@ -53,7 +54,7 @@ struct Tracker {
Variant::mapType annotationStartArgs;
Variant annotationEndClassName;
Variant annotationEndElementName;
- Variant dataData;
+ TokenizedData dataData;
bool startResult;
bool fieldStartSetIsDefault;
@@ -81,7 +82,7 @@ struct Tracker {
annotationStartArgs = Variant::mapType{};
annotationEndClassName = Variant::fromString(std::string{});
annotationEndElementName = Variant::fromString(std::string{});
- dataData = Variant::fromString(std::string{});
+ dataData = TokenizedData();
startResult = true;
fieldStartSetIsDefault = false;
@@ -157,7 +158,7 @@ public:
return tracker.annotationEndResult;
}
- bool data(Variant &data) override
+ bool data(TokenizedData &data) override
{
tracker.dataCount++;
tracker.dataData = data;
@@ -363,7 +364,7 @@ TEST(Stack, multipleFields)
s.data("test");
tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc
- EXPECT_EQ("test", tracker.dataData);
+ EXPECT_EQ("test", tracker.dataData.text().asString());
s.fieldEnd();
tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc
@@ -375,7 +376,7 @@ TEST(Stack, multipleFields)
s.data("test2");
tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc
- EXPECT_EQ("test2", tracker.dataData);
+ EXPECT_EQ("test2", tracker.dataData.text().asString());
s.fieldEnd();
tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc
@@ -387,7 +388,7 @@ TEST(Stack, multipleFields)
s.data("test3");
tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc
- EXPECT_EQ("test3", tracker.dataData);
+ EXPECT_EQ("test3", tracker.dataData.text().asString());
s.fieldEnd();
tracker.expect(1, 0, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc
@@ -744,4 +745,4 @@ TEST(Stack, fieldAfterDefaultField)
ASSERT_FALSE(logger.hasError());
}
}
-} \ No newline at end of file
+}
diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp
index 231bad9..dfe2526 100644
--- a/test/core/parser/utils/TokenizedDataTest.cpp
+++ b/test/core/parser/utils/TokenizedDataTest.cpp
@@ -22,6 +22,43 @@
namespace ousia {
+void assertToken(TokenizedDataReader &reader, TokenId id,
+ const std::string &text, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset,
+ SourceId sourceId = InvalidSourceId)
+{
+ Token token;
+ ASSERT_TRUE(reader.read(token, tokens, mode));
+ EXPECT_EQ(id, token.id);
+ EXPECT_EQ(text, token.content);
+ if (start != InvalidSourceOffset) {
+ EXPECT_EQ(start, token.getLocation().getStart());
+ }
+ if (end != InvalidSourceOffset) {
+ EXPECT_EQ(end, token.getLocation().getEnd());
+ }
+ EXPECT_EQ(sourceId, token.getLocation().getSourceId());
+}
+
+void assertText(TokenizedDataReader &reader, const std::string &text,
+ const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM,
+ SourceOffset start = InvalidSourceOffset,
+ SourceOffset end = InvalidSourceOffset,
+ SourceId id = InvalidSourceId)
+{
+ assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id);
+}
+
+void assertEnd(TokenizedDataReader &reader)
+{
+ Token token;
+ ASSERT_TRUE(reader.atEnd());
+ ASSERT_FALSE(reader.read(token));
+}
+
TEST(TokenizedData, dataWhitespacePreserve)
{
TokenizedData data;
@@ -29,15 +66,10 @@ TEST(TokenizedData, dataWhitespacePreserve)
// 0123456789012345
// 0 1
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" test1 test2 ", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(16U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, " test1 test2 ", TokenSet{}, WhitespaceMode::PRESERVE,
+ 0, 16);
+ assertEnd(reader);
}
TEST(TokenizedData, dataWhitespaceTrim)
@@ -47,15 +79,10 @@ TEST(TokenizedData, dataWhitespaceTrim)
// 0123456789012345
// 0 1
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test1 test2", token.content);
- EXPECT_EQ(1U, token.getLocation().getStart());
- EXPECT_EQ(14U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::TRIM, 1,
+ 14);
+ assertEnd(reader);
}
TEST(TokenizedData, dataWhitespaceCollapse)
@@ -65,15 +92,10 @@ TEST(TokenizedData, dataWhitespaceCollapse)
// 0123456789012345
// 0 1
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test1 test2", token.content);
- EXPECT_EQ(1U, token.getLocation().getStart());
- EXPECT_EQ(14U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::COLLAPSE, 1,
+ 14);
+ assertEnd(reader);
}
TEST(TokenizedData, singleToken)
@@ -82,17 +104,9 @@ TEST(TokenizedData, singleToken)
ASSERT_EQ(2U, data.append("$$"));
data.mark(5, 0, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, singleDisabledToken)
@@ -101,15 +115,9 @@ TEST(TokenizedData, singleDisabledToken)
ASSERT_EQ(2U, data.append("$$"));
data.mark(5, 0, 2);
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "$$", TokenSet{}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, dualToken)
@@ -120,18 +128,10 @@ TEST(TokenizedData, dualToken)
data.mark(5, 0, 2);
data.mark(6, 1, 1);
- data.enableToken(5);
- data.enableToken(6);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5, 6}, WhitespaceMode::COLLAPSE, 0,
+ 2);
+ assertEnd(reader);
}
TEST(TokenizedData, dualTokenShorterEnabled)
@@ -142,385 +142,281 @@ TEST(TokenizedData, dualTokenShorterEnabled)
data.mark(5, 0, 2);
data.mark(6, 1, 1);
- data.enableToken(6);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(1U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(1U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 0, 1);
+ assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 1, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, dualTokenLongerEnabled)
{
TokenizedData data;
ASSERT_EQ(2U, data.append("$$"));
+ data.mark(6, 0, 1);
data.mark(5, 0, 2);
+ data.mark(6, 1, 1);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndDataPreserveWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ test $$"));
- // 0123456789
+ ASSERT_EQ(18U, data.append("$$ test text $$"));
+ // 012345678901234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" test ", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(8U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2);
+ assertText(reader, " test text ", TokenSet{5}, WhitespaceMode::PRESERVE,
+ 2, 16);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 16, 18);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndDataTrimWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ test $$"));
- // 0123456789
+ ASSERT_EQ(18U, data.append("$$ test text $$"));
+ // 012345678901234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(7U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2);
+ assertText(reader, "test text", TokenSet{5}, WhitespaceMode::TRIM, 3,
+ 15);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 16, 18);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndDataCollapseWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ test $$"));
- // 0123456789
+ ASSERT_EQ(18U, data.append("$$ test text $$"));
+ // 012345678901234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(7U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertText(reader, "test text", TokenSet{5}, WhitespaceMode::COLLAPSE, 3,
+ 15);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 16, 18);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndWhitespacePreserveWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ $$"));
- // 0123456789
+ ASSERT_EQ(8U, data.append("$$ $$"));
+ // 01234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" ", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(8U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2);
+ assertText(reader, " ", TokenSet{5}, WhitespaceMode::PRESERVE, 2, 6);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 6, 8);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndWhitespaceTrimWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ $$"));
- // 0123456789
+ ASSERT_EQ(8U, data.append("$$ $$"));
+ // 01234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 6, 8);
+ assertEnd(reader);
}
TEST(TokenizedData, tokensAndWhitespaceCollapseWhitespace)
{
TokenizedData data;
- ASSERT_EQ(10U, data.append("$$ $$"));
- // 0123456789
+ ASSERT_EQ(8U, data.append("$$ $$"));
+ // 01234567
data.mark(5, 0, 2);
data.mark(5, 2);
- data.enableToken(5);
-
- Token token;
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(8U, token.getLocation().getStart());
- EXPECT_EQ(10U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2);
+ assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 6, 8);
+ assertEnd(reader);
}
-TEST(TokenizedData, textPreserveWhitespace)
+TEST(TokenizedData, appendChars)
{
TokenizedData data;
- ASSERT_EQ(6U, data.append(" $$ "));
- // 012345
- data.mark(5, 2, 2);
-
- data.enableToken(5);
+ ASSERT_EQ(1U, data.append('t', 5, 7));
+ ASSERT_EQ(2U, data.append('e', 7, 8));
+ ASSERT_EQ(3U, data.append('s', 8, 10));
+ ASSERT_EQ(4U, data.append('t', 10, 12));
- Token token;
- ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" ", token.content);
- EXPECT_EQ(0U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" ", token.content);
- EXPECT_EQ(4U, token.getLocation().getStart());
- EXPECT_EQ(6U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.text(token, WhitespaceMode::PRESERVE));
- ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test", TokenSet{5}, WhitespaceMode::COLLAPSE, 5, 12);
+ assertEnd(reader);
}
-TEST(TokenizedData, textTrimWhitespace)
+TEST(TokenizedData, protectedWhitespace)
{
TokenizedData data;
- ASSERT_EQ(6U, data.append(" $$ "));
- // 012345
- data.mark(5, 2, 2);
+ ASSERT_EQ(4U, data.append("test", 10));
+ ASSERT_EQ(11U, data.append(" test", 14, true));
- data.enableToken(5);
-
- Token token;
- ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM));
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "test test", TokenSet{5}, WhitespaceMode::COLLAPSE, 10,
+ 21);
+ assertEnd(reader);
+}
- ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM));
- ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM));
+TEST(TokenizedData, specialNewlineToken)
+{
+ TokenizedData data;
+ data.append("a\nb\n \nc\n");
+ // 0 12 3456 78 9
+
+ const TokenSet tokens{Tokens::Newline};
+
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 1, 2);
+ assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 3, 4);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 7, 8);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 9, 10);
+ assertEnd(reader);
}
-TEST(TokenizedData, textCollapseWhitespace)
+TEST(TokenizedData, specialParagraphToken)
{
TokenizedData data;
- ASSERT_EQ(6U, data.append(" $$ "));
- // 012345
- data.mark(5, 2, 2);
+ data.append("a\nb\n \nc\n");
+ // 0 12 3456 78 9
- data.enableToken(5);
+ const TokenSet tokens{Tokens::Paragraph};
- Token token;
- ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3);
+ assertToken(reader, Tokens::Paragraph, "\n \n", tokens,
+ WhitespaceMode::COLLAPSE, 3, 8);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9);
+ assertEnd(reader);
+}
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(5U, token.id);
- EXPECT_EQ("$$", token.content);
- EXPECT_EQ(2U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
+TEST(TokenizedData, specialSectionToken)
+{
+ TokenizedData data;
+ data.append("a\nb\n \n \t \n");
+ // 0 12 3456 789 01 2
+ // 0 1
+
+ const TokenSet tokens{Tokens::Section};
- ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3);
+ assertToken(reader, Tokens::Section, "\n \n \t \n", tokens,
+ WhitespaceMode::COLLAPSE, 3, 13);
+ assertEnd(reader);
}
-TEST(TokenizedData, appendChars)
+TEST(TokenizedData, specialTokenPrecedence)
{
TokenizedData data;
- ASSERT_EQ(1U, data.append('t', 5, 7));
- ASSERT_EQ(2U, data.append('e', 7, 8));
- ASSERT_EQ(3U, data.append('s', 8, 10));
- ASSERT_EQ(4U, data.append('t', 10, 12));
+ data.append("a\nb\n\nc\n\n\nd");
+ // 0 12 3 45 6 7 89
+
+ const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section};
+
+ TokenizedDataReader reader = data.reader();
+ assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1);
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 1, 2);
+ assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3);
+ assertToken(reader, Tokens::Paragraph, "\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 3, 5);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 5, 6);
+ assertToken(reader, Tokens::Section, "\n\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 6, 9);
+ assertText(reader, "d", tokens, WhitespaceMode::COLLAPSE, 9, 10);
+ assertEnd(reader);
+}
- Token token;
- ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("test", token.content);
- EXPECT_EQ(5U, token.getLocation().getStart());
- EXPECT_EQ(12U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
- ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE));
+TEST(TokenizedData, specialTokenPrecedence2)
+{
+ TokenizedData data;
+ data.append("\nb\n\nc\n\n\n");
+ // 0 12 3 45 6 7
+
+ const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section};
+
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE,
+ 0, 1);
+ assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 1, 2);
+ assertToken(reader, Tokens::Paragraph, "\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 2, 4);
+ assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 4, 5);
+ assertToken(reader, Tokens::Section, "\n\n\n", tokens,
+ WhitespaceMode::COLLAPSE, 5, 8);
+ assertEnd(reader);
}
-TEST(TokenizedData, copy)
+TEST(TokenizedData, specialTokenIndent)
{
TokenizedData data;
- ASSERT_EQ(7U, data.append(" a $ b "));
- // 0123456
- data.mark(6, 3, 1);
- data.enableToken(6);
+ data.append(" test\n\ttest2\n test3 \ttest4\ntest5");
+ // 01234567 8 901234 5678901234567890 123456 789012
+ // 0 1 2 3 4
+ const TokenSet tokens{Tokens::Indent, Tokens::Dedent};
+
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 4, 4);
+ assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 10, 10);
+ assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+ assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
+ 38, 38);
+ assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43);
+ assertEnd(reader);
+}
- Token token;
- ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("a", token.content);
- EXPECT_EQ(1U, token.getLocation().getStart());
- EXPECT_EQ(2U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE));
-
- TokenizedData dataCopy = data;
-
- ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(dataCopy.next(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(6U, token.id);
- EXPECT_EQ("$", token.content);
- EXPECT_EQ(3U, token.getLocation().getStart());
- EXPECT_EQ(4U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
-
- ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ(" b ", token.content);
- EXPECT_EQ(4U, token.getLocation().getStart());
- EXPECT_EQ(7U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
- ASSERT_FALSE(data.next(token));
-
- ASSERT_TRUE(dataCopy.text(token, WhitespaceMode::COLLAPSE));
- EXPECT_EQ(Tokens::Data, token.id);
- EXPECT_EQ("b", token.content);
- EXPECT_EQ(5U, token.getLocation().getStart());
- EXPECT_EQ(6U, token.getLocation().getEnd());
- EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId());
- ASSERT_FALSE(dataCopy.next(token));
+TEST(TokenizedData, specialTokenIndentOverlap)
+{
+ TokenizedData data;
+ data.append(" test\n\ttest2\n test3 \ttest4\ntest5");
+ // 01234567 8 901234 5678901234567890 123456 789012
+ // 0 1 2 3 4
+ const TokenSet tokens{Tokens::Indent, Tokens::Dedent, 5};
+
+ data.mark(5, 4, 4);
+
+ TokenizedDataReader reader = data.reader();
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 4, 4);
+ assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8);
+ assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE,
+ 10, 10);
+ assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37);
+ assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE,
+ 38, 38);
+ assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43);
+ assertEnd(reader);
}
+
}
diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp
index 3809a12..0f2bfb7 100644
--- a/test/core/parser/utils/TokenizerTest.cpp
+++ b/test/core/parser/utils/TokenizerTest.cpp
@@ -20,6 +20,7 @@
#include <core/common/CharReader.hpp>
#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
namespace ousia {
@@ -31,23 +32,40 @@ TEST(Tokenizer, tokenRegistration)
ASSERT_EQ(0U, tokenizer.registerToken("a"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("a"));
- ASSERT_EQ("a", tokenizer.getTokenString(0U));
+ ASSERT_EQ("a", tokenizer.lookupToken(0U).string);
ASSERT_EQ(1U, tokenizer.registerToken("b"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("b"));
- ASSERT_EQ("b", tokenizer.getTokenString(1U));
+ ASSERT_EQ("b", tokenizer.lookupToken(1U).string);
ASSERT_EQ(2U, tokenizer.registerToken("c"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("c"));
- ASSERT_EQ("c", tokenizer.getTokenString(2U));
+ ASSERT_EQ("c", tokenizer.lookupToken(2U).string);
ASSERT_TRUE(tokenizer.unregisterToken(1U));
ASSERT_FALSE(tokenizer.unregisterToken(1U));
- ASSERT_EQ("", tokenizer.getTokenString(1U));
+ ASSERT_EQ("", tokenizer.lookupToken(1U).string);
ASSERT_EQ(1U, tokenizer.registerToken("d"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("d"));
- ASSERT_EQ("d", tokenizer.getTokenString(1U));
+ ASSERT_EQ("d", tokenizer.lookupToken(1U).string);
+}
+
+void expectData(const std::string &expected, SourceOffset tokenStart,
+ SourceOffset tokenEnd, SourceOffset textStart,
+ SourceOffset textEnd, const Token &token, TokenizedData &data,
+ WhitespaceMode mode = WhitespaceMode::PRESERVE)
+{
+ ASSERT_EQ(Tokens::Data, token.id);
+
+ Variant text = data.text(mode);
+ ASSERT_TRUE(text.isString());
+
+ EXPECT_EQ(expected, text.asString());
+ EXPECT_EQ(tokenStart, token.location.getStart());
+ EXPECT_EQ(tokenEnd, token.location.getEnd());
+ EXPECT_EQ(textStart, text.getLocation().getStart());
+ EXPECT_EQ(textEnd, text.getLocation().getEnd());
}
TEST(Tokenizer, textTokenPreserveWhitespace)
@@ -56,36 +74,34 @@ TEST(Tokenizer, textTokenPreserveWhitespace)
CharReader reader{" this \t is only a \n\n test text "};
// 012345 6789012345678 9 0123456789012345
// 0 1 2 3
- Tokenizer tokenizer{WhitespaceMode::PRESERVE};
+ Tokenizer tokenizer;
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ(" this \t is only a \n\n test text ", token.content);
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- SourceLocation loc = token.location;
- ASSERT_EQ(0U, loc.getStart());
- ASSERT_EQ(36U, loc.getEnd());
+ expectData(" this \t is only a \n\n test text ", 0, 36, 0, 36,
+ token, data, WhitespaceMode::PRESERVE);
- ASSERT_FALSE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
}
{
CharReader reader{"this \t is only a \n\n test text"};
// 01234 5678901234567 8 9012345678901
// 0 1 2 3
- Tokenizer tokenizer{WhitespaceMode::PRESERVE};
+ Tokenizer tokenizer;
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("this \t is only a \n\n test text", token.content);
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- SourceLocation loc = token.location;
- ASSERT_EQ(0U, loc.getStart());
- ASSERT_EQ(32U, loc.getEnd());
+ expectData("this \t is only a \n\n test text", 0, 32, 0, 32,
+ token, data, WhitespaceMode::PRESERVE);
- ASSERT_FALSE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
@@ -95,36 +111,34 @@ TEST(Tokenizer, textTokenTrimWhitespace)
CharReader reader{" this \t is only a \n\n test text "};
// 012345 6789012345678 9 0123456789012345
// 0 1 2 3
- Tokenizer tokenizer{WhitespaceMode::TRIM};
+ Tokenizer tokenizer;
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("this \t is only a \n\n test text", token.content);
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- SourceLocation loc = token.location;
- ASSERT_EQ(1U, loc.getStart());
- ASSERT_EQ(33U, loc.getEnd());
+ expectData("this \t is only a \n\n test text", 0, 36, 1, 33, token,
+ data, WhitespaceMode::TRIM);
- ASSERT_FALSE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
}
{
CharReader reader{"this \t is only a \n\n test text"};
// 01234 5678901234567 8 9012345678901
// 0 1 2 3
- Tokenizer tokenizer{WhitespaceMode::TRIM};
+ Tokenizer tokenizer;
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("this \t is only a \n\n test text", token.content);
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- SourceLocation loc = token.location;
- ASSERT_EQ(0U, loc.getStart());
- ASSERT_EQ(32U, loc.getEnd());
+ expectData("this \t is only a \n\n test text", 0, 32, 0, 32,
+ token, data, WhitespaceMode::TRIM);
- ASSERT_FALSE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
@@ -134,36 +148,34 @@ TEST(Tokenizer, textTokenCollapseWhitespace)
CharReader reader{" this \t is only a \n\n test text "};
// 012345 6789012345678 9 0123456789012345
// 0 1 2 3
- Tokenizer tokenizer{WhitespaceMode::COLLAPSE};
+ Tokenizer tokenizer;
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("this is only a test text", token.content);
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- SourceLocation loc = token.location;
- ASSERT_EQ(1U, loc.getStart());
- ASSERT_EQ(33U, loc.getEnd());
+ expectData("this is only a test text", 0, 36, 1, 33, token, data,
+ WhitespaceMode::COLLAPSE);
- ASSERT_FALSE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
}
{
CharReader reader{"this \t is only a \n\n test text"};
// 01234 5678901234567 8 9012345678901
// 0 1 2 3
- Tokenizer tokenizer{WhitespaceMode::COLLAPSE};
+ Tokenizer tokenizer;
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("this is only a test text", token.content);
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- SourceLocation loc = token.location;
- ASSERT_EQ(0U, loc.getStart());
- ASSERT_EQ(32U, loc.getEnd());
+ expectData("this is only a test text", 0, 32, 0, 32, token, data,
+ WhitespaceMode::COLLAPSE);
- ASSERT_FALSE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
@@ -177,14 +189,12 @@ TEST(Tokenizer, simpleReadToken)
{
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("test1", token.content);
- SourceLocation loc = token.location;
- ASSERT_EQ(0U, loc.getStart());
- ASSERT_EQ(5U, loc.getEnd());
+ expectData("test1", 0, 5, 0, 5, token, data);
char c;
ASSERT_TRUE(reader.peek(c));
@@ -193,7 +203,8 @@ TEST(Tokenizer, simpleReadToken)
{
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(tid, token.id);
ASSERT_EQ(":", token.content);
@@ -209,14 +220,10 @@ TEST(Tokenizer, simpleReadToken)
{
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
-
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("test2", token.content);
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- SourceLocation loc = token.location;
- ASSERT_EQ(6U, loc.getStart());
- ASSERT_EQ(11U, loc.getEnd());
+ expectData("test2", 6, 11, 6, 11, token, data);
char c;
ASSERT_FALSE(reader.peek(c));
@@ -233,21 +240,17 @@ TEST(Tokenizer, simplePeekToken)
{
Token token;
- ASSERT_TRUE(tokenizer.peek(reader, token));
-
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("test1", token.content);
-
- SourceLocation loc = token.location;
- ASSERT_EQ(0U, loc.getStart());
- ASSERT_EQ(5U, loc.getEnd());
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.peek(reader, token, data));
+ expectData("test1", 0, 5, 0, 5, token, data);
ASSERT_EQ(0U, reader.getOffset());
ASSERT_EQ(5U, reader.getPeekOffset());
}
{
Token token;
- ASSERT_TRUE(tokenizer.peek(reader, token));
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.peek(reader, token, data));
ASSERT_EQ(tid, token.id);
ASSERT_EQ(":", token.content);
@@ -261,35 +264,26 @@ TEST(Tokenizer, simplePeekToken)
{
Token token;
- ASSERT_TRUE(tokenizer.peek(reader, token));
-
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("test2", token.content);
-
- SourceLocation loc = token.location;
- ASSERT_EQ(6U, loc.getStart());
- ASSERT_EQ(11U, loc.getEnd());
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.peek(reader, token, data));
+ expectData("test2", 6, 11, 6, 11, token, data);
ASSERT_EQ(0U, reader.getOffset());
ASSERT_EQ(11U, reader.getPeekOffset());
}
{
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
-
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("test1", token.content);
-
- SourceLocation loc = token.location;
- ASSERT_EQ(0U, loc.getStart());
- ASSERT_EQ(5U, loc.getEnd());
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+ expectData("test1", 0, 5, 0, 5, token, data);
ASSERT_EQ(5U, reader.getOffset());
ASSERT_EQ(5U, reader.getPeekOffset());
}
{
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(tid, token.id);
ASSERT_EQ(":", token.content);
@@ -303,14 +297,9 @@ TEST(Tokenizer, simplePeekToken)
{
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
-
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("test2", token.content);
-
- SourceLocation loc = token.location;
- ASSERT_EQ(6U, loc.getStart());
- ASSERT_EQ(11U, loc.getEnd());
+ TokenizedData data;
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
+ expectData("test2", 6, 11, 6, 11, token, data);
ASSERT_EQ(11U, reader.getOffset());
ASSERT_EQ(11U, reader.getPeekOffset());
}
@@ -320,6 +309,7 @@ TEST(Tokenizer, ambiguousTokens)
{
CharReader reader{"abc"};
Tokenizer tokenizer;
+ TokenizedData data;
TokenId t1 = tokenizer.registerToken("abd");
TokenId t2 = tokenizer.registerToken("bc");
@@ -328,16 +318,17 @@ TEST(Tokenizer, ambiguousTokens)
ASSERT_EQ(1U, t2);
Token token;
- ASSERT_TRUE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
- ASSERT_EQ(Tokens::Data, token.id);
- ASSERT_EQ("a", token.content);
+ expectData("a", 0, 1, 0, 1, token, data);
SourceLocation loc = token.location;
ASSERT_EQ(0U, loc.getStart());
ASSERT_EQ(1U, loc.getEnd());
- ASSERT_TRUE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(t2, token.id);
ASSERT_EQ("bc", token.content);
@@ -346,7 +337,8 @@ TEST(Tokenizer, ambiguousTokens)
ASSERT_EQ(1U, loc.getStart());
ASSERT_EQ(3U, loc.getEnd());
- ASSERT_FALSE(tokenizer.read(reader, token));
+ data.clear();
+ ASSERT_FALSE(tokenizer.read(reader, token, data));
}
TEST(Tokenizer, commentTestWhitespacePreserve)
@@ -354,7 +346,7 @@ TEST(Tokenizer, commentTestWhitespacePreserve)
CharReader reader{"Test/Test /* Block Comment */", 0};
// 012345678901234567890123456789
// 0 1 2
- Tokenizer tokenizer(WhitespaceMode::PRESERVE);
+ Tokenizer tokenizer;
const TokenId t1 = tokenizer.registerToken("/");
const TokenId t2 = tokenizer.registerToken("/*");
@@ -370,45 +362,23 @@ TEST(Tokenizer, commentTestWhitespacePreserve)
Token t;
for (auto &te : expected) {
- EXPECT_TRUE(tokenizer.read(reader, t));
+ TokenizedData data(0);
+ EXPECT_TRUE(tokenizer.read(reader, t, data));
EXPECT_EQ(te.id, t.id);
- EXPECT_EQ(te.content, t.content);
+ if (te.id != Tokens::Data) {
+ EXPECT_EQ(te.content, t.content);
+ } else {
+ Variant text = data.text(WhitespaceMode::PRESERVE);
+ ASSERT_TRUE(text.isString());
+ EXPECT_EQ(te.content, text.asString());
+ }
EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
EXPECT_EQ(te.location.getStart(), t.location.getStart());
EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
}
- ASSERT_FALSE(tokenizer.read(reader, t));
-}
-
-TEST(Tokenizer, commentTestWhitespaceCollapse)
-{
- CharReader reader{"Test/Test /* Block Comment */", 0};
- // 012345678901234567890123456789
- // 0 1 2
- Tokenizer tokenizer(WhitespaceMode::COLLAPSE);
- const TokenId t1 = tokenizer.registerToken("/");
- const TokenId t2 = tokenizer.registerToken("/*");
- const TokenId t3 = tokenizer.registerToken("*/");
-
- std::vector<Token> expected = {
- {Tokens::Data, "Test", SourceLocation{0, 0, 4}},
- {t1, "/", SourceLocation{0, 4, 5}},
- {Tokens::Data, "Test", SourceLocation{0, 5, 9}},
- {t2, "/*", SourceLocation{0, 10, 12}},
- {Tokens::Data, "Block Comment", SourceLocation{0, 13, 26}},
- {t3, "*/", SourceLocation{0, 27, 29}}};
-
- Token t;
- for (auto &te : expected) {
- EXPECT_TRUE(tokenizer.read(reader, t));
- EXPECT_EQ(te.id, t.id);
- EXPECT_EQ(te.content, t.content);
- EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
- EXPECT_EQ(te.location.getStart(), t.location.getStart());
- EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
- }
- ASSERT_FALSE(tokenizer.read(reader, t));
+ TokenizedData data;
+ ASSERT_FALSE(tokenizer.read(reader, t, data));
}
}
diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp
index d52fa5b..3d01007 100644
--- a/test/formats/osml/OsmlStreamParserTest.cpp
+++ b/test/formats/osml/OsmlStreamParserTest.cpp
@@ -30,11 +30,21 @@ namespace ousia {
static TerminalLogger logger(std::cerr, true);
// static ConcreteLogger logger;
+static OsmlStreamParser::State skipEmptyData(OsmlStreamParser &reader)
+{
+ OsmlStreamParser::State res = reader.parse();
+ if (res == OsmlStreamParser::State::DATA) {
+ EXPECT_FALSE(reader.getData().hasNonWhitespaceText());
+ res = reader.parse();
+ }
+ return res;
+}
+
static void assertCommand(OsmlStreamParser &reader, const std::string &name,
SourceOffset start = InvalidSourceOffset,
SourceOffset end = InvalidSourceOffset)
{
- ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse());
+ ASSERT_EQ(OsmlStreamParser::State::COMMAND, skipEmptyData(reader));
EXPECT_EQ(name, reader.getCommandName().asString());
if (start != InvalidSourceOffset) {
EXPECT_EQ(start, reader.getCommandName().getLocation().getStart());
@@ -57,16 +67,19 @@ static void assertCommand(OsmlStreamParser &reader, const std::string &name,
static void assertData(OsmlStreamParser &reader, const std::string &data,
SourceOffset start = InvalidSourceOffset,
- SourceOffset end = InvalidSourceOffset)
+ SourceOffset end = InvalidSourceOffset,
+ WhitespaceMode mode = WhitespaceMode::COLLAPSE)
{
ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- EXPECT_EQ(data, reader.getData().asString());
+ Variant text = reader.getText(mode);
+ ASSERT_TRUE(text.isString());
+ EXPECT_EQ(data, text.asString());
if (start != InvalidSourceOffset) {
- EXPECT_EQ(start, reader.getData().getLocation().getStart());
+ EXPECT_EQ(start, text.getLocation().getStart());
EXPECT_EQ(start, reader.getLocation().getStart());
}
if (end != InvalidSourceOffset) {
- EXPECT_EQ(end, reader.getData().getLocation().getEnd());
+ EXPECT_EQ(end, text.getLocation().getEnd());
EXPECT_EQ(end, reader.getLocation().getEnd());
}
}
@@ -75,7 +88,7 @@ static void assertFieldStart(OsmlStreamParser &reader, bool defaultField,
SourceOffset start = InvalidSourceOffset,
SourceOffset end = InvalidSourceOffset)
{
- ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse());
+ ASSERT_EQ(OsmlStreamParser::State::FIELD_START, skipEmptyData(reader));
EXPECT_EQ(defaultField, reader.inDefaultField());
if (start != InvalidSourceOffset) {
EXPECT_EQ(start, reader.getLocation().getStart());
@@ -89,7 +102,7 @@ static void assertFieldEnd(OsmlStreamParser &reader,
SourceOffset start = InvalidSourceOffset,
SourceOffset end = InvalidSourceOffset)
{
- ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse());
+ ASSERT_EQ(OsmlStreamParser::State::FIELD_END, skipEmptyData(reader));
if (start != InvalidSourceOffset) {
EXPECT_EQ(start, reader.getLocation().getStart());
}
@@ -103,7 +116,7 @@ static void assertAnnotationStart(OsmlStreamParser &reader,
SourceOffset start = InvalidSourceOffset,
SourceOffset end = InvalidSourceOffset)
{
- ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, reader.parse());
+ ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, skipEmptyData(reader));
EXPECT_EQ(name, reader.getCommandName().asString());
if (start != InvalidSourceOffset) {
EXPECT_EQ(start, reader.getCommandName().getLocation().getStart());
@@ -131,7 +144,7 @@ static void assertAnnotationEnd(OsmlStreamParser &reader,
SourceOffset start = InvalidSourceOffset,
SourceOffset end = InvalidSourceOffset)
{
- ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, reader.parse());
+ ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, skipEmptyData(reader));
ASSERT_EQ(name, reader.getCommandName().asString());
if (!elementName.empty()) {
ASSERT_EQ(1U, reader.getCommandArguments().asMap().size());
@@ -152,7 +165,7 @@ static void assertEnd(OsmlStreamParser &reader,
SourceOffset start = InvalidSourceOffset,
SourceOffset end = InvalidSourceOffset)
{
- ASSERT_EQ(OsmlStreamParser::State::END, reader.parse());
+ ASSERT_EQ(OsmlStreamParser::State::END, skipEmptyData(reader));
if (start != InvalidSourceOffset) {
EXPECT_EQ(start, reader.getLocation().getStart());
}
@@ -205,26 +218,14 @@ TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak)
assertData(reader, "hello world", 1, 14);
}
-TEST(OsmlStreamParser, escapeWhitespace)
-{
- const char *testString = " hello\\ \\ world ";
- // 012345 67 89012345
- // 0 1
- CharReader charReader(testString);
-
- OsmlStreamParser reader(charReader, logger);
-
- assertData(reader, "hello world", 1, 15);
-}
-
static void testEscapeSpecialCharacter(const std::string &c)
{
CharReader charReader(std::string("\\") + c);
OsmlStreamParser reader(charReader, logger);
EXPECT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- EXPECT_EQ(c, reader.getData().asString());
+ EXPECT_EQ(c, reader.getText().asString());
- SourceLocation loc = reader.getData().getLocation();
+ SourceLocation loc = reader.getText().getLocation();
EXPECT_EQ(0U, loc.getStart());
EXPECT_EQ(1U + c.size(), loc.getEnd());
}
@@ -253,16 +254,16 @@ TEST(OsmlStreamParser, singleLineComment)
OsmlStreamParser reader(charReader, logger);
{
ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- ASSERT_EQ("a", reader.getData().asString());
- SourceLocation loc = reader.getData().getLocation();
+ ASSERT_EQ("a", reader.getText().asString());
+ SourceLocation loc = reader.getText().getLocation();
ASSERT_EQ(0U, loc.getStart());
ASSERT_EQ(1U, loc.getEnd());
}
{
ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- ASSERT_EQ("b", reader.getData().asString());
- SourceLocation loc = reader.getData().getLocation();
+ ASSERT_EQ("b", reader.getText().asString());
+ SourceLocation loc = reader.getText().getLocation();
ASSERT_EQ(33U, loc.getStart());
ASSERT_EQ(34U, loc.getEnd());
}
@@ -279,16 +280,16 @@ TEST(OsmlStreamParser, multilineComment)
OsmlStreamParser reader(charReader, logger);
{
ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- ASSERT_EQ("a", reader.getData().asString());
- SourceLocation loc = reader.getData().getLocation();
+ ASSERT_EQ("a", reader.getText().asString());
+ SourceLocation loc = reader.getText().getLocation();
ASSERT_EQ(0U, loc.getStart());
ASSERT_EQ(1U, loc.getEnd());
}
{
ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- ASSERT_EQ("b", reader.getData().asString());
- SourceLocation loc = reader.getData().getLocation();
+ ASSERT_EQ("b", reader.getText().asString());
+ SourceLocation loc = reader.getText().getLocation();
ASSERT_EQ(40U, loc.getStart());
ASSERT_EQ(41U, loc.getEnd());
}
@@ -305,16 +306,16 @@ TEST(OsmlStreamParser, nestedMultilineComment)
OsmlStreamParser reader(charReader, logger);
{
ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- ASSERT_EQ("a", reader.getData().asString());
- SourceLocation loc = reader.getData().getLocation();
+ ASSERT_EQ("a", reader.getText().asString());
+ SourceLocation loc = reader.getText().getLocation();
ASSERT_EQ(0U, loc.getStart());
ASSERT_EQ(1U, loc.getEnd());
}
{
ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse());
- ASSERT_EQ("b", reader.getData().asString());
- SourceLocation loc = reader.getData().getLocation();
+ ASSERT_EQ("b", reader.getText().asString());
+ SourceLocation loc = reader.getText().getLocation();
ASSERT_EQ(40U, loc.getStart());
ASSERT_EQ(41U, loc.getEnd());
}
@@ -569,8 +570,11 @@ TEST(OsmlStreamParser, multipleCommands)
OsmlStreamParser reader(charReader, logger);
assertCommand(reader, "a", 0, 2);
+ assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE);
assertCommand(reader, "b", 3, 5);
+ assertData(reader, " ", 5, 6, WhitespaceMode::PRESERVE);
assertCommand(reader, "c", 6, 8);
+ assertData(reader, " ", 8, 9, WhitespaceMode::PRESERVE);
assertCommand(reader, "d", 9, 11);
assertEnd(reader, 11, 11);
}
@@ -584,10 +588,13 @@ TEST(OsmlStreamParser, fieldsWithSpaces)
OsmlStreamParser reader(charReader, logger);
assertCommand(reader, "a", 0, 2);
+ assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE);
assertFieldStart(reader, false, 3, 4);
assertCommand(reader, "b", 4, 6);
+ assertData(reader, " ", 6, 7, WhitespaceMode::PRESERVE);
assertCommand(reader, "c", 7, 9);
assertFieldEnd(reader, 9, 10);
+ assertData(reader, " \n\n {", 10, 12, WhitespaceMode::PRESERVE);
assertFieldStart(reader, false, 16, 17);
assertCommand(reader, "d", 17, 19);
assertFieldEnd(reader, 19, 20);
diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp
index 3293370..6942166 100644
--- a/test/formats/osxml/OsxmlEventParserTest.cpp
+++ b/test/formats/osxml/OsxmlEventParserTest.cpp
@@ -21,6 +21,7 @@
#include <core/frontend/TerminalLogger.hpp>
#include <core/common/CharReader.hpp>
#include <core/common/Variant.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <formats/osxml/OsxmlEventParser.hpp>
@@ -74,13 +75,11 @@ public:
};
static std::vector<std::pair<OsxmlEvent, Variant>> parseXml(
- const char *testString,
- WhitespaceMode whitespaceMode = WhitespaceMode::TRIM)
+ const char *testString)
{
TestOsxmlEventListener listener;
CharReader reader(testString);
OsxmlEventParser parser(reader, listener, logger);
- parser.setWhitespaceMode(whitespaceMode);
parser.parse();
return listener.events;
}
@@ -157,7 +156,7 @@ TEST(OsxmlEventParser, magicTopLevelTagInside)
ASSERT_EQ(expectedEvents, events);
}
-TEST(OsxmlEventParser, commandWithDataPreserveWhitespace)
+TEST(OsxmlEventParser, commandWithData)
{
const char *testString = "<a> hello \n world </a>";
// 012345678901 234567890123
@@ -168,50 +167,12 @@ TEST(OsxmlEventParser, commandWithDataPreserveWhitespace)
{OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}},
{OsxmlEvent::FIELD_END, Variant::arrayType{}}};
- auto events = parseXml(testString, WhitespaceMode::PRESERVE);
+ auto events = parseXml(testString);
ASSERT_EQ(expectedEvents, events);
// Check the location of the text
ASSERT_EQ(3U, events[1].second.asArray()[0].getLocation().getStart());
ASSERT_EQ(20U, events[1].second.asArray()[0].getLocation().getEnd());
}
-
-TEST(OsxmlEventParser, commandWithDataTrimWhitespace)
-{
- const char *testString = "<a> hello \n world </a>";
- // 012345678901 234567890123
- // 0 1 2
-
- std::vector<std::pair<OsxmlEvent, Variant>> expectedEvents{
- {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}},
- {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}},
- {OsxmlEvent::FIELD_END, Variant::arrayType{}}};
-
- auto events = parseXml(testString, WhitespaceMode::TRIM);
- ASSERT_EQ(expectedEvents, events);
-
- // Check the location of the text
- ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart());
- ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd());
-}
-
-TEST(OsxmlEventParser, commandWithDataCollapseWhitespace)
-{
- const char *testString = "<a> hello \n world </a>";
- // 012345678901 234567890123
- // 0 1 2
-
- std::vector<std::pair<OsxmlEvent, Variant>> expectedEvents{
- {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}},
- {OsxmlEvent::DATA, Variant::arrayType{"hello world"}},
- {OsxmlEvent::FIELD_END, Variant::arrayType{}}};
-
- auto events = parseXml(testString, WhitespaceMode::COLLAPSE);
- ASSERT_EQ(expectedEvents, events);
-
- // Check the location of the text
- ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart());
- ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd());
-}
}