From 5a67fc7d682ddba6a862aacf616d02cd20b727eb Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Tue, 24 Feb 2015 02:13:46 +0100 Subject: start of branch, commit log will be rewritten --- CMakeLists.txt | 2 +- src/core/common/Token.cpp | 24 +++ src/core/common/Token.hpp | 181 +++++++++++++++++ src/core/common/WhitespaceHandler.hpp | 284 --------------------------- src/core/parser/stack/DocumentHandler.cpp | 24 ++- src/core/parser/stack/DocumentHandler.hpp | 4 +- src/core/parser/stack/Handler.cpp | 25 ++- src/core/parser/stack/Handler.hpp | 74 ++++--- src/core/parser/stack/Stack.cpp | 55 ++++-- src/core/parser/stack/Stack.hpp | 18 +- src/core/parser/utils/SourceOffsetVector.hpp | 28 ++- src/core/parser/utils/Token.cpp | 24 --- src/core/parser/utils/Token.hpp | 142 -------------- src/core/parser/utils/TokenTrie.cpp | 16 +- src/core/parser/utils/TokenTrie.hpp | 11 +- src/core/parser/utils/TokenizedData.cpp | 133 +++++++++++-- src/core/parser/utils/TokenizedData.hpp | 214 ++++++++++++++++---- src/core/parser/utils/Tokenizer.cpp | 271 ++++++++++++------------- src/core/parser/utils/Tokenizer.hpp | 140 +++++++------ src/formats/osml/OsmlStreamParser.cpp | 157 ++++----------- src/formats/osml/OsmlStreamParser.hpp | 85 ++++---- src/formats/osxml/OsxmlEventParser.cpp | 63 +----- src/formats/osxml/OsxmlEventParser.hpp | 31 +-- test/core/parser/stack/StackTest.cpp | 15 +- test/core/parser/utils/TokenizedDataTest.cpp | 90 +++++---- test/core/parser/utils/TokenizerTest.cpp | 248 ++++++++++------------- test/formats/osml/OsmlStreamParserTest.cpp | 79 ++++---- test/formats/osxml/OsxmlEventParserTest.cpp | 47 +---- 28 files changed, 1184 insertions(+), 1301 deletions(-) create mode 100644 src/core/common/Token.cpp create mode 100644 src/core/common/Token.hpp delete mode 100644 src/core/common/WhitespaceHandler.hpp delete mode 100644 src/core/parser/utils/Token.cpp delete mode 100644 src/core/parser/utils/Token.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ea5c3aa..54f971c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,6 +158,7 @@ ADD_LIBRARY(ousia_core src/core/common/Rtti src/core/common/RttiBuilder src/core/common/SourceContextReader + src/core/common/Token src/core/common/Utils src/core/common/Variant src/core/common/VariantConverter @@ -189,7 +190,6 @@ ADD_LIBRARY(ousia_core src/core/parser/stack/Stack src/core/parser/stack/TypesystemHandler src/core/parser/utils/SourceOffsetVector - src/core/parser/utils/Token src/core/parser/utils/TokenizedData src/core/parser/utils/Tokenizer src/core/parser/utils/TokenTrie diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp new file mode 100644 index 0000000..8bcdbb5 --- /dev/null +++ b/src/core/common/Token.cpp @@ -0,0 +1,24 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Token.hpp" + +namespace ousia { +// Stub to make sure Tokens.hpp is valid +} + diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp new file mode 100644 index 0000000..07d7c8f --- /dev/null +++ b/src/core/common/Token.hpp @@ -0,0 +1,181 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Token.hpp + * + * Definition of the TokenId id and constants for some special tokens. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_HPP_ +#define _OUSIA_TOKEN_HPP_ + +#include +#include +#include +#include + +#include + +namespace ousia { + +/** + * The TokenId is used to give each token id a unique id. + */ +using TokenId = uint32_t; + +/** + * Type used for storing token lengths. + */ +using TokenLength = uint16_t; + +/** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set; + +/** + * Namespace containing constants for TokenId instances with special meaning. + */ +namespace Tokens { +/** + * Token which is not a token. + */ +constexpr TokenId Empty = std::numeric_limits::max(); + +/** + * Token which represents data (represented as TokenizedData). + */ +constexpr TokenId Data = std::numeric_limits::max() - 1; + +/** + * Token which represents a newline token. + */ +constexpr TokenId Newline = std::numeric_limits::max() - 2; + +/** + * Token which represents a paragraph token -- issued if two consecutive + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached. + */ +constexpr TokenId Paragraph = std::numeric_limits::max() - 3; + +/** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits::max() - 4; + +/** + * Token which represents an indentation token -- issued if the indentation of + * this line is larger than the indentation of the previous line. + */ +constexpr TokenId Indent = std::numeric_limits::max() - 5; + +/** + * Token which represents an unindentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. + */ +constexpr TokenId Unindent = std::numeric_limits::max() - 6; + +/** + * Maximum token id to be used. Tokens allocated for users should not surpass + * this value. + */ +constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255; +} + +/** + * The Token structure describes a token discovered by the Tokenizer or read + * from the TokenizedData struct. + */ +struct Token { + /** + * Id of the id of this token. + */ + TokenId id; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + Token() : id(Tokens::Empty) {} + + /** + * Constructor of a "data" token with no explicit content. + * + * @param location is the location of the extracted string content in the + * source file. + */ + Token(SourceLocation location) + : id(Tokens::Data), location(location) + { + } + + /** + * Constructor of the Token struct. + * + * @param id represents the token id. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + Token(TokenId id, const std::string &content, SourceLocation location) + : id(id), content(content), location(location) + { + } + + /** + * Constructor of the Token struct, only initializes the token id + * + * @param id is the id corresponding to the id of the token. + */ + Token(TokenId id) : id(id) {} + + /** + * Returns true if this token is special. + * + * @return true if the TokenId indicates that this token is a "special" + * token. + */ + + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; +} + +#endif /* _OUSIA_TOKENS_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include -#include - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - WhitespaceHandler() : textStart(0), textEnd(0) {} - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } - - /** - * Returns the content of the WhitespaceHandler as string. - */ - std::string toString() const - { - return std::string(textBuf.data(), textBuf.size()); - } -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd); - } - - /** - * Static version of PreservingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); - } - - /** - * Static version of TrimmingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param whitespaceBuf is a reference at the buffer for storing whitespace - * characters. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd, std::vector &whitespaceBuf) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); - } - - /** - * Static version of CollapsingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param hasWhitespace is a reference at the "hasWhitespace" flag. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd, bool &hasWhitespace) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, - size_t start) -{ - for (auto elem : buf) { - handler.append(elem, start, start + 1); - start++; - } -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ - diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index bb04bd3..d44176a 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -372,8 +373,15 @@ bool DocumentChildHandler::convertData(Handle field, return valid && scope().resolveValue(data, type, logger); } -bool DocumentChildHandler::data(Variant &data) +bool DocumentChildHandler::data(TokenizedData &data) { + // TODO: Handle this correctly + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // For now, except "no data" as success + return true; + } + // We're past the region in which explicit fields can be defined in the // parent structure element scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true); @@ -393,11 +401,11 @@ bool DocumentChildHandler::data(Variant &data) // If it is a primitive field directly, try to parse the content. if (field->isPrimitive()) { // Add it as primitive content. - if (!convertData(field, data, logger())) { + if (!convertData(field, text, logger())) { return false; } - parent->createChildDocumentPrimitive(data, fieldIdx); + parent->createChildDocumentPrimitive(text, fieldIdx); return true; } @@ -411,7 +419,7 @@ bool DocumentChildHandler::data(Variant &data) for (auto primitiveField : defaultFields) { // Then try to parse the content using the type specification. forks.emplace_back(logger().fork()); - if (!convertData(primitiveField, data, forks.back())) { + if (!convertData(primitiveField, text, forks.back())) { continue; } @@ -424,7 +432,7 @@ bool DocumentChildHandler::data(Variant &data) createPath(fieldIdx, path, parent); // Then create the primitive element - parent->createChildDocumentPrimitive(data); + parent->createChildDocumentPrimitive(text); return true; } @@ -434,10 +442,10 @@ bool DocumentChildHandler::data(Variant &data) if (defaultFields.empty()) { logger().error("Got data, but structure \"" + name() + "\" does not have any primitive field", - data); + text); } else { logger().error("Could not read data with any of the possible fields:", - data); + text); size_t f = 0; for (auto field : defaultFields) { logger().note(std::string("Field ") + @@ -471,4 +479,4 @@ namespace RttiTypes { const Rtti DocumentField = RttiBuilder( "DocumentField").parent(&Node); } -} \ No newline at end of file +} diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 862081c..dda7d8b 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -167,7 +167,7 @@ public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; @@ -213,4 +213,4 @@ extern const Rtti DocumentField; } } -#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ \ No newline at end of file +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index bf5d4ea..3d413e8 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include "Callbacks.hpp" @@ -130,7 +131,7 @@ bool EmptyHandler::annotationEnd(const Variant &className, return true; } -bool EmptyHandler::data(Variant &data) +bool EmptyHandler::data(TokenizedData &data) { // Support any data return true; @@ -184,10 +185,13 @@ bool StaticHandler::annotationEnd(const Variant &className, return false; } -bool StaticHandler::data(Variant &data) +bool StaticHandler::data(TokenizedData &data) { - logger().error("Did not expect any data here", data); - return false; + if (data.text(WhitespaceMode::TRIM) != nullptr) { + logger().error("Did not expect any data here", data); + return false; + } + return true; } /* Class StaticFieldHandler */ @@ -227,12 +231,19 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(Variant &data) +bool StaticFieldHandler::data(TokenizedData &data) { + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // Providing no data here is ok as long as the "doHandle" callback + // function has already been called + return handled; + } + // Call the doHandle function if this has not been done before if (!handled) { handled = true; - doHandle(data, args); + doHandle(text, args); return true; } @@ -240,7 +251,7 @@ bool StaticFieldHandler::data(Variant &data) logger().error( std::string("Found data, but the corresponding argument \"") + argName + std::string("\" was already specified"), - data); + text); // Print the location at which the attribute was originally specified auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 7cda7a4..929466d 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -31,6 +31,7 @@ namespace ousia { class ParserScope; class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -158,40 +159,63 @@ protected: */ const std::string &name() const; -public: - /** - * Virtual destructor. - */ - virtual ~Handler(); - /** * Calls the corresponding function in the Callbacks instance. Sets the * whitespace mode that specifies how string data should be processed. The * calls to this function are placed on a stack by the underlying Stack - * class. + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * * @param whitespaceMode specifies one of the three WhitespaceMode constants * PRESERVE, TRIM or COLLAPSE. */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); + void pushWhitespaceMode(WhitespaceMode whitespaceMode); /** - * Calls the corresponding function in the Callbacks instance. - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. */ - void registerToken(const std::string &token); + void popWhitespaceMode(); /** - * Calls the corresponding function in the Callbacks instance. - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * - * @param token is the token string that should be unregistered. + * @param tokens is a list of tokens that should be reported to this handler + * instance via the "token" method. */ - void unregisterToken(const std::string &token); + void pushTokens(const std::vector &tokens); + + /** + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. + */ + void popWhitespaceMode(); + + + /** + * Calls the corresponding function in the Callbacks instance. This method + * registers the given tokens as tokens that are generally available, tokens + * must be explicitly enabled using the "pushTokens" and "popTokens" method. + * Tokens that have not been registered are not guaranteed to be reported, + * even though they are + */ + void registerTokens(const std::vector &tokens); + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); /** * Returns the command name for which the handler was created. @@ -299,11 +323,11 @@ public: * Handler instance. Should return true if the data could be handled, false * otherwise. * - * @param data is a string variant containing the character data and its - * location. + * @param data is an instance of TokenizedData containing the segmented + * character data and its location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(Variant &data) = 0; + virtual bool data(TokenizedData &data) = 0; }; /** @@ -333,7 +357,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; /** * Creates an instance of the EmptyHandler class. @@ -359,7 +383,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; /** @@ -412,7 +436,7 @@ protected: public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; } } diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 5b67248..309c9a0 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -413,16 +414,24 @@ void Stack::command(const Variant &name, const Variant::mapType &args) } } -void Stack::data(const Variant &data) +void Stack::data(TokenizedData data) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); + // TODO: Rewrite this function for token handling + // TODO: This loop needs to be refactored out + while (!data.atEnd()) { + // End handlers that already had a default field and are currently not + // active. + endOverdueHandlers(); - while (true) { - // Check whether there is any command the data can be sent to + const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); + + // Check whether there is any command the data can be sent to -- if not, + // make sure the data actually is data if (stack.empty()) { - throw LoggableException("No command here to receive data.", data); + if (hasNonWhitespaceText) { + throw LoggableException("No command here to receive data.", data); + } + return; } // Fetch the current command handler information @@ -440,7 +449,10 @@ void Stack::data(const Variant &data) // If the "hadDefaultField" flag is set, we already issued an error // message if (!info.hadDefaultField) { - logger().error("Did not expect any data here", data); + if (hasNonWhitespaceText) { + logger().error("Did not expect any data here", data); + } + return; } } @@ -454,8 +466,16 @@ void Stack::data(const Variant &data) // Pass the data to the current Handler instance bool valid = false; try { - Variant dataCopy = data; - valid = info.handler->data(dataCopy); + // Create a fork of the TokenizedData and let the handler work + // on it + TokenizedData dataFork = data; + valid = info.handler->data(dataFork); + + // If the data was validly handled by the handler, commit the + // change + if (valid) { + data = dataFork; + } } catch (LoggableException ex) { loggerFork.log(ex); @@ -482,6 +502,19 @@ void Stack::data(const Variant &data) } } +void Stack::data(const Variant &stringData) +{ + // Fetch the SourceLocation of the given stringData variant + SourceLocation loc = stringData.getLocation(); + + // Create a TokenizedData instance and feed the given string data into it + TokenizedData tokenizedData(loc.getSourceId()); + tokenizedData.append(stringData.asString(), loc.getStart()); + + // Call the actual "data" method + data(tokenizedData); +} + void Stack::fieldStart(bool isDefault) { // Make sure the current handler stack is not empty @@ -584,4 +617,4 @@ void Stack::token(Variant token) // TODO } } -} \ No newline at end of file +} diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index b67ce82..cd29b28 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -44,6 +44,7 @@ namespace ousia { // Forward declarations class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -292,13 +293,24 @@ public: void command(const Variant &name, const Variant::mapType &args); /** - * Function that shuold be called whenever character data is found in the + * Function that should be called whenever character data is found in the * input stream. May only be called if the currently is a command on the * stack. * - * @param data is a string variant containing the data that has been found. + * @param data is a TokenizedData instance containing the pre-segmented data + * that should be read. + */ + void data(TokenizedData data); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. The given string variant is converted into a TokenizedData + * instance internally. + * + * @param stringData is a string variant containing the data that has been + * found. */ - void data(const Variant &data); + void data(const Variant &stringData); /** * Function that should be called whenever a new field starts. Fields of the diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..aaebe7d 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -127,7 +127,7 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair loadOffset(size_t idx) + std::pair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); @@ -157,7 +157,31 @@ public: /** * Returns the number of characters for which offsets are stored. */ - size_t size() { return lens.size(); } + size_t size() const { return lens.size(); } + + /** + * Trims the length of the TokenizedData instance to the given length. + * Removes all token matches that lie within the trimmed region. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) { + if (length < size()) { + lens.resize(length); + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + } + } + + /** + * Resets the SourceOffsetVector to the state it had when it was + * constructed. + */ + void clear() { + lens.clear(); + offsets.clear(); + lastEnd = 0; + } }; } diff --git a/src/core/parser/utils/Token.cpp b/src/core/parser/utils/Token.cpp deleted file mode 100644 index 8bcdbb5..0000000 --- a/src/core/parser/utils/Token.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "Token.hpp" - -namespace ousia { -// Stub to make sure Tokens.hpp is valid -} - diff --git a/src/core/parser/utils/Token.hpp b/src/core/parser/utils/Token.hpp deleted file mode 100644 index f907450..0000000 --- a/src/core/parser/utils/Token.hpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file Token.hpp - * - * Definition of the TokenId id and constants for some special tokens. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_HPP_ -#define _OUSIA_TOKEN_HPP_ - -#include -#include -#include - -#include - -namespace ousia { - -/** - * The TokenId is used to give each token id a unique id. - */ -using TokenId = uint32_t; - -/** - * Type used for storing token lengths. - */ -using TokenLength = uint16_t; - -/** - * Namespace containing constants for TokenId instances with special meaning. - */ -namespace Tokens { -/** - * Token which is not a token. - */ -constexpr TokenId Empty = std::numeric_limits::max(); - -/** - * Token which represents data (represented as TokenizedData). - */ -constexpr TokenId Data = std::numeric_limits::max() - 1; - -/** - * Token which represents a newline token. - */ -constexpr TokenId Newline = std::numeric_limits::max() - 2; - -/** - * Token which represents a paragraph token -- issued if two consecutive - * newlines occur with optionally any amout of whitespace between them. - */ -constexpr TokenId Paragraph = std::numeric_limits::max() - 3; - -/** - * Token which represents an indentation token -- issued if the indentation of - * this line is larget than the indentation of the previous line. - */ -constexpr TokenId Indentation = std::numeric_limits::max() - 4; - -/** - * Maximum token id to be used. Tokens allocated for users should not surpass - * this value. - */ -constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255; -} - -/** - * The Token structure describes a token discovered by the Tokenizer or read - * from the TokenizedData struct. - */ -struct Token { - /** - * Id of the id of this token. - */ - TokenId id; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - Token() : id(Tokens::Empty) {} - - /** - * Constructor of the Token struct. - * - * @param id represents the token id. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - Token(TokenId id, const std::string &content, SourceLocation location) - : id(id), content(content), location(location) - { - } - - /** - * Constructor of the Token struct, only initializes the token id - * - * @param id is the id corresponding to the id of the token. - */ - Token(TokenId id) : id(id) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; -} - -#endif /* _OUSIA_TOKENS_HPP_ */ - diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 80cc945..a45d3ff 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia { /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(Tokens::Empty) {} +TokenTrie::Node::Node() : id(Tokens::Empty) {} /* Class DynamicTokenTree */ bool TokenTrie::registerToken(const std::string &token, - TokenId type) noexcept + TokenId id) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token, } // If the resulting node already has a type set, we're screwed. - if (node->type != Tokens::Empty) { + if (node->id != Tokens::Empty) { return false; } // Otherwise just set the type to the given type. - node->type = type; + node->id = id; return true; } @@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->type != Tokens::Empty || node->children.size() > 1) && + if ((node->id != Tokens::Empty || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; @@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept } // If the node type is already Tokens::Empty, we cannot do anything here - if (node->type == Tokens::Empty) { + if (node->id == Tokens::Empty) { return false; } // If the target node has children, we cannot delete the subtree. Set the // type to Tokens::Empty instead if (!node->children.empty()) { - node->type = Tokens::Empty; + node->id = Tokens::Empty; return true; } @@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept } node = it->second.get(); } - return node->type; + return node->id; } } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index b2d1539..c470acc 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,7 +33,7 @@ #include #include -#include "Token.hpp" +#include namespace ousia { @@ -75,10 +75,9 @@ public: ChildMap children; /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. + * Id of the token represented by this node. */ - TokenId type; + TokenId id; /** * Default constructor, initializes the descriptor with nullptr. @@ -99,10 +98,10 @@ public: * * @param token is the character sequence that should be registered as * token. - * @param type is the descriptor that should be set for this token. + * @param id is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, TokenId type) noexcept; + bool registerToken(const std::string &token, TokenId id) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..0ec56af 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -110,19 +110,19 @@ private: std::vector buf; /** - * Vector containing all token marks. + * Vector storing all the character offsets efficiently. */ - std::vector marks; + SourceOffsetVector offsets; /** - * Vector storing all the character offsets efficiently. + * Vector containing all token marks. */ - SourceOffsetVector offsets; + mutable std::vector marks; /** * Flag indicating whether the internal "marks" vector is sorted. */ - bool sorted; + mutable bool sorted; public: /** @@ -150,9 +150,12 @@ public: // Extend the text regions, interpolate the source position (this may // yield incorrect results) const size_t size = buf.size(); - for (SourceOffset offs = offsStart; offs < offsStart + data.size(); - offs++) { - offsets.storeOffset(offs, offs + 1); + for (size_t i = 0; i < data.size(); i++) { + if (offsStart != InvalidSourceOffset) { + offsets.storeOffset(offsStart + i, offsStart + i + 1); + } else { + offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset); + } } return size; @@ -213,7 +216,7 @@ public: * available. */ bool next(Token &token, WhitespaceMode mode, - const std::unordered_set &tokens, size_t &cursor) + const std::unordered_set &tokens, size_t &cursor) const { // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { @@ -222,10 +225,11 @@ public: } // Fetch the next larger TokenMark instance, make sure the token is in - // the "enabled" list + // the "enabled" list and within the buffer range auto it = std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); - while (it != marks.end() && tokens.count(it->id) == 0) { + while (it != marks.end() && (tokens.count(it->id) == 0 || + it->bufStart + it->len > buf.size())) { it++; } @@ -303,12 +307,59 @@ public: return false; } + /** + * Resets the TokenizedDataImpl instance to the state it had when it was + * constructred. + */ + void clear() + { + buf.clear(); + marks.clear(); + offsets.clear(); + sorted = true; + } + + /** + * Trims the length of the TokenizedDataImpl instance to the given length. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) + { + if (length < size()) { + buf.resize(length); + offsets.trim(length); + } + } + /** * Returns the current size of the internal buffer. * * @return the size of the internal character buffer. */ - size_t getSize() { return buf.size(); } + size_t size() const { return buf.size(); } + + /** + * Returns true if no data is in the data buffer. + * + * @return true if the "buf" instance has no data. + */ + bool empty() const { return buf.empty(); } + + /** + * Returns the current location of all data in the buffer. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const + { + if (empty()) { + return SourceLocation{sourceId}; + } + return SourceLocation{sourceId, offsets.loadOffset(0).first, + offsets.loadOffset(size()).second}; + } }; /* Class TokenizedData */ @@ -335,7 +386,7 @@ size_t TokenizedData::append(char c, SourceOffset offsStart, void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->getSize() - len, len); + impl->mark(id, impl->size() - len, len); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) @@ -343,23 +394,67 @@ void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) impl->mark(id, bufStart, len); } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear() { - return impl->next(token, mode, tokens, cursor); + impl->clear(); + tokens.clear(); + cursor = 0; } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const +{ + return impl->getLocation(); +} + +TokenizedDataReader reader() const +{ + return TokenizedDataReader(impl, std::unordered_set{}, 0, 0); +} + +/* Class TokenizedDataReader */ + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ + return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const { return readCursor >= size(); } + +bool TokenizedData::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + peekCursor = readCursor; + return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedData::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + return impl->next(token, mode, tokens, peekCursor); +} + +Variant TokenizedData::text(WhitespaceMode mode) { // Copy the current cursor position to not update the actual cursor position // if the operation was not successful size_t cursorCopy = cursor; + Token token; if (!impl->next(token, mode, tokens, cursorCopy) || token.id != Tokens::Data) { - return false; + return Variant{nullptr}; } - // There is indeed a text token, update the internal cursor position + // There is indeed a text token, update the internal cursor position and + // return the token as variant. cursor = cursorCopy; - return true; + Variant res = Variant::fromString(token.content); + res.setLocation(token.getLocation()); + return res; } } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..85b80ae 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -36,42 +36,29 @@ #include #include +#include #include - -#include "Token.hpp" +#include namespace ousia { // Forward declaration class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork; /** * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * The data stored in TokenizedData */ class TokenizedData { private: /** - * Shared pointer pointing at the internal data. This data is shared when - * copying TokenizedData instances, which corresponds to forking a - * TokenizedData instance. + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. */ std::shared_ptr impl; - /** - * Contains all currently enabled token ids. - */ - std::unordered_set tokens; - - /** - * Position from which the last element was read from the internal buffer. - * This information is not shared with the other instances of TokenizedData - * pointing at the same location. - */ - size_t cursor; - public: /** * Default constructor, creates a new instance of TokenizedData, sets the @@ -136,25 +123,121 @@ public: void mark(TokenId id, size_t bufStart, TokenLength len); /** - * Enables a single token id. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Resets the TokenizedData instance to the state it had when it was + * constructred. + */ + void clear(); + + /** + * Trims the length of the TokenizedData instance to the given length. Note + * that this function does not remove any token matches for performance + * reasons, it merely renders them incaccessible. Appending new data after + * calling trim will make the token marks accessible again. Thus this method + * should be the last function called to modify the data buffer and the + * token marks. * - * @param id is the TokenId of the token that should be enabled. + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. */ - void enableToken(TokenId id) { tokens.insert(id); } + void trim(size_t length); /** - * Enables a set of token ids. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Returns the number of characters currently represented by this + * TokenizedData instance. + */ + size_t size() const; + + /** + * Returns true if the TokenizedData instance is empty, false otherwise. * - * @param ids is the TokenId of the token that should be enabled. + * @return true if not data is stored inside the TokenizedData instance. */ - void enableToken(const std::unordered_set &ids) - { - tokens.insert(ids.begin(), ids.end()); - } + bool empty() const; + + /** + * Returns the location of the entire TokenizedData instance. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const; + + /** + * Returns a TokenizedDataReader instance that can be used to access the + * data. + * + * @return a new TokenizedDataReader instance pointing at the beginning of + * the internal buffer. + */ + TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: + friend TokenizedData; + + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. + */ + std::shared_ptr impl; + + /** + * Position from which the last element was read from the internal buffer. + */ + size_t readCursor; + + /** + * Position from which the last element was peeked from the internal buffer. + */ + size_t peekCursor; + + /** + * Private constructor of TokenizedDataReader, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader. + * + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReader(std::shared_ptr impl, + size_t readCursor, size_t peekCursor); + +public: + /** + * Returns a new TokenizedDataReaderFork from which tokens and text can be + * read without advancing this reader instance. + */ + TokenizedDataReaderFork fork(); + + /** + * Returns true if this TokenizedData instance is at the end. + * + * @return true if the end of the TokenizedData instance has been reached. + */ + bool atEnd() const; + + /** + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. Advances the + * internal cursor and re + * + * @param token is an output parameter into which the read token will be + * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. + * @param mode is the whitespace mode that should be used when a text token + * is returned. + * @return true if the operation was successful and there is a next token, + * false if there are no more tokens. + */ + bool read(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Stores the next token in the given token reference, returns true if the @@ -162,12 +245,26 @@ public: * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool peek(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::COLLAPSE); + + /** + * Consumes the peeked tokens, the read cursor will now be at the position + * of the peek cursor. + */ + void consumePeek() { readCursor = peekCursor; } + + /** + * Resets the peek cursor to the position of the read cursor. + */ + void resetPeek() { peekCursor = readCursor; } /** * Stores the next text token in the given token reference, returns true if @@ -178,12 +275,53 @@ public: * stored. The TokenId is set to Tokens::Empty if there are no more tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. - * @return true if the operation was successful and there is a next token, - * false if there are no more tokens. + * @return a string variant with the data if there is any data or a nullptr + * variant if there is no text. */ - bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE); }; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: + friend TokenizedDataReader; + + /** + * Reference pointing at the parent TokenizedDataReader to which changes may + * be commited. + */ + TokenizedDataReader &parent; + + /** + * Private constructor of TokenizedDataReaderFork, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader and a reference at the parent TokenizedDataReader. + * + * @param parent is the TokenizedDataReader instance to which the current + * read/peek progress may be commited. + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReaderFork(TokenizedDataReader &parent, + std::shared_ptr impl, + size_t readCursor, size_t peekCursor) + : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) + { + } + +public: + /** + * Commits the read/peek progress to the underlying parent. + */ + void commit() { parent = *this; } +} } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..51787cd 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@ #include #include #include -#include +#include "TokenizedData.hpp" #include "Tokenizer.hpp" namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch { Token token; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; + size_t dataStartOffset; /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. + * Set to true if the matched token is a primary token. */ - size_t textEnd; + bool primary; /** * Constructor of the TokenMatch class. */ - TokenMatch() : textLength(0), textEnd(0) {} + TokenMatch() : dataStartOffset(0), primary(false) {} /** * Returns true if this TokenMatch instance actually represents a match. + * + * @return true if the TokenMatch actually has a match. + */ + bool hasMatch() const { return token.id != Tokens::Empty; } + + /** + * Returns the length of the matched token. + * + * @return the length of the token string. */ - bool hasMatch() { return token.id != Tokens::Empty; } + size_t size() const { return token.content.size(); } }; /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private: size_t start; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; + size_t dataStartOffset; public: /** * Constructor of the TokenLookup class. * * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. + * @param start is the start position in the source file. + * @param dataStartOffset is the current length of the TokenizedData buffer. */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t dataStartOffset) + : node(node), start(start), dataStartOffset(dataStartOffset) { } /** * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). + * character. If a complete token is matched, stores the match in the given + * TokenMatch reference and returns true. * * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,48 @@ public: * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. + * @return true if a token was matched, false otherwise. */ - void advance(char c, std::vector &lookups, TokenMatch &match, - const std::vector &tokens, SourceOffset end, - SourceId sourceId) + bool advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, + SourceOffset end, SourceId sourceId) { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node + // Set to true once a token has been matched + bool res = false; + + // Check whether we can continue the current token path, if not, abort auto it = node->children.find(c); if (it == node->children.end()) { - return; + return res; } // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); - if (node->type != Tokens::Empty) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - Token{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } + if (node->id != Tokens::Empty) { + const Tokenizer::TokenDescriptor &descr = tokens[node->id]; + match.token = Token(node->id, descr.string, + SourceLocation(sourceId, start, end)); + match.dataStartOffset = dataStartOffset; + match.primary = descr.primary; + res = true; } // If this state can possibly be advanced, store it in the states list. if (!node->children.empty()) { lookups.emplace_back(*this); } + return res; } }; - -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.id = Tokens::Data; -} } /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template -bool Tokenizer::next(CharReader &reader, Token &token) +template +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -199,43 +173,68 @@ bool Tokenizer::next(CharReader &reader, Token &token) // Prepare the lookups in the token trie const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; + TokenMatch bestMatch; std::vector lookups; std::vector nextLookups; - // Instantiate the text handler - TextHandler textHandler; - // Peek characters from the reader and try to advance the current token tree // cursor char c; + const size_t initialDataSize = data.size(); size_t charStart = reader.getPeekOffset(); const SourceId sourceId = reader.getSourceId(); while (reader.peek(c)) { const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; + const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); + if (!bestMatch.hasMatch()) { + lookups.emplace_back(root, charStart, dataStartOffset); } // Try to advance all other lookups with the new character + TokenMatch match; for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + // Continue if the current lookup + if (!lookup.advance(c, nextLookups, match, tokens, charEnd, + sourceId)) { + continue; + } + + // If the matched token is primary, check whether it is better than + // the current best match, if yes, replace the best match. In any + // case just continue + if (match.primary) { + if (match.size() > bestMatch.size()) { + bestMatch = match; + } + continue; + } + + // Otherwise -- if the matched token is a non-primary token (and no + // primary token has been found until now) -- mark the match in the + // TokenizedData + if (!bestMatch.hasMatch()) { + data.mark(match.token.id, data.size() - match.size() + 1, + match.size()); + } } // We have found a token and there are no more states to advance or the // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { + if (bestMatch.hasMatch()) { + if ((nextLookups.empty() || data.size() > initialDataSize)) { break; } } else { // Record all incomming characters - textHandler.append(c, charStart, charEnd); + data.append(c, charStart, charEnd); + + // Special token processing + // TODO: Build a special state machine for this in another class + if (c == '\n') { + data.mark(Tokens::Newline, 1); + } } // Swap the lookups and the nextLookups list @@ -246,60 +245,53 @@ bool Tokenizer::next(CharReader &reader, Token &token) charStart = charEnd; } - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildDataToken(textHandler, match, sourceId); + // If we found data, emit a corresponding data token + if (data.size() > initialDataSize && + (!bestMatch.hasMatch() || + bestMatch.dataStartOffset > initialDataSize)) { + // If we have a "bestMatch" wich starts after text data has started, + // trim the TokenizedData to this offset + if (bestMatch.dataStartOffset > initialDataSize) { + data.trim(bestMatch.dataStartOffset); + } + + // Create a token containing the data location + bestMatch.token = Token{data.getLocation()}; } // Move the read/peek cursor to the end of the token, abort if an error // happens while doing so - if (match.hasMatch()) { + if (bestMatch.hasMatch()) { // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { + if (bestMatch.token.location.getEnd() == InvalidSourceOffset) { throw OusiaException{"Token end position offset out of range"}; } // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); + const size_t end = bestMatch.token.location.getEnd(); if (read) { reader.seek(end); } else { reader.seekPeekCursor(end); } - token = match.token; + token = bestMatch.token; } else { token = Token{}; } - return match.hasMatch(); + return bestMatch.hasMatch(); } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary) { // Abort if an empty token should be registered if (token.empty()) { @@ -309,8 +301,8 @@ TokenId Tokenizer::registerToken(const std::string &token) // Search for a new slot in the tokens list TokenId type = Tokens::Empty; for (size_t i = nextTokenId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; + if (!tokens[i].valid()) { + tokens[i] = TokenDescriptor(token, primary); type = i; break; } @@ -320,62 +312,47 @@ TokenId Tokenizer::registerToken(const std::string &token) // override the special token type handles if (type == Tokens::Empty) { type = tokens.size(); - if (type == Tokens::Data || type == Tokens::Empty) { + if (type >= Tokens::MaxTokenId) { throw OusiaException{"Token type ids depleted!"}; } - tokens.emplace_back(token); + tokens.emplace_back(token, primary); } nextTokenId = type + 1; - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list + // Try to register the token in the trie -- if this fails, remove it from + // the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; + tokens[type] = TokenDescriptor(); nextTokenId = type; return Tokens::Empty; } return type; } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id) { // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenId = type; + if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { + tokens[id] = TokenDescriptor(); + nextTokenId = id; return true; } return false; } -std::string Tokenizer::getTokenString(TokenId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const { - whitespaceMode = mode; + if (id < tokens.size()) { + return tokens[id]; + } + return EmptyTokenDescriptor; } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } - /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..2ddb9c9 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@ /** * @file Tokenizer.hpp * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -33,39 +33,75 @@ #include #include -#include +#include -#include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; +class TokenizedData; /** * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance. */ class Tokenizer { -private: +public: /** - * Internally used token trie. This object holds all registered tokens. + * Internally used structure describing a registered token. */ - TokenTrie trie; + struct TokenDescriptor { + /** + * String describing the token. + */ + std::string string; + + /** + * Set to true if this token is primary. + */ + bool primary; + + /** + * Constructor of the TokenDescriptor class. + * + * @param string is the string representation of the registered token. + * @param primary specifies whether the token is a primary token that + * should be returned as a single token, or a secondary token, that + * should be returned as part of TokenizedData. + */ + TokenDescriptor(const std::string &string, bool primary) + : string(string), primary(primary) + { + } + + /** + * Default constructor. + */ + TokenDescriptor() : primary(false) {} + + /** + * Returns true if the TokenDescriptor represents a valid token. + */ + bool valid() { return !string.empty(); } + }; +private: /** - * Flag defining whether whitespaces should be preserved or not. + * Internally used token trie. This object holds all registered tokens. */ - WhitespaceMode whitespaceMode; + TokenTrie trie; /** * Vector containing all registered token types. */ - std::vector tokens; + std::vector tokens; /** * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private: /** * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. + * function is templated in order to force optimized code generation for + * both reading and peeking. * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. + * @tparam read specifies whether the method should read the token or just + * peek. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return false if the end of the stream has been reached, true otherwise. */ - template - bool next(CharReader &reader, Token &token); + template + bool next(CharReader &reader, Token &token, TokenizedData &data); public: /** * Constructor of the Tokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. */ - Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + Tokenizer(); /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. + * Registers the given string as a token. Returns a unique identifier + * describing the registered token. * * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if + * @param primary specifies whether the token is a primary token -- if true, + * the token will be returned as a single, standalone token. Otherwise the + * token will be returned as part of a "TokenizedData" structure. + * @return a unique identifier for the registered token or Tokens::Empty if * an error occured. */ - TokenId registerToken(const std::string &token); + TokenId registerToken(const std::string &token, bool primary = true); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The - *TokenId - * must have been returned by registerToken. + * TokenId must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). + * because the token with the given TokenId was already unregistered). */ - bool unregisterToken(TokenId type); + bool unregisterToken(TokenId id); /** * Returns the token that was registered under the given TokenId id or - *an - * empty string if an invalid TokenId id is given. + * an empty string if an invalid TokenId id is given. * - * @param type is the TokenId id for which the corresponding token - *string + * @param id is the TokenId for which the corresponding TokenDescriptor * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. + * @return the registered TokenDescriptor or an invalid TokenDescriptor if + * the given TokenId is invalid. */ - WhitespaceMode getWhitespaceMode(); + const TokenDescriptor& lookupToken(TokenId id) const; /** * Reads a new token from the CharReader and stores it in the given - * Token instance. + * Token instance. If the token has the id Tokens::Data, use the "getData" + * method to fetch a reference at the underlying TokenizedData instance + * storing the data. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(CharReader &reader, Token &token); + bool read(CharReader &reader, Token &token, TokenizedData &data); /** * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public: * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(CharReader &reader, Token &token); + bool peek(CharReader &reader, Token &token, TokenizedData &data); }; } diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..d4cdbf8 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -94,92 +94,11 @@ public: static const PlainFormatTokens OsmlTokens; -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: - /** - * Internal character buffer. - */ - std::vector buf; - - /** - * Start location of the character data. - */ - SourceOffset start; - - /** - * End location of the character data. - */ - SourceOffset end; - -public: - /** - * Default constructor, initializes start and end with zeros. - */ - DataHandler() : start(0), end(0) {} - - /** - * Returns true if the internal buffer is empty. - * - * @return true if no characters were added to the internal buffer, false - * otherwise. - */ - bool isEmpty() { return buf.empty(); } - - /** - * Appends a single character to the internal buffer. - * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. - */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) - { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; - } - - /** - * Appends a string to the internal buffer. - * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. - */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) - { - if (isEmpty()) { - start = stringStart; - } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; - } - - /** - * Converts the internal buffer to a variant with attached location - * information. - * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. - */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } -}; - OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) - : reader(reader), logger(logger), tokenizer(OsmlTokens) + : reader(reader), + logger(logger), + tokenizer(OsmlTokens), + data(reader.getSourceId()) { // Place an intial command representing the complete file on the stack commands.push(Command{"", Variant::mapType{}, true, true, true, false}); @@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; - bool hasCharSiceNSSep = false; + bool hasCharSinceNSSep = false; std::vector identifier; size_t end = reader.getPeekOffset(); char c, c2; @@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) if ((first && Utils::isIdentifierStartCharacter(c)) || (!first && Utils::isIdentifierCharacter(c))) { identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { identifier.push_back(c); } else { @@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) // This is no longer the first character first = false; - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); + // Advance the hasCharSinceNSSep flag + hasCharSinceNSSep = allowNSSep && (c != ':'); end = reader.getPeekOffset(); reader.consumePeek(); @@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment() { Token token; size_t depth = 1; - while (tokenizer.read(reader, token)) { + while (tokenizer.read(reader, token, data)) { + // Throw the comment data away + data.clear(); + if (token.id == OsmlTokens.BlockCommentEnd) { depth--; if (depth == 0) { @@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment() } } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData() { - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); + if (!data.empty()) { location = data.getLocation(); reader.resetPeek(); return true; @@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField() OsmlStreamParser::State OsmlStreamParser::parse() { - // Handler for incomming data - DataHandler handler; + // Reset the data handler + data.clear(); // Read tokens until the outer loop should be left Token token; - while (tokenizer.peek(reader, token)) { + while (tokenizer.peek(reader, token, data)) { const TokenId type = token.id; // Special handling for Backslash and Text @@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() // Try to parse a command if (Utils::isIdentifierStartCharacter(c)) { // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse() // If this was an annotation start token, add the parsed < to the // output if (type == OsmlTokens.AnnotationStart) { - handler.append('<', token.location.getStart(), - token.location.getStart() + 1); + data.append('<', token.location.getStart(), + token.location.getStart() + 1); } - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); + data.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); continue; } else if (type == Tokens::Data) { @@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse() location = token.location; return State::FIELD_START; } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - reader.consumePeek(); continue; } // A non-text token was reached, make sure all pending data commands // have been issued - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse() Command &cmd = commands.top(); if (!cmd.inField) { cmd.inField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got field start token \"{\", but no command for which to " "start the field. Write \"\\{\" to insert this sequence as " "text.", - token); + token);*/ } else if (token.id == OsmlTokens.FieldEnd) { - if (closeField()) { + closeField(); + return State::FIELD_END; +/* if (closeField()) { return State::FIELD_END; } logger.error( "Got field end token \"}\", but there is no field to end. " "Write \"\\}\" to insert this sequence as text.", - token); + token);*/ } else if (token.id == OsmlTokens.DefaultFieldStart) { // Try to start a default field the first time the token is reached Command &topCmd = commands.top(); if (!topCmd.inField) { topCmd.inField = true; topCmd.inDefaultField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got default field start token \"{!\", but no command for " "which to start the field. Write \"\\{!\" to insert this " "sequence as text", - token); + token);*/ } else if (token.id == OsmlTokens.AnnotationEnd) { // We got a single annotation end token "\>" -- simply issue the // ANNOTATION_END event @@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() } // Issue available data - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse() return State::END; } +Variant OsmlStreamParser::getText(WhitespaceMode mode) +{ + TokenizedData dataFork = data; + Variant text = dataFork.text(mode); + location = text.getLocation(); + return text; +} + const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..453a2bb 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,17 +29,19 @@ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include +#include #include +#include #include +#include namespace ousia { // Forward declarations class CharReader; class Logger; -class DataHandler; +class OsmlStreamParserImpl; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml @@ -137,26 +139,15 @@ public: Variant arguments; /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange : 1; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField : 1; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). + * Vector used as stack for holding the number of opening/closing braces + * and the corresponding "isDefaultField" flag. */ - bool inRangeField : 1; + std::vector fields; /** - * Set to true if we are currently in a field that has been especially - * marked as default field (using the "|") syntax. + * Set to true if this is a command with clear begin and end. */ - bool inDefaultField : 1; + bool hasRange; /** * Default constructor. @@ -164,7 +155,6 @@ public: Command() : hasRange(false), inField(false), - inRangeField(false), inDefaultField() { } @@ -178,15 +168,10 @@ public: * command. * @param hasRange should be set to true if this is a command with * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently are inside the - * outer field of a ranged command. * @param inDefaultField is set to true if we currently are in a * specially marked default field. */ - Command(Variant name, Variant arguments, bool hasRange, - bool inField, bool inRangeField, bool inDefaultField) + Command(Variant name, Variant arguments, bool hasRange) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), @@ -215,25 +200,20 @@ private: Tokenizer tokenizer; /** - * Stack containing the current commands. - */ - std::stack commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). + * Variant containing the tokenized data that was returned from the + * tokenizer as data. */ - Variant data; + TokenizedData data; /** - * Contains the location of the last token. + * Stack containing the current commands. */ - SourceLocation location; + std::stack commands; /** - * Contains the field index of the current command. + * Pointer at */ - size_t fieldIdx; + std::unique_ptr impl; /** * Function used internall to parse an identifier. @@ -291,12 +271,10 @@ private: /** * Checks whether there is any data pending to be issued, if yes, issues it. * - * @param handler is the data handler that contains the data that may be - * returned to the user. * @return true if there was any data and DATA should be returned by the * parse function, false otherwise. */ - bool checkIssueData(DataHandler &handler); + bool checkIssueData(); /** * Called before any data is appended to the internal data handler. Checks @@ -327,6 +305,12 @@ public: */ OsmlStreamParser(CharReader &reader, Logger &logger); + /** + * Destructor of the OsmlStreamParser, needed to destroy the incomplete + * OsmlStreamParserImpl. + */ + ~OsmlStreamParser(); + /** * Continues parsing. Returns one of the states defined in the State enum. * Callers should stop once the State::END state is reached. Use the getter @@ -344,7 +328,19 @@ public: * @return a reference at a variant containing the data parsed by the * "parse" function. */ - const Variant &getData() const { return data; } + const TokenizedData &getData() const { return data; } + + /** + * Returns the complete content of the internal TokenizedData instance as + * a single string Variant. This method is mainly used in the unit tests for + * this class, it simply calls the text() method of TokenizedData. + * + * @param mode is the WhitespaceMode that should be used for returning the + * text. + * @return a string variant containing the text content of the internal + * TokenizedData instance or a nullptr variant if there is no text. + */ + Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Returns a reference at the internally stored command name. Only valid if @@ -371,13 +367,6 @@ public: * syntax). */ bool inDefaultField() const; - - /** - * Returns a reference at the char reader. - * - * @return the last internal token location. - */ - const SourceLocation &getLocation() const { return location; } }; } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..855f80d 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" @@ -56,17 +55,6 @@ public: */ std::vector textBuf; - /** - * Current whitespace buffer (for the trimming whitspace mode) - */ - std::vector whitespaceBuf; - - /** - * Flag indicating whether a whitespace character was present (for the - * collapsing whitespace mode). - */ - bool hasWhitespace; - /** * Current character data start. */ @@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) SourceLocation loc = xmlSyncLoggerPosition(p, ulen); // Fetch some variables for convenience - const WhitespaceMode mode = parser->getWhitespaceMode(); OsxmlEventParserData &data = parser->getData(); std::vector &textBuf = data.textBuf; - std::vector &whitespaceBuf = data.whitespaceBuf; - bool &hasWhitespace = data.hasWhitespace; - size_t &textStart = data.textStart; - size_t &textEnd = data.textEnd; - - size_t pos = loc.getStart(); - for (size_t i = 0; i < ulen; i++, pos++) { - switch (mode) { - case WhitespaceMode::PRESERVE: - PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd); - break; - case WhitespaceMode::TRIM: - TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - whitespaceBuf); - break; - case WhitespaceMode::COLLAPSE: - CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - hasWhitespace); - break; - } + + // Update start and end position + if (textBuf.empty()) { + data.textStart = loc.getStart(); } + data.textEnd = loc.getEnd(); + + // Insert the data into the text buffer + textBuf.insert(textBuf.end(), &s[0], &s[ulen]); } /* Class OsxmlEvents */ @@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ OsxmlEventParserData::OsxmlEventParserData() - : depth(0), - annotationEndTagDepth(-1), - hasWhitespace(false), - textStart(0), - textEnd(0) + : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0) { } @@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId) // Reset the text buffers textBuf.clear(); - whitespaceBuf.clear(); - hasWhitespace = false; textStart = 0; textEnd = 0; @@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), data(new OsxmlEventParserData()) { } @@ -532,16 +497,6 @@ void OsxmlEventParser::parse() } } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ - return whitespaceMode; -} - CharReader &OsxmlEventParser::getReader() const { return reader; } Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..e3fd5d4 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@ #include #include -#include - namespace ousia { // Forward declarations @@ -99,13 +97,10 @@ public: virtual void fieldEnd() = 0; /** - * Called whenever data is found. Whitespace data is handled as specified - * and the data has been parsed to the specified variant type. This function - * is not called if the parsing failed, the parser prints an error message - * instead. + * Called whenever string data is found. * - * @param data is the already parsed data that should be passed to the - * handler. + * @param data is a Variant containing the string data that was found in the + * XML file. */ virtual void data(const Variant &data) = 0; }; @@ -134,11 +129,6 @@ private: */ Logger &logger; - /** - * Current whitespace mode. - */ - WhitespaceMode whitespaceMode; - /** * Data to be used by the internal functions. */ @@ -170,21 +160,6 @@ public: */ void parse(); - /** - * Sets the whitespace handling mode. - * - * @param whitespaceMode defines how whitespace in the data should be - * handled. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Returns the current whitespace handling mode. - * - * @return the currently set whitespace handling mode. - */ - WhitespaceMode getWhitespaceMode() const; - /** * Returns the internal CharReader reference. * diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp index a93f14a..83966d5 100644 --- a/test/core/parser/stack/StackTest.cpp +++ b/test/core/parser/stack/StackTest.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -53,7 +54,7 @@ struct Tracker { Variant::mapType annotationStartArgs; Variant annotationEndClassName; Variant annotationEndElementName; - Variant dataData; + TokenizedData dataData; bool startResult; bool fieldStartSetIsDefault; @@ -81,7 +82,7 @@ struct Tracker { annotationStartArgs = Variant::mapType{}; annotationEndClassName = Variant::fromString(std::string{}); annotationEndElementName = Variant::fromString(std::string{}); - dataData = Variant::fromString(std::string{}); + dataData = TokenizedData(); startResult = true; fieldStartSetIsDefault = false; @@ -157,7 +158,7 @@ public: return tracker.annotationEndResult; } - bool data(Variant &data) override + bool data(TokenizedData &data) override { tracker.dataCount++; tracker.dataData = data; @@ -363,7 +364,7 @@ TEST(Stack, multipleFields) s.data("test"); tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test", tracker.dataData); + EXPECT_EQ("test", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc @@ -375,7 +376,7 @@ TEST(Stack, multipleFields) s.data("test2"); tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test2", tracker.dataData); + EXPECT_EQ("test2", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc @@ -387,7 +388,7 @@ TEST(Stack, multipleFields) s.data("test3"); tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test3", tracker.dataData); + EXPECT_EQ("test3", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc @@ -744,4 +745,4 @@ TEST(Stack, fieldAfterDefaultField) ASSERT_FALSE(logger.hasError()); } } -} \ No newline at end of file +} diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp index 231bad9..6bd7234 100644 --- a/test/core/parser/utils/TokenizedDataTest.cpp +++ b/test/core/parser/utils/TokenizedDataTest.cpp @@ -380,14 +380,14 @@ TEST(TokenizedData, textPreserveWhitespace) data.enableToken(5); - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); + Variant text; + text = data.text(WhitespaceMode::PRESERVE); + EXPECT_EQ(" ", text.asString()); + EXPECT_EQ(0U, text.getLocation().getStart()); + EXPECT_EQ(2U, text.getLocation().getEnd()); + EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); + Token token; ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); EXPECT_EQ(5U, token.id); EXPECT_EQ("$$", token.content); @@ -395,14 +395,13 @@ TEST(TokenizedData, textPreserveWhitespace) EXPECT_EQ(4U, token.getLocation().getEnd()); EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(4U, token.getLocation().getStart()); - EXPECT_EQ(6U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); + text = data.text(WhitespaceMode::PRESERVE); + EXPECT_EQ(" ", text.asString()); + EXPECT_EQ(4U, text.getLocation().getStart()); + EXPECT_EQ(6U, text.getLocation().getEnd()); + EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); - ASSERT_FALSE(data.text(token, WhitespaceMode::PRESERVE)); + ASSERT_EQ(nullptr, data.text(WhitespaceMode::PRESERVE)); ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); } @@ -416,7 +415,7 @@ TEST(TokenizedData, textTrimWhitespace) data.enableToken(5); Token token; - ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM)); + ASSERT_EQ(nullptr, data.text(WhitespaceMode::TRIM)); ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); EXPECT_EQ(5U, token.id); @@ -425,7 +424,7 @@ TEST(TokenizedData, textTrimWhitespace) EXPECT_EQ(4U, token.getLocation().getEnd()); EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM)); + ASSERT_EQ(nullptr, data.text(WhitespaceMode::TRIM)); ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); } @@ -439,7 +438,7 @@ TEST(TokenizedData, textCollapseWhitespace) data.enableToken(5); Token token; - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); + ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE)); ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); EXPECT_EQ(5U, token.id); @@ -448,7 +447,7 @@ TEST(TokenizedData, textCollapseWhitespace) EXPECT_EQ(4U, token.getLocation().getEnd()); EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); + ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE)); ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); } @@ -460,15 +459,15 @@ TEST(TokenizedData, appendChars) ASSERT_EQ(3U, data.append('s', 8, 10)); ASSERT_EQ(4U, data.append('t', 10, 12)); - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(5U, token.getLocation().getStart()); - EXPECT_EQ(12U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); + Variant text = data.text(WhitespaceMode::COLLAPSE); + ASSERT_EQ("test", text.asString()); + EXPECT_EQ(5U, text.getLocation().getStart()); + EXPECT_EQ(12U, text.getLocation().getEnd()); + EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); + + ASSERT_EQ(nullptr, data.text(WhitespaceMode::PRESERVE)); - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); + Token token; ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); } @@ -480,15 +479,16 @@ TEST(TokenizedData, copy) data.mark(6, 3, 1); data.enableToken(6); + Variant text; Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("a", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); + text = data.text(WhitespaceMode::COLLAPSE); + ASSERT_EQ("a", text.asString()); + EXPECT_EQ(1U, text.getLocation().getStart()); + EXPECT_EQ(2U, text.getLocation().getEnd()); + EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); + + ASSERT_EQ(nullptr, data.text(WhitespaceMode::COLLAPSE)); TokenizedData dataCopy = data; @@ -506,21 +506,19 @@ TEST(TokenizedData, copy) EXPECT_EQ(4U, token.getLocation().getEnd()); EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" b ", token.content); - EXPECT_EQ(4U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); + text = data.text(WhitespaceMode::PRESERVE); + ASSERT_EQ(" b ", text.asString()); + EXPECT_EQ(4U, text.getLocation().getStart()); + EXPECT_EQ(7U, text.getLocation().getEnd()); + EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); ASSERT_FALSE(data.next(token)); - ASSERT_TRUE(dataCopy.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("b", token.content); - EXPECT_EQ(5U, token.getLocation().getStart()); - EXPECT_EQ(6U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(dataCopy.next(token)); + text = dataCopy.text(WhitespaceMode::COLLAPSE); + ASSERT_EQ("b", text.asString()); + EXPECT_EQ(5U, text.getLocation().getStart()); + EXPECT_EQ(6U, text.getLocation().getEnd()); + EXPECT_EQ(InvalidSourceId, text.getLocation().getSourceId()); + ASSERT_FALSE(data.next(token)); } } diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 3809a12..0f2bfb7 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace ousia { @@ -31,23 +32,40 @@ TEST(Tokenizer, tokenRegistration) ASSERT_EQ(0U, tokenizer.registerToken("a")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("a")); - ASSERT_EQ("a", tokenizer.getTokenString(0U)); + ASSERT_EQ("a", tokenizer.lookupToken(0U).string); ASSERT_EQ(1U, tokenizer.registerToken("b")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("b")); - ASSERT_EQ("b", tokenizer.getTokenString(1U)); + ASSERT_EQ("b", tokenizer.lookupToken(1U).string); ASSERT_EQ(2U, tokenizer.registerToken("c")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("c")); - ASSERT_EQ("c", tokenizer.getTokenString(2U)); + ASSERT_EQ("c", tokenizer.lookupToken(2U).string); ASSERT_TRUE(tokenizer.unregisterToken(1U)); ASSERT_FALSE(tokenizer.unregisterToken(1U)); - ASSERT_EQ("", tokenizer.getTokenString(1U)); + ASSERT_EQ("", tokenizer.lookupToken(1U).string); ASSERT_EQ(1U, tokenizer.registerToken("d")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("d")); - ASSERT_EQ("d", tokenizer.getTokenString(1U)); + ASSERT_EQ("d", tokenizer.lookupToken(1U).string); +} + +void expectData(const std::string &expected, SourceOffset tokenStart, + SourceOffset tokenEnd, SourceOffset textStart, + SourceOffset textEnd, const Token &token, TokenizedData &data, + WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ + ASSERT_EQ(Tokens::Data, token.id); + + Variant text = data.text(mode); + ASSERT_TRUE(text.isString()); + + EXPECT_EQ(expected, text.asString()); + EXPECT_EQ(tokenStart, token.location.getStart()); + EXPECT_EQ(tokenEnd, token.location.getEnd()); + EXPECT_EQ(textStart, text.getLocation().getStart()); + EXPECT_EQ(textEnd, text.getLocation().getEnd()); } TEST(Tokenizer, textTokenPreserveWhitespace) @@ -56,36 +74,34 @@ TEST(Tokenizer, textTokenPreserveWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(36U, loc.getEnd()); + expectData(" this \t is only a \n\n test text ", 0, 36, 0, 36, + token, data, WhitespaceMode::PRESERVE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, + token, data, WhitespaceMode::PRESERVE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -95,36 +111,34 @@ TEST(Tokenizer, textTokenTrimWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 36, 1, 33, token, + data, WhitespaceMode::TRIM); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, + token, data, WhitespaceMode::TRIM); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -134,36 +148,34 @@ TEST(Tokenizer, textTokenCollapseWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this is only a test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); + expectData("this is only a test text", 0, 36, 1, 33, token, data, + WhitespaceMode::COLLAPSE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this is only a test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this is only a test text", 0, 32, 0, 32, token, data, + WhitespaceMode::COLLAPSE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -177,14 +189,12 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + expectData("test1", 0, 5, 0, 5, token, data); char c; ASSERT_TRUE(reader.peek(c)); @@ -193,7 +203,8 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -209,14 +220,10 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + expectData("test2", 6, 11, 6, 11, token, data); char c; ASSERT_FALSE(reader.peek(c)); @@ -233,21 +240,17 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); + expectData("test1", 0, 5, 0, 5, token, data); ASSERT_EQ(0U, reader.getOffset()); ASSERT_EQ(5U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -261,35 +264,26 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); + expectData("test2", 6, 11, 6, 11, token, data); ASSERT_EQ(0U, reader.getOffset()); ASSERT_EQ(11U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + expectData("test1", 0, 5, 0, 5, token, data); ASSERT_EQ(5U, reader.getOffset()); ASSERT_EQ(5U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -303,14 +297,9 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + expectData("test2", 6, 11, 6, 11, token, data); ASSERT_EQ(11U, reader.getOffset()); ASSERT_EQ(11U, reader.getPeekOffset()); } @@ -320,6 +309,7 @@ TEST(Tokenizer, ambiguousTokens) { CharReader reader{"abc"}; Tokenizer tokenizer; + TokenizedData data; TokenId t1 = tokenizer.registerToken("abd"); TokenId t2 = tokenizer.registerToken("bc"); @@ -328,16 +318,17 @@ TEST(Tokenizer, ambiguousTokens) ASSERT_EQ(1U, t2); Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_TRUE(tokenizer.read(reader, token, data)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("a", token.content); + expectData("a", 0, 1, 0, 1, token, data); SourceLocation loc = token.location; ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); - ASSERT_TRUE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(t2, token.id); ASSERT_EQ("bc", token.content); @@ -346,7 +337,8 @@ TEST(Tokenizer, ambiguousTokens) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(3U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } TEST(Tokenizer, commentTestWhitespacePreserve) @@ -354,7 +346,7 @@ TEST(Tokenizer, commentTestWhitespacePreserve) CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - Tokenizer tokenizer(WhitespaceMode::PRESERVE); + Tokenizer tokenizer; const TokenId t1 = tokenizer.registerToken("/"); const TokenId t2 = tokenizer.registerToken("/*"); @@ -370,45 +362,23 @@ TEST(Tokenizer, commentTestWhitespacePreserve) Token t; for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); + TokenizedData data(0); + EXPECT_TRUE(tokenizer.read(reader, t, data)); EXPECT_EQ(te.id, t.id); - EXPECT_EQ(te.content, t.content); + if (te.id != Tokens::Data) { + EXPECT_EQ(te.content, t.content); + } else { + Variant text = data.text(WhitespaceMode::PRESERVE); + ASSERT_TRUE(text.isString()); + EXPECT_EQ(te.content, text.asString()); + } EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -TEST(Tokenizer, commentTestWhitespaceCollapse) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - Tokenizer tokenizer(WhitespaceMode::COLLAPSE); - const TokenId t1 = tokenizer.registerToken("/"); - const TokenId t2 = tokenizer.registerToken("/*"); - const TokenId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {Tokens::Data, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {Tokens::Data, "Test", SourceLocation{0, 5, 9}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {Tokens::Data, "Block Comment", SourceLocation{0, 13, 26}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.id, t.id); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); + TokenizedData data; + ASSERT_FALSE(tokenizer.read(reader, t, data)); } } diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index d52fa5b..3d01007 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -30,11 +30,21 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); // static ConcreteLogger logger; +static OsmlStreamParser::State skipEmptyData(OsmlStreamParser &reader) +{ + OsmlStreamParser::State res = reader.parse(); + if (res == OsmlStreamParser::State::DATA) { + EXPECT_FALSE(reader.getData().hasNonWhitespaceText()); + res = reader.parse(); + } + return res; +} + static void assertCommand(OsmlStreamParser &reader, const std::string &name, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::COMMAND, skipEmptyData(reader)); EXPECT_EQ(name, reader.getCommandName().asString()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); @@ -57,16 +67,19 @@ static void assertCommand(OsmlStreamParser &reader, const std::string &name, static void assertData(OsmlStreamParser &reader, const std::string &data, SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) + SourceOffset end = InvalidSourceOffset, + WhitespaceMode mode = WhitespaceMode::COLLAPSE) { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(data, reader.getData().asString()); + Variant text = reader.getText(mode); + ASSERT_TRUE(text.isString()); + EXPECT_EQ(data, text.asString()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getData().getLocation().getStart()); + EXPECT_EQ(start, text.getLocation().getStart()); EXPECT_EQ(start, reader.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getData().getLocation().getEnd()); + EXPECT_EQ(end, text.getLocation().getEnd()); EXPECT_EQ(end, reader.getLocation().getEnd()); } } @@ -75,7 +88,7 @@ static void assertFieldStart(OsmlStreamParser &reader, bool defaultField, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::FIELD_START, skipEmptyData(reader)); EXPECT_EQ(defaultField, reader.inDefaultField()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); @@ -89,7 +102,7 @@ static void assertFieldEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::FIELD_END, skipEmptyData(reader)); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); } @@ -103,7 +116,7 @@ static void assertAnnotationStart(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, skipEmptyData(reader)); EXPECT_EQ(name, reader.getCommandName().asString()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); @@ -131,7 +144,7 @@ static void assertAnnotationEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, skipEmptyData(reader)); ASSERT_EQ(name, reader.getCommandName().asString()); if (!elementName.empty()) { ASSERT_EQ(1U, reader.getCommandArguments().asMap().size()); @@ -152,7 +165,7 @@ static void assertEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::END, skipEmptyData(reader)); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); } @@ -205,26 +218,14 @@ TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) assertData(reader, "hello world", 1, 14); } -TEST(OsmlStreamParser, escapeWhitespace) -{ - const char *testString = " hello\\ \\ world "; - // 012345 67 89012345 - // 0 1 - CharReader charReader(testString); - - OsmlStreamParser reader(charReader, logger); - - assertData(reader, "hello world", 1, 15); -} - static void testEscapeSpecialCharacter(const std::string &c) { CharReader charReader(std::string("\\") + c); OsmlStreamParser reader(charReader, logger); EXPECT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(c, reader.getData().asString()); + EXPECT_EQ(c, reader.getText().asString()); - SourceLocation loc = reader.getData().getLocation(); + SourceLocation loc = reader.getText().getLocation(); EXPECT_EQ(0U, loc.getStart()); EXPECT_EQ(1U + c.size(), loc.getEnd()); } @@ -253,16 +254,16 @@ TEST(OsmlStreamParser, singleLineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(33U, loc.getStart()); ASSERT_EQ(34U, loc.getEnd()); } @@ -279,16 +280,16 @@ TEST(OsmlStreamParser, multilineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(40U, loc.getStart()); ASSERT_EQ(41U, loc.getEnd()); } @@ -305,16 +306,16 @@ TEST(OsmlStreamParser, nestedMultilineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(40U, loc.getStart()); ASSERT_EQ(41U, loc.getEnd()); } @@ -569,8 +570,11 @@ TEST(OsmlStreamParser, multipleCommands) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); + assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); assertCommand(reader, "b", 3, 5); + assertData(reader, " ", 5, 6, WhitespaceMode::PRESERVE); assertCommand(reader, "c", 6, 8); + assertData(reader, " ", 8, 9, WhitespaceMode::PRESERVE); assertCommand(reader, "d", 9, 11); assertEnd(reader, 11, 11); } @@ -584,10 +588,13 @@ TEST(OsmlStreamParser, fieldsWithSpaces) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); + assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); assertFieldStart(reader, false, 3, 4); assertCommand(reader, "b", 4, 6); + assertData(reader, " ", 6, 7, WhitespaceMode::PRESERVE); assertCommand(reader, "c", 7, 9); assertFieldEnd(reader, 9, 10); + assertData(reader, " \n\n {", 10, 12, WhitespaceMode::PRESERVE); assertFieldStart(reader, false, 16, 17); assertCommand(reader, "d", 17, 19); assertFieldEnd(reader, 19, 20); diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp index 3293370..6942166 100644 --- a/test/formats/osxml/OsxmlEventParserTest.cpp +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -74,13 +75,11 @@ public: }; static std::vector> parseXml( - const char *testString, - WhitespaceMode whitespaceMode = WhitespaceMode::TRIM) + const char *testString) { TestOsxmlEventListener listener; CharReader reader(testString); OsxmlEventParser parser(reader, listener, logger); - parser.setWhitespaceMode(whitespaceMode); parser.parse(); return listener.events; } @@ -157,7 +156,7 @@ TEST(OsxmlEventParser, magicTopLevelTagInside) ASSERT_EQ(expectedEvents, events); } -TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) +TEST(OsxmlEventParser, commandWithData) { const char *testString = " hello \n world "; // 012345678901 234567890123 @@ -168,50 +167,12 @@ TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - auto events = parseXml(testString, WhitespaceMode::PRESERVE); + auto events = parseXml(testString); ASSERT_EQ(expectedEvents, events); // Check the location of the text ASSERT_EQ(3U, events[1].second.asArray()[0].getLocation().getStart()); ASSERT_EQ(20U, events[1].second.asArray()[0].getLocation().getEnd()); } - -TEST(OsxmlEventParser, commandWithDataTrimWhitespace) -{ - const char *testString = " hello \n world "; - // 012345678901 234567890123 - // 0 1 2 - - std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, - {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - - auto events = parseXml(testString, WhitespaceMode::TRIM); - ASSERT_EQ(expectedEvents, events); - - // Check the location of the text - ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); - ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); -} - -TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) -{ - const char *testString = " hello \n world "; - // 012345678901 234567890123 - // 0 1 2 - - std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, - {OsxmlEvent::DATA, Variant::arrayType{"hello world"}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - - auto events = parseXml(testString, WhitespaceMode::COLLAPSE); - ASSERT_EQ(expectedEvents, events); - - // Check the location of the text - ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); - ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); -} } -- cgit v1.2.3 From 84c9abc3e9762c4486ddc5ca0352a5d697a51987 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Wed, 25 Feb 2015 23:09:26 +0100 Subject: start of branch, commit log will be rewritten --- CMakeLists.txt | 110 ++--- src/core/common/SourceContextReader.cpp | 5 +- src/core/common/Token.cpp | 24 ++ src/core/common/Token.hpp | 181 ++++++++ src/core/common/Utils.cpp | 6 - src/core/common/Utils.hpp | 53 ++- src/core/common/WhitespaceHandler.hpp | 284 ------------- src/core/parser/stack/DocumentHandler.cpp | 24 +- src/core/parser/stack/DocumentHandler.hpp | 4 +- src/core/parser/stack/Handler.cpp | 25 +- src/core/parser/stack/Handler.hpp | 74 ++-- src/core/parser/stack/Stack.cpp | 55 ++- src/core/parser/stack/Stack.hpp | 18 +- src/core/parser/utils/SourceOffsetVector.hpp | 28 +- src/core/parser/utils/Token.cpp | 24 -- src/core/parser/utils/Token.hpp | 142 ------- src/core/parser/utils/TokenTrie.cpp | 16 +- src/core/parser/utils/TokenTrie.hpp | 11 +- src/core/parser/utils/TokenizedData.cpp | 353 +++++++++++++--- src/core/parser/utils/TokenizedData.hpp | 234 +++++++++-- src/core/parser/utils/Tokenizer.cpp | 264 ++++++------ src/core/parser/utils/Tokenizer.hpp | 142 ++++--- src/formats/osml/OsmlStreamParser.cpp | 157 ++----- src/formats/osml/OsmlStreamParser.hpp | 85 ++-- src/formats/osxml/OsxmlEventParser.cpp | 63 +-- src/formats/osxml/OsxmlEventParser.hpp | 31 +- test/core/parser/stack/StackTest.cpp | 15 +- test/core/parser/utils/TokenizedDataTest.cpp | 602 +++++++++++---------------- test/core/parser/utils/TokenizerTest.cpp | 248 +++++------ test/formats/osml/OsmlStreamParserTest.cpp | 79 ++-- test/formats/osxml/OsxmlEventParserTest.cpp | 47 +-- 31 files changed, 1664 insertions(+), 1740 deletions(-) create mode 100644 src/core/common/Token.cpp create mode 100644 src/core/common/Token.hpp delete mode 100644 src/core/common/WhitespaceHandler.hpp delete mode 100644 src/core/parser/utils/Token.cpp delete mode 100644 src/core/parser/utils/Token.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ea5c3aa..225e63d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,6 +158,7 @@ ADD_LIBRARY(ousia_core src/core/common/Rtti src/core/common/RttiBuilder src/core/common/SourceContextReader + src/core/common/Token src/core/common/Utils src/core/common/Variant src/core/common/VariantConverter @@ -180,16 +181,15 @@ ADD_LIBRARY(ousia_core src/core/parser/ParserContext src/core/parser/ParserScope src/core/parser/stack/Callbacks - src/core/parser/stack/DocumentHandler - src/core/parser/stack/DomainHandler - src/core/parser/stack/GenericParserStates - src/core/parser/stack/Handler - src/core/parser/stack/ImportIncludeHandler +# src/core/parser/stack/DocumentHandler +# src/core/parser/stack/DomainHandler +# src/core/parser/stack/GenericParserStates +# src/core/parser/stack/Handler +# src/core/parser/stack/ImportIncludeHandler src/core/parser/stack/State - src/core/parser/stack/Stack - src/core/parser/stack/TypesystemHandler +# src/core/parser/stack/Stack +# src/core/parser/stack/TypesystemHandler src/core/parser/utils/SourceOffsetVector - src/core/parser/utils/Token src/core/parser/utils/TokenizedData src/core/parser/utils/Tokenizer src/core/parser/utils/TokenTrie @@ -212,19 +212,19 @@ ADD_LIBRARY(ousia_core # ousia_core #) -ADD_LIBRARY(ousia_osml - src/formats/osml/OsmlParser - src/formats/osml/OsmlStreamParser -) +#ADD_LIBRARY(ousia_osml +# src/formats/osml/OsmlParser +# src/formats/osml/OsmlStreamParser +#) -TARGET_LINK_LIBRARIES(ousia_osml - ousia_core -) +#TARGET_LINK_LIBRARIES(ousia_osml +# ousia_core +#) ADD_LIBRARY(ousia_osxml src/formats/osxml/OsxmlAttributeLocator src/formats/osxml/OsxmlEventParser - src/formats/osxml/OsxmlParser +# src/formats/osxml/OsxmlParser ) TARGET_LINK_LIBRARIES(ousia_osxml @@ -273,19 +273,19 @@ TARGET_LINK_LIBRARIES(ousia_xml # Command line interface -ADD_EXECUTABLE(ousia - src/cli/Main -) +#ADD_EXECUTABLE(ousia +# src/cli/Main +#) -TARGET_LINK_LIBRARIES(ousia - ousia_core - ousia_filesystem - ousia_html - ousia_xml - ousia_osml - ousia_osxml - ${Boost_LIBRARIES} -) +#TARGET_LINK_LIBRARIES(ousia +# ousia_core +# ousia_filesystem +# ousia_html +# ousia_xml +# ousia_osml +# ousia_osxml +# ${Boost_LIBRARIES} +#) # If testing is enabled, build the unit tests IF(TEST) @@ -323,11 +323,11 @@ IF(TEST) test/core/model/StyleTest test/core/model/TypesystemTest test/core/parser/ParserScopeTest - test/core/parser/stack/StackTest +# test/core/parser/stack/StackTest test/core/parser/stack/StateTest test/core/parser/utils/SourceOffsetVectorTest test/core/parser/utils/TokenizedDataTest - test/core/parser/utils/TokenizerTest +# test/core/parser/utils/TokenizerTest test/core/parser/utils/TokenTrieTest test/core/resource/ResourceLocatorTest test/core/resource/ResourceRequestTest @@ -383,29 +383,29 @@ IF(TEST) # ousia_mozjs # ) - ADD_EXECUTABLE(ousia_test_osml - test/formats/osml/OsmlParserTest - test/formats/osml/OsmlStreamParserTest - ) +# ADD_EXECUTABLE(ousia_test_osml +# test/formats/osml/OsmlParserTest +# test/formats/osml/OsmlStreamParserTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_osml - ${GTEST_LIBRARIES} - ousia_core - ousia_osml - ousia_filesystem - ) +# TARGET_LINK_LIBRARIES(ousia_test_osml +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_osml +# ousia_filesystem +# ) - ADD_EXECUTABLE(ousia_test_osxml - test/formats/osxml/OsxmlEventParserTest - test/formats/osxml/OsxmlParserTest - ) +# ADD_EXECUTABLE(ousia_test_osxml +# test/formats/osxml/OsxmlEventParserTest +# test/formats/osxml/OsxmlParserTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_osxml - ${GTEST_LIBRARIES} - ousia_core - ousia_osxml - ousia_filesystem - ) +# TARGET_LINK_LIBRARIES(ousia_test_osxml +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_osxml +# ousia_filesystem +# ) ADD_EXECUTABLE(ousia_test_xml test/plugins/xml/XmlOutputTest @@ -423,8 +423,8 @@ IF(TEST) ADD_TEST(ousia_test_filesystem ousia_test_filesystem) ADD_TEST(ousia_test_html ousia_test_html) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) - ADD_TEST(ousia_test_osml ousia_test_osml) - ADD_TEST(ousia_test_osxml ousia_test_osxml) +# ADD_TEST(ousia_test_osml ousia_test_osml) +# ADD_TEST(ousia_test_osxml ousia_test_osxml) ADD_TEST(ousia_test_xml ousia_test_xml) ENDIF() @@ -442,9 +442,9 @@ INSTALL(DIRECTORY data/ DESTINATION share/ousia OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE ) -INSTALL(TARGETS ousia - RUNTIME DESTINATION bin -) +#INSTALL(TARGETS ousia +# RUNTIME DESTINATION bin +#) IF(INSTALL_GEDIT_HIGHLIGHTER) INSTALL(FILES contrib/gtksourceview-3.0/language-specs/ousia.lang diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp index d5d379c..f7dbdf3 100644 --- a/src/core/common/SourceContextReader.cpp +++ b/src/core/common/SourceContextReader.cpp @@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader, ctx.relLen = end - start; // end >= start (I2) // Remove linebreaks at the beginning and the end - const std::pair b = - Utils::trim(lineBuf, Utils::isLinebreak); + const std::pair b = Utils::trim( + lineBuf, + [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); }); ssize_t s = b.first, e = b.second; s = std::min(s, static_cast(ctx.relPos)); diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp new file mode 100644 index 0000000..8bcdbb5 --- /dev/null +++ b/src/core/common/Token.cpp @@ -0,0 +1,24 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Token.hpp" + +namespace ousia { +// Stub to make sure Tokens.hpp is valid +} + diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp new file mode 100644 index 0000000..0cf56b0 --- /dev/null +++ b/src/core/common/Token.hpp @@ -0,0 +1,181 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Token.hpp + * + * Definition of the TokenId id and constants for some special tokens. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_HPP_ +#define _OUSIA_TOKEN_HPP_ + +#include +#include +#include +#include + +#include + +namespace ousia { + +/** + * The TokenId is used to give each token id a unique id. + */ +using TokenId = uint32_t; + +/** + * Type used for storing token lengths. + */ +using TokenLength = uint16_t; + +/** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set; + +/** + * Namespace containing constants for TokenId instances with special meaning. + */ +namespace Tokens { +/** + * Token which is not a token. + */ +constexpr TokenId Empty = std::numeric_limits::max(); + +/** + * Token which represents data (represented as TokenizedData). + */ +constexpr TokenId Data = std::numeric_limits::max() - 1; + +/** + * Token which represents a newline token. + */ +constexpr TokenId Newline = std::numeric_limits::max() - 2; + +/** + * Token which represents a paragraph token -- issued if two consecutive + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached. + */ +constexpr TokenId Paragraph = std::numeric_limits::max() - 3; + +/** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits::max() - 4; + +/** + * Token which represents an indentation token -- issued if the indentation of + * this line is larger than the indentation of the previous line. + */ +constexpr TokenId Indent = std::numeric_limits::max() - 5; + +/** + * Token which represents an dedentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. + */ +constexpr TokenId Dedent = std::numeric_limits::max() - 6; + +/** + * Maximum token id to be used. Tokens allocated for users should not surpass + * this value. + */ +constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255; +} + +/** + * The Token structure describes a token discovered by the Tokenizer or read + * from the TokenizedData struct. + */ +struct Token { + /** + * Id of the id of this token. + */ + TokenId id; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + Token() : id(Tokens::Empty) {} + + /** + * Constructor of a "data" token with no explicit content. + * + * @param location is the location of the extracted string content in the + * source file. + */ + Token(SourceLocation location) + : id(Tokens::Data), location(location) + { + } + + /** + * Constructor of the Token struct. + * + * @param id represents the token id. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + Token(TokenId id, const std::string &content, SourceLocation location) + : id(id), content(content), location(location) + { + } + + /** + * Constructor of the Token struct, only initializes the token id + * + * @param id is the id corresponding to the id of the token. + */ + Token(TokenId id) : id(id) {} + + /** + * Returns true if this token is special. + * + * @return true if the TokenId indicates that this token is a "special" + * token. + */ + bool isSpecial() const {return id > Tokens::MaxTokenId;} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; +} + +#endif /* _OUSIA_TOKENS_HPP_ */ + diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index a77951e..85d2c28 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename) return std::string{}; } -std::string Utils::trim(const std::string &s) -{ - std::pair bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::startsWith(const std::string &s, const std::string &prefix) { return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 7d96562..82a8f8c 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -123,14 +123,6 @@ public: */ static bool hasNonWhitepaceChar(const std::string &s); - /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - /** * Trims the given string or vector of chars by returning the start and end * index. @@ -153,8 +145,8 @@ public: * * @param s is the container that should be trimmed. * @param len is the number of elements in the container. - * @param f is a function that returns true for values that should be - * removed. + * @param f is a function that returns true for values at a certain index + * that should be removed. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" */ @@ -163,7 +155,7 @@ public: { size_t start = 0; for (size_t i = 0; i < len; i++) { - if (!f(s[i])) { + if (!f(i)) { start = i; break; } @@ -171,7 +163,7 @@ public: size_t end = 0; for (ssize_t i = len - 1; i >= static_cast(start); i--) { - if (!f(s[i])) { + if (!f(i)) { end = i + 1; break; } @@ -198,16 +190,32 @@ public: * the collapsed version of the string ends. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" + * @param f is a function that returns true for values at a certain index + * that should be removed. */ - template - static std::string trim(const T &s, size_t len, size_t &start, size_t &end) + template + static std::string trim(const T &s, size_t len, size_t &start, size_t &end, + Filter f) { - auto res = trim(s, len, isWhitespace); + auto res = trim(s, len, f); start = res.first; end = res.second; return std::string(&s[start], end - start); } + /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s) + { + std::pair bounds = + trim(s, [&s](size_t i) { return isWhitespace(s[i]); }); + return s.substr(bounds.first, bounds.second - bounds.first); + } + /** * Collapses the whitespaces in the given string (trims the string and * replaces all whitespace characters by a single one). @@ -219,7 +227,8 @@ public: { size_t start; size_t end; - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -236,7 +245,8 @@ public: static std::string collapse(const std::string &s, size_t &start, size_t &end) { - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -244,6 +254,8 @@ public: * replaces all whitespace characters by a single one). * * @tparam T is the string type that should be used. + * @tparam Filter is a filter function used for detecting the character + * indices that might be removed. * @param s is the string in which the whitespace should be collapsed. * @param len is the length of the input string * @param start is an output parameter which is set to the offset at which @@ -252,9 +264,9 @@ public: * the collapsed version of the string ends. * @return a copy of s with collapsed whitespace. */ - template + template static std::string collapse(const T &s, size_t len, size_t &start, - size_t &end) + size_t &end, Filter f) { // Result vector std::vector res; @@ -268,8 +280,7 @@ public: bool hadWhitespace = false; for (size_t i = 0; i < len; i++) { const char c = s[i]; - const bool whitespace = isWhitespace(c); - if (whitespace) { + if (f(i)) { hadWhitespace = !res.empty(); } else { // Adapt the start and end position diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include -#include - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - WhitespaceHandler() : textStart(0), textEnd(0) {} - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } - - /** - * Returns the content of the WhitespaceHandler as string. - */ - std::string toString() const - { - return std::string(textBuf.data(), textBuf.size()); - } -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd); - } - - /** - * Static version of PreservingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); - } - - /** - * Static version of TrimmingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param whitespaceBuf is a reference at the buffer for storing whitespace - * characters. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd, std::vector &whitespaceBuf) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); - } - - /** - * Static version of CollapsingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param hasWhitespace is a reference at the "hasWhitespace" flag. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd, bool &hasWhitespace) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, - size_t start) -{ - for (auto elem : buf) { - handler.append(elem, start, start + 1); - start++; - } -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ - diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index bb04bd3..d44176a 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -372,8 +373,15 @@ bool DocumentChildHandler::convertData(Handle field, return valid && scope().resolveValue(data, type, logger); } -bool DocumentChildHandler::data(Variant &data) +bool DocumentChildHandler::data(TokenizedData &data) { + // TODO: Handle this correctly + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // For now, except "no data" as success + return true; + } + // We're past the region in which explicit fields can be defined in the // parent structure element scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true); @@ -393,11 +401,11 @@ bool DocumentChildHandler::data(Variant &data) // If it is a primitive field directly, try to parse the content. if (field->isPrimitive()) { // Add it as primitive content. - if (!convertData(field, data, logger())) { + if (!convertData(field, text, logger())) { return false; } - parent->createChildDocumentPrimitive(data, fieldIdx); + parent->createChildDocumentPrimitive(text, fieldIdx); return true; } @@ -411,7 +419,7 @@ bool DocumentChildHandler::data(Variant &data) for (auto primitiveField : defaultFields) { // Then try to parse the content using the type specification. forks.emplace_back(logger().fork()); - if (!convertData(primitiveField, data, forks.back())) { + if (!convertData(primitiveField, text, forks.back())) { continue; } @@ -424,7 +432,7 @@ bool DocumentChildHandler::data(Variant &data) createPath(fieldIdx, path, parent); // Then create the primitive element - parent->createChildDocumentPrimitive(data); + parent->createChildDocumentPrimitive(text); return true; } @@ -434,10 +442,10 @@ bool DocumentChildHandler::data(Variant &data) if (defaultFields.empty()) { logger().error("Got data, but structure \"" + name() + "\" does not have any primitive field", - data); + text); } else { logger().error("Could not read data with any of the possible fields:", - data); + text); size_t f = 0; for (auto field : defaultFields) { logger().note(std::string("Field ") + @@ -471,4 +479,4 @@ namespace RttiTypes { const Rtti DocumentField = RttiBuilder( "DocumentField").parent(&Node); } -} \ No newline at end of file +} diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 862081c..dda7d8b 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -167,7 +167,7 @@ public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; @@ -213,4 +213,4 @@ extern const Rtti DocumentField; } } -#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ \ No newline at end of file +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index bf5d4ea..3d413e8 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include "Callbacks.hpp" @@ -130,7 +131,7 @@ bool EmptyHandler::annotationEnd(const Variant &className, return true; } -bool EmptyHandler::data(Variant &data) +bool EmptyHandler::data(TokenizedData &data) { // Support any data return true; @@ -184,10 +185,13 @@ bool StaticHandler::annotationEnd(const Variant &className, return false; } -bool StaticHandler::data(Variant &data) +bool StaticHandler::data(TokenizedData &data) { - logger().error("Did not expect any data here", data); - return false; + if (data.text(WhitespaceMode::TRIM) != nullptr) { + logger().error("Did not expect any data here", data); + return false; + } + return true; } /* Class StaticFieldHandler */ @@ -227,12 +231,19 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(Variant &data) +bool StaticFieldHandler::data(TokenizedData &data) { + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // Providing no data here is ok as long as the "doHandle" callback + // function has already been called + return handled; + } + // Call the doHandle function if this has not been done before if (!handled) { handled = true; - doHandle(data, args); + doHandle(text, args); return true; } @@ -240,7 +251,7 @@ bool StaticFieldHandler::data(Variant &data) logger().error( std::string("Found data, but the corresponding argument \"") + argName + std::string("\" was already specified"), - data); + text); // Print the location at which the attribute was originally specified auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 7cda7a4..929466d 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -31,6 +31,7 @@ namespace ousia { class ParserScope; class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -158,40 +159,63 @@ protected: */ const std::string &name() const; -public: - /** - * Virtual destructor. - */ - virtual ~Handler(); - /** * Calls the corresponding function in the Callbacks instance. Sets the * whitespace mode that specifies how string data should be processed. The * calls to this function are placed on a stack by the underlying Stack - * class. + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * * @param whitespaceMode specifies one of the three WhitespaceMode constants * PRESERVE, TRIM or COLLAPSE. */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); + void pushWhitespaceMode(WhitespaceMode whitespaceMode); /** - * Calls the corresponding function in the Callbacks instance. - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. */ - void registerToken(const std::string &token); + void popWhitespaceMode(); /** - * Calls the corresponding function in the Callbacks instance. - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * - * @param token is the token string that should be unregistered. + * @param tokens is a list of tokens that should be reported to this handler + * instance via the "token" method. */ - void unregisterToken(const std::string &token); + void pushTokens(const std::vector &tokens); + + /** + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. + */ + void popWhitespaceMode(); + + + /** + * Calls the corresponding function in the Callbacks instance. This method + * registers the given tokens as tokens that are generally available, tokens + * must be explicitly enabled using the "pushTokens" and "popTokens" method. + * Tokens that have not been registered are not guaranteed to be reported, + * even though they are + */ + void registerTokens(const std::vector &tokens); + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); /** * Returns the command name for which the handler was created. @@ -299,11 +323,11 @@ public: * Handler instance. Should return true if the data could be handled, false * otherwise. * - * @param data is a string variant containing the character data and its - * location. + * @param data is an instance of TokenizedData containing the segmented + * character data and its location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(Variant &data) = 0; + virtual bool data(TokenizedData &data) = 0; }; /** @@ -333,7 +357,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; /** * Creates an instance of the EmptyHandler class. @@ -359,7 +383,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; /** @@ -412,7 +436,7 @@ protected: public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; } } diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 5b67248..309c9a0 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -413,16 +414,24 @@ void Stack::command(const Variant &name, const Variant::mapType &args) } } -void Stack::data(const Variant &data) +void Stack::data(TokenizedData data) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); + // TODO: Rewrite this function for token handling + // TODO: This loop needs to be refactored out + while (!data.atEnd()) { + // End handlers that already had a default field and are currently not + // active. + endOverdueHandlers(); - while (true) { - // Check whether there is any command the data can be sent to + const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); + + // Check whether there is any command the data can be sent to -- if not, + // make sure the data actually is data if (stack.empty()) { - throw LoggableException("No command here to receive data.", data); + if (hasNonWhitespaceText) { + throw LoggableException("No command here to receive data.", data); + } + return; } // Fetch the current command handler information @@ -440,7 +449,10 @@ void Stack::data(const Variant &data) // If the "hadDefaultField" flag is set, we already issued an error // message if (!info.hadDefaultField) { - logger().error("Did not expect any data here", data); + if (hasNonWhitespaceText) { + logger().error("Did not expect any data here", data); + } + return; } } @@ -454,8 +466,16 @@ void Stack::data(const Variant &data) // Pass the data to the current Handler instance bool valid = false; try { - Variant dataCopy = data; - valid = info.handler->data(dataCopy); + // Create a fork of the TokenizedData and let the handler work + // on it + TokenizedData dataFork = data; + valid = info.handler->data(dataFork); + + // If the data was validly handled by the handler, commit the + // change + if (valid) { + data = dataFork; + } } catch (LoggableException ex) { loggerFork.log(ex); @@ -482,6 +502,19 @@ void Stack::data(const Variant &data) } } +void Stack::data(const Variant &stringData) +{ + // Fetch the SourceLocation of the given stringData variant + SourceLocation loc = stringData.getLocation(); + + // Create a TokenizedData instance and feed the given string data into it + TokenizedData tokenizedData(loc.getSourceId()); + tokenizedData.append(stringData.asString(), loc.getStart()); + + // Call the actual "data" method + data(tokenizedData); +} + void Stack::fieldStart(bool isDefault) { // Make sure the current handler stack is not empty @@ -584,4 +617,4 @@ void Stack::token(Variant token) // TODO } } -} \ No newline at end of file +} diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index b67ce82..cd29b28 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -44,6 +44,7 @@ namespace ousia { // Forward declarations class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -292,13 +293,24 @@ public: void command(const Variant &name, const Variant::mapType &args); /** - * Function that shuold be called whenever character data is found in the + * Function that should be called whenever character data is found in the * input stream. May only be called if the currently is a command on the * stack. * - * @param data is a string variant containing the data that has been found. + * @param data is a TokenizedData instance containing the pre-segmented data + * that should be read. + */ + void data(TokenizedData data); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. The given string variant is converted into a TokenizedData + * instance internally. + * + * @param stringData is a string variant containing the data that has been + * found. */ - void data(const Variant &data); + void data(const Variant &stringData); /** * Function that should be called whenever a new field starts. Fields of the diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..aaebe7d 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -127,7 +127,7 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair loadOffset(size_t idx) + std::pair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); @@ -157,7 +157,31 @@ public: /** * Returns the number of characters for which offsets are stored. */ - size_t size() { return lens.size(); } + size_t size() const { return lens.size(); } + + /** + * Trims the length of the TokenizedData instance to the given length. + * Removes all token matches that lie within the trimmed region. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) { + if (length < size()) { + lens.resize(length); + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + } + } + + /** + * Resets the SourceOffsetVector to the state it had when it was + * constructed. + */ + void clear() { + lens.clear(); + offsets.clear(); + lastEnd = 0; + } }; } diff --git a/src/core/parser/utils/Token.cpp b/src/core/parser/utils/Token.cpp deleted file mode 100644 index 8bcdbb5..0000000 --- a/src/core/parser/utils/Token.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "Token.hpp" - -namespace ousia { -// Stub to make sure Tokens.hpp is valid -} - diff --git a/src/core/parser/utils/Token.hpp b/src/core/parser/utils/Token.hpp deleted file mode 100644 index f907450..0000000 --- a/src/core/parser/utils/Token.hpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file Token.hpp - * - * Definition of the TokenId id and constants for some special tokens. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_HPP_ -#define _OUSIA_TOKEN_HPP_ - -#include -#include -#include - -#include - -namespace ousia { - -/** - * The TokenId is used to give each token id a unique id. - */ -using TokenId = uint32_t; - -/** - * Type used for storing token lengths. - */ -using TokenLength = uint16_t; - -/** - * Namespace containing constants for TokenId instances with special meaning. - */ -namespace Tokens { -/** - * Token which is not a token. - */ -constexpr TokenId Empty = std::numeric_limits::max(); - -/** - * Token which represents data (represented as TokenizedData). - */ -constexpr TokenId Data = std::numeric_limits::max() - 1; - -/** - * Token which represents a newline token. - */ -constexpr TokenId Newline = std::numeric_limits::max() - 2; - -/** - * Token which represents a paragraph token -- issued if two consecutive - * newlines occur with optionally any amout of whitespace between them. - */ -constexpr TokenId Paragraph = std::numeric_limits::max() - 3; - -/** - * Token which represents an indentation token -- issued if the indentation of - * this line is larget than the indentation of the previous line. - */ -constexpr TokenId Indentation = std::numeric_limits::max() - 4; - -/** - * Maximum token id to be used. Tokens allocated for users should not surpass - * this value. - */ -constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255; -} - -/** - * The Token structure describes a token discovered by the Tokenizer or read - * from the TokenizedData struct. - */ -struct Token { - /** - * Id of the id of this token. - */ - TokenId id; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - Token() : id(Tokens::Empty) {} - - /** - * Constructor of the Token struct. - * - * @param id represents the token id. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - Token(TokenId id, const std::string &content, SourceLocation location) - : id(id), content(content), location(location) - { - } - - /** - * Constructor of the Token struct, only initializes the token id - * - * @param id is the id corresponding to the id of the token. - */ - Token(TokenId id) : id(id) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; -} - -#endif /* _OUSIA_TOKENS_HPP_ */ - diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 80cc945..a45d3ff 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia { /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(Tokens::Empty) {} +TokenTrie::Node::Node() : id(Tokens::Empty) {} /* Class DynamicTokenTree */ bool TokenTrie::registerToken(const std::string &token, - TokenId type) noexcept + TokenId id) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token, } // If the resulting node already has a type set, we're screwed. - if (node->type != Tokens::Empty) { + if (node->id != Tokens::Empty) { return false; } // Otherwise just set the type to the given type. - node->type = type; + node->id = id; return true; } @@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->type != Tokens::Empty || node->children.size() > 1) && + if ((node->id != Tokens::Empty || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; @@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept } // If the node type is already Tokens::Empty, we cannot do anything here - if (node->type == Tokens::Empty) { + if (node->id == Tokens::Empty) { return false; } // If the target node has children, we cannot delete the subtree. Set the // type to Tokens::Empty instead if (!node->children.empty()) { - node->type = Tokens::Empty; + node->id = Tokens::Empty; return true; } @@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept } node = it->second.get(); } - return node->type; + return node->id; } } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index b2d1539..c470acc 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,7 +33,7 @@ #include #include -#include "Token.hpp" +#include namespace ousia { @@ -75,10 +75,9 @@ public: ChildMap children; /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. + * Id of the token represented by this node. */ - TokenId type; + TokenId id; /** * Default constructor, initializes the descriptor with nullptr. @@ -99,10 +98,10 @@ public: * * @param token is the character sequence that should be registered as * token. - * @param type is the descriptor that should be set for this token. + * @param id is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, TokenId type) noexcept; + bool registerToken(const std::string &token, TokenId id) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..aeefa26 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -47,6 +47,17 @@ struct TokenMark { */ TokenLength len; + /** + * Specifies whether the token is special or not. + */ + bool special; + + /** + * Maximum token length. + */ + static constexpr TokenLength MaxTokenLength = + std::numeric_limits::max(); + /** * Constructor of the TokenMark structure, initializes all members with the * given values. @@ -55,9 +66,10 @@ struct TokenMark { * @param bufStart is the start position of the TokenMark in the internal * character buffer. * @param len is the length of the token. + * @param special modifies the sort order, special tokens are prefered. */ - TokenMark(TokenId id, size_t bufStart, TokenLength len) - : bufStart(bufStart), id(id), len(len) + TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) + : bufStart(bufStart), id(id), len(len), special(special) { } @@ -72,7 +84,8 @@ struct TokenMark { TokenMark(size_t bufStart) : bufStart(bufStart), id(Tokens::Empty), - len(std::numeric_limits::max()) + len(MaxTokenLength), + special(true) { } @@ -86,8 +99,22 @@ struct TokenMark { */ friend bool operator<(const TokenMark &m1, const TokenMark &m2) { - return (m1.bufStart < m2.bufStart) || - (m1.bufStart == m2.bufStart && m1.len > m2.len); + // Prefer the mark with the smaller bufStart + if (m1.bufStart < m2.bufStart) { + return true; + } + + // Special handling for marks with the same bufStart + if (m1.bufStart == m2.bufStart) { + // If exactly one of the two marks is special, return true if this + // one is special + if (m1.special != m2.special) { + return m1.special; + } + // Otherwise prefer longer marks + return m1.len > m2.len; + } + return false; } }; } @@ -110,19 +137,44 @@ private: std::vector buf; /** - * Vector containing all token marks. + * Buffset storing the "protected" flag of the character data. */ - std::vector marks; + std::vector protectedChars; /** * Vector storing all the character offsets efficiently. */ SourceOffsetVector offsets; + /** + * Vector containing all token marks. + */ + mutable std::vector marks; + + /** + * Position of the first linebreak in a sequence of linebreaks. + */ + size_t firstLinebreak; + + /** + * Current indentation level. + */ + uint16_t currentIndentation; + + /** + * Last indentation level. + */ + uint16_t lastIndentation; + + /** + * Number of linebreaks without any content between them. + */ + uint16_t numLinebreaks; + /** * Flag indicating whether the internal "marks" vector is sorted. */ - bool sorted; + mutable bool sorted; public: /** @@ -132,7 +184,7 @@ public: * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ - TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} + TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); } /** * Appends a complete string to the internal character buffer and extends @@ -140,22 +192,22 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart) - { // Append the data to the internal buffer - buf.insert(buf.end(), data.begin(), data.end()); - - // Extend the text regions, interpolate the source position (this may - // yield incorrect results) - const size_t size = buf.size(); - for (SourceOffset offs = offsStart; offs < offsStart + data.size(); - offs++) { - offsets.storeOffset(offs, offs + 1); + size_t append(const std::string &data, SourceOffset offsStart, bool protect) + { + for (size_t i = 0; i < data.size(); i++) { + if (offsStart != InvalidSourceOffset) { + append(data[i], offsStart + i, offsStart + i + 1, protect); + } else { + append(data[i], InvalidSourceOffset, InvalidSourceOffset, + protect); + } } - - return size; + return size(); } /** @@ -165,16 +217,86 @@ public: * @param c is the character that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param offsEnd is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect) { // Add the character to the list and store the location of the character // in the source file buf.push_back(c); + protectedChars.push_back(protect); offsets.storeOffset(offsStart, offsEnd); - return buf.size(); + + // Insert special tokens + const size_t size = buf.size(); + const bool isWhitespace = Utils::isWhitespace(c); + const bool isLinebreak = Utils::isLinebreak(c); + + // Handle linebreaks + if (isLinebreak) { + // Mark linebreaks as linebreak + mark(Tokens::Newline, size - 1, 1, false); + + // The linebreak sequence started at the previous character + if (numLinebreaks == 0) { + firstLinebreak = size - 1; + } + + // Reset the indentation + currentIndentation = 0; + + // Increment the number of linebreaks + numLinebreaks++; + + const size_t markStart = firstLinebreak; + const size_t markLength = size - firstLinebreak; + + // Issue two consecutive linebreaks as paragraph token + if (numLinebreaks == 2) { + mark(Tokens::Paragraph, markStart, markLength, false); + } + + // Issue three consecutive linebreaks as paragraph token + if (numLinebreaks >= 3) { + mark(Tokens::Section, markStart, markLength, false); + } + } else if (isWhitespace) { + // Count the whitespace characters at the beginning of the line + if (numLinebreaks > 0) { + // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the + // next multiple of eight. + if (c == '\t') { + currentIndentation = (currentIndentation + 8) & ~7; + } else { + currentIndentation++; + } + } + } + + // Issue indent and unindent tokens + if (!isWhitespace && numLinebreaks > 0) { + // Issue a larger indentation than that in the previous line as + // "Indent" token + if (currentIndentation > lastIndentation) { + mark(Tokens::Indent, size - 1, 0, true); + } + + // Issue a smaller indentation than that in the previous line as + // "Dedent" token + if (currentIndentation < lastIndentation) { + mark(Tokens::Dedent, size - 1, 0, true); + } + + // Reset the internal state machine + lastIndentation = currentIndentation; + numLinebreaks = 0; + } + + return size; } /** @@ -184,11 +306,12 @@ public: * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. + * @param special tags the mark as "special", prefering it in the sort order */ - void mark(TokenId id, size_t bufStart, TokenLength len) + void mark(TokenId id, size_t bufStart, TokenLength len, bool special) { // Push the new instance back onto the list - marks.emplace_back(id, bufStart, len); + marks.emplace_back(id, bufStart, len, special); // Update the sorted flag as soon as more than one element is in the // list @@ -212,9 +335,13 @@ public: * @return true if a token was returned, false if no more tokens are * available. */ - bool next(Token &token, WhitespaceMode mode, - const std::unordered_set &tokens, size_t &cursor) + bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, + TokenizedDataCursor &cursor) const { + // Some variables for convenient access + size_t &bufPos = cursor.bufPos; + size_t &markPos = cursor.markPos; + // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { std::sort(marks.begin(), marks.end()); @@ -222,10 +349,11 @@ public: } // Fetch the next larger TokenMark instance, make sure the token is in - // the "enabled" list - auto it = - std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); - while (it != marks.end() && tokens.count(it->id) == 0) { + // the "enabled" list and within the buffer range + auto it = std::lower_bound(marks.begin() + markPos, marks.end(), + TokenMark(bufPos)); + while (it != marks.end() && (tokens.count(it->id) == 0 || + it->bufStart + it->len > buf.size())) { it++; } @@ -236,15 +364,15 @@ public: // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. - if (cursor < end && cursor < buf.size()) { + if (bufPos < end && bufPos < buf.size()) { switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( - Tokens::Data, std::string(&buf[cursor], end - cursor), + Tokens::Data, std::string(&buf[bufPos], end - bufPos), SourceLocation(sourceId, - offsets.loadOffset(cursor).first, + offsets.loadOffset(bufPos).first, offsets.loadOffset(end).first)); - cursor = end; + bufPos = end; return true; } case WhitespaceMode::TRIM: @@ -254,30 +382,35 @@ public: size_t stringStart; size_t stringEnd; std::string content; + const char *cBuf = &buf[bufPos]; + auto filter = [cBuf, this](size_t i) -> bool { + return Utils::isWhitespace(cBuf[i]) && + !protectedChars[i]; + }; if (mode == WhitespaceMode::TRIM) { - content = Utils::trim(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::trim(cBuf, end - bufPos, stringStart, + stringEnd, filter); } else { - content = Utils::collapse(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::collapse( + cBuf, end - bufPos, stringStart, stringEnd, filter); } // If the resulting string is empty (only whitespaces), // abort if (content.empty()) { - cursor = end; + bufPos = end; break; } // Calculate the absolute positions and return the token - stringStart += cursor; - stringEnd += cursor; + stringStart += bufPos; + stringEnd += bufPos; token = Token( Tokens::Data, content, SourceLocation(sourceId, offsets.loadOffset(stringStart).first, offsets.loadOffset(stringEnd).first)); - cursor = end; + bufPos = end; return true; } } @@ -286,14 +419,18 @@ public: // If start equals end, we're currently directly at a token // instance. Return this token and advance the cursor to the end of // the token. - if (cursor == end && it != marks.end()) { + if (bufPos == end && it != marks.end()) { const size_t tokenStart = it->bufStart; const size_t tokenEnd = it->bufStart + it->len; token = Token( it->id, std::string(&buf[tokenStart], it->len), SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, offsets.loadOffset(tokenEnd).first)); - cursor = tokenEnd; + + // Update the cursor, consume the token by incrementing the marks + // pos counter + bufPos = tokenEnd; + markPos = it - marks.begin() + 1; return true; } @@ -303,12 +440,63 @@ public: return false; } + /** + * Resets the TokenizedDataImpl instance to the state it had when it was + * constructred. + */ + void clear() + { + buf.clear(); + protectedChars.clear(); + offsets.clear(); + marks.clear(); + currentIndentation = 0; + lastIndentation = 0; + numLinebreaks = 1; // Assume the stream starts with a linebreak + sorted = true; + } + + /** + * Trims the length of the TokenizedDataImpl instance to the given length. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) + { + if (length < size()) { + buf.resize(length); + offsets.trim(length); + } + } + /** * Returns the current size of the internal buffer. * * @return the size of the internal character buffer. */ - size_t getSize() { return buf.size(); } + size_t size() const { return buf.size(); } + + /** + * Returns true if no data is in the data buffer. + * + * @return true if the "buf" instance has no data. + */ + bool empty() const { return buf.empty(); } + + /** + * Returns the current location of all data in the buffer. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const + { + if (empty()) { + return SourceLocation{sourceId}; + } + return SourceLocation{sourceId, offsets.loadOffset(0).first, + offsets.loadOffset(size()).second}; + } }; /* Class TokenizedData */ @@ -316,50 +504,83 @@ public: TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} TokenizedData::TokenizedData(SourceId sourceId) - : impl(std::make_shared(sourceId)), cursor(0) + : impl(std::make_shared(sourceId)) { } TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, + bool protect) { - return impl->append(data, offsStart); + return impl->append(data, offsStart, protect); } size_t TokenizedData::append(char c, SourceOffset offsStart, - SourceOffset offsEnd) + SourceOffset offsEnd, bool protect) { - return impl->append(c, offsStart, offsEnd); + return impl->append(c, offsStart, offsEnd, protect); } void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->getSize() - len, len); + impl->mark(id, impl->size() - len, len, false); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) { - impl->mark(id, bufStart, len); + impl->mark(id, bufStart, len, false); } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear() { impl->clear(); } + +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const { - return impl->next(token, mode, tokens, cursor); + return impl->getLocation(); } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +TokenizedDataReader TokenizedData::reader() const { - // Copy the current cursor position to not update the actual cursor position - // if the operation was not successful - size_t cursorCopy = cursor; - if (!impl->next(token, mode, tokens, cursorCopy) || - token.id != Tokens::Data) { - return false; - } + return TokenizedDataReader(impl, TokenizedDataCursor(), + TokenizedDataCursor()); +} + +/* Class TokenizedDataReader */ - // There is indeed a text token, update the internal cursor position - cursor = cursorCopy; - return true; +TokenizedDataReader::TokenizedDataReader( + std::shared_ptr impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ + return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const +{ + return readCursor.bufPos >= impl->size(); +} + +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + peekCursor = readCursor; + return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + return impl->next(token, mode, tokens, peekCursor); } } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..b72ca02 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -37,40 +37,48 @@ #include #include - -#include "Token.hpp" +#include namespace ousia { // Forward declaration class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork; /** - * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * Internally used structure representing a cursor within the TokenizedData + * stream. */ -class TokenizedData { -private: +struct TokenizedDataCursor { /** - * Shared pointer pointing at the internal data. This data is shared when - * copying TokenizedData instances, which corresponds to forking a - * TokenizedData instance. + * Position within the byte buffer. */ - std::shared_ptr impl; + size_t bufPos; /** - * Contains all currently enabled token ids. + * Position within the token mark buffer. */ - std::unordered_set tokens; + size_t markPos; /** - * Position from which the last element was read from the internal buffer. - * This information is not shared with the other instances of TokenizedData - * pointing at the same location. + * Default constructor. The resulting cursor points at the beginning of the + * stream. + */ + TokenizedDataCursor() : bufPos(0), markPos(0) {} +}; + +/** + * The TokenizedData class stores data extracted from a user defined document. + * The data stored in TokenizedData + */ +class TokenizedData { +private: + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. */ - size_t cursor; + std::shared_ptr impl; public: /** @@ -101,10 +109,13 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart = 0); + size_t append(const std::string &data, SourceOffset offsStart = 0, + bool protect = false); /** * Appends a single character to the internal character buffer. @@ -112,10 +123,13 @@ public: * @param c is the character that should be appended to the buffer. * @param start is the start offset in bytes in the input file. * @param end is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect = false); /** * Stores a token ending at the last character of the current buffer. @@ -136,54 +150,194 @@ public: void mark(TokenId id, size_t bufStart, TokenLength len); /** - * Enables a single token id. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Resets the TokenizedData instance to the state it had when it was + * constructred. + */ + void clear(); + + /** + * Trims the length of the TokenizedData instance to the given length. Note + * that this function does not remove any token matches for performance + * reasons, it merely renders them incaccessible. Appending new data after + * calling trim will make the token marks accessible again. Thus this method + * should be the last function called to modify the data buffer and the + * token marks. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length); + + /** + * Returns the number of characters currently represented by this + * TokenizedData instance. + */ + size_t size() const; + + /** + * Returns true if the TokenizedData instance is empty, false otherwise. * - * @param id is the TokenId of the token that should be enabled. + * @return true if not data is stored inside the TokenizedData instance. */ - void enableToken(TokenId id) { tokens.insert(id); } + bool empty() const; /** - * Enables a set of token ids. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Returns the location of the entire TokenizedData instance. * - * @param ids is the TokenId of the token that should be enabled. + * @return the location of the entire data represented by this instance. */ - void enableToken(const std::unordered_set &ids) - { - tokens.insert(ids.begin(), ids.end()); - } + SourceLocation getLocation() const; + + /** + * Returns a TokenizedDataReader instance that can be used to access the + * data. + * + * @return a new TokenizedDataReader instance pointing at the beginning of + * the internal buffer. + */ + TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: + friend TokenizedData; + + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. + */ + std::shared_ptr impl; + + /** + * Position from which the last element was read from the internal buffer. + */ + TokenizedDataCursor readCursor; + + /** + * Position from which the last element was peeked from the internal buffer. + */ + TokenizedDataCursor peekCursor; + +protected: + /** + * Protected constructor of TokenizedDataReader, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader. + * + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReader(std::shared_ptr impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor); + +public: + /** + * Returns a new TokenizedDataReaderFork from which tokens and text can be + * read without advancing this reader instance. + */ + TokenizedDataReaderFork fork(); + + /** + * Returns true if this TokenizedData instance is at the end. + * + * @return true if the end of the TokenizedData instance has been reached. + */ + bool atEnd() const; /** * Stores the next token in the given token reference, returns true if the - * operation was successful, false if there are no more tokens. + * operation was successful, false if there are no more tokens. Advances the + * internal cursor and re * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool read(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM); /** - * Stores the next text token in the given token reference, returns true if - * the operation was successful (there was indeed a text token), false if - * the next token is not a text token or there were no more tokens. + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool peek(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM); + + /** + * Consumes the peeked tokens, the read cursor will now be at the position + * of the peek cursor. + */ + void consumePeek() { readCursor = peekCursor; } + + /** + * Resets the peek cursor to the position of the read cursor. + */ + void resetPeek() { peekCursor = readCursor; } +}; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: + friend TokenizedDataReader; + + /** + * Reference pointing at the parent TokenizedDataReader to which changes may + * be commited. + */ + TokenizedDataReader &parent; + + /** + * Private constructor of TokenizedDataReaderFork, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader and a reference at the parent TokenizedDataReader. + * + * @param parent is the TokenizedDataReader instance to which the current + * read/peek progress may be commited. + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReaderFork(TokenizedDataReader &parent, + std::shared_ptr impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) + { + } + +public: + /** + * Commits the read/peek progress to the underlying parent. + */ + void commit() { parent = *this; } }; } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..e78b0f4 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@ #include #include #include -#include +#include "TokenizedData.hpp" #include "Tokenizer.hpp" namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch { Token token; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; + size_t dataStartOffset; /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. + * Set to true if the matched token is a primary token. */ - size_t textEnd; + bool primary; /** * Constructor of the TokenMatch class. */ - TokenMatch() : textLength(0), textEnd(0) {} + TokenMatch() : dataStartOffset(0), primary(false) {} /** * Returns true if this TokenMatch instance actually represents a match. + * + * @return true if the TokenMatch actually has a match. + */ + bool hasMatch() const { return token.id != Tokens::Empty; } + + /** + * Returns the length of the matched token. + * + * @return the length of the token string. */ - bool hasMatch() { return token.id != Tokens::Empty; } + size_t size() const { return token.content.size(); } }; /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private: size_t start; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; + size_t dataStartOffset; public: /** * Constructor of the TokenLookup class. * * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. + * @param start is the start position in the source file. + * @param dataStartOffset is the current length of the TokenizedData buffer. */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t dataStartOffset) + : node(node), start(start), dataStartOffset(dataStartOffset) { } /** * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). + * character. If a complete token is matched, stores the match in the given + * TokenMatch reference and returns true. * * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,49 @@ public: * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. + * @return true if a token was matched, false otherwise. */ - void advance(char c, std::vector &lookups, TokenMatch &match, - const std::vector &tokens, SourceOffset end, - SourceId sourceId) + bool advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, + SourceOffset end, SourceId sourceId) { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node + // Set to true once a token has been matched + bool res = false; + + // Check whether we can continue the current token path, if not, abort auto it = node->children.find(c); if (it == node->children.end()) { - return; + return res; } // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); - if (node->type != Tokens::Empty) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - Token{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } + if (node->id != Tokens::Empty) { + const Tokenizer::TokenDescriptor &descr = tokens[node->id]; + match.token = Token(node->id, descr.string, + SourceLocation(sourceId, start, end)); + match.dataStartOffset = dataStartOffset; + match.primary = descr.primary; + res = true; } // If this state can possibly be advanced, store it in the states list. if (!node->children.empty()) { lookups.emplace_back(*this); } + return res; } }; -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.id = Tokens::Data; -} } /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template -bool Tokenizer::next(CharReader &reader, Token &token) +template +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -199,43 +174,62 @@ bool Tokenizer::next(CharReader &reader, Token &token) // Prepare the lookups in the token trie const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; + TokenMatch bestMatch; std::vector lookups; std::vector nextLookups; - // Instantiate the text handler - TextHandler textHandler; - // Peek characters from the reader and try to advance the current token tree // cursor char c; + const size_t initialDataSize = data.size(); size_t charStart = reader.getPeekOffset(); const SourceId sourceId = reader.getSourceId(); while (reader.peek(c)) { const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; + const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); + if (!bestMatch.hasMatch()) { + lookups.emplace_back(root, charStart, dataStartOffset); } // Try to advance all other lookups with the new character + TokenMatch match; for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + // Continue if the current lookup + if (!lookup.advance(c, nextLookups, match, tokens, charEnd, + sourceId)) { + continue; + } + + // If the matched token is primary, check whether it is better than + // the current best match, if yes, replace the best match. In any + // case just continue + if (match.primary) { + if (match.size() > bestMatch.size()) { + bestMatch = match; + } + continue; + } + + // Otherwise -- if the matched token is a non-primary token (and no + // primary token has been found until now) -- mark the match in the + // TokenizedData + if (!bestMatch.hasMatch()) { + data.mark(match.token.id, data.size() - match.size() + 1, + match.size()); + } } // We have found a token and there are no more states to advance or the // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { + if (bestMatch.hasMatch()) { + if ((nextLookups.empty() || data.size() > initialDataSize)) { break; } } else { // Record all incomming characters - textHandler.append(c, charStart, charEnd); + data.append(c, charStart, charEnd); } // Swap the lookups and the nextLookups list @@ -246,60 +240,53 @@ bool Tokenizer::next(CharReader &reader, Token &token) charStart = charEnd; } - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildDataToken(textHandler, match, sourceId); + // If we found data, emit a corresponding data token + if (data.size() > initialDataSize && + (!bestMatch.hasMatch() || + bestMatch.dataStartOffset > initialDataSize)) { + // If we have a "bestMatch" wich starts after text data has started, + // trim the TokenizedData to this offset + if (bestMatch.dataStartOffset > initialDataSize) { + data.trim(bestMatch.dataStartOffset); + } + + // Create a token containing the data location + bestMatch.token = Token{data.getLocation()}; } // Move the read/peek cursor to the end of the token, abort if an error // happens while doing so - if (match.hasMatch()) { + if (bestMatch.hasMatch()) { // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { + if (bestMatch.token.location.getEnd() == InvalidSourceOffset) { throw OusiaException{"Token end position offset out of range"}; } // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); + const size_t end = bestMatch.token.location.getEnd(); if (read) { reader.seek(end); } else { reader.seekPeekCursor(end); } - token = match.token; + token = bestMatch.token; } else { token = Token{}; } - return match.hasMatch(); + return bestMatch.hasMatch(); } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary) { // Abort if an empty token should be registered if (token.empty()) { @@ -309,8 +296,8 @@ TokenId Tokenizer::registerToken(const std::string &token) // Search for a new slot in the tokens list TokenId type = Tokens::Empty; for (size_t i = nextTokenId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; + if (!tokens[i].valid()) { + tokens[i] = TokenDescriptor(token, primary); type = i; break; } @@ -320,62 +307,47 @@ TokenId Tokenizer::registerToken(const std::string &token) // override the special token type handles if (type == Tokens::Empty) { type = tokens.size(); - if (type == Tokens::Data || type == Tokens::Empty) { + if (type >= Tokens::MaxTokenId) { throw OusiaException{"Token type ids depleted!"}; } - tokens.emplace_back(token); + tokens.emplace_back(token, primary); } nextTokenId = type + 1; - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list + // Try to register the token in the trie -- if this fails, remove it from + // the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; + tokens[type] = TokenDescriptor(); nextTokenId = type; return Tokens::Empty; } return type; } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id) { // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenId = type; + if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { + tokens[id] = TokenDescriptor(); + nextTokenId = id; return true; } return false; } -std::string Tokenizer::getTokenString(TokenId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const { - whitespaceMode = mode; + if (id < tokens.size()) { + return tokens[id]; + } + return EmptyTokenDescriptor; } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } - /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@ /** * @file Tokenizer.hpp * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -28,44 +28,80 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include +#include #include #include #include -#include +#include -#include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; +class TokenizedData; /** * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance. */ class Tokenizer { -private: +public: /** - * Internally used token trie. This object holds all registered tokens. + * Internally used structure describing a registered token. */ - TokenTrie trie; + struct TokenDescriptor { + /** + * String describing the token. + */ + std::string string; + + /** + * Set to true if this token is primary. + */ + bool primary; + + /** + * Constructor of the TokenDescriptor class. + * + * @param string is the string representation of the registered token. + * @param primary specifies whether the token is a primary token that + * should be returned as a single token, or a secondary token, that + * should be returned as part of TokenizedData. + */ + TokenDescriptor(const std::string &string, bool primary) + : string(string), primary(primary) + { + } + + /** + * Default constructor. + */ + TokenDescriptor() : primary(false) {} + + /** + * Returns true if the TokenDescriptor represents a valid token. + */ + bool valid() { return !string.empty(); } + }; +private: /** - * Flag defining whether whitespaces should be preserved or not. + * Internally used token trie. This object holds all registered tokens. */ - WhitespaceMode whitespaceMode; + TokenTrie trie; /** * Vector containing all registered token types. */ - std::vector tokens; + std::vector tokens; /** * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private: /** * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. + * function is templated in order to force optimized code generation for + * both reading and peeking. * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. + * @tparam read specifies whether the method should read the token or just + * peek. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return false if the end of the stream has been reached, true otherwise. */ - template - bool next(CharReader &reader, Token &token); + template + bool next(CharReader &reader, Token &token, TokenizedData &data); public: /** * Constructor of the Tokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. */ - Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + Tokenizer(); /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. + * Registers the given string as a token. Returns a unique identifier + * describing the registered token. * * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if + * @param primary specifies whether the token is a primary token -- if true, + * the token will be returned as a single, standalone token. Otherwise the + * token will be returned as part of a "TokenizedData" structure. + * @return a unique identifier for the registered token or Tokens::Empty if * an error occured. */ - TokenId registerToken(const std::string &token); + TokenId registerToken(const std::string &token, bool primary = true); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The - *TokenId - * must have been returned by registerToken. + * TokenId must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). + * because the token with the given TokenId was already unregistered). */ - bool unregisterToken(TokenId type); + bool unregisterToken(TokenId id); /** * Returns the token that was registered under the given TokenId id or - *an - * empty string if an invalid TokenId id is given. + * an empty string if an invalid TokenId id is given. * - * @param type is the TokenId id for which the corresponding token - *string + * @param id is the TokenId for which the corresponding TokenDescriptor * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. + * @return the registered TokenDescriptor or an invalid TokenDescriptor if + * the given TokenId is invalid. */ - WhitespaceMode getWhitespaceMode(); + const TokenDescriptor& lookupToken(TokenId id) const; /** * Reads a new token from the CharReader and stores it in the given - * Token instance. + * Token instance. If the token has the id Tokens::Data, use the "getData" + * method to fetch a reference at the underlying TokenizedData instance + * storing the data. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(CharReader &reader, Token &token); + bool read(CharReader &reader, Token &token, TokenizedData &data); /** * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public: * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(CharReader &reader, Token &token); + bool peek(CharReader &reader, Token &token, TokenizedData &data); }; } diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..d4cdbf8 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -94,92 +94,11 @@ public: static const PlainFormatTokens OsmlTokens; -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: - /** - * Internal character buffer. - */ - std::vector buf; - - /** - * Start location of the character data. - */ - SourceOffset start; - - /** - * End location of the character data. - */ - SourceOffset end; - -public: - /** - * Default constructor, initializes start and end with zeros. - */ - DataHandler() : start(0), end(0) {} - - /** - * Returns true if the internal buffer is empty. - * - * @return true if no characters were added to the internal buffer, false - * otherwise. - */ - bool isEmpty() { return buf.empty(); } - - /** - * Appends a single character to the internal buffer. - * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. - */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) - { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; - } - - /** - * Appends a string to the internal buffer. - * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. - */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) - { - if (isEmpty()) { - start = stringStart; - } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; - } - - /** - * Converts the internal buffer to a variant with attached location - * information. - * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. - */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } -}; - OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) - : reader(reader), logger(logger), tokenizer(OsmlTokens) + : reader(reader), + logger(logger), + tokenizer(OsmlTokens), + data(reader.getSourceId()) { // Place an intial command representing the complete file on the stack commands.push(Command{"", Variant::mapType{}, true, true, true, false}); @@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; - bool hasCharSiceNSSep = false; + bool hasCharSinceNSSep = false; std::vector identifier; size_t end = reader.getPeekOffset(); char c, c2; @@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) if ((first && Utils::isIdentifierStartCharacter(c)) || (!first && Utils::isIdentifierCharacter(c))) { identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { identifier.push_back(c); } else { @@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) // This is no longer the first character first = false; - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); + // Advance the hasCharSinceNSSep flag + hasCharSinceNSSep = allowNSSep && (c != ':'); end = reader.getPeekOffset(); reader.consumePeek(); @@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment() { Token token; size_t depth = 1; - while (tokenizer.read(reader, token)) { + while (tokenizer.read(reader, token, data)) { + // Throw the comment data away + data.clear(); + if (token.id == OsmlTokens.BlockCommentEnd) { depth--; if (depth == 0) { @@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment() } } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData() { - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); + if (!data.empty()) { location = data.getLocation(); reader.resetPeek(); return true; @@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField() OsmlStreamParser::State OsmlStreamParser::parse() { - // Handler for incomming data - DataHandler handler; + // Reset the data handler + data.clear(); // Read tokens until the outer loop should be left Token token; - while (tokenizer.peek(reader, token)) { + while (tokenizer.peek(reader, token, data)) { const TokenId type = token.id; // Special handling for Backslash and Text @@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() // Try to parse a command if (Utils::isIdentifierStartCharacter(c)) { // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse() // If this was an annotation start token, add the parsed < to the // output if (type == OsmlTokens.AnnotationStart) { - handler.append('<', token.location.getStart(), - token.location.getStart() + 1); + data.append('<', token.location.getStart(), + token.location.getStart() + 1); } - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); + data.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); continue; } else if (type == Tokens::Data) { @@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse() location = token.location; return State::FIELD_START; } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - reader.consumePeek(); continue; } // A non-text token was reached, make sure all pending data commands // have been issued - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse() Command &cmd = commands.top(); if (!cmd.inField) { cmd.inField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got field start token \"{\", but no command for which to " "start the field. Write \"\\{\" to insert this sequence as " "text.", - token); + token);*/ } else if (token.id == OsmlTokens.FieldEnd) { - if (closeField()) { + closeField(); + return State::FIELD_END; +/* if (closeField()) { return State::FIELD_END; } logger.error( "Got field end token \"}\", but there is no field to end. " "Write \"\\}\" to insert this sequence as text.", - token); + token);*/ } else if (token.id == OsmlTokens.DefaultFieldStart) { // Try to start a default field the first time the token is reached Command &topCmd = commands.top(); if (!topCmd.inField) { topCmd.inField = true; topCmd.inDefaultField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got default field start token \"{!\", but no command for " "which to start the field. Write \"\\{!\" to insert this " "sequence as text", - token); + token);*/ } else if (token.id == OsmlTokens.AnnotationEnd) { // We got a single annotation end token "\>" -- simply issue the // ANNOTATION_END event @@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() } // Issue available data - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse() return State::END; } +Variant OsmlStreamParser::getText(WhitespaceMode mode) +{ + TokenizedData dataFork = data; + Variant text = dataFork.text(mode); + location = text.getLocation(); + return text; +} + const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..453a2bb 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,17 +29,19 @@ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include +#include #include +#include #include +#include namespace ousia { // Forward declarations class CharReader; class Logger; -class DataHandler; +class OsmlStreamParserImpl; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml @@ -137,26 +139,15 @@ public: Variant arguments; /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange : 1; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField : 1; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). + * Vector used as stack for holding the number of opening/closing braces + * and the corresponding "isDefaultField" flag. */ - bool inRangeField : 1; + std::vector fields; /** - * Set to true if we are currently in a field that has been especially - * marked as default field (using the "|") syntax. + * Set to true if this is a command with clear begin and end. */ - bool inDefaultField : 1; + bool hasRange; /** * Default constructor. @@ -164,7 +155,6 @@ public: Command() : hasRange(false), inField(false), - inRangeField(false), inDefaultField() { } @@ -178,15 +168,10 @@ public: * command. * @param hasRange should be set to true if this is a command with * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently are inside the - * outer field of a ranged command. * @param inDefaultField is set to true if we currently are in a * specially marked default field. */ - Command(Variant name, Variant arguments, bool hasRange, - bool inField, bool inRangeField, bool inDefaultField) + Command(Variant name, Variant arguments, bool hasRange) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), @@ -215,25 +200,20 @@ private: Tokenizer tokenizer; /** - * Stack containing the current commands. - */ - std::stack commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). + * Variant containing the tokenized data that was returned from the + * tokenizer as data. */ - Variant data; + TokenizedData data; /** - * Contains the location of the last token. + * Stack containing the current commands. */ - SourceLocation location; + std::stack commands; /** - * Contains the field index of the current command. + * Pointer at */ - size_t fieldIdx; + std::unique_ptr impl; /** * Function used internall to parse an identifier. @@ -291,12 +271,10 @@ private: /** * Checks whether there is any data pending to be issued, if yes, issues it. * - * @param handler is the data handler that contains the data that may be - * returned to the user. * @return true if there was any data and DATA should be returned by the * parse function, false otherwise. */ - bool checkIssueData(DataHandler &handler); + bool checkIssueData(); /** * Called before any data is appended to the internal data handler. Checks @@ -327,6 +305,12 @@ public: */ OsmlStreamParser(CharReader &reader, Logger &logger); + /** + * Destructor of the OsmlStreamParser, needed to destroy the incomplete + * OsmlStreamParserImpl. + */ + ~OsmlStreamParser(); + /** * Continues parsing. Returns one of the states defined in the State enum. * Callers should stop once the State::END state is reached. Use the getter @@ -344,7 +328,19 @@ public: * @return a reference at a variant containing the data parsed by the * "parse" function. */ - const Variant &getData() const { return data; } + const TokenizedData &getData() const { return data; } + + /** + * Returns the complete content of the internal TokenizedData instance as + * a single string Variant. This method is mainly used in the unit tests for + * this class, it simply calls the text() method of TokenizedData. + * + * @param mode is the WhitespaceMode that should be used for returning the + * text. + * @return a string variant containing the text content of the internal + * TokenizedData instance or a nullptr variant if there is no text. + */ + Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Returns a reference at the internally stored command name. Only valid if @@ -371,13 +367,6 @@ public: * syntax). */ bool inDefaultField() const; - - /** - * Returns a reference at the char reader. - * - * @return the last internal token location. - */ - const SourceLocation &getLocation() const { return location; } }; } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..855f80d 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" @@ -56,17 +55,6 @@ public: */ std::vector textBuf; - /** - * Current whitespace buffer (for the trimming whitspace mode) - */ - std::vector whitespaceBuf; - - /** - * Flag indicating whether a whitespace character was present (for the - * collapsing whitespace mode). - */ - bool hasWhitespace; - /** * Current character data start. */ @@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) SourceLocation loc = xmlSyncLoggerPosition(p, ulen); // Fetch some variables for convenience - const WhitespaceMode mode = parser->getWhitespaceMode(); OsxmlEventParserData &data = parser->getData(); std::vector &textBuf = data.textBuf; - std::vector &whitespaceBuf = data.whitespaceBuf; - bool &hasWhitespace = data.hasWhitespace; - size_t &textStart = data.textStart; - size_t &textEnd = data.textEnd; - - size_t pos = loc.getStart(); - for (size_t i = 0; i < ulen; i++, pos++) { - switch (mode) { - case WhitespaceMode::PRESERVE: - PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd); - break; - case WhitespaceMode::TRIM: - TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - whitespaceBuf); - break; - case WhitespaceMode::COLLAPSE: - CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - hasWhitespace); - break; - } + + // Update start and end position + if (textBuf.empty()) { + data.textStart = loc.getStart(); } + data.textEnd = loc.getEnd(); + + // Insert the data into the text buffer + textBuf.insert(textBuf.end(), &s[0], &s[ulen]); } /* Class OsxmlEvents */ @@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ OsxmlEventParserData::OsxmlEventParserData() - : depth(0), - annotationEndTagDepth(-1), - hasWhitespace(false), - textStart(0), - textEnd(0) + : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0) { } @@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId) // Reset the text buffers textBuf.clear(); - whitespaceBuf.clear(); - hasWhitespace = false; textStart = 0; textEnd = 0; @@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), data(new OsxmlEventParserData()) { } @@ -532,16 +497,6 @@ void OsxmlEventParser::parse() } } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ - return whitespaceMode; -} - CharReader &OsxmlEventParser::getReader() const { return reader; } Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..e3fd5d4 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@ #include #include -#include - namespace ousia { // Forward declarations @@ -99,13 +97,10 @@ public: virtual void fieldEnd() = 0; /** - * Called whenever data is found. Whitespace data is handled as specified - * and the data has been parsed to the specified variant type. This function - * is not called if the parsing failed, the parser prints an error message - * instead. + * Called whenever string data is found. * - * @param data is the already parsed data that should be passed to the - * handler. + * @param data is a Variant containing the string data that was found in the + * XML file. */ virtual void data(const Variant &data) = 0; }; @@ -134,11 +129,6 @@ private: */ Logger &logger; - /** - * Current whitespace mode. - */ - WhitespaceMode whitespaceMode; - /** * Data to be used by the internal functions. */ @@ -170,21 +160,6 @@ public: */ void parse(); - /** - * Sets the whitespace handling mode. - * - * @param whitespaceMode defines how whitespace in the data should be - * handled. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Returns the current whitespace handling mode. - * - * @return the currently set whitespace handling mode. - */ - WhitespaceMode getWhitespaceMode() const; - /** * Returns the internal CharReader reference. * diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp index a93f14a..83966d5 100644 --- a/test/core/parser/stack/StackTest.cpp +++ b/test/core/parser/stack/StackTest.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -53,7 +54,7 @@ struct Tracker { Variant::mapType annotationStartArgs; Variant annotationEndClassName; Variant annotationEndElementName; - Variant dataData; + TokenizedData dataData; bool startResult; bool fieldStartSetIsDefault; @@ -81,7 +82,7 @@ struct Tracker { annotationStartArgs = Variant::mapType{}; annotationEndClassName = Variant::fromString(std::string{}); annotationEndElementName = Variant::fromString(std::string{}); - dataData = Variant::fromString(std::string{}); + dataData = TokenizedData(); startResult = true; fieldStartSetIsDefault = false; @@ -157,7 +158,7 @@ public: return tracker.annotationEndResult; } - bool data(Variant &data) override + bool data(TokenizedData &data) override { tracker.dataCount++; tracker.dataData = data; @@ -363,7 +364,7 @@ TEST(Stack, multipleFields) s.data("test"); tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test", tracker.dataData); + EXPECT_EQ("test", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc @@ -375,7 +376,7 @@ TEST(Stack, multipleFields) s.data("test2"); tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test2", tracker.dataData); + EXPECT_EQ("test2", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc @@ -387,7 +388,7 @@ TEST(Stack, multipleFields) s.data("test3"); tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test3", tracker.dataData); + EXPECT_EQ("test3", tracker.dataData.text().asString()); s.fieldEnd(); tracker.expect(1, 0, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc @@ -744,4 +745,4 @@ TEST(Stack, fieldAfterDefaultField) ASSERT_FALSE(logger.hasError()); } } -} \ No newline at end of file +} diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp index 231bad9..dfe2526 100644 --- a/test/core/parser/utils/TokenizedDataTest.cpp +++ b/test/core/parser/utils/TokenizedDataTest.cpp @@ -22,6 +22,43 @@ namespace ousia { +void assertToken(TokenizedDataReader &reader, TokenId id, + const std::string &text, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId sourceId = InvalidSourceId) +{ + Token token; + ASSERT_TRUE(reader.read(token, tokens, mode)); + EXPECT_EQ(id, token.id); + EXPECT_EQ(text, token.content); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, token.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, token.getLocation().getEnd()); + } + EXPECT_EQ(sourceId, token.getLocation().getSourceId()); +} + +void assertText(TokenizedDataReader &reader, const std::string &text, + const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId id = InvalidSourceId) +{ + assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id); +} + +void assertEnd(TokenizedDataReader &reader) +{ + Token token; + ASSERT_TRUE(reader.atEnd()); + ASSERT_FALSE(reader.read(token)); +} + TEST(TokenizedData, dataWhitespacePreserve) { TokenizedData data; @@ -29,15 +66,10 @@ TEST(TokenizedData, dataWhitespacePreserve) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" test1 test2 ", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(16U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, " test1 test2 ", TokenSet{}, WhitespaceMode::PRESERVE, + 0, 16); + assertEnd(reader); } TEST(TokenizedData, dataWhitespaceTrim) @@ -47,15 +79,10 @@ TEST(TokenizedData, dataWhitespaceTrim) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test1 test2", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(14U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::TRIM, 1, + 14); + assertEnd(reader); } TEST(TokenizedData, dataWhitespaceCollapse) @@ -65,15 +92,10 @@ TEST(TokenizedData, dataWhitespaceCollapse) // 0123456789012345 // 0 1 - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test1 test2", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(14U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test1 test2", TokenSet{}, WhitespaceMode::COLLAPSE, 1, + 14); + assertEnd(reader); } TEST(TokenizedData, singleToken) @@ -82,17 +104,9 @@ TEST(TokenizedData, singleToken) ASSERT_EQ(2U, data.append("$$")); data.mark(5, 0, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, singleDisabledToken) @@ -101,15 +115,9 @@ TEST(TokenizedData, singleDisabledToken) ASSERT_EQ(2U, data.append("$$")); data.mark(5, 0, 2); - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "$$", TokenSet{}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, dualToken) @@ -120,18 +128,10 @@ TEST(TokenizedData, dualToken) data.mark(5, 0, 2); data.mark(6, 1, 1); - data.enableToken(5); - data.enableToken(6); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5, 6}, WhitespaceMode::COLLAPSE, 0, + 2); + assertEnd(reader); } TEST(TokenizedData, dualTokenShorterEnabled) @@ -142,385 +142,281 @@ TEST(TokenizedData, dualTokenShorterEnabled) data.mark(5, 0, 2); data.mark(6, 1, 1); - data.enableToken(6); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(1U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, 6, "$", TokenSet{6}, WhitespaceMode::COLLAPSE, 1, 2); + assertEnd(reader); } TEST(TokenizedData, dualTokenLongerEnabled) { TokenizedData data; ASSERT_EQ(2U, data.append("$$")); + data.mark(6, 0, 1); data.mark(5, 0, 2); + data.mark(6, 1, 1); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataPreserveWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" test ", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(8U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2); + assertText(reader, " test text ", TokenSet{5}, WhitespaceMode::PRESERVE, + 2, 16); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataTrimWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2); + assertText(reader, "test text", TokenSet{5}, WhitespaceMode::TRIM, 3, + 15); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndDataCollapseWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ test $$")); - // 0123456789 + ASSERT_EQ(18U, data.append("$$ test text $$")); + // 012345678901234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertText(reader, "test text", TokenSet{5}, WhitespaceMode::COLLAPSE, 3, + 15); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 16, 18); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespacePreserveWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(8U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 0, 2); + assertText(reader, " ", TokenSet{5}, WhitespaceMode::PRESERVE, 2, 6); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::PRESERVE, 6, 8); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespaceTrimWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 0, 2); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::TRIM, 6, 8); + assertEnd(reader); } TEST(TokenizedData, tokensAndWhitespaceCollapseWhitespace) { TokenizedData data; - ASSERT_EQ(10U, data.append("$$ $$")); - // 0123456789 + ASSERT_EQ(8U, data.append("$$ $$")); + // 01234567 data.mark(5, 0, 2); data.mark(5, 2); - data.enableToken(5); - - Token token; - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(8U, token.getLocation().getStart()); - EXPECT_EQ(10U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 0, 2); + assertToken(reader, 5, "$$", TokenSet{5}, WhitespaceMode::COLLAPSE, 6, 8); + assertEnd(reader); } -TEST(TokenizedData, textPreserveWhitespace) +TEST(TokenizedData, appendChars) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); - - data.enableToken(5); + ASSERT_EQ(1U, data.append('t', 5, 7)); + ASSERT_EQ(2U, data.append('e', 7, 8)); + ASSERT_EQ(3U, data.append('s', 8, 10)); + ASSERT_EQ(4U, data.append('t', 10, 12)); - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(0U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.next(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" ", token.content); - EXPECT_EQ(4U, token.getLocation().getStart()); - EXPECT_EQ(6U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.text(token, WhitespaceMode::PRESERVE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::PRESERVE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test", TokenSet{5}, WhitespaceMode::COLLAPSE, 5, 12); + assertEnd(reader); } -TEST(TokenizedData, textTrimWhitespace) +TEST(TokenizedData, protectedWhitespace) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); + ASSERT_EQ(4U, data.append("test", 10)); + ASSERT_EQ(11U, data.append(" test", 14, true)); - data.enableToken(5); - - Token token; - ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM)); - - ASSERT_TRUE(data.next(token, WhitespaceMode::TRIM)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); + TokenizedDataReader reader = data.reader(); + assertText(reader, "test test", TokenSet{5}, WhitespaceMode::COLLAPSE, 10, + 21); + assertEnd(reader); +} - ASSERT_FALSE(data.text(token, WhitespaceMode::TRIM)); - ASSERT_FALSE(data.next(token, WhitespaceMode::TRIM)); +TEST(TokenizedData, specialNewlineToken) +{ + TokenizedData data; + data.append("a\nb\n \nc\n"); + // 0 12 3456 78 9 + + const TokenSet tokens{Tokens::Newline}; + + TokenizedDataReader reader = data.reader(); + assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 1, 2); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 3, 4); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 7, 8); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 9, 10); + assertEnd(reader); } -TEST(TokenizedData, textCollapseWhitespace) +TEST(TokenizedData, specialParagraphToken) { TokenizedData data; - ASSERT_EQ(6U, data.append(" $$ ")); - // 012345 - data.mark(5, 2, 2); + data.append("a\nb\n \nc\n"); + // 0 12 3456 78 9 - data.enableToken(5); + const TokenSet tokens{Tokens::Paragraph}; - Token token; - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3); + assertToken(reader, Tokens::Paragraph, "\n \n", tokens, + WhitespaceMode::COLLAPSE, 3, 8); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 8, 9); + assertEnd(reader); +} - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(5U, token.id); - EXPECT_EQ("$$", token.content); - EXPECT_EQ(2U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); +TEST(TokenizedData, specialSectionToken) +{ + TokenizedData data; + data.append("a\nb\n \n \t \n"); + // 0 12 3456 789 01 2 + // 0 1 + + const TokenSet tokens{Tokens::Section}; - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); + TokenizedDataReader reader = data.reader(); + assertText(reader, "a b", tokens, WhitespaceMode::COLLAPSE, 0, 3); + assertToken(reader, Tokens::Section, "\n \n \t \n", tokens, + WhitespaceMode::COLLAPSE, 3, 13); + assertEnd(reader); } -TEST(TokenizedData, appendChars) +TEST(TokenizedData, specialTokenPrecedence) { TokenizedData data; - ASSERT_EQ(1U, data.append('t', 5, 7)); - ASSERT_EQ(2U, data.append('e', 7, 8)); - ASSERT_EQ(3U, data.append('s', 8, 10)); - ASSERT_EQ(4U, data.append('t', 10, 12)); + data.append("a\nb\n\nc\n\n\nd"); + // 0 12 3 45 6 7 89 + + const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section}; + + TokenizedDataReader reader = data.reader(); + assertText(reader, "a", tokens, WhitespaceMode::COLLAPSE, 0, 1); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 1, 2); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 2, 3); + assertToken(reader, Tokens::Paragraph, "\n\n", tokens, + WhitespaceMode::COLLAPSE, 3, 5); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 5, 6); + assertToken(reader, Tokens::Section, "\n\n\n", tokens, + WhitespaceMode::COLLAPSE, 6, 9); + assertText(reader, "d", tokens, WhitespaceMode::COLLAPSE, 9, 10); + assertEnd(reader); +} - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("test", token.content); - EXPECT_EQ(5U, token.getLocation().getStart()); - EXPECT_EQ(12U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); - ASSERT_FALSE(data.next(token, WhitespaceMode::COLLAPSE)); +TEST(TokenizedData, specialTokenPrecedence2) +{ + TokenizedData data; + data.append("\nb\n\nc\n\n\n"); + // 0 12 3 45 6 7 + + const TokenSet tokens{Tokens::Newline, Tokens::Paragraph, Tokens::Section}; + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Newline, "\n", tokens, WhitespaceMode::COLLAPSE, + 0, 1); + assertText(reader, "b", tokens, WhitespaceMode::COLLAPSE, 1, 2); + assertToken(reader, Tokens::Paragraph, "\n\n", tokens, + WhitespaceMode::COLLAPSE, 2, 4); + assertText(reader, "c", tokens, WhitespaceMode::COLLAPSE, 4, 5); + assertToken(reader, Tokens::Section, "\n\n\n", tokens, + WhitespaceMode::COLLAPSE, 5, 8); + assertEnd(reader); } -TEST(TokenizedData, copy) +TEST(TokenizedData, specialTokenIndent) { TokenizedData data; - ASSERT_EQ(7U, data.append(" a $ b ")); - // 0123456 - data.mark(6, 3, 1); - data.enableToken(6); + data.append(" test\n\ttest2\n test3 \ttest4\ntest5"); + // 01234567 8 901234 5678901234567890 123456 789012 + // 0 1 2 3 4 + const TokenSet tokens{Tokens::Indent, Tokens::Dedent}; + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 4, 4); + assertText(reader, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 10, 10); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, + 38, 38); + assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43); + assertEnd(reader); +} - Token token; - ASSERT_TRUE(data.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("a", token.content); - EXPECT_EQ(1U, token.getLocation().getStart()); - EXPECT_EQ(2U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_FALSE(data.text(token, WhitespaceMode::COLLAPSE)); - - TokenizedData dataCopy = data; - - ASSERT_TRUE(data.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(dataCopy.next(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(6U, token.id); - EXPECT_EQ("$", token.content); - EXPECT_EQ(3U, token.getLocation().getStart()); - EXPECT_EQ(4U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - - ASSERT_TRUE(data.text(token, WhitespaceMode::PRESERVE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ(" b ", token.content); - EXPECT_EQ(4U, token.getLocation().getStart()); - EXPECT_EQ(7U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(data.next(token)); - - ASSERT_TRUE(dataCopy.text(token, WhitespaceMode::COLLAPSE)); - EXPECT_EQ(Tokens::Data, token.id); - EXPECT_EQ("b", token.content); - EXPECT_EQ(5U, token.getLocation().getStart()); - EXPECT_EQ(6U, token.getLocation().getEnd()); - EXPECT_EQ(InvalidSourceId, token.getLocation().getSourceId()); - ASSERT_FALSE(dataCopy.next(token)); +TEST(TokenizedData, specialTokenIndentOverlap) +{ + TokenizedData data; + data.append(" test\n\ttest2\n test3 \ttest4\ntest5"); + // 01234567 8 901234 5678901234567890 123456 789012 + // 0 1 2 3 4 + const TokenSet tokens{Tokens::Indent, Tokens::Dedent, 5}; + + data.mark(5, 4, 4); + + TokenizedDataReader reader = data.reader(); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 4, 4); + assertToken(reader, 5, "test", tokens, WhitespaceMode::COLLAPSE, 4, 8); + assertToken(reader, Tokens::Indent, "", tokens, WhitespaceMode::COLLAPSE, + 10, 10); + assertText(reader, "test2 test3 test4", tokens, WhitespaceMode::COLLAPSE, 10, 37); + assertToken(reader, Tokens::Dedent, "", tokens, WhitespaceMode::COLLAPSE, + 38, 38); + assertText(reader, "test5", tokens, WhitespaceMode::COLLAPSE, 38, 43); + assertEnd(reader); } + } diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 3809a12..0f2bfb7 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace ousia { @@ -31,23 +32,40 @@ TEST(Tokenizer, tokenRegistration) ASSERT_EQ(0U, tokenizer.registerToken("a")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("a")); - ASSERT_EQ("a", tokenizer.getTokenString(0U)); + ASSERT_EQ("a", tokenizer.lookupToken(0U).string); ASSERT_EQ(1U, tokenizer.registerToken("b")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("b")); - ASSERT_EQ("b", tokenizer.getTokenString(1U)); + ASSERT_EQ("b", tokenizer.lookupToken(1U).string); ASSERT_EQ(2U, tokenizer.registerToken("c")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("c")); - ASSERT_EQ("c", tokenizer.getTokenString(2U)); + ASSERT_EQ("c", tokenizer.lookupToken(2U).string); ASSERT_TRUE(tokenizer.unregisterToken(1U)); ASSERT_FALSE(tokenizer.unregisterToken(1U)); - ASSERT_EQ("", tokenizer.getTokenString(1U)); + ASSERT_EQ("", tokenizer.lookupToken(1U).string); ASSERT_EQ(1U, tokenizer.registerToken("d")); ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("d")); - ASSERT_EQ("d", tokenizer.getTokenString(1U)); + ASSERT_EQ("d", tokenizer.lookupToken(1U).string); +} + +void expectData(const std::string &expected, SourceOffset tokenStart, + SourceOffset tokenEnd, SourceOffset textStart, + SourceOffset textEnd, const Token &token, TokenizedData &data, + WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ + ASSERT_EQ(Tokens::Data, token.id); + + Variant text = data.text(mode); + ASSERT_TRUE(text.isString()); + + EXPECT_EQ(expected, text.asString()); + EXPECT_EQ(tokenStart, token.location.getStart()); + EXPECT_EQ(tokenEnd, token.location.getEnd()); + EXPECT_EQ(textStart, text.getLocation().getStart()); + EXPECT_EQ(textEnd, text.getLocation().getEnd()); } TEST(Tokenizer, textTokenPreserveWhitespace) @@ -56,36 +74,34 @@ TEST(Tokenizer, textTokenPreserveWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(36U, loc.getEnd()); + expectData(" this \t is only a \n\n test text ", 0, 36, 0, 36, + token, data, WhitespaceMode::PRESERVE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, + token, data, WhitespaceMode::PRESERVE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -95,36 +111,34 @@ TEST(Tokenizer, textTokenTrimWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 36, 1, 33, token, + data, WhitespaceMode::TRIM); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this \t is only a \n\n test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, + token, data, WhitespaceMode::TRIM); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -134,36 +148,34 @@ TEST(Tokenizer, textTokenCollapseWhitespace) CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this is only a test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); + expectData("this is only a test text", 0, 36, 1, 33, token, data, + WhitespaceMode::COLLAPSE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } { CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer; Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("this is only a test text", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); + expectData("this is only a test text", 0, 32, 0, 32, token, data, + WhitespaceMode::COLLAPSE); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } } @@ -177,14 +189,12 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + expectData("test1", 0, 5, 0, 5, token, data); char c; ASSERT_TRUE(reader.peek(c)); @@ -193,7 +203,8 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -209,14 +220,10 @@ TEST(Tokenizer, simpleReadToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + expectData("test2", 6, 11, 6, 11, token, data); char c; ASSERT_FALSE(reader.peek(c)); @@ -233,21 +240,17 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); + expectData("test1", 0, 5, 0, 5, token, data); ASSERT_EQ(0U, reader.getOffset()); ASSERT_EQ(5U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -261,35 +264,26 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.peek(reader, token, data)); + expectData("test2", 6, 11, 6, 11, token, data); ASSERT_EQ(0U, reader.getOffset()); ASSERT_EQ(11U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + expectData("test1", 0, 5, 0, 5, token, data); ASSERT_EQ(5U, reader.getOffset()); ASSERT_EQ(5U, reader.getPeekOffset()); } { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(tid, token.id); ASSERT_EQ(":", token.content); @@ -303,14 +297,9 @@ TEST(Tokenizer, simplePeekToken) { Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + expectData("test2", 6, 11, 6, 11, token, data); ASSERT_EQ(11U, reader.getOffset()); ASSERT_EQ(11U, reader.getPeekOffset()); } @@ -320,6 +309,7 @@ TEST(Tokenizer, ambiguousTokens) { CharReader reader{"abc"}; Tokenizer tokenizer; + TokenizedData data; TokenId t1 = tokenizer.registerToken("abd"); TokenId t2 = tokenizer.registerToken("bc"); @@ -328,16 +318,17 @@ TEST(Tokenizer, ambiguousTokens) ASSERT_EQ(1U, t2); Token token; - ASSERT_TRUE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_TRUE(tokenizer.read(reader, token, data)); - ASSERT_EQ(Tokens::Data, token.id); - ASSERT_EQ("a", token.content); + expectData("a", 0, 1, 0, 1, token, data); SourceLocation loc = token.location; ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); - ASSERT_TRUE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_TRUE(tokenizer.read(reader, token, data)); ASSERT_EQ(t2, token.id); ASSERT_EQ("bc", token.content); @@ -346,7 +337,8 @@ TEST(Tokenizer, ambiguousTokens) ASSERT_EQ(1U, loc.getStart()); ASSERT_EQ(3U, loc.getEnd()); - ASSERT_FALSE(tokenizer.read(reader, token)); + data.clear(); + ASSERT_FALSE(tokenizer.read(reader, token, data)); } TEST(Tokenizer, commentTestWhitespacePreserve) @@ -354,7 +346,7 @@ TEST(Tokenizer, commentTestWhitespacePreserve) CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - Tokenizer tokenizer(WhitespaceMode::PRESERVE); + Tokenizer tokenizer; const TokenId t1 = tokenizer.registerToken("/"); const TokenId t2 = tokenizer.registerToken("/*"); @@ -370,45 +362,23 @@ TEST(Tokenizer, commentTestWhitespacePreserve) Token t; for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); + TokenizedData data(0); + EXPECT_TRUE(tokenizer.read(reader, t, data)); EXPECT_EQ(te.id, t.id); - EXPECT_EQ(te.content, t.content); + if (te.id != Tokens::Data) { + EXPECT_EQ(te.content, t.content); + } else { + Variant text = data.text(WhitespaceMode::PRESERVE); + ASSERT_TRUE(text.isString()); + EXPECT_EQ(te.content, text.asString()); + } EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -TEST(Tokenizer, commentTestWhitespaceCollapse) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - Tokenizer tokenizer(WhitespaceMode::COLLAPSE); - const TokenId t1 = tokenizer.registerToken("/"); - const TokenId t2 = tokenizer.registerToken("/*"); - const TokenId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {Tokens::Data, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {Tokens::Data, "Test", SourceLocation{0, 5, 9}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {Tokens::Data, "Block Comment", SourceLocation{0, 13, 26}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.id, t.id); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); + TokenizedData data; + ASSERT_FALSE(tokenizer.read(reader, t, data)); } } diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index d52fa5b..3d01007 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -30,11 +30,21 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); // static ConcreteLogger logger; +static OsmlStreamParser::State skipEmptyData(OsmlStreamParser &reader) +{ + OsmlStreamParser::State res = reader.parse(); + if (res == OsmlStreamParser::State::DATA) { + EXPECT_FALSE(reader.getData().hasNonWhitespaceText()); + res = reader.parse(); + } + return res; +} + static void assertCommand(OsmlStreamParser &reader, const std::string &name, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::COMMAND, skipEmptyData(reader)); EXPECT_EQ(name, reader.getCommandName().asString()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); @@ -57,16 +67,19 @@ static void assertCommand(OsmlStreamParser &reader, const std::string &name, static void assertData(OsmlStreamParser &reader, const std::string &data, SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) + SourceOffset end = InvalidSourceOffset, + WhitespaceMode mode = WhitespaceMode::COLLAPSE) { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(data, reader.getData().asString()); + Variant text = reader.getText(mode); + ASSERT_TRUE(text.isString()); + EXPECT_EQ(data, text.asString()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getData().getLocation().getStart()); + EXPECT_EQ(start, text.getLocation().getStart()); EXPECT_EQ(start, reader.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getData().getLocation().getEnd()); + EXPECT_EQ(end, text.getLocation().getEnd()); EXPECT_EQ(end, reader.getLocation().getEnd()); } } @@ -75,7 +88,7 @@ static void assertFieldStart(OsmlStreamParser &reader, bool defaultField, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::FIELD_START, skipEmptyData(reader)); EXPECT_EQ(defaultField, reader.inDefaultField()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); @@ -89,7 +102,7 @@ static void assertFieldEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::FIELD_END, skipEmptyData(reader)); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); } @@ -103,7 +116,7 @@ static void assertAnnotationStart(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, skipEmptyData(reader)); EXPECT_EQ(name, reader.getCommandName().asString()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); @@ -131,7 +144,7 @@ static void assertAnnotationEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, skipEmptyData(reader)); ASSERT_EQ(name, reader.getCommandName().asString()); if (!elementName.empty()) { ASSERT_EQ(1U, reader.getCommandArguments().asMap().size()); @@ -152,7 +165,7 @@ static void assertEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + ASSERT_EQ(OsmlStreamParser::State::END, skipEmptyData(reader)); if (start != InvalidSourceOffset) { EXPECT_EQ(start, reader.getLocation().getStart()); } @@ -205,26 +218,14 @@ TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) assertData(reader, "hello world", 1, 14); } -TEST(OsmlStreamParser, escapeWhitespace) -{ - const char *testString = " hello\\ \\ world "; - // 012345 67 89012345 - // 0 1 - CharReader charReader(testString); - - OsmlStreamParser reader(charReader, logger); - - assertData(reader, "hello world", 1, 15); -} - static void testEscapeSpecialCharacter(const std::string &c) { CharReader charReader(std::string("\\") + c); OsmlStreamParser reader(charReader, logger); EXPECT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(c, reader.getData().asString()); + EXPECT_EQ(c, reader.getText().asString()); - SourceLocation loc = reader.getData().getLocation(); + SourceLocation loc = reader.getText().getLocation(); EXPECT_EQ(0U, loc.getStart()); EXPECT_EQ(1U + c.size(), loc.getEnd()); } @@ -253,16 +254,16 @@ TEST(OsmlStreamParser, singleLineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(33U, loc.getStart()); ASSERT_EQ(34U, loc.getEnd()); } @@ -279,16 +280,16 @@ TEST(OsmlStreamParser, multilineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(40U, loc.getStart()); ASSERT_EQ(41U, loc.getEnd()); } @@ -305,16 +306,16 @@ TEST(OsmlStreamParser, nestedMultilineComment) OsmlStreamParser reader(charReader, logger); { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("a", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(0U, loc.getStart()); ASSERT_EQ(1U, loc.getEnd()); } { ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ("b", reader.getText().asString()); + SourceLocation loc = reader.getText().getLocation(); ASSERT_EQ(40U, loc.getStart()); ASSERT_EQ(41U, loc.getEnd()); } @@ -569,8 +570,11 @@ TEST(OsmlStreamParser, multipleCommands) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); + assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); assertCommand(reader, "b", 3, 5); + assertData(reader, " ", 5, 6, WhitespaceMode::PRESERVE); assertCommand(reader, "c", 6, 8); + assertData(reader, " ", 8, 9, WhitespaceMode::PRESERVE); assertCommand(reader, "d", 9, 11); assertEnd(reader, 11, 11); } @@ -584,10 +588,13 @@ TEST(OsmlStreamParser, fieldsWithSpaces) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); + assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); assertFieldStart(reader, false, 3, 4); assertCommand(reader, "b", 4, 6); + assertData(reader, " ", 6, 7, WhitespaceMode::PRESERVE); assertCommand(reader, "c", 7, 9); assertFieldEnd(reader, 9, 10); + assertData(reader, " \n\n {", 10, 12, WhitespaceMode::PRESERVE); assertFieldStart(reader, false, 16, 17); assertCommand(reader, "d", 17, 19); assertFieldEnd(reader, 19, 20); diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp index 3293370..6942166 100644 --- a/test/formats/osxml/OsxmlEventParserTest.cpp +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -74,13 +75,11 @@ public: }; static std::vector> parseXml( - const char *testString, - WhitespaceMode whitespaceMode = WhitespaceMode::TRIM) + const char *testString) { TestOsxmlEventListener listener; CharReader reader(testString); OsxmlEventParser parser(reader, listener, logger); - parser.setWhitespaceMode(whitespaceMode); parser.parse(); return listener.events; } @@ -157,7 +156,7 @@ TEST(OsxmlEventParser, magicTopLevelTagInside) ASSERT_EQ(expectedEvents, events); } -TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) +TEST(OsxmlEventParser, commandWithData) { const char *testString = " hello \n world "; // 012345678901 234567890123 @@ -168,50 +167,12 @@ TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - auto events = parseXml(testString, WhitespaceMode::PRESERVE); + auto events = parseXml(testString); ASSERT_EQ(expectedEvents, events); // Check the location of the text ASSERT_EQ(3U, events[1].second.asArray()[0].getLocation().getStart()); ASSERT_EQ(20U, events[1].second.asArray()[0].getLocation().getEnd()); } - -TEST(OsxmlEventParser, commandWithDataTrimWhitespace) -{ - const char *testString = " hello \n world "; - // 012345678901 234567890123 - // 0 1 2 - - std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, - {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - - auto events = parseXml(testString, WhitespaceMode::TRIM); - ASSERT_EQ(expectedEvents, events); - - // Check the location of the text - ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); - ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); -} - -TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) -{ - const char *testString = " hello \n world "; - // 012345678901 234567890123 - // 0 1 2 - - std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, - {OsxmlEvent::DATA, Variant::arrayType{"hello world"}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; - - auto events = parseXml(testString, WhitespaceMode::COLLAPSE); - ASSERT_EQ(expectedEvents, events); - - // Check the location of the text - ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); - ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); -} } -- cgit v1.2.3 From b95cf0ddd1aee517ed948155d43da4e2b64cfcdf Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Thu, 26 Feb 2015 00:21:33 +0100 Subject: Fixed non-initialized variable --- src/core/parser/utils/TokenizedData.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index aeefa26..bcbbe43 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -26,6 +26,12 @@ #include "TokenizedData.hpp" namespace ousia { +/** + * Maximum token length. + */ +constexpr TokenLength MaxTokenLength = + std::numeric_limits::max(); + namespace { /** * Structure used to represent the position of a token in the internal @@ -52,12 +58,6 @@ struct TokenMark { */ bool special; - /** - * Maximum token length. - */ - static constexpr TokenLength MaxTokenLength = - std::numeric_limits::max(); - /** * Constructor of the TokenMark structure, initializes all members with the * given values. @@ -450,6 +450,7 @@ public: protectedChars.clear(); offsets.clear(); marks.clear(); + firstLinebreak = 0; currentIndentation = 0; lastIndentation = 0; numLinebreaks = 1; // Assume the stream starts with a linebreak -- cgit v1.2.3 From 041a2dd18050e9e26ca1ee00851461dff1e1f90c Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Thu, 26 Feb 2015 00:22:12 +0100 Subject: Moved "assert" functions to own header --- test/core/parser/utils/TokenizedDataTest.cpp | 39 +------------- test/core/parser/utils/TokenizedDataTestUtils.hpp | 64 +++++++++++++++++++++++ 2 files changed, 66 insertions(+), 37 deletions(-) create mode 100644 test/core/parser/utils/TokenizedDataTestUtils.hpp diff --git a/test/core/parser/utils/TokenizedDataTest.cpp b/test/core/parser/utils/TokenizedDataTest.cpp index dfe2526..8488459 100644 --- a/test/core/parser/utils/TokenizedDataTest.cpp +++ b/test/core/parser/utils/TokenizedDataTest.cpp @@ -20,44 +20,9 @@ #include -namespace ousia { - -void assertToken(TokenizedDataReader &reader, TokenId id, - const std::string &text, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::TRIM, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset, - SourceId sourceId = InvalidSourceId) -{ - Token token; - ASSERT_TRUE(reader.read(token, tokens, mode)); - EXPECT_EQ(id, token.id); - EXPECT_EQ(text, token.content); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, token.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, token.getLocation().getEnd()); - } - EXPECT_EQ(sourceId, token.getLocation().getSourceId()); -} - -void assertText(TokenizedDataReader &reader, const std::string &text, - const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::TRIM, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset, - SourceId id = InvalidSourceId) -{ - assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id); -} +#include "TokenizedDataTestUtils.hpp" -void assertEnd(TokenizedDataReader &reader) -{ - Token token; - ASSERT_TRUE(reader.atEnd()); - ASSERT_FALSE(reader.read(token)); -} +namespace ousia { TEST(TokenizedData, dataWhitespacePreserve) { diff --git a/test/core/parser/utils/TokenizedDataTestUtils.hpp b/test/core/parser/utils/TokenizedDataTestUtils.hpp new file mode 100644 index 0000000..c384f9d --- /dev/null +++ b/test/core/parser/utils/TokenizedDataTestUtils.hpp @@ -0,0 +1,64 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_TOKENIZED_DATA_TEST_UTILS_HPP_ +#define _OUSIA_TOKENIZED_DATA_TEST_UTILS_HPP_ + +namespace ousia { + +static void assertToken(TokenizedDataReader &reader, TokenId id, + const std::string &text, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId sourceId = InvalidSourceId) +{ + Token token; + ASSERT_TRUE(reader.read(token, tokens, mode)); + EXPECT_EQ(id, token.id); + EXPECT_EQ(text, token.content); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, token.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, token.getLocation().getEnd()); + } + EXPECT_EQ(sourceId, token.getLocation().getSourceId()); +} + +static void assertText(TokenizedDataReader &reader, const std::string &text, + const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId id = InvalidSourceId) +{ + assertToken(reader, Tokens::Data, text, tokens, mode, start, end, id); +} + +static void assertEnd(TokenizedDataReader &reader) +{ + Token token; + ASSERT_TRUE(reader.atEnd()); + ASSERT_FALSE(reader.read(token)); +} + +} + +#endif /* _OUSIA_TOKENIZED_DATA_TEST_UTILS_HPP_ */ + -- cgit v1.2.3 From 19dd5946125e90dcbd61966896c9f6cfc4451d80 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Thu, 26 Feb 2015 00:22:23 +0100 Subject: Reactivated TokenizerTest --- CMakeLists.txt | 2 +- test/core/parser/utils/TokenizerTest.cpp | 94 ++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 225e63d..75909e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -327,7 +327,7 @@ IF(TEST) test/core/parser/stack/StateTest test/core/parser/utils/SourceOffsetVectorTest test/core/parser/utils/TokenizedDataTest -# test/core/parser/utils/TokenizerTest + test/core/parser/utils/TokenizerTest test/core/parser/utils/TokenTrieTest test/core/resource/ResourceLocatorTest test/core/resource/ResourceRequestTest diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 0f2bfb7..785bd81 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -22,6 +22,8 @@ #include #include +#include "TokenizedDataTestUtils.hpp" + namespace ousia { TEST(Tokenizer, tokenRegistration) @@ -58,14 +60,16 @@ void expectData(const std::string &expected, SourceOffset tokenStart, { ASSERT_EQ(Tokens::Data, token.id); - Variant text = data.text(mode); - ASSERT_TRUE(text.isString()); + Token textToken; + TokenizedDataReader reader = data.reader(); + ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode)); - EXPECT_EQ(expected, text.asString()); + EXPECT_EQ(expected, textToken.content); EXPECT_EQ(tokenStart, token.location.getStart()); EXPECT_EQ(tokenEnd, token.location.getEnd()); - EXPECT_EQ(textStart, text.getLocation().getStart()); - EXPECT_EQ(textEnd, text.getLocation().getEnd()); + EXPECT_EQ(textStart, textToken.getLocation().getStart()); + EXPECT_EQ(textEnd, textToken.getLocation().getEnd()); + EXPECT_TRUE(reader.atEnd()); } TEST(Tokenizer, textTokenPreserveWhitespace) @@ -97,8 +101,8 @@ TEST(Tokenizer, textTokenPreserveWhitespace) TokenizedData data; ASSERT_TRUE(tokenizer.read(reader, token, data)); - expectData("this \t is only a \n\n test text", 0, 32, 0, 32, - token, data, WhitespaceMode::PRESERVE); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, token, + data, WhitespaceMode::PRESERVE); data.clear(); ASSERT_FALSE(tokenizer.read(reader, token, data)); @@ -134,8 +138,8 @@ TEST(Tokenizer, textTokenTrimWhitespace) TokenizedData data; ASSERT_TRUE(tokenizer.read(reader, token, data)); - expectData("this \t is only a \n\n test text", 0, 32, 0, 32, - token, data, WhitespaceMode::TRIM); + expectData("this \t is only a \n\n test text", 0, 32, 0, 32, token, + data, WhitespaceMode::TRIM); data.clear(); ASSERT_FALSE(tokenizer.read(reader, token, data)); @@ -368,9 +372,12 @@ TEST(Tokenizer, commentTestWhitespacePreserve) if (te.id != Tokens::Data) { EXPECT_EQ(te.content, t.content); } else { - Variant text = data.text(WhitespaceMode::PRESERVE); - ASSERT_TRUE(text.isString()); - EXPECT_EQ(te.content, text.asString()); + TokenizedDataReader dataReader = data.reader(); + Token textToken; + ASSERT_TRUE(dataReader.read(textToken, TokenSet{}, + WhitespaceMode::PRESERVE)); + EXPECT_TRUE(dataReader.atEnd()); + EXPECT_EQ(te.content, textToken.content); } EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); EXPECT_EQ(te.location.getStart(), t.location.getStart()); @@ -380,5 +387,68 @@ TEST(Tokenizer, commentTestWhitespacePreserve) TokenizedData data; ASSERT_FALSE(tokenizer.read(reader, t, data)); } + +TEST(Tokenizer, nonPrimaryTokens) +{ + CharReader reader{ + "<>"}; + // 012345678901234567890 12345678901234567890123456789012345678901234567 + // 0 1 2 3 4 5 6 + + Tokenizer tokenizer; + + TokenId tBackslash = tokenizer.registerToken("\\"); + TokenId tDollar = tokenizer.registerToken("$", false); + TokenId tSpeechStart = tokenizer.registerToken("<<", false); + TokenId tSpeechEnd = tokenizer.registerToken(">>", false); + + TokenSet tokens = TokenSet{tDollar, tSpeechStart, tSpeechEnd}; + + Token token, textToken; + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertToken(dataReader, tSpeechStart, "<<", tokens, + WhitespaceMode::TRIM, 0, 2); + assertText(dataReader, "switch to", tokens, WhitespaceMode::TRIM, 2, + 11); + assertToken(dataReader, tDollar, "$", tokens, WhitespaceMode::TRIM, 12, + 13); + assertText(dataReader, "inline", tokens, WhitespaceMode::TRIM, 13, 19); + assertEnd(dataReader); + } + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(tBackslash, token.id); + ASSERT_EQ(20U, token.location.getStart()); + ASSERT_EQ(21U, token.location.getEnd()); + } + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertText(dataReader, "math mode", tokens, WhitespaceMode::TRIM, 21, + 30); + assertToken(dataReader, tDollar, "$", tokens, WhitespaceMode::TRIM, 30, + 31); + assertText(dataReader, "they said, see the world they said", tokens, + WhitespaceMode::TRIM, 32, 66); + assertToken(dataReader, tSpeechEnd, ">>", tokens, WhitespaceMode::TRIM, + 66, 68); + assertEnd(dataReader); + } + + TokenizedData data; + ASSERT_FALSE(tokenizer.read(reader, token, data)); +} } -- cgit v1.2.3 From 12e10d18810b7ea4ce142d76e846b4faf0c33488 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Fri, 27 Feb 2015 18:52:43 +0100 Subject: Made OsmlStreamParser ready for user defined tokens, started to adapt unit tests. --- CMakeLists.txt | 32 +- src/formats/osml/OsmlStreamParser.cpp | 701 +++++++++---- src/formats/osml/OsmlStreamParser.hpp | 298 ++---- test/formats/osml/OsmlStreamParserTest.cpp | 1542 ++++++++++++++-------------- 4 files changed, 1355 insertions(+), 1218 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 75909e9..4e2d7f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,14 +212,14 @@ ADD_LIBRARY(ousia_core # ousia_core #) -#ADD_LIBRARY(ousia_osml +ADD_LIBRARY(ousia_osml # src/formats/osml/OsmlParser -# src/formats/osml/OsmlStreamParser -#) + src/formats/osml/OsmlStreamParser +) -#TARGET_LINK_LIBRARIES(ousia_osml -# ousia_core -#) +TARGET_LINK_LIBRARIES(ousia_osml + ousia_core +) ADD_LIBRARY(ousia_osxml src/formats/osxml/OsxmlAttributeLocator @@ -383,17 +383,17 @@ IF(TEST) # ousia_mozjs # ) -# ADD_EXECUTABLE(ousia_test_osml + ADD_EXECUTABLE(ousia_test_osml # test/formats/osml/OsmlParserTest -# test/formats/osml/OsmlStreamParserTest -# ) + test/formats/osml/OsmlStreamParserTest + ) -# TARGET_LINK_LIBRARIES(ousia_test_osml -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_osml -# ousia_filesystem -# ) + TARGET_LINK_LIBRARIES(ousia_test_osml + ${GTEST_LIBRARIES} + ousia_core + ousia_osml + ousia_filesystem + ) # ADD_EXECUTABLE(ousia_test_osxml # test/formats/osxml/OsxmlEventParserTest @@ -423,7 +423,7 @@ IF(TEST) ADD_TEST(ousia_test_filesystem ousia_test_filesystem) ADD_TEST(ousia_test_html ousia_test_html) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) -# ADD_TEST(ousia_test_osml ousia_test_osml) + ADD_TEST(ousia_test_osml ousia_test_osml) # ADD_TEST(ousia_test_osxml ousia_test_osxml) ADD_TEST(ousia_test_xml ousia_test_xml) ENDIF() diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index d4cdbf8..7e01a3c 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -19,92 +19,411 @@ #include #include #include +#include #include +#include +#include + #include "OsmlStreamParser.hpp" +#include +#include + namespace ousia { +namespace { /** - * Plain format default tokenizer. + * Osml format default tokenizer. Registers the primary tokens in its + * constructor. A single, static instance of this class is created as + * "OsmlTokens", which is copied to the Tokenizer instance of + * OsmlStreamParserImpl. */ -class PlainFormatTokens : public Tokenizer { +class OsmlFormatTokens : public Tokenizer { public: + TokenId Backslash; + TokenId LineComment; + TokenId BlockCommentStart; + TokenId BlockCommentEnd; + TokenId FieldStart; + TokenId FieldEnd; + TokenId DefaultFieldStart; + TokenId AnnotationStart; + TokenId AnnotationEnd; + /** - * Id of the backslash token. + * Registers the plain format tokens in the internal tokenizer. */ - TokenId Backslash; + OsmlFormatTokens() + { + Backslash = registerToken("\\"); + LineComment = registerToken("%"); + BlockCommentStart = registerToken("%{"); + BlockCommentEnd = registerToken("}%"); + FieldStart = registerToken("{"); + FieldEnd = registerToken("}"); + DefaultFieldStart = registerToken("{!"); + AnnotationStart = registerToken("<\\"); + AnnotationEnd = registerToken("\\>"); + } +}; + +/** + * Instance of OsmlFormatTokens used to initialize the internal tokenizer + * instance of OsmlStreamParserImpl. + */ +static const OsmlFormatTokens OsmlTokens; +/** + * Structure representing a field. + */ +struct Field { /** - * Id of the line comment token. + * Specifies whether this field was marked as default field. */ - TokenId LineComment; + bool defaultField; /** - * Id of the block comment start token. + * Location at which the field was started. */ - TokenId BlockCommentStart; + SourceLocation location; /** - * Id of the block comment end token. + * Constructor of the Field structure, initializes all member variables with + * the given values. + * + * @param defaultField is a flag specifying whether this field is a default + * field. + * @param location specifies the location at which the field was started. */ - TokenId BlockCommentEnd; + Field(bool defaultField = false, + const SourceLocation &location = SourceLocation{}) + : defaultField(defaultField), location(location) + { + } +}; +/** + * Entry used for the command stack. + */ +class Command { +private: /** - * Id of the field start token. + * Name and location of the current command. */ - TokenId FieldStart; + Variant name; /** - * Id of the field end token. + * Arguments that were passed to the command. */ - TokenId FieldEnd; + Variant arguments; /** - * Id of the default field start token. + * Vector used as stack for holding the number of opening/closing braces + * and the corresponding "isDefaultField" flag. */ - TokenId DefaultFieldStart; + std::vector fields; /** - * Id of the annotation start token. + * Set to true if this is a command with clear begin and end. */ - TokenId AnnotationStart; + bool hasRange; +public: /** - * Id of the annotation end token. + * Default constructor, marks this command as normal, non-range command. */ - TokenId AnnotationEnd; + Command() : hasRange(false) {} /** - * Registers the plain format tokens in the internal tokenizer. + * Constructor of the Command class. + * + * @param name is a string variant with name and location of the + * command. + * @param arguments is a map variant with the arguments given to the + * command. + * @param hasRange should be set to true if this is a command with + * explicit range. */ - PlainFormatTokens() + Command(Variant name, Variant arguments, bool hasRange) + : name(std::move(name)), + arguments(std::move(arguments)), + hasRange(hasRange) { - Backslash = registerToken("\\"); - LineComment = registerToken("%"); - BlockCommentStart = registerToken("%{"); - BlockCommentEnd = registerToken("}%"); - FieldStart = registerToken("{"); - FieldEnd = registerToken("}"); - DefaultFieldStart = registerToken("{!"); - AnnotationStart = registerToken("<\\"); - AnnotationEnd = registerToken("\\>"); + } + + /** + * Returns a reference at the variant representing name and location of the + * command. + * + * @return a variant containing name and location of the command. + */ + const Variant &getName() const { return name; } + + /** + * Returns a reference at the variant containing name, value and location of + * the arguments. + * + * @return the arguments stored for the command. + */ + const Variant &getArguments() const { return arguments; } + + /** + * Returns a reference at the internal field list. This list should be used + * for printing error messages when fields are still open although the outer + * range field closes. + * + * @return a const reference at the internal field vector. + */ + const std::vector &getFields() const { return fields; } + + /** + * Returns true if this command is currently in a default field. + * + * @return true if the current field on the field stack was explicitly + * marked as default field. If the field stack is empty, true is returned + * if this is a range command. + */ + bool inDefaultField() const + { + return (!fields.empty() && fields.back().defaultField) || + (fields.empty() && hasRange); + } + + /** + * Returns true if this command currently is in any field. + * + * @return true if a field is on the stack or this is a range commands. + * Range commands always are in a field. + */ + bool inField() const { return !fields.empty() || hasRange; } + + /** + * Returns true if this command currently is in a range field. + * + * @return true if the command has a range and no other ranges are on the + * stack. + */ + bool inRangeField() const { return fields.empty() && hasRange; } + + /** + * Returns true if this command currently is in a non-range field. + * + * @return true if the command is in a field, but the field is not the field + * constructed by the "range" + */ + bool inNonRangeField() const { return !fields.empty(); } + + /** + * Pushes another field onto the field stack of this command. + * + * @param defaultField if true, explicitly marks this field as default + * field. + * @param location is the source location at which the field was started. + * Used for error messages in which the user is notified about an error with + * too few closing fields. + */ + void pushField(bool defaultField = false, + const SourceLocation &location = SourceLocation{}) + { + fields.emplace_back(defaultField, location); + } + + /** + * Removes another field from the field stack of this command, returns true + * if the operation was successful. + * + * @return true if there was a field to pop on the stack, false otherwise. + */ + bool popField() + { + if (!fields.empty()) { + fields.pop_back(); + return true; + } + return false; } }; +} -static const PlainFormatTokens OsmlTokens; +/* Class OsmlStreamParserImpl */ -OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) - : reader(reader), - logger(logger), - tokenizer(OsmlTokens), - data(reader.getSourceId()) +/** + * Internal implementation of OsmlStreamParser. + */ +class OsmlStreamParserImpl { +public: + /** + * State enum compatible with OsmlStreamParserState but extended by two more + * entries (END and NONE). + */ + enum class State : uint8_t { + COMMAND_START = 0, + COMMAND_END = 1, + FIELD_START = 2, + FIELD_END = 3, + ANNOTATION_START = 4, + ANNOTATION_END = 5, + DATA = 6, + END = 7, + RECOVERABLE_ERROR = 8, + IRRECOVERABLE_ERROR = 9 + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; + + /** + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Tokenizer instance used to read individual tokens from the text. + */ + Tokenizer tokenizer; + + /** + * Stack containing the current commands. + */ + std::stack commands; + + /** + * Variant containing the tokenized data that was returned from the + * tokenizer as data. + */ + TokenizedData data; + + /** + * Variable containing the current location of the parser. + */ + SourceLocation location; + + /** + * Function used internally to parse an identifier. + * + * @param start is the start byte offset of the identifier (including the + * backslash). + * @param allowNSSep should be set to true if the namespace separator is + * allowed in the identifier name. Issues error if the namespace separator + * is placed incorrectly. + */ + Variant parseIdentifier(size_t start, bool allowNSSep = false); + + /** + * Function used internally to handle the special "\begin" command. + * + * @return an internal State specifying whether an error occured (return + * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a + * command was actually started (return value State::COMMAND_START). + */ + State parseBeginCommand(); + + /** + * Function used internally to handle the special "\end" command. + * + * @return an internal State specifying whether an error occured (return + * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a + * command was actually ended (return value State::COMMAND_END). + */ + State parseEndCommand(); + + /** + * Parses the command arguments. Handles errors if the name of the command + * was given using the hash notation and as a name field. + * + * @param commandArgName is the name argument that was given using the hash + * notation. + * @return a map variant containing the arguments. + */ + Variant parseCommandArguments(Variant commandArgName); + + /** + * Function used internally to parse a command. + * + * @param start is the start byte offset of the command (including the + * backslash) + * @param isAnnotation if true, the command is not returned as command, but + * as annotation start. + * @return true if a command was actuall parsed, false otherwise. + */ + State parseCommand(size_t start, bool isAnnotation); + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseLineComment(); + + /** + * Pushes the parsed command onto the command stack. + */ + void pushCommand(Variant commandName, Variant commandArguments, + bool hasRange); + + /** + * Checks whether there is any data pending to be issued, if yes, resets the + * currently peeked characters and returns true. + * + * @return true if there was any data and DATA should be returned by the + * parse function, false otherwise. + */ + bool checkIssueData(); + + /** + * Returns a reference at the current command at the top of the command + * stack. + * + * @return a reference at the top command in the command stack. + */ + Command &cmd() { return commands.top(); } + + /** + * Returns a reference at the current command at the top of the command + * stack. + * + * @return a reference at the top command in the command stack. + */ + const Command &cmd() const { return commands.top(); } + +public: + /** + * Constructor of the OsmlStreamParserImpl class. Attaches the new + * OsmlStreamParserImpl to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + OsmlStreamParserImpl(CharReader &reader, Logger &logger); + + State parse(); + + const TokenizedData &getData() const { return data; } + const Variant &getCommandName() const { return cmd().getName(); } + const Variant &getCommandArguments() const { return cmd().getArguments(); } + const SourceLocation &getLocation() const { return location; } + bool inRangeCommand() const { return cmd().inRangeField(); }; + bool inDefaultField() const { return cmd().inDefaultField(); } +}; + +/* Class OsmlStreamParserImpl */ + +OsmlStreamParserImpl::OsmlStreamParserImpl(CharReader &reader, Logger &logger) + : reader(reader), logger(logger), tokenizer(OsmlTokens) { - // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true, false}); + commands.emplace("", Variant::mapType{}, true); } -Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) +Variant OsmlStreamParserImpl::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; bool hasCharSinceNSSep = false; @@ -147,20 +466,20 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) return res; } -OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseBeginCommand() { // Expect a '{' after the command reader.consumeWhitespace(); if (!reader.expect('{')) { logger.error("Expected \"{\" after \\begin", reader); - return State::NONE; + return State::RECOVERABLE_ERROR; } // Parse the name of the command that should be opened Variant commandName = parseIdentifier(reader.getOffset(), true); if (commandName.asString().empty()) { logger.error("Expected identifier", commandName); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } // Check whether the next character is a '#', indicating the start of the @@ -176,7 +495,7 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() if (!reader.expect('}')) { logger.error("Expected \"}\"", reader); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } // Parse the arguments @@ -185,28 +504,15 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() // Push the command onto the command stack pushCommand(std::move(commandName), std::move(commandArguments), true); - return State::COMMAND; -} - -static bool checkStillInField(const OsmlStreamParser::Command &cmd, - const Variant &endName, Logger &logger) -{ - if (cmd.inField && !cmd.inRangeField) { - logger.error(std::string("\\end in open field of command \"") + - cmd.name.asString() + std::string("\""), - endName); - logger.note(std::string("Open command started here:"), cmd.name); - return true; - } - return false; + return State::COMMAND_START; } -OsmlStreamParser::State OsmlStreamParser::parseEndCommand() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseEndCommand() { // Expect a '{' after the command if (!reader.expect('{')) { logger.error("Expected \"{\" after \\end", reader); - return State::NONE; + return State::RECOVERABLE_ERROR; } // Fetch the name of the command that should be ended here @@ -215,56 +521,58 @@ OsmlStreamParser::State OsmlStreamParser::parseEndCommand() // Make sure the given command name is not empty if (name.asString().empty()) { logger.error("Expected identifier", name); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } // Make sure the command name is terminated with a '}' if (!reader.expect('}')) { logger.error("Expected \"}\"", reader); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } - // Unroll the command stack up to the last range command - while (!commands.top().hasRange) { - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; + // Unroll the command stack up to the last range command, make sure we do + // not intersect with any open field + while (!cmd().inRangeField()) { + if (cmd().inField()) { + logger.error(std::string("\\end in open field of command \"") + + cmd().getName().asString() + std::string("\""), + name); + const std::vector &fields = cmd().getFields(); + for (const Field &field : fields) { + logger.note(std::string("Still open field started here: "), + field.location); + } + return State::IRRECOVERABLE_ERROR; } commands.pop(); } - // Make sure we're not in an open field of this command - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - // Special error message if the top-level command is reached if (commands.size() == 1) { logger.error(std::string("Cannot end command \"") + name.asString() + std::string("\" here, no command open"), name); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } - // Inform the about command mismatches - const Command &cmd = commands.top(); - if (commands.top().name.asString() != name.asString()) { - logger.error(std::string("Trying to end command \"") + - cmd.name.asString() + + // Inform the user about command mismatches, copy the current command + // descriptor before popping it from the stack + if (getCommandName().asString() != name.asString()) { + logger.error(std::string("Trying to end command \"") + name.asString() + std::string("\", but open command is \"") + - name.asString() + std::string("\""), + getCommandName().asString() + std::string("\""), name); - logger.note("Last command was opened here:", cmd.name); - return State::ERROR; + logger.note("Open command started here:", getCommandName()); + return State::IRRECOVERABLE_ERROR; } - // Set the location to the location of the command that was ended, then end - // the current command + // End the current command location = name.getLocation(); commands.pop(); - return cmd.inRangeField ? State::FIELD_END : State::NONE; + return State::COMMAND_END; } -Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) +Variant OsmlStreamParserImpl::parseCommandArguments(Variant commandArgName) { // Parse the arguments using the universal VariantReader Variant commandArguments; @@ -290,29 +598,14 @@ Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) return commandArguments; } -void OsmlStreamParser::pushCommand(Variant commandName, - Variant commandArguments, bool hasRange) -{ - // Store the location on the stack - location = commandName.getLocation(); - - // Place the command on the command stack, remove the last commands if we're - // not currently inside a field of these commands - while (!commands.top().inField) { - commands.pop(); - } - commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false, false}); -} - -OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, - bool isAnnotation) +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseCommand( + size_t start, bool isAnnotation) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); if (commandName.asString().empty()) { logger.error("Empty command name", reader); - return State::NONE; + return State::RECOVERABLE_ERROR; } // Handle the special "begin" and "end" commands @@ -322,7 +615,7 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, const bool isEnd = commandNameComponents[0] == "end"; // Parse the begin or end command - State res = State::COMMAND; + State res = State::COMMAND_START; if (isBegin || isEnd) { if (commandNameComponents.size() > 1) { logger.error( @@ -378,12 +671,13 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, } else { // Make sure no arguments apart from the "name" argument are given // to an annotation end - Variant::mapType &map = commands.top().arguments.asMap(); + const Variant::mapType &map = getCommandArguments().asMap(); if (!map.empty()) { if (map.count("name") == 0 || map.size() > 1U) { logger.error( "An annotation end command may not have any arguments " - "other than \"name\""); + "other than \"name\"", + reader); return res; } } @@ -397,13 +691,13 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, // If we're starting an annotation, return the command as annotation start // instead of command - if (isAnnotation && res == State::COMMAND) { + if (isAnnotation && res == State::COMMAND_START) { return State::ANNOTATION_START; } return res; } -void OsmlStreamParser::parseBlockComment() +void OsmlStreamParserImpl::parseBlockComment() { Token token; size_t depth = 1; @@ -426,7 +720,7 @@ void OsmlStreamParser::parseBlockComment() logger.error("File ended while being in a block comment", reader); } -void OsmlStreamParser::parseLineComment() +void OsmlStreamParserImpl::parseLineComment() { char c; while (reader.read(c)) { @@ -436,65 +730,34 @@ void OsmlStreamParser::parseLineComment() } } -bool OsmlStreamParser::checkIssueData() -{ - if (!data.empty()) { - location = data.getLocation(); - reader.resetPeek(); - return true; - } - return false; -} - -bool OsmlStreamParser::checkIssueFieldStart() +void OsmlStreamParserImpl::pushCommand(Variant commandName, + Variant commandArguments, bool hasRange) { - // Fetch the current command, and check whether we're currently inside a - // field of this command - Command &cmd = commands.top(); - if (!cmd.inField) { - // If this is a range command, we're now implicitly inside the field of - // this command -- we'll have to issue a field start command! - if (cmd.hasRange) { - cmd.inField = true; - cmd.inRangeField = true; - reader.resetPeek(); - return true; - } + // Store the location of the command + location = commandName.getLocation(); - // This was not a range command, so obviously we're now inside within - // a field of some command -- so unroll the commands stack until a - // command with open field is reached - while (!commands.top().inField) { - commands.pop(); - } + // Place the command on the command stack, remove the last commands if we're + // not currently inside a field of these commands + while (!cmd().inField()) { + commands.pop(); } - return false; + + // Push the new command onto the command stack + commands.emplace(std::move(commandName), std::move(commandArguments), + hasRange); } -bool OsmlStreamParser::closeField() +bool OsmlStreamParserImpl::checkIssueData() { - // Try to end an open field of the current command -- if the current command - // is not inside an open field, end this command and try to close the next - // one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - if (cmd.inDefaultField) { - commands.pop(); - } - return true; - } - commands.pop(); - } else { - return false; - } + if (!data.empty()) { + location = data.getLocation(); + reader.resetPeek(); + return true; } return false; } -OsmlStreamParser::State OsmlStreamParser::parse() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() { // Reset the data handler data.clear(); @@ -507,14 +770,6 @@ OsmlStreamParser::State OsmlStreamParser::parse() // Special handling for Backslash and Text if (type == OsmlTokens.Backslash || type == OsmlTokens.AnnotationStart) { - // Before appending anything to the output data or starting a new - // command, check whether FIELD_START has to be issued, as the - // current command is a command with range - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - // Check whether a command starts now, without advancing the peek // cursor char c; @@ -535,11 +790,11 @@ OsmlStreamParser::State OsmlStreamParser::parse() State res = parseCommand(token.location.getStart(), type == OsmlTokens.AnnotationStart); switch (res) { - case State::ERROR: + case State::IRRECOVERABLE_ERROR: throw LoggableException( "Last error was irrecoverable, ending parsing " "process"); - case State::NONE: + case State::RECOVERABLE_ERROR: continue; default: return res; @@ -558,15 +813,12 @@ OsmlStreamParser::State OsmlStreamParser::parse() token.location.getStart() + 1); } - data.append(c, token.location.getStart(), reader.getPeekOffset()); + // Append the character to the output data, mark it as protected + data.append(c, token.location.getStart(), reader.getPeekOffset(), + true); reader.consumePeek(); continue; } else if (type == Tokens::Data) { - // Check whether FIELD_START has to be issued before appending text - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } reader.consumePeek(); continue; } @@ -580,7 +832,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() // We will handle the token now, consume the peeked characters reader.consumePeek(); - // Update the location to the current token location + // Synchronize the location with the current token location location = token.location; if (token.id == OsmlTokens.LineComment) { @@ -588,39 +840,27 @@ OsmlStreamParser::State OsmlStreamParser::parse() } else if (token.id == OsmlTokens.BlockCommentStart) { parseBlockComment(); } else if (token.id == OsmlTokens.FieldStart) { - Command &cmd = commands.top(); - if (!cmd.inField) { - cmd.inField = true; - } + cmd().pushField(false, token.location); return State::FIELD_START; -/* logger.error( - "Got field start token \"{\", but no command for which to " - "start the field. Write \"\\{\" to insert this sequence as " - "text.", - token);*/ } else if (token.id == OsmlTokens.FieldEnd) { - closeField(); - return State::FIELD_END; -/* if (closeField()) { + // Remove all commands from the list that currently are not in any + // field + while (!cmd().inField()) { + commands.pop(); + } + + // If the remaining command is not in a range field, remove this + // command + if (cmd().inNonRangeField()) { + cmd().popField(); return State::FIELD_END; } logger.error( - "Got field end token \"}\", but there is no field to end. " - "Write \"\\}\" to insert this sequence as text.", - token);*/ + "Got field end token \"}\", but there is no field to end.", + token); } else if (token.id == OsmlTokens.DefaultFieldStart) { - // Try to start a default field the first time the token is reached - Command &topCmd = commands.top(); - if (!topCmd.inField) { - topCmd.inField = true; - topCmd.inDefaultField = true; - } + cmd().pushField(true, token.location); return State::FIELD_START; -/* logger.error( - "Got default field start token \"{!\", but no command for " - "which to start the field. Write \"\\{!\" to insert this " - "sequence as text", - token);*/ } else if (token.id == OsmlTokens.AnnotationEnd) { // We got a single annotation end token "\>" -- simply issue the // ANNOTATION_END event @@ -641,11 +881,25 @@ OsmlStreamParser::State OsmlStreamParser::parse() // Make sure all open commands and fields have been ended at the end of the // stream while (commands.size() > 1) { - Command &cmd = commands.top(); - if (cmd.inField || cmd.hasRange) { - logger.error("Reached end of stream, but command \"" + - cmd.name.asString() + "\" has not been ended", - cmd.name); + if (cmd().inField()) { + // If the stream ended with an open range field, issue information + // about the range field + if (cmd().inRangeField()) { + // Inform about the still open command itself + logger.error("Reached end of stream, but command \"" + + getCommandName().asString() + + "\" has not been ended", + getCommandName()); + } else { + // Issue information about still open fields + const std::vector &fields = cmd().getFields(); + if (!fields.empty()) { + logger.error( + std::string( + "Reached end of stream, but field is still open."), + fields.back().location); + } + } } commands.pop(); } @@ -654,26 +908,45 @@ OsmlStreamParser::State OsmlStreamParser::parse() return State::END; } -Variant OsmlStreamParser::getText(WhitespaceMode mode) +/* Class OsmlStreamParser */ + +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) + : impl(new OsmlStreamParserImpl(reader, logger)) +{ +} + +OsmlStreamParser::~OsmlStreamParser() +{ + // Stub needed because OsmlStreamParserImpl is incomplete in header +} + +OsmlStreamParser::State OsmlStreamParser::parse() +{ + return static_cast(impl->parse()); +} + +const TokenizedData &OsmlStreamParser::getData() const { - TokenizedData dataFork = data; - Variant text = dataFork.text(mode); - location = text.getLocation(); - return text; + return impl->getData(); } const Variant &OsmlStreamParser::getCommandName() const { - return commands.top().name; + return impl->getCommandName(); } const Variant &OsmlStreamParser::getCommandArguments() const { - return commands.top().arguments; + return impl->getCommandArguments(); } -bool OsmlStreamParser::inDefaultField() const +const SourceLocation &OsmlStreamParser::getLocation() const { - return commands.top().inRangeField || commands.top().inDefaultField; + return impl->getLocation(); } + +bool OsmlStreamParser::inDefaultField() const { return impl->inDefaultField(); } + +bool OsmlStreamParser::inRangeCommand() const { return impl->inRangeCommand(); } + } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 453a2bb..1fee90b 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,30 +29,29 @@ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ +#include #include -#include -#include -#include -#include - namespace ousia { // Forward declarations class CharReader; class Logger; class OsmlStreamParserImpl; +class TokenizedData; +class Variant; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml * format. The parser is constructed around a "parse" function, which reads data * from the underlying CharReader until a new state is reached and indicates * this state in a return value. The calling code then has to pull corresponding - * data from the stream reader. The reader makes sure the incommind file is + * data from the stream reader. The reader makes sure the incomming stream is * syntactically valid and tries to recorver from most errors. If an error is * irrecoverable (this is the case for errors with wrong nesting of commands or * fields, as this would lead to too many consecutive errors) a - * LoggableException is thrown. + * LoggableException is thrown. The OsmlStreamParser can be compared to a SAX + * parser for XML. */ class OsmlStreamParser { public: @@ -60,39 +59,21 @@ public: * Enum used to indicate which state the OsmlStreamParser class is in * after calling the "parse" function. */ - enum class State { - /** - * State returned if a fully featured command has been read. A command - * consists of the command name and its arguments (which optionally - * includes the name). - */ - COMMAND, - - /** - * State returned if data is given. The reader must decide which field - * or command this should be routed to. Trailing or leading whitespace - * has been removed. Only called if the data is non-empty. - */ - DATA, - + enum class State : uint8_t { /** - * A user-defined entity has been found. The entity sequence is stored - * in the command name. + * State returned if the start of a command has been read. Use the + * getCommandName(), getCommandArguments() and inRangeCommand() + * functions the retrieve more information about the command that was + * just started. */ - ENTITY, + COMMAND_START = 0, /** - * State returned if an annotation was started. An annotation consists - * of the command name and its arguments (which optionally include the - * name). + * State returned if a range command has just ended. This state is not + * returned for non-range commands (as the actual end of a command is + * context dependant). */ - ANNOTATION_START, - - /** - * State returned if an annotation ends. The reader indicates which - * annotation ends. - */ - ANNOTATION_END, + COMMAND_END = 1, /** * State returned if a new field started. The reader assures that the @@ -100,200 +81,47 @@ public: * is not started if data has been given outside of a field. The * field number is set to the current field index. */ - FIELD_START, + FIELD_START = 2, /** * State returned if the current field ends. The reader assures that a * field was actually open. */ - FIELD_END, + FIELD_END = 3, /** - * The end of the stream has been reached. + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). */ - END, + ANNOTATION_START = 4, /** - * Returned from internal functions if nothing should be done. + * State returned if an annotation ends. The reader indicates which + * annotation ends. */ - NONE, + ANNOTATION_END = 5, /** - * Returned from internal function to indicate irrecoverable errors. + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. */ - ERROR - }; - - /** - * Entry used for the command stack. - */ - struct Command { - /** - * Name and location of the current command. - */ - Variant name; - - /** - * Arguments that were passed to the command. - */ - Variant arguments; - - /** - * Vector used as stack for holding the number of opening/closing braces - * and the corresponding "isDefaultField" flag. - */ - std::vector fields; - - /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange; - - /** - * Default constructor. - */ - Command() - : hasRange(false), - inField(false), - inDefaultField() - { - } + DATA = 6, /** - * Constructor of the Command class. - * - * @param name is a string variant with name and location of the - * command. - * @param arguments is a map variant with the arguments given to the - * command. - * @param hasRange should be set to true if this is a command with - * explicit range. - * @param inDefaultField is set to true if we currently are in a - * specially marked default field. - */ - Command(Variant name, Variant arguments, bool hasRange) - : name(std::move(name)), - arguments(std::move(arguments)), - hasRange(hasRange), - inField(inField), - inRangeField(inRangeField), - inDefaultField(inDefaultField) - { - } + * The end of the stream has been reached. + */ + END = 7 }; private: /** - * Reference to the CharReader instance from which the incomming bytes are - * read. - */ - CharReader &reader; - - /** - * Reference at the logger instance to which all error messages are sent. - */ - Logger &logger; - - /** - * Tokenizer instance used to read individual tokens from the text. - */ - Tokenizer tokenizer; - - /** - * Variant containing the tokenized data that was returned from the - * tokenizer as data. - */ - TokenizedData data; - - /** - * Stack containing the current commands. - */ - std::stack commands; - - /** - * Pointer at + * Pointer at the class containing the internal implementation (according + * to the PIMPL idiom). */ std::unique_ptr impl; - /** - * Function used internall to parse an identifier. - * - * @param start is the start byte offset of the identifier (including the - * backslash). - * @param allowNSSep should be set to true if the namespace separator is - * allowed in the identifier name. Issues error if the namespace separator - * is placed incorrectly. - */ - Variant parseIdentifier(size_t start, bool allowNSSep = false); - - /** - * Function used internally to handle the special "\begin" command. - */ - State parseBeginCommand(); - - /** - * Function used internally to handle the special "\end" command. - */ - State parseEndCommand(); - - /** - * Pushes the parsed command onto the command stack. - */ - void pushCommand(Variant commandName, Variant commandArguments, - bool hasRange); - - /** - * Parses the command arguments. - */ - Variant parseCommandArguments(Variant commandArgName); - - /** - * Function used internally to parse a command. - * - * @param start is the start byte offset of the command (including the - * backslash) - * @param isAnnotation if true, the command is not returned as command, but - * as annotation start. - * @return true if a command was actuall parsed, false otherwise. - */ - State parseCommand(size_t start, bool isAnnotation); - - /** - * Function used internally to parse a block comment. - */ - void parseBlockComment(); - - /** - * Function used internally to parse a generic comment. - */ - void parseLineComment(); - - /** - * Checks whether there is any data pending to be issued, if yes, issues it. - * - * @return true if there was any data and DATA should be returned by the - * parse function, false otherwise. - */ - bool checkIssueData(); - - /** - * Called before any data is appended to the internal data handler. Checks - * whether a new field should be started or implicitly ended. - * - * @return true if FIELD_START should be returned by the parse function. - */ - bool checkIssueFieldStart(); - - /** - * Closes a currently open field. Note that the command will be removed from - * the internal command stack if the field that is being closed is a - * field marked as default field. - * - * @return true if the field could be closed, false if there was no field - * to close. - */ - bool closeField(); - public: /** * Constructor of the OsmlStreamParser class. Attaches the new @@ -321,30 +149,10 @@ public: */ State parse(); - /** - * Returns a reference at the internally stored data. Only valid if - * State::DATA was returned by the "parse" function. - * - * @return a reference at a variant containing the data parsed by the - * "parse" function. - */ - const TokenizedData &getData() const { return data; } - - /** - * Returns the complete content of the internal TokenizedData instance as - * a single string Variant. This method is mainly used in the unit tests for - * this class, it simply calls the text() method of TokenizedData. - * - * @param mode is the WhitespaceMode that should be used for returning the - * text. - * @return a string variant containing the text content of the internal - * TokenizedData instance or a nullptr variant if there is no text. - */ - Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE); - /** * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. + * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END + * was returned by the "parse" function. * * @return a reference at a variant containing name and location of the * parsed command. @@ -353,18 +161,46 @@ public: /** * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. + * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END + * was returned by the "parse" function. * * @return a reference at a variant containing arguments given to the * command. */ const Variant &getCommandArguments() const; + /** + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. + */ + const TokenizedData &getData() const; + + /** + * Returns the location of the current token. + */ + const SourceLocation &getLocation() const; + + /** + * Returns true if the currently started command is a range command, only + * valid if State::COMMAND_START was returned by the "parse" function. + * + * @return true if the command is started is a range command, false + * otherwise. + */ + bool inRangeCommand() const; + /** * Returns true if the current field is the "default" field. This is true if * the parser either is in the outer range of a range command or inside a - * field that has been especially marked as "default" field (using the "|" - * syntax). + * field that has been especially marked as "default" field (using the "{!" + * syntax). Only valid if State::FIELD_START was returned by the "parse" + * function. + * + * @return true if the current field was marked as default field (using the + * "{!" syntax). */ bool inDefaultField() const; }; diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index 3d01007..8b64e51 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -21,7 +21,9 @@ #include #include +#include #include +#include #include @@ -30,147 +32,196 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); // static ConcreteLogger logger; -static OsmlStreamParser::State skipEmptyData(OsmlStreamParser &reader) +static void assertCommandStart(OsmlStreamParser &parser, + const std::string &name, + bool rangeCommand, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) { - OsmlStreamParser::State res = reader.parse(); - if (res == OsmlStreamParser::State::DATA) { - EXPECT_FALSE(reader.getData().hasNonWhitespaceText()); - res = reader.parse(); - } - return res; -} - -static void assertCommand(OsmlStreamParser &reader, const std::string &name, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::COMMAND, skipEmptyData(reader)); - EXPECT_EQ(name, reader.getCommandName().asString()); + ASSERT_EQ(OsmlStreamParser::State::COMMAND_START, parser.parse()); + EXPECT_EQ(name, parser.getCommandName().asString()); + EXPECT_EQ(rangeCommand, parser.inRangeCommand()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); + EXPECT_EQ(start, parser.getCommandName().getLocation().getStart()); + EXPECT_EQ(start, parser.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); + EXPECT_EQ(end, parser.getCommandName().getLocation().getEnd()); + EXPECT_EQ(end, parser.getLocation().getEnd()); } } -static void assertCommand(OsmlStreamParser &reader, const std::string &name, - const Variant::mapType &args, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) +static void assertCommandStart(OsmlStreamParser &parser, + const std::string &name, + bool rangeCommand, + const Variant::mapType &args, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) { - assertCommand(reader, name, start, end); - EXPECT_EQ(args, reader.getCommandArguments()); + assertCommandStart(parser, name, rangeCommand, start, end); + EXPECT_EQ(args, parser.getCommandArguments()); } -static void assertData(OsmlStreamParser &reader, const std::string &data, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset, - WhitespaceMode mode = WhitespaceMode::COLLAPSE) +static void assertCommand(OsmlStreamParser &parser, + const std::string &name, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - Variant text = reader.getText(mode); - ASSERT_TRUE(text.isString()); - EXPECT_EQ(data, text.asString()); + assertCommandStart(parser, name, false, Variant::mapType{}, start, end); +} + +static void assertCommandEnd(OsmlStreamParser &parser, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::COMMAND_END, parser.parse()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, text.getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); + EXPECT_EQ(start, parser.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, text.getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); + EXPECT_EQ(end, parser.getLocation().getEnd()); + } +} + +static void assertTextData(OsmlStreamParser &parser, const std::string &text, + SourceOffset dataStart = InvalidSourceOffset, + SourceOffset dataEnd = InvalidSourceOffset, + SourceOffset textStart = InvalidSourceOffset, + SourceOffset textEnd = InvalidSourceOffset, + WhitespaceMode mode = WhitespaceMode::COLLAPSE) +{ + ASSERT_EQ(OsmlStreamParser::State::DATA, parser.parse()); + + const TokenizedData &data = parser.getData(); + TokenizedDataReader dataReader = data.reader(); + + Token token; + ASSERT_TRUE(dataReader.read(token, TokenSet{}, mode)); + EXPECT_EQ(Tokens::Data, token.id); + EXPECT_EQ(text, token.content); + if (dataStart != InvalidSourceOffset) { + EXPECT_EQ(dataStart, data.getLocation().getStart()); + EXPECT_EQ(dataStart, parser.getLocation().getStart()); + } + if (dataEnd != InvalidSourceOffset) { + EXPECT_EQ(dataEnd, data.getLocation().getEnd()); + EXPECT_EQ(dataEnd, parser.getLocation().getEnd()); + } + if (textStart != InvalidSourceOffset) { + EXPECT_EQ(textStart, token.getLocation().getStart()); } + if (textEnd != InvalidSourceOffset) { + EXPECT_EQ(textEnd, token.getLocation().getEnd()); + } +} + +static void assertData(OsmlStreamParser &parser, const std::string &text, + SourceOffset textStart = InvalidSourceOffset, + SourceOffset textEnd = InvalidSourceOffset, + WhitespaceMode mode = WhitespaceMode::COLLAPSE) +{ + assertTextData(parser, text, InvalidSourceOffset, InvalidSourceOffset, textStart, textEnd, mode); +} + +static void assertEmptyData(OsmlStreamParser &parser) +{ + ASSERT_EQ(OsmlStreamParser::State::DATA, parser.parse()); + + const TokenizedData &data = parser.getData(); + TokenizedDataReader dataReader = data.reader(); + + Token token; + EXPECT_FALSE(dataReader.read(token, TokenSet{}, WhitespaceMode::TRIM)); } -static void assertFieldStart(OsmlStreamParser &reader, bool defaultField, + +static void assertFieldStart(OsmlStreamParser &parser, bool defaultField, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_START, skipEmptyData(reader)); - EXPECT_EQ(defaultField, reader.inDefaultField()); + ASSERT_EQ(OsmlStreamParser::State::FIELD_START, parser.parse()); + EXPECT_EQ(defaultField, parser.inDefaultField()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); + EXPECT_EQ(start, parser.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); + EXPECT_EQ(end, parser.getLocation().getEnd()); } } -static void assertFieldEnd(OsmlStreamParser &reader, +static void assertFieldEnd(OsmlStreamParser &parser, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::FIELD_END, skipEmptyData(reader)); + ASSERT_EQ(OsmlStreamParser::State::FIELD_END, parser.parse()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); + EXPECT_EQ(start, parser.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); + EXPECT_EQ(end, parser.getLocation().getEnd()); } } -static void assertAnnotationStart(OsmlStreamParser &reader, +static void assertAnnotationStart(OsmlStreamParser &parser, const std::string &name, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, skipEmptyData(reader)); - EXPECT_EQ(name, reader.getCommandName().asString()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, parser.parse()); + EXPECT_EQ(name, parser.getCommandName().asString()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); + EXPECT_EQ(start, parser.getCommandName().getLocation().getStart()); + EXPECT_EQ(start, parser.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); + EXPECT_EQ(end, parser.getCommandName().getLocation().getEnd()); + EXPECT_EQ(end, parser.getLocation().getEnd()); } } -static void assertAnnotationStart(OsmlStreamParser &reader, +static void assertAnnotationStart(OsmlStreamParser &parser, const std::string &name, const Variant::mapType &args, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - assertAnnotationStart(reader, name, start, end); - EXPECT_EQ(args, reader.getCommandArguments()); + assertAnnotationStart(parser, name, start, end); + EXPECT_EQ(args, parser.getCommandArguments()); } -static void assertAnnotationEnd(OsmlStreamParser &reader, +static void assertAnnotationEnd(OsmlStreamParser &parser, const std::string &name, const std::string &elementName, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, skipEmptyData(reader)); - ASSERT_EQ(name, reader.getCommandName().asString()); + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, parser.parse()); + ASSERT_EQ(name, parser.getCommandName().asString()); if (!elementName.empty()) { - ASSERT_EQ(1U, reader.getCommandArguments().asMap().size()); - ASSERT_EQ(1U, reader.getCommandArguments().asMap().count("name")); + ASSERT_EQ(1U, parser.getCommandArguments().asMap().size()); + ASSERT_EQ(1U, parser.getCommandArguments().asMap().count("name")); - auto it = reader.getCommandArguments().asMap().find("name"); + auto it = parser.getCommandArguments().asMap().find("name"); ASSERT_EQ(elementName, it->second.asString()); } if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); + EXPECT_EQ(start, parser.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); + EXPECT_EQ(end, parser.getLocation().getEnd()); } } -static void assertEnd(OsmlStreamParser &reader, +static void assertEnd(OsmlStreamParser &parser, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::END, skipEmptyData(reader)); + ASSERT_EQ(OsmlStreamParser::State::END, parser.parse()); if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); + EXPECT_EQ(start, parser.getLocation().getStart()); } if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); + EXPECT_EQ(end, parser.getLocation().getEnd()); } } @@ -179,9 +230,9 @@ TEST(OsmlStreamParser, empty) const char *testString = ""; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertEnd(parser, 0, 0); } TEST(OsmlStreamParser, oneCharacter) @@ -189,45 +240,102 @@ TEST(OsmlStreamParser, oneCharacter) const char *testString = "a"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); + + assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::COLLAPSE); + assertEnd(parser, 1, 1); +} + +TEST(OsmlStreamParser, whitespacePreserve) +{ + const char *testString = " hello \t world "; + // 0123456 78901234 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); + + assertTextData(parser, " hello \t world ", 0, 15, 0, 15, + WhitespaceMode::PRESERVE); + assertEnd(parser, 15, 15); +} + +TEST(OsmlStreamParser, whitespaceTrim) +{ + const char *testString = " hello \t world "; + // 0123456 78901234 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); - assertData(reader, "a", 0, 1); + assertTextData(parser, "hello \t world", 0, 15, 1, 14, + WhitespaceMode::TRIM); + assertEnd(parser, 15, 15); } -TEST(OsmlStreamParser, whitespaceElimination) +TEST(OsmlStreamParser, whitespaceCollapse) { const char *testString = " hello \t world "; // 0123456 78901234 // 0 1 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertData(reader, "hello world", 1, 14); + assertTextData(parser, "hello world", 0, 15, 1, 14, + WhitespaceMode::COLLAPSE); + assertEnd(parser, 15, 15); } -TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) +TEST(OsmlStreamParser, whitespaceCollapseLinebreak) { const char *testString = " hello \n world "; // 0123456 78901234 // 0 1 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); + + assertTextData(parser, "hello world", 0, 15, 1, 14, + WhitespaceMode::COLLAPSE); + assertEnd(parser, 15, 15); +} + +TEST(OsmlStreamParser, whitespaceCollapseProtected) +{ + const char *testString = " hello\\ \\ world "; + // 012345 67 89012345 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); + + assertTextData(parser, "hello world", 0, 16, 1, 15, + WhitespaceMode::COLLAPSE); + assertEnd(parser, 16, 16); +} + +TEST(OsmlStreamParser, whitespaceCollapseProtected2) +{ + const char *testString = " hello \\ \\ world "; + // 012345 67 89012345 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); - assertData(reader, "hello world", 1, 14); + assertTextData(parser, "hello world", 0, 17, 1, 16, + WhitespaceMode::COLLAPSE); + assertEnd(parser, 17, 17); } static void testEscapeSpecialCharacter(const std::string &c) { CharReader charReader(std::string("\\") + c); - OsmlStreamParser reader(charReader, logger); - EXPECT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(c, reader.getText().asString()); - - SourceLocation loc = reader.getText().getLocation(); - EXPECT_EQ(0U, loc.getStart()); - EXPECT_EQ(1U + c.size(), loc.getEnd()); + OsmlStreamParser parser(charReader, logger); + assertTextData(parser, c, 0, 2, 0, 2, WhitespaceMode::PRESERVE); + assertEnd(parser, 2, 2); } TEST(OsmlStreamParser, escapeSpecialCharacters) @@ -240,9 +348,11 @@ TEST(OsmlStreamParser, escapeSpecialCharacters) TEST(OsmlStreamParser, simpleSingleLineComment) { const char *testString = "% This is a single line comment"; + // 0123456789012345678901234567890 + // 0 1 2 3 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + OsmlStreamParser parser(charReader, logger); + assertEnd(parser, 31, 31); } TEST(OsmlStreamParser, singleLineComment) @@ -251,24 +361,11 @@ TEST(OsmlStreamParser, singleLineComment) // 01234567890123456789012345678901 23 // 0 1 2 3 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getText().asString()); - SourceLocation loc = reader.getText().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getText().asString()); - SourceLocation loc = reader.getText().getLocation(); - ASSERT_EQ(33U, loc.getStart()); - ASSERT_EQ(34U, loc.getEnd()); - } + OsmlStreamParser parser(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); + assertTextData(parser, "b", 33, 34, 33, 34, WhitespaceMode::PRESERVE); + assertEnd(parser, 34, 34); } TEST(OsmlStreamParser, multilineComment) @@ -277,24 +374,27 @@ TEST(OsmlStreamParser, multilineComment) // 0123456789012 3 456789012345678901234567890 // 0 1 2 3 4 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getText().asString()); - SourceLocation loc = reader.getText().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } + OsmlStreamParser parser(charReader, logger); - { - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getText().asString()); - SourceLocation loc = reader.getText().getLocation(); - ASSERT_EQ(40U, loc.getStart()); - ASSERT_EQ(41U, loc.getEnd()); - } + assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); + assertTextData(parser, "b", 40, 41, 40, 41, WhitespaceMode::PRESERVE); + assertEnd(parser, 41, 41); +} - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +TEST(OsmlStreamParser, unfinishedMultilineComment) +{ + const char *testString = "a%{ This is a\n\n multiline line comment"; + // 0123456789012 3 456789012345678901234567 + // 0 1 2 3 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + + logger.reset(); + + assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 38, 38); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, nestedMultilineComment) @@ -303,24 +403,11 @@ TEST(OsmlStreamParser, nestedMultilineComment) // 0123456789012 3 456789012345678901234567890 // 0 1 2 3 4 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getText().asString()); - SourceLocation loc = reader.getText().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getText().asString()); - SourceLocation loc = reader.getText().getLocation(); - ASSERT_EQ(40U, loc.getStart()); - ASSERT_EQ(41U, loc.getEnd()); - } + OsmlStreamParser parser(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); + assertTextData(parser, "b", 40, 41, 40, 41, WhitespaceMode::PRESERVE); + assertEnd(parser, 41, 41); } TEST(OsmlStreamParser, simpleCommand) @@ -328,45 +415,27 @@ TEST(OsmlStreamParser, simpleCommand) const char *testString = "\\test"; // 0 12345 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + OsmlStreamParser parser(charReader, logger); - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - ASSERT_EQ(0U, reader.getCommandArguments().asMap().size()); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertCommand(parser, "test", 0, 5); + assertEnd(parser); } TEST(OsmlStreamParser, simpleCommandWithName) { - const char *testString = "\\test#bla"; - // 0 12345678 + const char *testString = "\\test#foo"; + // 012345678 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + OsmlStreamParser parser(charReader, logger); - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(1U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("name")); - ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); + assertCommandStart(parser, "test", false, Variant::mapType{{"name", "foo"}}, + 0, 5); - loc = commandArguments.asMap()["name"].getLocation(); - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); + Variant::mapType args = parser.getCommandArguments().asMap(); + ASSERT_EQ(5U, args["name"].getLocation().getStart()); + ASSERT_EQ(9U, args["name"].getLocation().getEnd()); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertEnd(parser); } TEST(OsmlStreamParser, simpleCommandWithArguments) @@ -375,38 +444,21 @@ TEST(OsmlStreamParser, simpleCommandWithArguments) // 0 123456789012345 678901 2 // 0 1 2 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + OsmlStreamParser parser(charReader, logger); - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); + assertCommandStart(parser, "test", false, + Variant::mapType{{"a", 1}, {"b", 2}, {"c", "test"}}, 0, + 5); - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(3U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("a")); - ASSERT_EQ(1U, commandArguments.asMap().count("b")); - ASSERT_EQ(1U, commandArguments.asMap().count("c")); - ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); - ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); - ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); + Variant::mapType args = parser.getCommandArguments().asMap(); + ASSERT_EQ(8U, args["a"].getLocation().getStart()); + ASSERT_EQ(9U, args["a"].getLocation().getEnd()); + ASSERT_EQ(12U, args["b"].getLocation().getStart()); + ASSERT_EQ(13U, args["b"].getLocation().getEnd()); + ASSERT_EQ(16U, args["c"].getLocation().getStart()); + ASSERT_EQ(22U, args["c"].getLocation().getEnd()); - loc = commandArguments.asMap()["a"].getLocation(); - ASSERT_EQ(8U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - loc = commandArguments.asMap()["b"].getLocation(); - ASSERT_EQ(12U, loc.getStart()); - ASSERT_EQ(13U, loc.getEnd()); - - loc = commandArguments.asMap()["c"].getLocation(); - ASSERT_EQ(16U, loc.getStart()); - ASSERT_EQ(22U, loc.getEnd()); - - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertEnd(parser); } TEST(OsmlStreamParser, simpleCommandWithArgumentsAndName) @@ -415,44 +467,24 @@ TEST(OsmlStreamParser, simpleCommandWithArgumentsAndName) // 0 1234567890123456789 01234 56 // 0 1 2 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(4U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("a")); - ASSERT_EQ(1U, commandArguments.asMap().count("b")); - ASSERT_EQ(1U, commandArguments.asMap().count("c")); - ASSERT_EQ(1U, commandArguments.asMap().count("name")); - ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); - ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); - ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); - ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); - - loc = commandArguments.asMap()["a"].getLocation(); - ASSERT_EQ(12U, loc.getStart()); - ASSERT_EQ(13U, loc.getEnd()); - - loc = commandArguments.asMap()["b"].getLocation(); - ASSERT_EQ(16U, loc.getStart()); - ASSERT_EQ(17U, loc.getEnd()); - - loc = commandArguments.asMap()["c"].getLocation(); - ASSERT_EQ(20U, loc.getStart()); - ASSERT_EQ(26U, loc.getEnd()); - - loc = commandArguments.asMap()["name"].getLocation(); - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + OsmlStreamParser parser(charReader, logger); + + assertCommandStart( + parser, "test", false, + Variant::mapType{{"name", "bla"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 0, + 5); + + Variant::mapType args = parser.getCommandArguments().asMap(); + ASSERT_EQ(5U, args["name"].getLocation().getStart()); + ASSERT_EQ(9U, args["name"].getLocation().getEnd()); + ASSERT_EQ(12U, args["a"].getLocation().getStart()); + ASSERT_EQ(13U, args["a"].getLocation().getEnd()); + ASSERT_EQ(16U, args["b"].getLocation().getStart()); + ASSERT_EQ(17U, args["b"].getLocation().getEnd()); + ASSERT_EQ(20U, args["c"].getLocation().getStart()); + ASSERT_EQ(26U, args["c"].getLocation().getEnd()); + + assertEnd(parser); } TEST(OsmlStreamParser, fields) @@ -461,21 +493,21 @@ TEST(OsmlStreamParser, fields) // 01234567890123 // 0 1 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, false, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertTextData(parser, "a", 6, 7, 6, 7, WhitespaceMode::PRESERVE); + assertFieldEnd(parser, 7, 8); - assertFieldStart(reader, false, 8, 9); - assertData(reader, "b", 9, 10); - assertFieldEnd(reader, 10, 11); + assertFieldStart(parser, false, 8, 9); + assertTextData(parser, "b", 9, 10, 9, 10, WhitespaceMode::PRESERVE); + assertFieldEnd(parser, 10, 11); - assertFieldStart(reader, false, 11, 12); - assertData(reader, "c", 12, 13); - assertFieldEnd(reader, 13, 14); - assertEnd(reader, 14, 14); + assertFieldStart(parser, false, 11, 12); + assertTextData(parser, "c", 12, 13, 12, 13, WhitespaceMode::PRESERVE); + assertFieldEnd(parser, 13, 14); + assertEnd(parser, 14, 14); } TEST(OsmlStreamParser, dataOutsideField) @@ -484,785 +516,781 @@ TEST(OsmlStreamParser, dataOutsideField) // 0123456789012 // 0 1 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, false, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertTextData(parser, "a", 6, 7, 6, 7, WhitespaceMode::COLLAPSE); + assertFieldEnd(parser, 7, 8); - assertFieldStart(reader, false, 8, 9); - assertData(reader, "b", 9, 10); - assertFieldEnd(reader, 10, 11); + assertFieldStart(parser, false, 8, 9); + assertTextData(parser, "b", 9, 10, 9, 10, WhitespaceMode::COLLAPSE); + assertFieldEnd(parser, 10, 11); - assertData(reader, "c", 12, 13); - assertEnd(reader, 13, 13); + assertTextData(parser, "c", 11, 13, 12, 13, WhitespaceMode::COLLAPSE); + assertEnd(parser, 13, 13); } TEST(OsmlStreamParser, nestedCommand) { - const char *testString = "\\test{a}{\\test2{b} c} d"; - // 012345678 90123456789012 - // 0 1 2 - CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + const char *testString = "\\test{a}{\\test2{b} c} d"; + // 012345678 90123456789012 + // 0 1 2 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "test", 0, 5); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertData(parser, "a", 6, 7); + assertFieldEnd(parser, 7, 8); - assertFieldStart(reader, false, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); - - assertFieldStart(reader, false, 8, 9); - { - assertCommand(reader, "test2", 9, 15); - assertFieldStart(reader, false, 15, 16); - assertData(reader, "b", 16, 17); - assertFieldEnd(reader, 17, 18); - } - assertData(reader, "c", 19, 20); - assertFieldEnd(reader, 20, 21); - assertData(reader, "d", 22, 23); - assertEnd(reader, 23, 23); + assertFieldStart(parser, false, 8, 9); + assertCommand(parser, "test2", 9, 15); + assertFieldStart(parser, false, 15, 16); + assertData(parser, "b", 16, 17); + assertFieldEnd(parser, 17, 18); + assertData(parser, "c", 19, 20); + assertFieldEnd(parser, 20, 21); + assertData(parser, "d", 22, 23); + assertEnd(parser, 23, 23); } + TEST(OsmlStreamParser, nestedCommandImmediateEnd) { - const char *testString = "\\test{\\test2{b}} d"; - // 012345 678901234567 - // 0 1 - CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, false, 5, 6); - { - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, false, 12, 13); - assertData(reader, "b", 13, 14); - assertFieldEnd(reader, 14, 15); - } - assertFieldEnd(reader, 15, 16); - assertData(reader, "d", 17, 18); - assertEnd(reader, 18, 18); + const char *testString = "\\test{\\test2{b}} d"; + // 012345 678901234567 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + { + assertCommand(parser, "test2", 6, 12); + assertFieldStart(parser, false, 12, 13); + assertData(parser, "b", 13, 14); + assertFieldEnd(parser, 14, 15); + } + assertFieldEnd(parser, 15, 16); + assertData(parser, "d", 17, 18); + assertEnd(parser, 18, 18); } TEST(OsmlStreamParser, nestedCommandNoData) { - const char *testString = "\\test{\\test2}"; - // 012345 6789012 - CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + const char *testString = "\\test{\\test2}"; + // 012345 6789012 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, false, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldEnd(reader, 12, 13); - assertEnd(reader, 13, 13); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertCommand(parser, "test2", 6, 12); + assertFieldEnd(parser, 12, 13); + assertEnd(parser, 13, 13); } TEST(OsmlStreamParser, multipleCommands) { - const char *testString = "\\a \\b \\c \\d"; - // 012 345 678 90 - // 0 1 - CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + const char *testString = "\\a \\b \\c \\d"; + // 012 345 678 90 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "a", 0, 2); - assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); - assertCommand(reader, "b", 3, 5); - assertData(reader, " ", 5, 6, WhitespaceMode::PRESERVE); - assertCommand(reader, "c", 6, 8); - assertData(reader, " ", 8, 9, WhitespaceMode::PRESERVE); - assertCommand(reader, "d", 9, 11); - assertEnd(reader, 11, 11); + assertCommand(parser, "a", 0, 2); + assertEmptyData(parser); + assertCommand(parser, "b", 3, 5); + assertEmptyData(parser); + assertCommand(parser, "c", 6, 8); + assertEmptyData(parser); + assertCommand(parser, "d", 9, 11); + assertEnd(parser, 11, 11); } TEST(OsmlStreamParser, fieldsWithSpaces) { - const char *testString = "\\a {\\b \\c} \n\n {\\d}"; - // 0123 456 789012 3 456 789 - // 0 1 - CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - - assertCommand(reader, "a", 0, 2); - assertData(reader, " ", 2, 3, WhitespaceMode::PRESERVE); - assertFieldStart(reader, false, 3, 4); - assertCommand(reader, "b", 4, 6); - assertData(reader, " ", 6, 7, WhitespaceMode::PRESERVE); - assertCommand(reader, "c", 7, 9); - assertFieldEnd(reader, 9, 10); - assertData(reader, " \n\n {", 10, 12, WhitespaceMode::PRESERVE); - assertFieldStart(reader, false, 16, 17); - assertCommand(reader, "d", 17, 19); - assertFieldEnd(reader, 19, 20); - assertEnd(reader, 20, 20); -} - -TEST(OsmlStreamParser, errorNoFieldToStart) -{ - const char *testString = "\\a b {"; - // 012345 - // 0 - CharReader charReader(testString); + const char *testString = "\\a {\\b \\c} \n\n {\\d}"; + // 0123 456 789012 3 456 789 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - OsmlStreamParser reader(charReader, logger); + assertCommand(parser, "a", 0, 2); + assertEmptyData(parser); + assertFieldStart(parser, false, 3, 4); + assertCommand(parser, "b", 4, 6); + assertEmptyData(parser); + assertCommand(parser, "c", 7, 9); + assertFieldEnd(parser, 9, 10); + assertEmptyData(parser); + assertFieldStart(parser, false, 16, 17); + assertCommand(parser, "d", 17, 19); + assertFieldEnd(parser, 19, 20); + assertEnd(parser, 20, 20); +} - logger.reset(); - assertCommand(reader, "a", 0, 2); - assertData(reader, "b", 3, 4); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 6, 6); - ASSERT_TRUE(logger.hasError()); +TEST(OsmlStreamParser, errorEndButOpenField) +{ + const char *testString = "\\a b {"; + // 012345 + // 0 + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); + + logger.reset(); + assertCommand(parser, "a", 0, 2); + assertData(parser, "b", 3, 4); + assertFieldStart(parser, false, 5, 6); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 6, 6); + ASSERT_TRUE(logger.hasError()); } + TEST(OsmlStreamParser, errorNoFieldToEnd) { - const char *testString = "\\a b }"; - // 012345 - // 0 - CharReader charReader(testString); + const char *testString = "\\a b }"; + // 012345 + // 0 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(reader, "a", 0, 2); - assertData(reader, "b", 3, 4); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 6, 6); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(parser, "a", 0, 2); + assertData(parser, "b", 3, 4); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 6, 6); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorNoFieldEndNested) { - const char *testString = "\\test{\\test2{}}}"; - // 012345 6789012345 - // 0 1 - CharReader charReader(testString); + const char *testString = "\\test{\\test2{}}}"; + // 012345 6789012345 + // 0 1 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, false, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, false, 12, 13); - assertFieldEnd(reader, 13, 14); - assertFieldEnd(reader, 14, 15); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 16, 16); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertCommand(parser, "test2", 6, 12); + assertFieldStart(parser, false, 12, 13); + assertFieldEnd(parser, 13, 14); + assertFieldEnd(parser, 14, 15); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 16, 16); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorNoFieldEndNestedData) { - const char *testString = "\\test{\\test2{}}a}"; - // 012345 67890123456 - // 0 1 - CharReader charReader(testString); + const char *testString = "\\test{\\test2{}}a}"; + // 012345 67890123456 + // 0 1 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, false, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, false, 12, 13); - assertFieldEnd(reader, 13, 14); - assertFieldEnd(reader, 14, 15); - assertData(reader, "a", 15, 16); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 17, 17); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertCommand(parser, "test2", 6, 12); + assertFieldStart(parser, false, 12, 13); + assertFieldEnd(parser, 13, 14); + assertFieldEnd(parser, 14, 15); + assertData(parser, "a", 15, 16); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 17, 17); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, beginEnd) { - const char *testString = "\\begin{book}\\end{book}"; - // 012345678901 2345678901 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book}\\end{book}"; + // 012345678901 2345678901 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, true, 12, 13); - assertFieldEnd(reader, 17, 21); - assertEnd(reader, 22, 22); + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertCommandEnd(parser, 17, 21); + assertEnd(parser, 22, 22); } TEST(OsmlStreamParser, beginEndWithName) { - const char *testString = "\\begin{book#a}\\end{book}"; - // 01234567890123 4567890123 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book#a}\\end{book}"; + // 01234567890123 4567890123 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "book", {{"name", "a"}}, 7, 11); - assertFieldStart(reader, true, 14, 15); - assertFieldEnd(reader, 19, 23); - assertEnd(reader, 24, 24); + assertCommandStart(parser, "book", true, {{"name", "a"}}, 7, 11); + assertCommandEnd(parser, 19, 23); + assertEnd(parser, 24, 24); } TEST(OsmlStreamParser, beginEndWithNameAndArgs) { - const char *testString = "\\begin{book#a}[a=1,b=2,c=\"test\"]\\end{book}"; - // 0123456789012345678901234 56789 01 2345678901 - // 0 1 2 3 4 - CharReader charReader(testString); + const char *testString = "\\begin{book#a}[a=1,b=2,c=\"test\"]\\end{book}"; + // 0123456789012345678901234 56789 01 2345678901 + // 0 1 2 3 4 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "book", - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, true, 32, 33); - assertFieldEnd(reader, 37, 41); - assertEnd(reader, 42, 42); + assertCommandStart(parser, "book", true, + {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); + assertCommandEnd(parser, 37, 41); + assertEnd(parser, 42, 42); } TEST(OsmlStreamParser, beginEndWithNameAndArgsMultipleFields) { - const char *testString = - "\\begin{book#a}[a=1,b=2,c=\"test\"]{a \\test}{b \\test{}}\\end{book}"; - // 0123456789012345678901234 56789 01234 567890123 45678901 2345678901 - // 0 1 2 3 4 5 6 - CharReader charReader(testString); - - OsmlStreamParser reader(charReader, logger); - - assertCommand(reader, "book", - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, false, 32, 33); - assertData(reader, "a", 33, 34); - assertCommand(reader, "test", Variant::mapType{}, 35, 40); - assertFieldEnd(reader, 40, 41); - assertFieldStart(reader, false, 41, 42); - assertData(reader, "b", 42, 43); - assertCommand(reader, "test", Variant::mapType{}, 44, 49); - assertFieldStart(reader, false, 49, 50); - assertFieldEnd(reader, 50, 51); - assertFieldEnd(reader, 51, 52); - assertFieldStart(reader, true, 52, 53); - assertFieldEnd(reader, 57, 61); - assertEnd(reader, 62, 62); + const char *testString = + "\\begin{book#a}[a=1,b=2,c=\"test\"]{a \\test}{b \\test{}}\\end{book}"; + // 0123456789012345678901234 56789 01234 567890123 45678901 2345678901 + // 0 1 2 3 4 5 6 + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); + + assertCommandStart(parser, "book", true, + {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); + assertFieldStart(parser, false, 32, 33); + assertData(parser, "a", 33, 34); + assertCommand(parser, "test", 35, 40); + assertFieldEnd(parser, 40, 41); + assertFieldStart(parser, false, 41, 42); + assertData(parser, "b", 42, 43); + assertCommand(parser, "test", 44, 49); + assertFieldStart(parser, false, 49, 50); + assertFieldEnd(parser, 50, 51); + assertFieldEnd(parser, 51, 52); + assertCommandEnd(parser, 57, 61); + assertEnd(parser, 62, 62); } TEST(OsmlStreamParser, beginEndWithData) { - const char *testString = "\\begin{book}a\\end{book}"; - // 0123456789012 3456789012 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book}a\\end{book}"; + // 0123456789012 3456789012 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, true, 12, 13); - assertData(reader, "a", 12, 13); - assertFieldEnd(reader, 18, 22); - assertEnd(reader, 23, 23); + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertData(parser, "a", 12, 13); + assertCommandEnd(parser, 18, 22); + assertEnd(parser, 23, 23); } - +/* TEST(OsmlStreamParser, beginEndNested) { - const char *testString = - "\\begin{a}{b} c \\begin{d}{e}{f} \\g{h} \\end{d}\\end{a}"; - // 012345678901234 5678901234567890 123456 7890123 4567890 - // 0 1 2 3 4 5 - CharReader charReader(testString); - - OsmlStreamParser reader(charReader, logger); - - assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, false, 9, 10); - assertData(reader, "b", 10, 11); - assertFieldEnd(reader, 11, 12); - assertFieldStart(reader, true, 13, 14); - assertData(reader, "c", 13, 14); - assertCommand(reader, "d", 22, 23); - assertFieldStart(reader, false, 24, 25); - assertData(reader, "e", 25, 26); - assertFieldEnd(reader, 26, 27); - assertFieldStart(reader, false, 27, 28); - assertData(reader, "f", 28, 29); - assertFieldEnd(reader, 29, 30); - assertFieldStart(reader, true, 31, 32); - assertCommand(reader, "g", 31, 33); - assertFieldStart(reader, false, 33, 34); - assertData(reader, "h", 34, 35); - assertFieldEnd(reader, 35, 36); - assertFieldEnd(reader, 42, 43); - assertFieldEnd(reader, 49, 50); - assertEnd(reader, 51, 51); + const char *testString = + "\\begin{a}{b} c \\begin{d}{e}{f} \\g{h} \\end{d}\\end{a}"; + // 012345678901234 5678901234567890 123456 7890123 4567890 + // 0 1 2 3 4 5 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 7, 8); + assertFieldStart(reader, false, 9, 10); + assertData(reader, "b", 10, 11); + assertFieldEnd(reader, 11, 12); + assertFieldStart(reader, true, 13, 14); + assertData(reader, "c", 13, 14); + assertCommand(reader, "d", 22, 23); + assertFieldStart(reader, false, 24, 25); + assertData(reader, "e", 25, 26); + assertFieldEnd(reader, 26, 27); + assertFieldStart(reader, false, 27, 28); + assertData(reader, "f", 28, 29); + assertFieldEnd(reader, 29, 30); + assertFieldStart(reader, true, 31, 32); + assertCommand(reader, "g", 31, 33); + assertFieldStart(reader, false, 33, 34); + assertData(reader, "h", 34, 35); + assertFieldEnd(reader, 35, 36); + assertFieldEnd(reader, 42, 43); + assertFieldEnd(reader, 49, 50); + assertEnd(reader, 51, 51); } TEST(OsmlStreamParser, beginEndWithCommand) { - const char *testString = "\\begin{book}\\a{test}\\end{book}"; - // 012345678901 23456789 0123456789 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book}\\a{test}\\end{book}"; + // 012345678901 23456789 0123456789 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, true, 12, 13); - assertCommand(reader, "a", 12, 14); - assertFieldStart(reader, false, 14, 15); - assertData(reader, "test", 15, 19); - assertFieldEnd(reader, 19, 20); - assertFieldEnd(reader, 25, 29); - assertEnd(reader, 30, 30); + assertCommand(reader, "book", 7, 11); + assertFieldStart(reader, true, 12, 13); + assertCommand(reader, "a", 12, 14); + assertFieldStart(reader, false, 14, 15); + assertData(reader, "test", 15, 19); + assertFieldEnd(reader, 19, 20); + assertFieldEnd(reader, 25, 29); + assertEnd(reader, 30, 30); } TEST(OsmlStreamParser, errorBeginNoBraceOpen) { - const char *testString = "\\begin a"; - // 01234567 - CharReader charReader(testString); + const char *testString = "\\begin a"; + // 01234567 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 7, 8); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertData(reader, "a", 7, 8); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginNoIdentifier) { - const char *testString = "\\begin{!"; - CharReader charReader(testString); + const char *testString = "\\begin{!"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginNoBraceClose) { - const char *testString = "\\begin{a"; - CharReader charReader(testString); + const char *testString = "\\begin{a"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginNoName) { - const char *testString = "\\begin{a#}"; - CharReader charReader(testString); + const char *testString = "\\begin{a#}"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "a"); - ASSERT_TRUE(logger.hasError()); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "a"); + ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertEnd(reader); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoBraceOpen) { - const char *testString = "\\end a"; - // 012345 - CharReader charReader(testString); + const char *testString = "\\end a"; + // 012345 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 5, 6); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertData(reader, "a", 5, 6); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoIdentifier) { - const char *testString = "\\end{!"; - CharReader charReader(testString); + const char *testString = "\\end{!"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoBraceClose) { - const char *testString = "\\end{a"; - CharReader charReader(testString); + const char *testString = "\\end{a"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoBegin) { - const char *testString = "\\end{a}"; - CharReader charReader(testString); + const char *testString = "\\end{a}"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginEndMismatch) { - const char *testString = "\\begin{a} \\begin{b} test \\end{a}"; - // 0123456789 012345678901234 5678901 - // 0 1 2 3 - CharReader charReader(testString); + const char *testString = "\\begin{a} \\begin{b} test \\end{a}"; + // 0123456789 012345678901234 5678901 + // 0 1 2 3 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, true, 10, 11); - assertCommand(reader, "b", 17, 18); - assertFieldStart(reader, true, 20, 24); - assertData(reader, "test", 20, 24); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(reader, "a", 7, 8); + assertFieldStart(reader, true, 10, 11); + assertCommand(reader, "b", 17, 18); + assertFieldStart(reader, true, 20, 24); + assertData(reader, "test", 20, 24); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, commandWithNSSep) { - const char *testString = "\\test1:test2"; - // 012345678901 - CharReader charReader(testString); + const char *testString = "\\test1:test2"; + // 012345678901 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertCommand(reader, "test1:test2", 0, 12); - assertEnd(reader, 12, 12); + assertCommand(reader, "test1:test2", 0, 12); + assertEnd(reader, 12, 12); } TEST(OsmlStreamParser, beginEndWithNSSep) { - const char *testString = "\\begin{test1:test2}\\end{test1:test2}"; - // 0123456789012345678 90123456789012345 - // 0 1 2 3 - CharReader charReader(testString); + const char *testString = "\\begin{test1:test2}\\end{test1:test2}"; + // 0123456789012345678 90123456789012345 + // 0 1 2 3 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertCommand(reader, "test1:test2", 7, 18); - assertFieldStart(reader, true, 19, 20); - assertFieldEnd(reader, 24, 35); - assertEnd(reader, 36, 36); + assertCommand(reader, "test1:test2", 7, 18); + assertFieldStart(reader, true, 19, 20); + assertFieldEnd(reader, 24, 35); + assertEnd(reader, 36, 36); } TEST(OsmlStreamParser, errorBeginNSSep) { - const char *testString = "\\begin:test{blub}\\end{blub}"; - CharReader charReader(testString); + const char *testString = "\\begin:test{blub}\\end{blub}"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "blub"); - ASSERT_TRUE(logger.hasError()); - assertFieldStart(reader, true); - assertFieldEnd(reader); - assertEnd(reader); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "blub"); + ASSERT_TRUE(logger.hasError()); + assertFieldStart(reader, true); + assertFieldEnd(reader); + assertEnd(reader); } TEST(OsmlStreamParser, errorEndNSSep) { - const char *testString = "\\begin{blub}\\end:test{blub}"; - CharReader charReader(testString); + const char *testString = "\\begin{blub}\\end:test{blub}"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - assertCommand(reader, "blub"); - assertFieldStart(reader, true); - ASSERT_FALSE(logger.hasError()); - assertFieldEnd(reader); - ASSERT_TRUE(logger.hasError()); - assertEnd(reader); + logger.reset(); + assertCommand(reader, "blub"); + assertFieldStart(reader, true); + ASSERT_FALSE(logger.hasError()); + assertFieldEnd(reader); + ASSERT_TRUE(logger.hasError()); + assertEnd(reader); } TEST(OsmlStreamParser, errorEmptyNs) { - const char *testString = "\\test:"; - CharReader charReader(testString); + const char *testString = "\\test:"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(reader, ":"); - assertEnd(reader); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "test"); + ASSERT_TRUE(logger.hasError()); + assertData(reader, ":"); + assertEnd(reader); } TEST(OsmlStreamParser, errorRepeatedNs) { - const char *testString = "\\test::"; - CharReader charReader(testString); + const char *testString = "\\test::"; + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(reader, "::"); - assertEnd(reader); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "test"); + ASSERT_TRUE(logger.hasError()); + assertData(reader, "::"); + assertEnd(reader); } TEST(OsmlStreamParser, explicitDefaultField) { - const char *testString = "\\a{!b}c"; - // 01234567 - CharReader charReader(testString); + const char *testString = "\\a{!b}c"; + // 01234567 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, true, 2, 4); - assertData(reader, "b", 4, 5); - assertFieldEnd(reader, 5, 6); - assertData(reader, "c", 6, 7); - assertEnd(reader, 7, 7); + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertData(reader, "b", 4, 5); + assertFieldEnd(reader, 5, 6); + assertData(reader, "c", 6, 7); + assertEnd(reader, 7, 7); } TEST(OsmlStreamParser, explicitDefaultFieldWithCommand) { - const char *testString = "\\a{!\\b}c"; - // 0123 4567 - CharReader charReader(testString); + const char *testString = "\\a{!\\b}c"; + // 0123 4567 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, true, 2, 4); - assertCommand(reader, "b", 4, 6); - assertFieldEnd(reader, 6, 7); - assertData(reader, "c", 7, 8); - assertEnd(reader, 8, 8); + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertCommand(reader, "b", 4, 6); + assertFieldEnd(reader, 6, 7); + assertData(reader, "c", 7, 8); + assertEnd(reader, 8, 8); } TEST(OsmlStreamParser, errorFieldAfterExplicitDefaultField) { - const char *testString = "\\a{!\\b}{c}"; - // 0123 456789 - CharReader charReader(testString); + const char *testString = "\\a{!\\b}{c}"; + // 0123 456789 + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, true, 2, 4); - assertCommand(reader, "b", 4, 6); - assertFieldEnd(reader, 6, 7); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "c", 8, 9); - ASSERT_TRUE(logger.hasError()); - assertEnd(reader, 10, 10); + logger.reset(); + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertCommand(reader, "b", 4, 6); + assertFieldEnd(reader, 6, 7); + ASSERT_FALSE(logger.hasError()); + assertData(reader, "c", 8, 9); + ASSERT_TRUE(logger.hasError()); + assertEnd(reader, 10, 10); } TEST(OsmlStreamParser, annotationStart) { - const char *testString = "<\\a"; - // 0 12 + const char *testString = "<\\a"; + // 0 12 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); - assertEnd(reader, 3, 3); + assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); + assertEnd(reader, 3, 3); } TEST(OsmlStreamParser, annotationStartWithName) { - const char *testString = "<\\annotationWithName#aName"; - // 0 1234567890123456789012345 - // 0 1 2 + const char *testString = "<\\annotationWithName#aName"; + // 0 1234567890123456789012345 + // 0 1 2 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationStart(reader, "annotationWithName", - Variant::mapType{{"name", "aName"}}, 0, 20); - assertEnd(reader, 26, 26); + assertAnnotationStart(reader, "annotationWithName", + Variant::mapType{{"name", "aName"}}, 0, 20); + assertEnd(reader, 26, 26); } TEST(OsmlStreamParser, annotationStartWithArguments) { - const char *testString = "<\\annotationWithName#aName[a=1,b=2]"; - // 0 1234567890123456789012345678901234 - // 0 1 2 3 + const char *testString = "<\\annotationWithName#aName[a=1,b=2]"; + // 0 1234567890123456789012345678901234 + // 0 1 2 3 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationStart( - reader, "annotationWithName", - Variant::mapType{{"name", "aName"}, {"a", 1}, {"b", 2}}, 0, 20); - assertEnd(reader, 35, 35); + assertAnnotationStart( + reader, "annotationWithName", + Variant::mapType{{"name", "aName"}, {"a", 1}, {"b", 2}}, 0, 20); + assertEnd(reader, 35, 35); } TEST(OsmlStreamParser, simpleAnnotationStartBeginEnd) { - const char *testString = "<\\begin{ab#name}[a=1,b=2] a \\end{ab}\\>"; - // 0 123456789012345678901234567 89012345 67 - // 0 1 2 3 + const char *testString = "<\\begin{ab#name}[a=1,b=2] a \\end{ab}\\>"; + // 0 123456789012345678901234567 89012345 67 + // 0 1 2 3 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationStart( - reader, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, - 10); - assertFieldStart(reader, true, 26, 27); - assertData(reader, "a", 26, 27); - assertFieldEnd(reader, 33, 35); - assertAnnotationEnd(reader, "", "", 36, 38); - assertEnd(reader, 38, 38); + assertAnnotationStart( + reader, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, + 10); + assertFieldStart(reader, true, 26, 27); + assertData(reader, "a", 26, 27); + assertFieldEnd(reader, 33, 35); + assertAnnotationEnd(reader, "", "", 36, 38); + assertEnd(reader, 38, 38); } TEST(OsmlStreamParser, annotationEnd) { - const char *testString = "\\a>"; - // 012 + const char *testString = "\\a>"; + // 012 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationEnd(reader, "a", "", 0, 2); - assertEnd(reader, 3, 3); + assertAnnotationEnd(reader, "a", "", 0, 2); + assertEnd(reader, 3, 3); } TEST(OsmlStreamParser, annotationEndWithName) { - const char *testString = "\\a#name>"; - // 01234567 + const char *testString = "\\a#name>"; + // 01234567 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationEnd(reader, "a", "name", 0, 2); - assertEnd(reader, 8, 8); + assertAnnotationEnd(reader, "a", "name", 0, 2); + assertEnd(reader, 8, 8); } TEST(OsmlStreamParser, annotationEndWithNameAsArgs) { - const char *testString = "\\a[name=name]>"; - // 01234567890123 + const char *testString = "\\a[name=name]>"; + // 01234567890123 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationEnd(reader, "a", "name", 0, 2); - assertEnd(reader, 14, 14); + assertAnnotationEnd(reader, "a", "name", 0, 2); + assertEnd(reader, 14, 14); } TEST(OsmlStreamParser, errorAnnotationEndWithArguments) { - const char *testString = "\\a[foo=bar]>"; - // 012345678901 - // 0 1 + const char *testString = "\\a[foo=bar]>"; + // 012345678901 + // 0 1 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "a", Variant::mapType{{"foo", "bar"}}, 0, 2); - ASSERT_TRUE(logger.hasError()); - assertData(reader, ">", 11, 12); - assertEnd(reader, 12, 12); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "a", Variant::mapType{{"foo", "bar"}}, 0, 2); + ASSERT_TRUE(logger.hasError()); + assertData(reader, ">", 11, 12); + assertEnd(reader, 12, 12); } TEST(OsmlStreamParser, closingAnnotation) { - const char *testString = "<\\a>"; - // 0 123 + const char *testString = "<\\a>"; + // 0 123 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); - assertData(reader, ">", 3, 4); - assertEnd(reader, 4, 4); + assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); + assertData(reader, ">", 3, 4); + assertEnd(reader, 4, 4); } TEST(OsmlStreamParser, annotationWithFields) { - const char *testString = "a <\\b{c}{d}{!e} f \\> g"; - // 012 345678901234567 8901 - // 0 1 2 + const char *testString = "a <\\b{c}{d}{!e} f \\> g"; + // 012 345678901234567 8901 + // 0 1 2 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - - assertData(reader, "a", 0, 1); - assertAnnotationStart(reader, "b", Variant::mapType{}, 2, 5); - assertFieldStart(reader, false, 5, 6); - assertData(reader, "c", 6, 7); - assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, false, 8, 9); - assertData(reader, "d", 9, 10); - assertFieldEnd(reader, 10, 11); - assertFieldStart(reader, true, 11, 13); - assertData(reader, "e", 13, 14); - assertFieldEnd(reader, 14, 15); - assertData(reader, "f", 16, 17); - assertAnnotationEnd(reader, "", "", 18, 20); - assertData(reader, "g", 21, 22); - assertEnd(reader, 22, 22); + OsmlStreamParser reader(charReader, logger); + + assertData(reader, "a", 0, 1); + assertAnnotationStart(reader, "b", Variant::mapType{}, 2, 5); + assertFieldStart(reader, false, 5, 6); + assertData(reader, "c", 6, 7); + assertFieldEnd(reader, 7, 8); + assertFieldStart(reader, false, 8, 9); + assertData(reader, "d", 9, 10); + assertFieldEnd(reader, 10, 11); + assertFieldStart(reader, true, 11, 13); + assertData(reader, "e", 13, 14); + assertFieldEnd(reader, 14, 15); + assertData(reader, "f", 16, 17); + assertAnnotationEnd(reader, "", "", 18, 20); + assertData(reader, "g", 21, 22); + assertEnd(reader, 22, 22); } TEST(OsmlStreamParser, annotationStartEscape) { - const char *testString = "<\\%test"; - // 0 123456 - // 0 + const char *testString = "<\\%test"; + // 0 123456 + // 0 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser reader(charReader, logger); - assertData(reader, "<%test", 0, 7); - assertEnd(reader, 7, 7); + assertData(reader, "<%test", 0, 7); + assertEnd(reader, 7, 7); } +*/ } -- cgit v1.2.3 From c18790f70beb5f52b00bc1c2b1ded2b252f1998a Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 28 Feb 2015 15:46:55 +0100 Subject: Fixed potential problem in SourceOffsetVector --- src/core/parser/utils/SourceOffsetVector.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index aaebe7d..67bacef 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -170,6 +170,11 @@ public: if (length < size()) { lens.resize(length); offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + if (length > 0) { + lastEnd = loadOffset(length - 1).second; + } else { + lastEnd = 0; + } } } -- cgit v1.2.3 From 6776f53b60ade0ece65ab895d23476761c5481d5 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 28 Feb 2015 15:47:13 +0100 Subject: Trimming forgotten protectedChars buffer --- src/core/parser/utils/TokenizedData.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index bcbbe43..c3c4f98 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -467,6 +467,7 @@ public: { if (length < size()) { buf.resize(length); + protectedChars.resize(length); offsets.trim(length); } } -- cgit v1.2.3 From 81e009aa22b5018b055ddda689cd3e78336a164b Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 28 Feb 2015 15:47:30 +0100 Subject: Always call trim if a bestMatch has been found --- src/core/parser/utils/Tokenizer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index e78b0f4..94d9cb0 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -156,7 +156,6 @@ public: return res; } }; - } /* Class Tokenizer */ @@ -252,6 +251,9 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) // Create a token containing the data location bestMatch.token = Token{data.getLocation()}; + } else if (bestMatch.hasMatch() && + bestMatch.dataStartOffset == initialDataSize) { + data.trim(initialDataSize); } // Move the read/peek cursor to the end of the token, abort if an error @@ -269,6 +271,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) } else { reader.seekPeekCursor(end); } + token = bestMatch.token; } else { token = Token{}; -- cgit v1.2.3 From b54760fbd5470032dc716dc870dc08b32dfba5ac Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 28 Feb 2015 15:48:07 +0100 Subject: Test case for data being empty if a token is found --- test/core/parser/utils/TokenizerTest.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 785bd81..9f644c2 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -450,5 +450,32 @@ TEST(Tokenizer, nonPrimaryTokens) TokenizedData data; ASSERT_FALSE(tokenizer.read(reader, token, data)); } + + +TEST(Tokenizer, ambiguousTokens2) +{ + CharReader reader{"<\\"}; + + Tokenizer tokenizer; + + TokenId tBackslash = tokenizer.registerToken("\\"); + TokenId tAnnotationStart = tokenizer.registerToken("<\\"); + + TokenSet tokens = TokenSet{tBackslash, tAnnotationStart}; + Token token; + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ("<\\", token.content); + ASSERT_EQ(tAnnotationStart, token.id); + ASSERT_TRUE(data.empty()); + } + + { + TokenizedData data; + ASSERT_FALSE(tokenizer.read(reader, token, data)); + } +} + } -- cgit v1.2.3 From fa2a5bdf0152002de520fcc72e48686b9e2657b1 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 28 Feb 2015 15:48:38 +0100 Subject: Adapted all unit tests, renamed COMMAND_END event to RANGE_END event to match ranged annotations --- src/formats/osml/OsmlStreamParser.cpp | 28 +- src/formats/osml/OsmlStreamParser.hpp | 11 +- test/formats/osml/OsmlStreamParserTest.cpp | 438 ++++++++++++++++++----------- 3 files changed, 299 insertions(+), 178 deletions(-) diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 7e01a3c..e467dc5 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -127,7 +127,7 @@ private: /** * Set to true if this is a command with clear begin and end. */ - bool hasRange; + bool hasRange: 1; public: /** @@ -259,7 +259,7 @@ public: */ enum class State : uint8_t { COMMAND_START = 0, - COMMAND_END = 1, + RANGE_END = 1, FIELD_START = 2, FIELD_END = 3, ANNOTATION_START = 4, @@ -328,7 +328,7 @@ private: * * @return an internal State specifying whether an error occured (return * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a - * command was actually ended (return value State::COMMAND_END). + * command was actually ended (return value State::RANGE_END). */ State parseEndCommand(); @@ -569,7 +569,7 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parseEndCommand() // End the current command location = name.getLocation(); commands.pop(); - return State::COMMAND_END; + return State::RANGE_END; } Variant OsmlStreamParserImpl::parseCommandArguments(Variant commandArgName) @@ -808,14 +808,15 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() // If this was an annotation start token, add the parsed < to the // output + SourceOffset charStart = token.location.getStart(); + SourceOffset charEnd = reader.getPeekOffset(); if (type == OsmlTokens.AnnotationStart) { - data.append('<', token.location.getStart(), - token.location.getStart() + 1); + data.append('<', charStart, charStart + 1); + charStart = charStart + 1; } // Append the character to the output data, mark it as protected - data.append(c, token.location.getStart(), reader.getPeekOffset(), - true); + data.append(c, charStart, charEnd, true); reader.consumePeek(); continue; } else if (type == Tokens::Data) { @@ -880,11 +881,12 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() // Make sure all open commands and fields have been ended at the end of the // stream - while (commands.size() > 1) { + while (true) { + bool topLevelCommand = commands.size() == 1U; if (cmd().inField()) { // If the stream ended with an open range field, issue information // about the range field - if (cmd().inRangeField()) { + if (cmd().inRangeField() && !topLevelCommand) { // Inform about the still open command itself logger.error("Reached end of stream, but command \"" + getCommandName().asString() + @@ -901,7 +903,11 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() } } } - commands.pop(); + if (!topLevelCommand) { + commands.pop(); + } else { + break; + } } location = SourceLocation{reader.getSourceId(), reader.getOffset()}; diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 1fee90b..10d5296 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -69,11 +69,11 @@ public: COMMAND_START = 0, /** - * State returned if a range command has just ended. This state is not - * returned for non-range commands (as the actual end of a command is - * context dependant). + * State returned if a range command or range annotation has just ended. + * This state is not returned for non-range commands (as the actual end + * of a command is context dependent). */ - COMMAND_END = 1, + RANGE_END = 1, /** * State returned if a new field started. The reader assures that the @@ -185,7 +185,8 @@ public: /** * Returns true if the currently started command is a range command, only - * valid if State::COMMAND_START was returned by the "parse" function. + * valid if State::COMMAND_START or State::ANNOTATION_START was returned by + * the "parse" function. * * @return true if the command is started is a range command, false * otherwise. diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index 8b64e51..3e7f4c1 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -70,11 +70,11 @@ static void assertCommand(OsmlStreamParser &parser, assertCommandStart(parser, name, false, Variant::mapType{}, start, end); } -static void assertCommandEnd(OsmlStreamParser &parser, +static void assertRangeEnd(OsmlStreamParser &parser, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { - ASSERT_EQ(OsmlStreamParser::State::COMMAND_END, parser.parse()); + ASSERT_EQ(OsmlStreamParser::State::RANGE_END, parser.parse()); if (start != InvalidSourceOffset) { EXPECT_EQ(start, parser.getLocation().getStart()); } @@ -510,6 +510,61 @@ TEST(OsmlStreamParser, fields) assertEnd(parser, 14, 14); } +TEST(OsmlStreamParser, fieldsWithoutCommand) +{ + const char *testString = "{a}{b}{c}"; + // 012345678 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + + assertFieldStart(parser, false, 0, 1); + assertTextData(parser, "a", 1, 2, 1, 2, WhitespaceMode::PRESERVE); + assertFieldEnd(parser, 2, 3); + + assertFieldStart(parser, false, 3, 4); + assertTextData(parser, "b", 4, 5, 4, 5, WhitespaceMode::PRESERVE); + assertFieldEnd(parser, 5, 6); + + assertFieldStart(parser, false, 6, 7); + assertTextData(parser, "c", 7, 8, 7, 8, WhitespaceMode::PRESERVE); + assertFieldEnd(parser, 8, 9); + assertEnd(parser, 9, 9); +} + +TEST(OsmlStreamParser, nestedField) +{ + const char *testString = "{{a{b}}}"; + // 01234567 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + + assertFieldStart(parser, false, 0, 1); + assertFieldStart(parser, false, 1, 2); + assertTextData(parser, "a", 2, 3, 2, 3, WhitespaceMode::PRESERVE); + assertFieldStart(parser, false, 3, 4); + assertTextData(parser, "b", 4, 5, 4, 5, WhitespaceMode::PRESERVE); + assertFieldEnd(parser, 5, 6); + assertFieldEnd(parser, 6, 7); + assertFieldEnd(parser, 7, 8); + assertEnd(parser, 8, 8); +} + +TEST(OsmlStreamParser, errorUnbalancedField) +{ + const char *testString = "{a"; + // 01 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + + logger.reset(); + + assertFieldStart(parser, false, 0, 1); + assertTextData(parser, "a", 1, 2, 1, 2, WhitespaceMode::PRESERVE); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 2, 2); + ASSERT_TRUE(logger.hasError()); +} + TEST(OsmlStreamParser, dataOutsideField) { const char *testString = "\\test{a}{b} c"; @@ -720,7 +775,7 @@ TEST(OsmlStreamParser, beginEnd) OsmlStreamParser parser(charReader, logger); assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); - assertCommandEnd(parser, 17, 21); + assertRangeEnd(parser, 17, 21); assertEnd(parser, 22, 22); } @@ -734,7 +789,7 @@ TEST(OsmlStreamParser, beginEndWithName) OsmlStreamParser parser(charReader, logger); assertCommandStart(parser, "book", true, {{"name", "a"}}, 7, 11); - assertCommandEnd(parser, 19, 23); + assertRangeEnd(parser, 19, 23); assertEnd(parser, 24, 24); } @@ -749,7 +804,7 @@ TEST(OsmlStreamParser, beginEndWithNameAndArgs) assertCommandStart(parser, "book", true, {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertCommandEnd(parser, 37, 41); + assertRangeEnd(parser, 37, 41); assertEnd(parser, 42, 42); } @@ -775,7 +830,7 @@ TEST(OsmlStreamParser, beginEndWithNameAndArgsMultipleFields) assertFieldStart(parser, false, 49, 50); assertFieldEnd(parser, 50, 51); assertFieldEnd(parser, 51, 52); - assertCommandEnd(parser, 57, 61); + assertRangeEnd(parser, 57, 61); assertEnd(parser, 62, 62); } @@ -790,10 +845,10 @@ TEST(OsmlStreamParser, beginEndWithData) assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); assertData(parser, "a", 12, 13); - assertCommandEnd(parser, 18, 22); + assertRangeEnd(parser, 18, 22); assertEnd(parser, 23, 23); } -/* + TEST(OsmlStreamParser, beginEndNested) { const char *testString = @@ -802,29 +857,32 @@ TEST(OsmlStreamParser, beginEndNested) // 0 1 2 3 4 5 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - - assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, false, 9, 10); - assertData(reader, "b", 10, 11); - assertFieldEnd(reader, 11, 12); - assertFieldStart(reader, true, 13, 14); - assertData(reader, "c", 13, 14); - assertCommand(reader, "d", 22, 23); - assertFieldStart(reader, false, 24, 25); - assertData(reader, "e", 25, 26); - assertFieldEnd(reader, 26, 27); - assertFieldStart(reader, false, 27, 28); - assertData(reader, "f", 28, 29); - assertFieldEnd(reader, 29, 30); - assertFieldStart(reader, true, 31, 32); - assertCommand(reader, "g", 31, 33); - assertFieldStart(reader, false, 33, 34); - assertData(reader, "h", 34, 35); - assertFieldEnd(reader, 35, 36); - assertFieldEnd(reader, 42, 43); - assertFieldEnd(reader, 49, 50); - assertEnd(reader, 51, 51); + OsmlStreamParser parser(charReader, logger); + + assertCommandStart(parser, "a", true, Variant::mapType{}, 7, 8); + assertFieldStart(parser, false, 9, 10); + assertData(parser, "b", 10, 11); + assertFieldEnd(parser, 11, 12); + + assertData(parser, "c", 13, 14); + + assertCommandStart(parser, "d", true, Variant::mapType{}, 22, 23); + assertFieldStart(parser, false, 24, 25); + assertData(parser, "e", 25, 26); + assertFieldEnd(parser, 26, 27); + assertFieldStart(parser, false, 27, 28); + assertData(parser, "f", 28, 29); + assertFieldEnd(parser, 29, 30); + + assertEmptyData(parser); + assertCommand(parser, "g", 31, 33); + assertFieldStart(parser, false, 33, 34); + assertData(parser, "h", 34, 35); + assertFieldEnd(parser, 35, 36); + assertEmptyData(parser); + assertRangeEnd(parser, 42, 43); + assertRangeEnd(parser, 49, 50); + assertEnd(parser, 51, 51); } TEST(OsmlStreamParser, beginEndWithCommand) @@ -834,16 +892,75 @@ TEST(OsmlStreamParser, beginEndWithCommand) // 0 1 2 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); + + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertCommand(parser, "a", 12, 14); + assertFieldStart(parser, false, 14, 15); + assertData(parser, "test", 15, 19); + assertFieldEnd(parser, 19, 20); + assertRangeEnd(parser, 25, 29); + assertEnd(parser, 30, 30); +} + +TEST(OsmlStreamParser, beginEndNestedFields) +{ + const char *testString = "\\begin{book}a{{b{c}}}\\end{book}"; + // 012345678901234567890 1234567890 + // 0 1 2 3 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + logger.reset(); + + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertData(parser, "a", 12, 13); + assertFieldStart(parser, false, 13, 14); + assertFieldStart(parser, false, 14, 15); + assertData(parser, "b", 15, 16); + assertFieldStart(parser, false, 16, 17); + assertData(parser, "c", 17, 18); + assertFieldEnd(parser, 18, 19); + assertFieldEnd(parser, 19, 20); + assertFieldEnd(parser, 20, 21); + assertRangeEnd(parser, 26, 30); + assertEnd(parser, 31, 31); +} + +TEST(OsmlStreamParser, errorBeginEndUnbalancedNestedFields) +{ + const char *testString = "\\begin{book}a{{b{c}}\\end{book}"; + // 012345678901234567890 123456789 + // 0 1 2 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + logger.reset(); + + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertData(parser, "a", 12, 13); + assertFieldStart(parser, false, 13, 14); + assertFieldStart(parser, false, 14, 15); + assertData(parser, "b", 15, 16); + assertFieldStart(parser, false, 16, 17); + assertData(parser, "c", 17, 18); + assertFieldEnd(parser, 18, 19); + assertFieldEnd(parser, 19, 20); + ASSERT_THROW(assertRangeEnd(parser, 25, 29), LoggableException); +} + +TEST(OsmlStreamParser, errorBeginEndUnbalancedFields) +{ + const char *testString = "{a"; + // 01 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, true, 12, 13); - assertCommand(reader, "a", 12, 14); - assertFieldStart(reader, false, 14, 15); - assertData(reader, "test", 15, 19); - assertFieldEnd(reader, 19, 20); - assertFieldEnd(reader, 25, 29); - assertEnd(reader, 30, 30); + logger.reset(); + + assertFieldStart(parser, false, 0, 1); + assertTextData(parser, "a", 1, 2, 1, 2, WhitespaceMode::PRESERVE); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 2, 2); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginNoBraceOpen) @@ -852,12 +969,13 @@ TEST(OsmlStreamParser, errorBeginNoBraceOpen) // 01234567 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 7, 8); + assertData(parser, "a", 7, 8); ASSERT_TRUE(logger.hasError()); + assertEnd(parser, 8, 8); } TEST(OsmlStreamParser, errorBeginNoIdentifier) @@ -865,7 +983,7 @@ TEST(OsmlStreamParser, errorBeginNoIdentifier) const char *testString = "\\begin{!"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); @@ -878,7 +996,7 @@ TEST(OsmlStreamParser, errorBeginNoBraceClose) const char *testString = "\\begin{a"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); @@ -891,15 +1009,15 @@ TEST(OsmlStreamParser, errorBeginNoName) const char *testString = "\\begin{a#}"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "a"); + assertCommandStart(parser, "a", true); ASSERT_TRUE(logger.hasError()); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertEnd(reader); + assertEnd(parser); ASSERT_TRUE(logger.hasError()); } @@ -909,11 +1027,11 @@ TEST(OsmlStreamParser, errorEndNoBraceOpen) // 012345 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 5, 6); + assertData(parser, "a", 5, 6); ASSERT_TRUE(logger.hasError()); } @@ -922,7 +1040,7 @@ TEST(OsmlStreamParser, errorEndNoIdentifier) const char *testString = "\\end{!"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); @@ -935,7 +1053,7 @@ TEST(OsmlStreamParser, errorEndNoBraceClose) const char *testString = "\\end{a"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); @@ -948,7 +1066,7 @@ TEST(OsmlStreamParser, errorEndNoBegin) const char *testString = "\\end{a}"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); @@ -963,14 +1081,13 @@ TEST(OsmlStreamParser, errorBeginEndMismatch) // 0 1 2 3 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); - assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, true, 10, 11); - assertCommand(reader, "b", 17, 18); - assertFieldStart(reader, true, 20, 24); - assertData(reader, "test", 20, 24); + assertCommandStart(parser, "a", true, Variant::mapType{}, 7, 8); + assertEmptyData(parser); + assertCommandStart(parser, "b", true, Variant::mapType{}, 17, 18); + assertData(parser, "test", 20, 24); ASSERT_FALSE(logger.hasError()); ASSERT_THROW(parser.parse(), LoggableException); ASSERT_TRUE(logger.hasError()); @@ -982,10 +1099,10 @@ TEST(OsmlStreamParser, commandWithNSSep) // 012345678901 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "test1:test2", 0, 12); - assertEnd(reader, 12, 12); + assertCommand(parser, "test1:test2", 0, 12); + assertEnd(parser, 12, 12); } TEST(OsmlStreamParser, beginEndWithNSSep) @@ -995,12 +1112,11 @@ TEST(OsmlStreamParser, beginEndWithNSSep) // 0 1 2 3 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "test1:test2", 7, 18); - assertFieldStart(reader, true, 19, 20); - assertFieldEnd(reader, 24, 35); - assertEnd(reader, 36, 36); + assertCommandStart(parser, "test1:test2", true, Variant::mapType{}, 7, 18); + assertRangeEnd(parser, 24, 35); + assertEnd(parser, 36, 36); } TEST(OsmlStreamParser, errorBeginNSSep) @@ -1008,15 +1124,14 @@ TEST(OsmlStreamParser, errorBeginNSSep) const char *testString = "\\begin:test{blub}\\end{blub}"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "blub"); + assertCommandStart(parser, "blub", true, Variant::mapType{}); ASSERT_TRUE(logger.hasError()); - assertFieldStart(reader, true); - assertFieldEnd(reader); - assertEnd(reader); + assertRangeEnd(parser); + assertEnd(parser); } TEST(OsmlStreamParser, errorEndNSSep) @@ -1024,15 +1139,14 @@ TEST(OsmlStreamParser, errorEndNSSep) const char *testString = "\\begin{blub}\\end:test{blub}"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); - assertCommand(reader, "blub"); - assertFieldStart(reader, true); + assertCommandStart(parser, "blub", true, Variant::mapType{}); ASSERT_FALSE(logger.hasError()); - assertFieldEnd(reader); + assertRangeEnd(parser); ASSERT_TRUE(logger.hasError()); - assertEnd(reader); + assertEnd(parser); } TEST(OsmlStreamParser, errorEmptyNs) @@ -1040,14 +1154,14 @@ TEST(OsmlStreamParser, errorEmptyNs) const char *testString = "\\test:"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); + assertCommand(parser, "test"); ASSERT_TRUE(logger.hasError()); - assertData(reader, ":"); - assertEnd(reader); + assertData(parser, ":"); + assertEnd(parser); } TEST(OsmlStreamParser, errorRepeatedNs) @@ -1055,14 +1169,14 @@ TEST(OsmlStreamParser, errorRepeatedNs) const char *testString = "\\test::"; CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); + assertCommand(parser, "test"); ASSERT_TRUE(logger.hasError()); - assertData(reader, "::"); - assertEnd(reader); + assertData(parser, "::"); + assertEnd(parser); } TEST(OsmlStreamParser, explicitDefaultField) @@ -1071,14 +1185,14 @@ TEST(OsmlStreamParser, explicitDefaultField) // 01234567 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, true, 2, 4); - assertData(reader, "b", 4, 5); - assertFieldEnd(reader, 5, 6); - assertData(reader, "c", 6, 7); - assertEnd(reader, 7, 7); + assertCommand(parser, "a", 0, 2); + assertFieldStart(parser, true, 2, 4); + assertData(parser, "b", 4, 5); + assertFieldEnd(parser, 5, 6); + assertData(parser, "c", 6, 7); + assertEnd(parser, 7, 7); } TEST(OsmlStreamParser, explicitDefaultFieldWithCommand) @@ -1087,33 +1201,33 @@ TEST(OsmlStreamParser, explicitDefaultFieldWithCommand) // 0123 4567 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, true, 2, 4); - assertCommand(reader, "b", 4, 6); - assertFieldEnd(reader, 6, 7); - assertData(reader, "c", 7, 8); - assertEnd(reader, 8, 8); + assertCommand(parser, "a", 0, 2); + assertFieldStart(parser, true, 2, 4); + assertCommand(parser, "b", 4, 6); + assertFieldEnd(parser, 6, 7); + assertData(parser, "c", 7, 8); + assertEnd(parser, 8, 8); } -TEST(OsmlStreamParser, errorFieldAfterExplicitDefaultField) +TEST(OsmlStreamParser, fieldAfterExplicitDefaultField) { const char *testString = "\\a{!\\b}{c}"; // 0123 456789 CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, true, 2, 4); - assertCommand(reader, "b", 4, 6); - assertFieldEnd(reader, 6, 7); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "c", 8, 9); - ASSERT_TRUE(logger.hasError()); - assertEnd(reader, 10, 10); + assertCommand(parser, "a", 0, 2); + assertFieldStart(parser, true, 2, 4); + assertCommand(parser, "b", 4, 6); + assertFieldEnd(parser, 6, 7); + assertFieldStart(parser, false, 7, 8); + assertData(parser, "c", 8, 9); + assertFieldEnd(parser, 9, 10); + assertEnd(parser, 10, 10); } TEST(OsmlStreamParser, annotationStart) @@ -1123,10 +1237,10 @@ TEST(OsmlStreamParser, annotationStart) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); - assertEnd(reader, 3, 3); + assertAnnotationStart(parser, "a", Variant::mapType{}, 0, 3); + assertEnd(parser, 3, 3); } TEST(OsmlStreamParser, annotationStartWithName) @@ -1137,11 +1251,11 @@ TEST(OsmlStreamParser, annotationStartWithName) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart(reader, "annotationWithName", + assertAnnotationStart(parser, "annotationWithName", Variant::mapType{{"name", "aName"}}, 0, 20); - assertEnd(reader, 26, 26); + assertEnd(parser, 26, 26); } TEST(OsmlStreamParser, annotationStartWithArguments) @@ -1152,12 +1266,12 @@ TEST(OsmlStreamParser, annotationStartWithArguments) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); assertAnnotationStart( - reader, "annotationWithName", + parser, "annotationWithName", Variant::mapType{{"name", "aName"}, {"a", 1}, {"b", 2}}, 0, 20); - assertEnd(reader, 35, 35); + assertEnd(parser, 35, 35); } TEST(OsmlStreamParser, simpleAnnotationStartBeginEnd) @@ -1168,16 +1282,16 @@ TEST(OsmlStreamParser, simpleAnnotationStartBeginEnd) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); assertAnnotationStart( - reader, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, + parser, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, 10); - assertFieldStart(reader, true, 26, 27); - assertData(reader, "a", 26, 27); - assertFieldEnd(reader, 33, 35); - assertAnnotationEnd(reader, "", "", 36, 38); - assertEnd(reader, 38, 38); + ASSERT_TRUE(parser.inRangeCommand()); + assertData(parser, "a", 26, 27); + assertRangeEnd(parser, 33, 35); + assertAnnotationEnd(parser, "", "", 36, 38); + assertEnd(parser, 38, 38); } TEST(OsmlStreamParser, annotationEnd) @@ -1187,10 +1301,10 @@ TEST(OsmlStreamParser, annotationEnd) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationEnd(reader, "a", "", 0, 2); - assertEnd(reader, 3, 3); + assertAnnotationEnd(parser, "a", "", 0, 2); + assertEnd(parser, 3, 3); } TEST(OsmlStreamParser, annotationEndWithName) @@ -1200,10 +1314,10 @@ TEST(OsmlStreamParser, annotationEndWithName) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationEnd(reader, "a", "name", 0, 2); - assertEnd(reader, 8, 8); + assertAnnotationEnd(parser, "a", "name", 0, 2); + assertEnd(parser, 8, 8); } TEST(OsmlStreamParser, annotationEndWithNameAsArgs) @@ -1213,10 +1327,10 @@ TEST(OsmlStreamParser, annotationEndWithNameAsArgs) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationEnd(reader, "a", "name", 0, 2); - assertEnd(reader, 14, 14); + assertAnnotationEnd(parser, "a", "name", 0, 2); + assertEnd(parser, 14, 14); } TEST(OsmlStreamParser, errorAnnotationEndWithArguments) @@ -1227,14 +1341,14 @@ TEST(OsmlStreamParser, errorAnnotationEndWithArguments) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); logger.reset(); ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "a", Variant::mapType{{"foo", "bar"}}, 0, 2); + assertCommandStart(parser, "a", false, Variant::mapType{{"foo", "bar"}}, 0, 2); ASSERT_TRUE(logger.hasError()); - assertData(reader, ">", 11, 12); - assertEnd(reader, 12, 12); + assertData(parser, ">", 11, 12); + assertEnd(parser, 12, 12); } TEST(OsmlStreamParser, closingAnnotation) @@ -1244,11 +1358,11 @@ TEST(OsmlStreamParser, closingAnnotation) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); - assertData(reader, ">", 3, 4); - assertEnd(reader, 4, 4); + assertAnnotationStart(parser, "a", Variant::mapType{}, 0, 3); + assertData(parser, ">", 3, 4); + assertEnd(parser, 4, 4); } TEST(OsmlStreamParser, annotationWithFields) @@ -1259,23 +1373,23 @@ TEST(OsmlStreamParser, annotationWithFields) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); - - assertData(reader, "a", 0, 1); - assertAnnotationStart(reader, "b", Variant::mapType{}, 2, 5); - assertFieldStart(reader, false, 5, 6); - assertData(reader, "c", 6, 7); - assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, false, 8, 9); - assertData(reader, "d", 9, 10); - assertFieldEnd(reader, 10, 11); - assertFieldStart(reader, true, 11, 13); - assertData(reader, "e", 13, 14); - assertFieldEnd(reader, 14, 15); - assertData(reader, "f", 16, 17); - assertAnnotationEnd(reader, "", "", 18, 20); - assertData(reader, "g", 21, 22); - assertEnd(reader, 22, 22); + OsmlStreamParser parser(charReader, logger); + + assertData(parser, "a", 0, 1); + assertAnnotationStart(parser, "b", Variant::mapType{}, 2, 5); + assertFieldStart(parser, false, 5, 6); + assertData(parser, "c", 6, 7); + assertFieldEnd(parser, 7, 8); + assertFieldStart(parser, false, 8, 9); + assertData(parser, "d", 9, 10); + assertFieldEnd(parser, 10, 11); + assertFieldStart(parser, true, 11, 13); + assertData(parser, "e", 13, 14); + assertFieldEnd(parser, 14, 15); + assertData(parser, "f", 16, 17); + assertAnnotationEnd(parser, "", "", 18, 20); + assertData(parser, "g", 21, 22); + assertEnd(parser, 22, 22); } TEST(OsmlStreamParser, annotationStartEscape) @@ -1286,11 +1400,11 @@ TEST(OsmlStreamParser, annotationStartEscape) CharReader charReader(testString); - OsmlStreamParser reader(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertData(reader, "<%test", 0, 7); - assertEnd(reader, 7, 7); + assertData(parser, "<%test", 0, 7); + assertEnd(parser, 7, 7); } -*/ + } -- cgit v1.2.3 From cb6cacdc7eade9d4290767bafb7ccf4e935d0fbf Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 13:49:26 +0100 Subject: allowing to store gaps in SourceOffsetVector and fixed bug with trim not resetting offsets correctly when the new length is zero --- src/core/parser/utils/SourceOffsetVector.hpp | 64 ++++++++++++++++------- test/core/parser/utils/SourceOffsetVectorTest.cpp | 2 +- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index 67bacef..f322a88 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -43,6 +44,9 @@ namespace ousia { * a delta compression. */ class SourceOffsetVector { +public: + using OffsPair = std::pair; + private: /** * Type used for representing the length of a character. @@ -81,10 +85,13 @@ private: */ std::vector offsets; + /** + * Map used to store discontinuities in the character offsets. + */ + std::unordered_map gaps; + /** * Last position given as "end" position in the storeOffset() method. - * Used to adapt the length of the previous element in case start and end - * positions do not match. */ SourceOffset lastEnd; @@ -105,19 +112,22 @@ public: // Make sure (end - start) is smaller than MAX_LEN assert(end - start < MAX_LEN); - // Adapt the length of the previous character in case there is a gap - if (!lens.empty() && start > lastEnd) { - lens.back() += start - lastEnd; - } - lastEnd = end; - // Store an absolute offset every OFFSET_INTERVAL elements if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) { offsets.push_back(start); } - // Store the length - lens.push_back(end - start); + // Adapt the length of the previous character in case there is a gap + if (!lens.empty() && start > lastEnd) { + // There is a discontinuity, store the given offsets in the "gaps" + // map + gaps[lens.size()] = OffsPair(start, end); + lens.push_back(MAX_LEN); + } else { + // Store the length + lens.push_back(end - start); + } + lastEnd = end; } /** @@ -127,14 +137,13 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair loadOffset(size_t idx) const + OffsPair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); if (idx > 0 && idx == count) { auto offs = loadOffset(count - 1); - return std::pair(offs.second, - offs.second); + return OffsPair(offs.second, offs.second); } // Calculate the start index in the lens vector and in the offsets @@ -146,12 +155,26 @@ public: assert(idx < count); assert(offsetIdx < offsets.size()); + // If the length of the last character is MAX_LEN, the position is + // stored in the "gaps" list + if (lens[idx] == MAX_LEN) { + auto it = gaps.find(idx); + assert(it != gaps.end()); + return it->second; + } + // Sum over the length starting with the start offset SourceOffset start = offsets[offsetIdx]; for (size_t i = sumStartIdx; i < idx; i++) { - start += lens[i]; + if (lens[i] == MAX_LEN) { + auto it = gaps.find(i); + assert(it != gaps.end()); + start = it->second.first; + } else { + start += lens[i]; + } } - return std::pair(start, start + lens[idx]); + return OffsPair(start, start + lens[idx]); } /** @@ -166,13 +189,16 @@ public: * @param length is the number of characters to which the TokenizedData * instance should be trimmed. */ - void trim(size_t length) { + void trim(size_t length) + { if (length < size()) { lens.resize(length); - offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); if (length > 0) { + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); lastEnd = loadOffset(length - 1).second; } else { + offsets.clear(); + gaps.clear(); lastEnd = 0; } } @@ -182,9 +208,11 @@ public: * Resets the SourceOffsetVector to the state it had when it was * constructed. */ - void clear() { + void clear() + { lens.clear(); offsets.clear(); + gaps.clear(); lastEnd = 0; } }; diff --git a/test/core/parser/utils/SourceOffsetVectorTest.cpp b/test/core/parser/utils/SourceOffsetVectorTest.cpp index 25a4163..26254f9 100644 --- a/test/core/parser/utils/SourceOffsetVectorTest.cpp +++ b/test/core/parser/utils/SourceOffsetVectorTest.cpp @@ -51,7 +51,7 @@ TEST(SourceOffsetVector, gaps) for (size_t i = 0; i < 999; i++) { auto elem = vec.loadOffset(i); EXPECT_EQ(i * 3 + 5, elem.first); - EXPECT_EQ((i + 1) * 3 + 5, elem.second); + EXPECT_EQ(i * 3 + 7, elem.second); } auto elem = vec.loadOffset(999); EXPECT_EQ(999U * 3 + 5, elem.first); -- cgit v1.2.3 From 31c83c05d257c9a7a336f12342c401f97d380674 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 13:50:15 +0100 Subject: Prefer longer non-primary tokens --- src/core/parser/utils/Tokenizer.cpp | 45 +++++----- test/core/parser/utils/TokenizerTest.cpp | 148 ++++++++++++++++++++++++++----- 2 files changed, 150 insertions(+), 43 deletions(-) diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 94d9cb0..8d540a6 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -188,7 +188,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!bestMatch.hasMatch()) { + if (!bestMatch.hasMatch() || !bestMatch.primary) { lookups.emplace_back(root, charStart, dataStartOffset); } @@ -201,36 +201,35 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) continue; } - // If the matched token is primary, check whether it is better than - // the current best match, if yes, replace the best match. In any - // case just continue - if (match.primary) { - if (match.size() > bestMatch.size()) { - bestMatch = match; - } - continue; + // Replace the best match with longest token + if (match.size() > bestMatch.size()) { + bestMatch = match; } - // Otherwise -- if the matched token is a non-primary token (and no - // primary token has been found until now) -- mark the match in the - // TokenizedData - if (!bestMatch.hasMatch()) { + // If the matched token is a non-primary token -- mark the match in + // the TokenizedData list + if (!match.primary) { data.mark(match.token.id, data.size() - match.size() + 1, match.size()); } } - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (bestMatch.hasMatch()) { - if ((nextLookups.empty() || data.size() > initialDataSize)) { + + // If a token has been found and the token is a primary token, check + // whether we have to abort, otherwise if we have a non-primary match, + // reset it once it can no longer be advanced + if (bestMatch.hasMatch() && nextLookups.empty()) { + if (bestMatch.primary) { break; + } else { + bestMatch = TokenMatch{}; } - } else { - // Record all incomming characters - data.append(c, charStart, charEnd); } + // Record all incomming characters + data.append(c, charStart, charEnd); + + // Swap the lookups and the nextLookups list lookups = std::move(nextLookups); nextLookups.clear(); @@ -241,17 +240,17 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) // If we found data, emit a corresponding data token if (data.size() > initialDataSize && - (!bestMatch.hasMatch() || + (!bestMatch.hasMatch() || !bestMatch.primary || bestMatch.dataStartOffset > initialDataSize)) { // If we have a "bestMatch" wich starts after text data has started, // trim the TokenizedData to this offset - if (bestMatch.dataStartOffset > initialDataSize) { + if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) { data.trim(bestMatch.dataStartOffset); } // Create a token containing the data location bestMatch.token = Token{data.getLocation()}; - } else if (bestMatch.hasMatch() && + } else if (bestMatch.hasMatch() && bestMatch.primary && bestMatch.dataStartOffset == initialDataSize) { data.trim(initialDataSize); } diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index 9f644c2..45fc77a 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -26,6 +26,60 @@ namespace ousia { +static void assertPrimaryToken(CharReader &reader, Tokenizer &tokenizer, + TokenId id, const std::string &text, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset, + SourceId sourceId = InvalidSourceId) +{ + Token token; + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + EXPECT_EQ(id, token.id); + EXPECT_EQ(text, token.content); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, token.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, token.getLocation().getEnd()); + } + EXPECT_EQ(sourceId, token.getLocation().getSourceId()); +} + +static void expectData(const std::string &expected, SourceOffset tokenStart, + SourceOffset tokenEnd, SourceOffset textStart, + SourceOffset textEnd, const Token &token, + TokenizedData &data, + WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ + ASSERT_EQ(Tokens::Data, token.id); + + Token textToken; + TokenizedDataReader reader = data.reader(); + ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode)); + + EXPECT_EQ(expected, textToken.content); + EXPECT_EQ(tokenStart, token.location.getStart()); + EXPECT_EQ(tokenEnd, token.location.getEnd()); + EXPECT_EQ(textStart, textToken.getLocation().getStart()); + EXPECT_EQ(textEnd, textToken.getLocation().getEnd()); + EXPECT_TRUE(reader.atEnd()); +} + +static void assertDataToken(CharReader &reader, Tokenizer &tokenizer, + const std::string &expected, + SourceOffset tokenStart, SourceOffset tokenEnd, + SourceOffset textStart, SourceOffset textEnd, + WhitespaceMode mode = WhitespaceMode::PRESERVE) +{ + Token token; + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + + expectData(expected, tokenStart, tokenEnd, textStart, textEnd, token, data, + mode); +} + TEST(Tokenizer, tokenRegistration) { Tokenizer tokenizer; @@ -53,25 +107,6 @@ TEST(Tokenizer, tokenRegistration) ASSERT_EQ("d", tokenizer.lookupToken(1U).string); } -void expectData(const std::string &expected, SourceOffset tokenStart, - SourceOffset tokenEnd, SourceOffset textStart, - SourceOffset textEnd, const Token &token, TokenizedData &data, - WhitespaceMode mode = WhitespaceMode::PRESERVE) -{ - ASSERT_EQ(Tokens::Data, token.id); - - Token textToken; - TokenizedDataReader reader = data.reader(); - ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode)); - - EXPECT_EQ(expected, textToken.content); - EXPECT_EQ(tokenStart, token.location.getStart()); - EXPECT_EQ(tokenEnd, token.location.getEnd()); - EXPECT_EQ(textStart, textToken.getLocation().getStart()); - EXPECT_EQ(textEnd, textToken.getLocation().getEnd()); - EXPECT_TRUE(reader.atEnd()); -} - TEST(Tokenizer, textTokenPreserveWhitespace) { { @@ -451,6 +486,80 @@ TEST(Tokenizer, nonPrimaryTokens) ASSERT_FALSE(tokenizer.read(reader, token, data)); } +TEST(Tokenizer, primaryNonPrimaryTokenInteraction) +{ + CharReader reader{"<><<<>>"}; + // 01234567890123456789012 3456789012345 + // 0 1 2 3 + + Tokenizer tokenizer; + + TokenId tP1 = tokenizer.registerToken("<", true); + TokenId tP2 = tokenizer.registerToken(">", true); + TokenId tP3 = tokenizer.registerToken("\\>", true); + TokenId tN1 = tokenizer.registerToken("<<", false); + TokenId tN2 = tokenizer.registerToken(">>", false); + + TokenSet tokens = TokenSet{tN1, tN2}; + + Token token, textToken; + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 0, 2); + assertText(dataReader, "test1", tokens, WhitespaceMode::TRIM, 2, 7); + assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 7, 9); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP1, "<", 9, 10); + assertDataToken(reader, tokenizer, "test2", 10, 15, 10, 15); + assertPrimaryToken(reader, tokenizer, tP2, ">", 15, 16); + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 16, 18); + assertText(dataReader, "test3", tokens, WhitespaceMode::TRIM, 18, 23); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP3, "\\>", 23, 25); + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 25, 27); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP1, "<", 27, 28); + + { + TokenizedData data; + ASSERT_TRUE(tokenizer.read(reader, token, data)); + ASSERT_EQ(Tokens::Data, token.id); + + TokenizedDataReader dataReader = data.reader(); + assertText(dataReader, "test4", tokens, WhitespaceMode::TRIM, 28, 33); + assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 33, 35); + assertEnd(dataReader); + } + + assertPrimaryToken(reader, tokenizer, tP2, ">", 35, 36); + + TokenizedData data; + ASSERT_FALSE(tokenizer.read(reader, token, data)); +} TEST(Tokenizer, ambiguousTokens2) { @@ -476,6 +585,5 @@ TEST(Tokenizer, ambiguousTokens2) ASSERT_FALSE(tokenizer.read(reader, token, data)); } } - } -- cgit v1.2.3 From 4e199ad0d5c5d94955839da2a52967b4f0f34a43 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 13:52:34 +0100 Subject: Implemented registration of user-defined tokens, fixed comment handling (do not issue multiple data events if a comment occurs, just skip the comment data like in TeX) --- src/formats/osml/OsmlStreamParser.cpp | 50 +++++++++++++++++++++++------- src/formats/osml/OsmlStreamParser.hpp | 11 +++++-- test/formats/osml/OsmlStreamParserTest.cpp | 13 +++----- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index e467dc5..823075a 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -16,6 +16,10 @@ along with this program. If not, see . */ +#include +#include +#include + #include #include #include @@ -27,9 +31,6 @@ #include "OsmlStreamParser.hpp" -#include -#include - namespace ousia { namespace { @@ -127,7 +128,7 @@ private: /** * Set to true if this is a command with clear begin and end. */ - bool hasRange: 1; + bool hasRange; public: /** @@ -407,6 +408,9 @@ public: State parse(); + TokenId registerToken(const std::string &token); + void unregisterToken(TokenId token); + const TokenizedData &getData() const { return data; } const Variant &getCommandName() const { return cmd().getName(); } const Variant &getCommandArguments() const { return cmd().getArguments(); } @@ -700,10 +704,11 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parseCommand( void OsmlStreamParserImpl::parseBlockComment() { Token token; + TokenizedData commentData; size_t depth = 1; - while (tokenizer.read(reader, token, data)) { + while (tokenizer.read(reader, token, commentData)) { // Throw the comment data away - data.clear(); + commentData.clear(); if (token.id == OsmlTokens.BlockCommentEnd) { depth--; @@ -822,6 +827,14 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() } else if (type == Tokens::Data) { reader.consumePeek(); continue; + } else if (type == OsmlTokens.LineComment) { + reader.consumePeek(); + parseLineComment(); + continue; + } else if (type == OsmlTokens.BlockCommentStart) { + reader.consumePeek(); + parseBlockComment(); + continue; } // A non-text token was reached, make sure all pending data commands @@ -836,11 +849,7 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() // Synchronize the location with the current token location location = token.location; - if (token.id == OsmlTokens.LineComment) { - parseLineComment(); - } else if (token.id == OsmlTokens.BlockCommentStart) { - parseBlockComment(); - } else if (token.id == OsmlTokens.FieldStart) { + if (token.id == OsmlTokens.FieldStart) { cmd().pushField(false, token.location); return State::FIELD_START; } else if (token.id == OsmlTokens.FieldEnd) { @@ -914,6 +923,16 @@ OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() return State::END; } +TokenId OsmlStreamParserImpl::registerToken(const std::string &token) +{ + return tokenizer.registerToken(token, false); +} + +void OsmlStreamParserImpl::unregisterToken(TokenId token) +{ + assert(tokenizer.unregisterToken(token)); +} + /* Class OsmlStreamParser */ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) @@ -955,4 +974,13 @@ bool OsmlStreamParser::inDefaultField() const { return impl->inDefaultField(); } bool OsmlStreamParser::inRangeCommand() const { return impl->inRangeCommand(); } +TokenId OsmlStreamParser::registerToken(const std::string &token) +{ + return impl->registerToken(token); +} + +void OsmlStreamParser::unregisterToken(TokenId token) +{ + impl->unregisterToken(token); +} } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 10d5296..b7e64f7 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -32,6 +32,8 @@ #include #include +#include + namespace ousia { // Forward declarations @@ -50,10 +52,10 @@ class Variant; * syntactically valid and tries to recorver from most errors. If an error is * irrecoverable (this is the case for errors with wrong nesting of commands or * fields, as this would lead to too many consecutive errors) a - * LoggableException is thrown. The OsmlStreamParser can be compared to a SAX - * parser for XML. + * LoggableException is thrown. In short, the OsmlStreamParser can be described + * as a SAX parser for OSML. */ -class OsmlStreamParser { +class OsmlStreamParser: public parser_stack::ParserCallbacks { public: /** * Enum used to indicate which state the OsmlStreamParser class is in @@ -204,6 +206,9 @@ public: * "{!" syntax). */ bool inDefaultField() const; + + TokenId registerToken(const std::string &token) override; + void unregisterToken(TokenId token) override; }; } diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index 3e7f4c1..0ea087f 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -363,8 +363,7 @@ TEST(OsmlStreamParser, singleLineComment) CharReader charReader(testString); OsmlStreamParser parser(charReader, logger); - assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); - assertTextData(parser, "b", 33, 34, 33, 34, WhitespaceMode::PRESERVE); + assertTextData(parser, "ab", 0, 34, 0, 34, WhitespaceMode::PRESERVE); assertEnd(parser, 34, 34); } @@ -376,8 +375,7 @@ TEST(OsmlStreamParser, multilineComment) CharReader charReader(testString); OsmlStreamParser parser(charReader, logger); - assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); - assertTextData(parser, "b", 40, 41, 40, 41, WhitespaceMode::PRESERVE); + assertTextData(parser, "ab", 0, 41, 0, 41, WhitespaceMode::PRESERVE); assertEnd(parser, 41, 41); } @@ -391,10 +389,10 @@ TEST(OsmlStreamParser, unfinishedMultilineComment) logger.reset(); - assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); ASSERT_FALSE(logger.hasError()); - assertEnd(parser, 38, 38); + assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); ASSERT_TRUE(logger.hasError()); + assertEnd(parser, 38, 38); } TEST(OsmlStreamParser, nestedMultilineComment) @@ -405,8 +403,7 @@ TEST(OsmlStreamParser, nestedMultilineComment) CharReader charReader(testString); OsmlStreamParser parser(charReader, logger); - assertTextData(parser, "a", 0, 1, 0, 1, WhitespaceMode::PRESERVE); - assertTextData(parser, "b", 40, 41, 40, 41, WhitespaceMode::PRESERVE); + assertTextData(parser, "ab", 0, 41, 0, 41, WhitespaceMode::PRESERVE); assertEnd(parser, 41, 41); } -- cgit v1.2.3 From 689348baf70d00e5ff1c8eec3959afc56071994e Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 14:25:39 +0100 Subject: Added user defined tokens test --- test/formats/osml/OsmlStreamParserTest.cpp | 1104 ++++++++++++++-------------- 1 file changed, 568 insertions(+), 536 deletions(-) diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index 0ea087f..d47f529 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -24,17 +24,17 @@ #include #include #include - #include +#include + namespace ousia { static TerminalLogger logger(std::cerr, true); // static ConcreteLogger logger; static void assertCommandStart(OsmlStreamParser &parser, - const std::string &name, - bool rangeCommand, + const std::string &name, bool rangeCommand, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) { @@ -52,8 +52,7 @@ static void assertCommandStart(OsmlStreamParser &parser, } static void assertCommandStart(OsmlStreamParser &parser, - const std::string &name, - bool rangeCommand, + const std::string &name, bool rangeCommand, const Variant::mapType &args, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) @@ -62,17 +61,16 @@ static void assertCommandStart(OsmlStreamParser &parser, EXPECT_EQ(args, parser.getCommandArguments()); } -static void assertCommand(OsmlStreamParser &parser, - const std::string &name, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) +static void assertCommand(OsmlStreamParser &parser, const std::string &name, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) { assertCommandStart(parser, name, false, Variant::mapType{}, start, end); } static void assertRangeEnd(OsmlStreamParser &parser, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) { ASSERT_EQ(OsmlStreamParser::State::RANGE_END, parser.parse()); if (start != InvalidSourceOffset) { @@ -116,11 +114,12 @@ static void assertTextData(OsmlStreamParser &parser, const std::string &text, } static void assertData(OsmlStreamParser &parser, const std::string &text, - SourceOffset textStart = InvalidSourceOffset, - SourceOffset textEnd = InvalidSourceOffset, - WhitespaceMode mode = WhitespaceMode::COLLAPSE) + SourceOffset textStart = InvalidSourceOffset, + SourceOffset textEnd = InvalidSourceOffset, + WhitespaceMode mode = WhitespaceMode::COLLAPSE) { - assertTextData(parser, text, InvalidSourceOffset, InvalidSourceOffset, textStart, textEnd, mode); + assertTextData(parser, text, InvalidSourceOffset, InvalidSourceOffset, + textStart, textEnd, mode); } static void assertEmptyData(OsmlStreamParser &parser) @@ -134,7 +133,6 @@ static void assertEmptyData(OsmlStreamParser &parser) EXPECT_FALSE(dataReader.read(token, TokenSet{}, WhitespaceMode::TRIM)); } - static void assertFieldStart(OsmlStreamParser &parser, bool defaultField, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) @@ -585,363 +583,363 @@ TEST(OsmlStreamParser, dataOutsideField) TEST(OsmlStreamParser, nestedCommand) { - const char *testString = "\\test{a}{\\test2{b} c} d"; - // 012345678 90123456789012 - // 0 1 2 - CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + const char *testString = "\\test{a}{\\test2{b} c} d"; + // 012345678 90123456789012 + // 0 1 2 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - assertCommand(parser, "test", 0, 5); - assertFieldStart(parser, false, 5, 6); - assertData(parser, "a", 6, 7); - assertFieldEnd(parser, 7, 8); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertData(parser, "a", 6, 7); + assertFieldEnd(parser, 7, 8); - assertFieldStart(parser, false, 8, 9); - assertCommand(parser, "test2", 9, 15); - assertFieldStart(parser, false, 15, 16); - assertData(parser, "b", 16, 17); - assertFieldEnd(parser, 17, 18); - assertData(parser, "c", 19, 20); - assertFieldEnd(parser, 20, 21); - assertData(parser, "d", 22, 23); - assertEnd(parser, 23, 23); + assertFieldStart(parser, false, 8, 9); + assertCommand(parser, "test2", 9, 15); + assertFieldStart(parser, false, 15, 16); + assertData(parser, "b", 16, 17); + assertFieldEnd(parser, 17, 18); + assertData(parser, "c", 19, 20); + assertFieldEnd(parser, 20, 21); + assertData(parser, "d", 22, 23); + assertEnd(parser, 23, 23); } - TEST(OsmlStreamParser, nestedCommandImmediateEnd) { - const char *testString = "\\test{\\test2{b}} d"; - // 012345 678901234567 - // 0 1 - CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); - - assertCommand(parser, "test", 0, 5); - assertFieldStart(parser, false, 5, 6); - { - assertCommand(parser, "test2", 6, 12); - assertFieldStart(parser, false, 12, 13); - assertData(parser, "b", 13, 14); - assertFieldEnd(parser, 14, 15); - } - assertFieldEnd(parser, 15, 16); - assertData(parser, "d", 17, 18); - assertEnd(parser, 18, 18); + const char *testString = "\\test{\\test2{b}} d"; + // 012345 678901234567 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + { + assertCommand(parser, "test2", 6, 12); + assertFieldStart(parser, false, 12, 13); + assertData(parser, "b", 13, 14); + assertFieldEnd(parser, 14, 15); + } + assertFieldEnd(parser, 15, 16); + assertData(parser, "d", 17, 18); + assertEnd(parser, 18, 18); } TEST(OsmlStreamParser, nestedCommandNoData) { - const char *testString = "\\test{\\test2}"; - // 012345 6789012 - CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + const char *testString = "\\test{\\test2}"; + // 012345 6789012 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - assertCommand(parser, "test", 0, 5); - assertFieldStart(parser, false, 5, 6); - assertCommand(parser, "test2", 6, 12); - assertFieldEnd(parser, 12, 13); - assertEnd(parser, 13, 13); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertCommand(parser, "test2", 6, 12); + assertFieldEnd(parser, 12, 13); + assertEnd(parser, 13, 13); } TEST(OsmlStreamParser, multipleCommands) { - const char *testString = "\\a \\b \\c \\d"; - // 012 345 678 90 - // 0 1 - CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + const char *testString = "\\a \\b \\c \\d"; + // 012 345 678 90 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); - assertCommand(parser, "a", 0, 2); - assertEmptyData(parser); - assertCommand(parser, "b", 3, 5); - assertEmptyData(parser); - assertCommand(parser, "c", 6, 8); - assertEmptyData(parser); - assertCommand(parser, "d", 9, 11); - assertEnd(parser, 11, 11); + assertCommand(parser, "a", 0, 2); + assertEmptyData(parser); + assertCommand(parser, "b", 3, 5); + assertEmptyData(parser); + assertCommand(parser, "c", 6, 8); + assertEmptyData(parser); + assertCommand(parser, "d", 9, 11); + assertEnd(parser, 11, 11); } TEST(OsmlStreamParser, fieldsWithSpaces) { - const char *testString = "\\a {\\b \\c} \n\n {\\d}"; - // 0123 456 789012 3 456 789 - // 0 1 - CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); - - assertCommand(parser, "a", 0, 2); - assertEmptyData(parser); - assertFieldStart(parser, false, 3, 4); - assertCommand(parser, "b", 4, 6); - assertEmptyData(parser); - assertCommand(parser, "c", 7, 9); - assertFieldEnd(parser, 9, 10); - assertEmptyData(parser); - assertFieldStart(parser, false, 16, 17); - assertCommand(parser, "d", 17, 19); - assertFieldEnd(parser, 19, 20); - assertEnd(parser, 20, 20); + const char *testString = "\\a {\\b \\c} \n\n {\\d}"; + // 0123 456 789012 3 456 789 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + + assertCommand(parser, "a", 0, 2); + assertEmptyData(parser); + assertFieldStart(parser, false, 3, 4); + assertCommand(parser, "b", 4, 6); + assertEmptyData(parser); + assertCommand(parser, "c", 7, 9); + assertFieldEnd(parser, 9, 10); + assertEmptyData(parser); + assertFieldStart(parser, false, 16, 17); + assertCommand(parser, "d", 17, 19); + assertFieldEnd(parser, 19, 20); + assertEnd(parser, 20, 20); } TEST(OsmlStreamParser, errorEndButOpenField) { - const char *testString = "\\a b {"; - // 012345 - // 0 - CharReader charReader(testString); + const char *testString = "\\a b {"; + // 012345 + // 0 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(parser, "a", 0, 2); - assertData(parser, "b", 3, 4); - assertFieldStart(parser, false, 5, 6); - ASSERT_FALSE(logger.hasError()); - assertEnd(parser, 6, 6); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(parser, "a", 0, 2); + assertData(parser, "b", 3, 4); + assertFieldStart(parser, false, 5, 6); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 6, 6); + ASSERT_TRUE(logger.hasError()); } - TEST(OsmlStreamParser, errorNoFieldToEnd) { - const char *testString = "\\a b }"; - // 012345 - // 0 - CharReader charReader(testString); + const char *testString = "\\a b }"; + // 012345 + // 0 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(parser, "a", 0, 2); - assertData(parser, "b", 3, 4); - ASSERT_FALSE(logger.hasError()); - assertEnd(parser, 6, 6); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(parser, "a", 0, 2); + assertData(parser, "b", 3, 4); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 6, 6); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorNoFieldEndNested) { - const char *testString = "\\test{\\test2{}}}"; - // 012345 6789012345 - // 0 1 - CharReader charReader(testString); + const char *testString = "\\test{\\test2{}}}"; + // 012345 6789012345 + // 0 1 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(parser, "test", 0, 5); - assertFieldStart(parser, false, 5, 6); - assertCommand(parser, "test2", 6, 12); - assertFieldStart(parser, false, 12, 13); - assertFieldEnd(parser, 13, 14); - assertFieldEnd(parser, 14, 15); - ASSERT_FALSE(logger.hasError()); - assertEnd(parser, 16, 16); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertCommand(parser, "test2", 6, 12); + assertFieldStart(parser, false, 12, 13); + assertFieldEnd(parser, 13, 14); + assertFieldEnd(parser, 14, 15); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 16, 16); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorNoFieldEndNestedData) { - const char *testString = "\\test{\\test2{}}a}"; - // 012345 67890123456 - // 0 1 - CharReader charReader(testString); + const char *testString = "\\test{\\test2{}}a}"; + // 012345 67890123456 + // 0 1 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(parser, "test", 0, 5); - assertFieldStart(parser, false, 5, 6); - assertCommand(parser, "test2", 6, 12); - assertFieldStart(parser, false, 12, 13); - assertFieldEnd(parser, 13, 14); - assertFieldEnd(parser, 14, 15); - assertData(parser, "a", 15, 16); - ASSERT_FALSE(logger.hasError()); - assertEnd(parser, 17, 17); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommand(parser, "test", 0, 5); + assertFieldStart(parser, false, 5, 6); + assertCommand(parser, "test2", 6, 12); + assertFieldStart(parser, false, 12, 13); + assertFieldEnd(parser, 13, 14); + assertFieldEnd(parser, 14, 15); + assertData(parser, "a", 15, 16); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser, 17, 17); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, beginEnd) { - const char *testString = "\\begin{book}\\end{book}"; - // 012345678901 2345678901 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book}\\end{book}"; + // 012345678901 2345678901 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); - assertRangeEnd(parser, 17, 21); - assertEnd(parser, 22, 22); + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertRangeEnd(parser, 17, 21); + assertEnd(parser, 22, 22); } TEST(OsmlStreamParser, beginEndWithName) { - const char *testString = "\\begin{book#a}\\end{book}"; - // 01234567890123 4567890123 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book#a}\\end{book}"; + // 01234567890123 4567890123 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommandStart(parser, "book", true, {{"name", "a"}}, 7, 11); - assertRangeEnd(parser, 19, 23); - assertEnd(parser, 24, 24); + assertCommandStart(parser, "book", true, {{"name", "a"}}, 7, 11); + assertRangeEnd(parser, 19, 23); + assertEnd(parser, 24, 24); } TEST(OsmlStreamParser, beginEndWithNameAndArgs) { - const char *testString = "\\begin{book#a}[a=1,b=2,c=\"test\"]\\end{book}"; - // 0123456789012345678901234 56789 01 2345678901 - // 0 1 2 3 4 - CharReader charReader(testString); + const char *testString = "\\begin{book#a}[a=1,b=2,c=\"test\"]\\end{book}"; + // 0123456789012345678901234 56789 01 2345678901 + // 0 1 2 3 4 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommandStart(parser, "book", true, - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertRangeEnd(parser, 37, 41); - assertEnd(parser, 42, 42); + assertCommandStart(parser, "book", true, + {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, + 11); + assertRangeEnd(parser, 37, 41); + assertEnd(parser, 42, 42); } TEST(OsmlStreamParser, beginEndWithNameAndArgsMultipleFields) { - const char *testString = - "\\begin{book#a}[a=1,b=2,c=\"test\"]{a \\test}{b \\test{}}\\end{book}"; - // 0123456789012345678901234 56789 01234 567890123 45678901 2345678901 - // 0 1 2 3 4 5 6 - CharReader charReader(testString); - - OsmlStreamParser parser(charReader, logger); - - assertCommandStart(parser, "book", true, - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(parser, false, 32, 33); - assertData(parser, "a", 33, 34); - assertCommand(parser, "test", 35, 40); - assertFieldEnd(parser, 40, 41); - assertFieldStart(parser, false, 41, 42); - assertData(parser, "b", 42, 43); - assertCommand(parser, "test", 44, 49); - assertFieldStart(parser, false, 49, 50); - assertFieldEnd(parser, 50, 51); - assertFieldEnd(parser, 51, 52); - assertRangeEnd(parser, 57, 61); - assertEnd(parser, 62, 62); + const char *testString = + "\\begin{book#a}[a=1,b=2,c=\"test\"]{a \\test}{b \\test{}}\\end{book}"; + // 0123456789012345678901234 56789 01234 567890123 45678901 2345678901 + // 0 1 2 3 4 5 6 + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); + + assertCommandStart(parser, "book", true, + {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, + 11); + assertFieldStart(parser, false, 32, 33); + assertData(parser, "a", 33, 34); + assertCommand(parser, "test", 35, 40); + assertFieldEnd(parser, 40, 41); + assertFieldStart(parser, false, 41, 42); + assertData(parser, "b", 42, 43); + assertCommand(parser, "test", 44, 49); + assertFieldStart(parser, false, 49, 50); + assertFieldEnd(parser, 50, 51); + assertFieldEnd(parser, 51, 52); + assertRangeEnd(parser, 57, 61); + assertEnd(parser, 62, 62); } TEST(OsmlStreamParser, beginEndWithData) { - const char *testString = "\\begin{book}a\\end{book}"; - // 0123456789012 3456789012 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book}a\\end{book}"; + // 0123456789012 3456789012 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); - assertData(parser, "a", 12, 13); - assertRangeEnd(parser, 18, 22); - assertEnd(parser, 23, 23); + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertData(parser, "a", 12, 13); + assertRangeEnd(parser, 18, 22); + assertEnd(parser, 23, 23); } TEST(OsmlStreamParser, beginEndNested) { - const char *testString = - "\\begin{a}{b} c \\begin{d}{e}{f} \\g{h} \\end{d}\\end{a}"; - // 012345678901234 5678901234567890 123456 7890123 4567890 - // 0 1 2 3 4 5 - CharReader charReader(testString); + const char *testString = + "\\begin{a}{b} c \\begin{d}{e}{f} \\g{h} \\end{d}\\end{a}"; + // 012345678901234 5678901234567890 123456 7890123 4567890 + // 0 1 2 3 4 5 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommandStart(parser, "a", true, Variant::mapType{}, 7, 8); - assertFieldStart(parser, false, 9, 10); - assertData(parser, "b", 10, 11); - assertFieldEnd(parser, 11, 12); + assertCommandStart(parser, "a", true, Variant::mapType{}, 7, 8); + assertFieldStart(parser, false, 9, 10); + assertData(parser, "b", 10, 11); + assertFieldEnd(parser, 11, 12); - assertData(parser, "c", 13, 14); + assertData(parser, "c", 13, 14); - assertCommandStart(parser, "d", true, Variant::mapType{}, 22, 23); - assertFieldStart(parser, false, 24, 25); - assertData(parser, "e", 25, 26); - assertFieldEnd(parser, 26, 27); - assertFieldStart(parser, false, 27, 28); - assertData(parser, "f", 28, 29); - assertFieldEnd(parser, 29, 30); + assertCommandStart(parser, "d", true, Variant::mapType{}, 22, 23); + assertFieldStart(parser, false, 24, 25); + assertData(parser, "e", 25, 26); + assertFieldEnd(parser, 26, 27); + assertFieldStart(parser, false, 27, 28); + assertData(parser, "f", 28, 29); + assertFieldEnd(parser, 29, 30); - assertEmptyData(parser); - assertCommand(parser, "g", 31, 33); - assertFieldStart(parser, false, 33, 34); - assertData(parser, "h", 34, 35); - assertFieldEnd(parser, 35, 36); - assertEmptyData(parser); - assertRangeEnd(parser, 42, 43); - assertRangeEnd(parser, 49, 50); - assertEnd(parser, 51, 51); + assertEmptyData(parser); + assertCommand(parser, "g", 31, 33); + assertFieldStart(parser, false, 33, 34); + assertData(parser, "h", 34, 35); + assertFieldEnd(parser, 35, 36); + assertEmptyData(parser); + assertRangeEnd(parser, 42, 43); + assertRangeEnd(parser, 49, 50); + assertEnd(parser, 51, 51); } TEST(OsmlStreamParser, beginEndWithCommand) { - const char *testString = "\\begin{book}\\a{test}\\end{book}"; - // 012345678901 23456789 0123456789 - // 0 1 2 - CharReader charReader(testString); + const char *testString = "\\begin{book}\\a{test}\\end{book}"; + // 012345678901 23456789 0123456789 + // 0 1 2 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); - assertCommand(parser, "a", 12, 14); - assertFieldStart(parser, false, 14, 15); - assertData(parser, "test", 15, 19); - assertFieldEnd(parser, 19, 20); - assertRangeEnd(parser, 25, 29); - assertEnd(parser, 30, 30); + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertCommand(parser, "a", 12, 14); + assertFieldStart(parser, false, 14, 15); + assertData(parser, "test", 15, 19); + assertFieldEnd(parser, 19, 20); + assertRangeEnd(parser, 25, 29); + assertEnd(parser, 30, 30); } TEST(OsmlStreamParser, beginEndNestedFields) { - const char *testString = "\\begin{book}a{{b{c}}}\\end{book}"; - // 012345678901234567890 1234567890 - // 0 1 2 3 - CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); - logger.reset(); - - assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); - assertData(parser, "a", 12, 13); - assertFieldStart(parser, false, 13, 14); - assertFieldStart(parser, false, 14, 15); - assertData(parser, "b", 15, 16); - assertFieldStart(parser, false, 16, 17); - assertData(parser, "c", 17, 18); - assertFieldEnd(parser, 18, 19); - assertFieldEnd(parser, 19, 20); - assertFieldEnd(parser, 20, 21); - assertRangeEnd(parser, 26, 30); - assertEnd(parser, 31, 31); + const char *testString = "\\begin{book}a{{b{c}}}\\end{book}"; + // 012345678901234567890 1234567890 + // 0 1 2 3 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + logger.reset(); + + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertData(parser, "a", 12, 13); + assertFieldStart(parser, false, 13, 14); + assertFieldStart(parser, false, 14, 15); + assertData(parser, "b", 15, 16); + assertFieldStart(parser, false, 16, 17); + assertData(parser, "c", 17, 18); + assertFieldEnd(parser, 18, 19); + assertFieldEnd(parser, 19, 20); + assertFieldEnd(parser, 20, 21); + assertRangeEnd(parser, 26, 30); + assertEnd(parser, 31, 31); } TEST(OsmlStreamParser, errorBeginEndUnbalancedNestedFields) { - const char *testString = "\\begin{book}a{{b{c}}\\end{book}"; - // 012345678901234567890 123456789 - // 0 1 2 - CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); - logger.reset(); - - assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); - assertData(parser, "a", 12, 13); - assertFieldStart(parser, false, 13, 14); - assertFieldStart(parser, false, 14, 15); - assertData(parser, "b", 15, 16); - assertFieldStart(parser, false, 16, 17); - assertData(parser, "c", 17, 18); - assertFieldEnd(parser, 18, 19); - assertFieldEnd(parser, 19, 20); - ASSERT_THROW(assertRangeEnd(parser, 25, 29), LoggableException); + const char *testString = "\\begin{book}a{{b{c}}\\end{book}"; + // 012345678901234567890 123456789 + // 0 1 2 + CharReader charReader(testString); + OsmlStreamParser parser(charReader, logger); + logger.reset(); + + assertCommandStart(parser, "book", true, Variant::mapType{}, 7, 11); + assertData(parser, "a", 12, 13); + assertFieldStart(parser, false, 13, 14); + assertFieldStart(parser, false, 14, 15); + assertData(parser, "b", 15, 16); + assertFieldStart(parser, false, 16, 17); + assertData(parser, "c", 17, 18); + assertFieldEnd(parser, 18, 19); + assertFieldEnd(parser, 19, 20); + ASSERT_THROW(assertRangeEnd(parser, 25, 29), LoggableException); } TEST(OsmlStreamParser, errorBeginEndUnbalancedFields) @@ -962,446 +960,480 @@ TEST(OsmlStreamParser, errorBeginEndUnbalancedFields) TEST(OsmlStreamParser, errorBeginNoBraceOpen) { - const char *testString = "\\begin a"; - // 01234567 - CharReader charReader(testString); + const char *testString = "\\begin a"; + // 01234567 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(parser, "a", 7, 8); - ASSERT_TRUE(logger.hasError()); - assertEnd(parser, 8, 8); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertData(parser, "a", 7, 8); + ASSERT_TRUE(logger.hasError()); + assertEnd(parser, 8, 8); } TEST(OsmlStreamParser, errorBeginNoIdentifier) { - const char *testString = "\\begin{!"; - CharReader charReader(testString); + const char *testString = "\\begin{!"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(parser.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginNoBraceClose) { - const char *testString = "\\begin{a"; - CharReader charReader(testString); + const char *testString = "\\begin{a"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(parser.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginNoName) { - const char *testString = "\\begin{a#}"; - CharReader charReader(testString); + const char *testString = "\\begin{a#}"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommandStart(parser, "a", true); - ASSERT_TRUE(logger.hasError()); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertEnd(parser); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommandStart(parser, "a", true); + ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertEnd(parser); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoBraceOpen) { - const char *testString = "\\end a"; - // 012345 - CharReader charReader(testString); + const char *testString = "\\end a"; + // 012345 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(parser, "a", 5, 6); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertData(parser, "a", 5, 6); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoIdentifier) { - const char *testString = "\\end{!"; - CharReader charReader(testString); + const char *testString = "\\end{!"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(parser.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoBraceClose) { - const char *testString = "\\end{a"; - CharReader charReader(testString); + const char *testString = "\\end{a"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(parser.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorEndNoBegin) { - const char *testString = "\\end{a}"; - CharReader charReader(testString); + const char *testString = "\\end{a}"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(parser.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, errorBeginEndMismatch) { - const char *testString = "\\begin{a} \\begin{b} test \\end{a}"; - // 0123456789 012345678901234 5678901 - // 0 1 2 3 - CharReader charReader(testString); + const char *testString = "\\begin{a} \\begin{b} test \\end{a}"; + // 0123456789 012345678901234 5678901 + // 0 1 2 3 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommandStart(parser, "a", true, Variant::mapType{}, 7, 8); - assertEmptyData(parser); - assertCommandStart(parser, "b", true, Variant::mapType{}, 17, 18); - assertData(parser, "test", 20, 24); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(parser.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + assertCommandStart(parser, "a", true, Variant::mapType{}, 7, 8); + assertEmptyData(parser); + assertCommandStart(parser, "b", true, Variant::mapType{}, 17, 18); + assertData(parser, "test", 20, 24); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(parser.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); } TEST(OsmlStreamParser, commandWithNSSep) { - const char *testString = "\\test1:test2"; - // 012345678901 - CharReader charReader(testString); + const char *testString = "\\test1:test2"; + // 012345678901 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(parser, "test1:test2", 0, 12); - assertEnd(parser, 12, 12); + assertCommand(parser, "test1:test2", 0, 12); + assertEnd(parser, 12, 12); } TEST(OsmlStreamParser, beginEndWithNSSep) { - const char *testString = "\\begin{test1:test2}\\end{test1:test2}"; - // 0123456789012345678 90123456789012345 - // 0 1 2 3 - CharReader charReader(testString); + const char *testString = "\\begin{test1:test2}\\end{test1:test2}"; + // 0123456789012345678 90123456789012345 + // 0 1 2 3 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommandStart(parser, "test1:test2", true, Variant::mapType{}, 7, 18); - assertRangeEnd(parser, 24, 35); - assertEnd(parser, 36, 36); + assertCommandStart(parser, "test1:test2", true, Variant::mapType{}, 7, 18); + assertRangeEnd(parser, 24, 35); + assertEnd(parser, 36, 36); } TEST(OsmlStreamParser, errorBeginNSSep) { - const char *testString = "\\begin:test{blub}\\end{blub}"; - CharReader charReader(testString); + const char *testString = "\\begin:test{blub}\\end{blub}"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommandStart(parser, "blub", true, Variant::mapType{}); - ASSERT_TRUE(logger.hasError()); - assertRangeEnd(parser); - assertEnd(parser); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommandStart(parser, "blub", true, Variant::mapType{}); + ASSERT_TRUE(logger.hasError()); + assertRangeEnd(parser); + assertEnd(parser); } TEST(OsmlStreamParser, errorEndNSSep) { - const char *testString = "\\begin{blub}\\end:test{blub}"; - CharReader charReader(testString); + const char *testString = "\\begin{blub}\\end:test{blub}"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommandStart(parser, "blub", true, Variant::mapType{}); - ASSERT_FALSE(logger.hasError()); - assertRangeEnd(parser); - ASSERT_TRUE(logger.hasError()); - assertEnd(parser); + logger.reset(); + assertCommandStart(parser, "blub", true, Variant::mapType{}); + ASSERT_FALSE(logger.hasError()); + assertRangeEnd(parser); + ASSERT_TRUE(logger.hasError()); + assertEnd(parser); } TEST(OsmlStreamParser, errorEmptyNs) { - const char *testString = "\\test:"; - CharReader charReader(testString); + const char *testString = "\\test:"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(parser, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(parser, ":"); - assertEnd(parser); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(parser, "test"); + ASSERT_TRUE(logger.hasError()); + assertData(parser, ":"); + assertEnd(parser); } TEST(OsmlStreamParser, errorRepeatedNs) { - const char *testString = "\\test::"; - CharReader charReader(testString); + const char *testString = "\\test::"; + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(parser, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(parser, "::"); - assertEnd(parser); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(parser, "test"); + ASSERT_TRUE(logger.hasError()); + assertData(parser, "::"); + assertEnd(parser); } TEST(OsmlStreamParser, explicitDefaultField) { - const char *testString = "\\a{!b}c"; - // 01234567 - CharReader charReader(testString); + const char *testString = "\\a{!b}c"; + // 01234567 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(parser, "a", 0, 2); - assertFieldStart(parser, true, 2, 4); - assertData(parser, "b", 4, 5); - assertFieldEnd(parser, 5, 6); - assertData(parser, "c", 6, 7); - assertEnd(parser, 7, 7); + assertCommand(parser, "a", 0, 2); + assertFieldStart(parser, true, 2, 4); + assertData(parser, "b", 4, 5); + assertFieldEnd(parser, 5, 6); + assertData(parser, "c", 6, 7); + assertEnd(parser, 7, 7); } TEST(OsmlStreamParser, explicitDefaultFieldWithCommand) { - const char *testString = "\\a{!\\b}c"; - // 0123 4567 - CharReader charReader(testString); + const char *testString = "\\a{!\\b}c"; + // 0123 4567 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertCommand(parser, "a", 0, 2); - assertFieldStart(parser, true, 2, 4); - assertCommand(parser, "b", 4, 6); - assertFieldEnd(parser, 6, 7); - assertData(parser, "c", 7, 8); - assertEnd(parser, 8, 8); + assertCommand(parser, "a", 0, 2); + assertFieldStart(parser, true, 2, 4); + assertCommand(parser, "b", 4, 6); + assertFieldEnd(parser, 6, 7); + assertData(parser, "c", 7, 8); + assertEnd(parser, 8, 8); } TEST(OsmlStreamParser, fieldAfterExplicitDefaultField) { - const char *testString = "\\a{!\\b}{c}"; - // 0123 456789 - CharReader charReader(testString); + const char *testString = "\\a{!\\b}{c}"; + // 0123 456789 + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - assertCommand(parser, "a", 0, 2); - assertFieldStart(parser, true, 2, 4); - assertCommand(parser, "b", 4, 6); - assertFieldEnd(parser, 6, 7); - assertFieldStart(parser, false, 7, 8); - assertData(parser, "c", 8, 9); - assertFieldEnd(parser, 9, 10); - assertEnd(parser, 10, 10); + logger.reset(); + assertCommand(parser, "a", 0, 2); + assertFieldStart(parser, true, 2, 4); + assertCommand(parser, "b", 4, 6); + assertFieldEnd(parser, 6, 7); + assertFieldStart(parser, false, 7, 8); + assertData(parser, "c", 8, 9); + assertFieldEnd(parser, 9, 10); + assertEnd(parser, 10, 10); } TEST(OsmlStreamParser, annotationStart) { - const char *testString = "<\\a"; - // 0 12 + const char *testString = "<\\a"; + // 0 12 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart(parser, "a", Variant::mapType{}, 0, 3); - assertEnd(parser, 3, 3); + assertAnnotationStart(parser, "a", Variant::mapType{}, 0, 3); + assertEnd(parser, 3, 3); } TEST(OsmlStreamParser, annotationStartWithName) { - const char *testString = "<\\annotationWithName#aName"; - // 0 1234567890123456789012345 - // 0 1 2 + const char *testString = "<\\annotationWithName#aName"; + // 0 1234567890123456789012345 + // 0 1 2 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart(parser, "annotationWithName", - Variant::mapType{{"name", "aName"}}, 0, 20); - assertEnd(parser, 26, 26); + assertAnnotationStart(parser, "annotationWithName", + Variant::mapType{{"name", "aName"}}, 0, 20); + assertEnd(parser, 26, 26); } TEST(OsmlStreamParser, annotationStartWithArguments) { - const char *testString = "<\\annotationWithName#aName[a=1,b=2]"; - // 0 1234567890123456789012345678901234 - // 0 1 2 3 + const char *testString = "<\\annotationWithName#aName[a=1,b=2]"; + // 0 1234567890123456789012345678901234 + // 0 1 2 3 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart( - parser, "annotationWithName", - Variant::mapType{{"name", "aName"}, {"a", 1}, {"b", 2}}, 0, 20); - assertEnd(parser, 35, 35); + assertAnnotationStart( + parser, "annotationWithName", + Variant::mapType{{"name", "aName"}, {"a", 1}, {"b", 2}}, 0, 20); + assertEnd(parser, 35, 35); } TEST(OsmlStreamParser, simpleAnnotationStartBeginEnd) { - const char *testString = "<\\begin{ab#name}[a=1,b=2] a \\end{ab}\\>"; - // 0 123456789012345678901234567 89012345 67 - // 0 1 2 3 + const char *testString = "<\\begin{ab#name}[a=1,b=2] a \\end{ab}\\>"; + // 0 123456789012345678901234567 89012345 67 + // 0 1 2 3 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart( - parser, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, - 10); - ASSERT_TRUE(parser.inRangeCommand()); - assertData(parser, "a", 26, 27); - assertRangeEnd(parser, 33, 35); - assertAnnotationEnd(parser, "", "", 36, 38); - assertEnd(parser, 38, 38); + assertAnnotationStart( + parser, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, + 10); + ASSERT_TRUE(parser.inRangeCommand()); + assertData(parser, "a", 26, 27); + assertRangeEnd(parser, 33, 35); + assertAnnotationEnd(parser, "", "", 36, 38); + assertEnd(parser, 38, 38); } TEST(OsmlStreamParser, annotationEnd) { - const char *testString = "\\a>"; - // 012 + const char *testString = "\\a>"; + // 012 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationEnd(parser, "a", "", 0, 2); - assertEnd(parser, 3, 3); + assertAnnotationEnd(parser, "a", "", 0, 2); + assertEnd(parser, 3, 3); } TEST(OsmlStreamParser, annotationEndWithName) { - const char *testString = "\\a#name>"; - // 01234567 + const char *testString = "\\a#name>"; + // 01234567 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationEnd(parser, "a", "name", 0, 2); - assertEnd(parser, 8, 8); + assertAnnotationEnd(parser, "a", "name", 0, 2); + assertEnd(parser, 8, 8); } TEST(OsmlStreamParser, annotationEndWithNameAsArgs) { - const char *testString = "\\a[name=name]>"; - // 01234567890123 + const char *testString = "\\a[name=name]>"; + // 01234567890123 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationEnd(parser, "a", "name", 0, 2); - assertEnd(parser, 14, 14); + assertAnnotationEnd(parser, "a", "name", 0, 2); + assertEnd(parser, 14, 14); } TEST(OsmlStreamParser, errorAnnotationEndWithArguments) { - const char *testString = "\\a[foo=bar]>"; - // 012345678901 - // 0 1 + const char *testString = "\\a[foo=bar]>"; + // 012345678901 + // 0 1 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommandStart(parser, "a", false, Variant::mapType{{"foo", "bar"}}, 0, 2); - ASSERT_TRUE(logger.hasError()); - assertData(parser, ">", 11, 12); - assertEnd(parser, 12, 12); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommandStart(parser, "a", false, Variant::mapType{{"foo", "bar"}}, 0, + 2); + ASSERT_TRUE(logger.hasError()); + assertData(parser, ">", 11, 12); + assertEnd(parser, 12, 12); } TEST(OsmlStreamParser, closingAnnotation) { - const char *testString = "<\\a>"; - // 0 123 + const char *testString = "<\\a>"; + // 0 123 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertAnnotationStart(parser, "a", Variant::mapType{}, 0, 3); - assertData(parser, ">", 3, 4); - assertEnd(parser, 4, 4); + assertAnnotationStart(parser, "a", Variant::mapType{}, 0, 3); + assertData(parser, ">", 3, 4); + assertEnd(parser, 4, 4); } TEST(OsmlStreamParser, annotationWithFields) { - const char *testString = "a <\\b{c}{d}{!e} f \\> g"; - // 012 345678901234567 8901 - // 0 1 2 + const char *testString = "a <\\b{c}{d}{!e} f \\> g"; + // 012 345678901234567 8901 + // 0 1 2 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertData(parser, "a", 0, 1); - assertAnnotationStart(parser, "b", Variant::mapType{}, 2, 5); - assertFieldStart(parser, false, 5, 6); - assertData(parser, "c", 6, 7); - assertFieldEnd(parser, 7, 8); - assertFieldStart(parser, false, 8, 9); - assertData(parser, "d", 9, 10); - assertFieldEnd(parser, 10, 11); - assertFieldStart(parser, true, 11, 13); - assertData(parser, "e", 13, 14); - assertFieldEnd(parser, 14, 15); - assertData(parser, "f", 16, 17); - assertAnnotationEnd(parser, "", "", 18, 20); - assertData(parser, "g", 21, 22); - assertEnd(parser, 22, 22); + assertData(parser, "a", 0, 1); + assertAnnotationStart(parser, "b", Variant::mapType{}, 2, 5); + assertFieldStart(parser, false, 5, 6); + assertData(parser, "c", 6, 7); + assertFieldEnd(parser, 7, 8); + assertFieldStart(parser, false, 8, 9); + assertData(parser, "d", 9, 10); + assertFieldEnd(parser, 10, 11); + assertFieldStart(parser, true, 11, 13); + assertData(parser, "e", 13, 14); + assertFieldEnd(parser, 14, 15); + assertData(parser, "f", 16, 17); + assertAnnotationEnd(parser, "", "", 18, 20); + assertData(parser, "g", 21, 22); + assertEnd(parser, 22, 22); } TEST(OsmlStreamParser, annotationStartEscape) { - const char *testString = "<\\%test"; - // 0 123456 - // 0 + const char *testString = "<\\%test"; + // 0 123456 + // 0 - CharReader charReader(testString); + CharReader charReader(testString); - OsmlStreamParser parser(charReader, logger); + OsmlStreamParser parser(charReader, logger); - assertData(parser, "<%test", 0, 7); - assertEnd(parser, 7, 7); + assertData(parser, "<%test", 0, 7); + assertEnd(parser, 7, 7); } +TEST(OsmlStreamParser, userDefinedTokens) +{ + const char *testString = "<>, the *old man* said."; + // 0123456789012345678901234567890123456789 + // 0 1 2 3 + + CharReader charReader(testString); + + OsmlStreamParser parser(charReader, logger); + + TokenId tSpeechStart = parser.registerToken("<<"); + TokenId tSpeechEnd = parser.registerToken(">>"); + TokenId tStar = parser.registerToken("*"); + + ASSERT_TRUE(tSpeechStart != Tokens::Empty); + ASSERT_TRUE(tSpeechEnd != Tokens::Empty); + ASSERT_TRUE(tStar != Tokens::Empty); + + TokenSet tokens{tSpeechStart, tSpeechEnd, tStar}; + + ASSERT_EQ(OsmlStreamParser::State::DATA, parser.parse()); + TokenizedDataReader reader = parser.getData().reader(); + + assertToken(reader, tSpeechStart, "<<", tokens, WhitespaceMode::PRESERVE, 0, 2); + assertText(reader, "My dear fellows", tokens, WhitespaceMode::PRESERVE, 2, 17); + assertToken(reader, tSpeechEnd, ">>", tokens, WhitespaceMode::PRESERVE, 17, 19); + assertText(reader, ", the ", tokens, WhitespaceMode::PRESERVE, 19, 25); + assertToken(reader, tStar, "*", tokens, WhitespaceMode::PRESERVE, 25, 26); + assertText(reader, "old man", tokens, WhitespaceMode::PRESERVE, 26, 33); + assertToken(reader, tStar, "*", tokens, WhitespaceMode::PRESERVE, 33, 34); + assertText(reader, " said.", tokens, WhitespaceMode::PRESERVE, 34, 40); + assertEnd(reader); +} } -- cgit v1.2.3 From e2fd79ac8c85ac6191f6ed895fa5cdff091f7551 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 16:28:35 +0100 Subject: Implemented TokenRegistry class and corresponding simple test case --- CMakeLists.txt | 2 + src/core/parser/stack/TokenRegistry.cpp | 72 +++++++++++++++++++++++ src/core/parser/stack/TokenRegistry.hpp | 87 ++++++++++++++++++++++++++++ test/core/parser/stack/TokenRegistryTest.cpp | 78 +++++++++++++++++++++++++ 4 files changed, 239 insertions(+) create mode 100644 src/core/parser/stack/TokenRegistry.cpp create mode 100644 src/core/parser/stack/TokenRegistry.hpp create mode 100644 test/core/parser/stack/TokenRegistryTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e2d7f7..6e021fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,6 +188,7 @@ ADD_LIBRARY(ousia_core # src/core/parser/stack/ImportIncludeHandler src/core/parser/stack/State # src/core/parser/stack/Stack + src/core/parser/stack/TokenRegistry # src/core/parser/stack/TypesystemHandler src/core/parser/utils/SourceOffsetVector src/core/parser/utils/TokenizedData @@ -325,6 +326,7 @@ IF(TEST) test/core/parser/ParserScopeTest # test/core/parser/stack/StackTest test/core/parser/stack/StateTest + test/core/parser/stack/TokenRegistryTest test/core/parser/utils/SourceOffsetVectorTest test/core/parser/utils/TokenizedDataTest test/core/parser/utils/TokenizerTest diff --git a/src/core/parser/stack/TokenRegistry.cpp b/src/core/parser/stack/TokenRegistry.cpp new file mode 100644 index 0000000..21ae109 --- /dev/null +++ b/src/core/parser/stack/TokenRegistry.cpp @@ -0,0 +1,72 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "TokenRegistry.hpp" + +namespace ousia { +namespace parser_stack { + +TokenId TokenRegistry::registerToken(const std::string &token) +{ + // Check whether the given token is already registered + auto it = tokens.find(token); + if (it != tokens.end()) { + // Increment the reference count + size_t &refCount = it->second.second; + refCount++; + + // Return the token id + return it->second.first; + } + + // Register the token in the parser + TokenId id = parser.registerToken(token); + tokens[token] = std::pair(id, 1); + tokenIds[id] = token; + return id; +} + +void TokenRegistry::unregisterToken(TokenId id) +{ + // Lookup the token corresponding to the given token id + auto tokenIt = tokenIds.find(id); + if (tokenIt != tokenIds.end()) { + const std::string &token = tokenIt->second; + // Lookup the reference count for the corresponding token + auto idIt = tokens.find(token); + if (idIt != tokens.end()) { + // Decrement the reference count, abort if the refCount is larger + // than zero + size_t &refCount = idIt->second.second; + refCount--; + if (refCount > 0) { + return; + } + + // Unregister the token from the parser + parser.unregisterToken(id); + + // Unregister the token from the internal tokens map + tokens.erase(token); + } + // Unregister the token from the internal id map + tokenIds.erase(id); + } +} +} +} diff --git a/src/core/parser/stack/TokenRegistry.hpp b/src/core/parser/stack/TokenRegistry.hpp new file mode 100644 index 0000000..21c36b5 --- /dev/null +++ b/src/core/parser/stack/TokenRegistry.hpp @@ -0,0 +1,87 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file TokenRegistry.hpp + * + * Contains the TokenRegistry class used for registering all possible tokens + * during the parsing process. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_ +#define _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_ + +#include +#include + +#include "Callbacks.hpp" + +namespace ousia { +namespace parser_stack { + +/** + * The TokenRegistry class is used for registering all possible tokens during + * the Parsing process. The TokenRegistry class acts as an adapter between the + * parser which allocates TokenId for each unique token and the Handler classes + * which may register tokens multiple times and expect the same TokenId to be + * returned for the same token. + */ +class TokenRegistry : public ParserCallbacks { +private: + /** + * Reference at the ParserCallback instance the tokens are relayed to. + */ + ParserCallbacks &parser; + + /** + * Store containing all TokenId instances for all registered tokens. The map + * maps from the token strings to the corresponding TokenId and a reference + * count. + */ + std::unordered_map> tokens; + + /** + * Reverse map containing the string corresponding to a TokenId. + */ + std::unordered_map tokenIds; + +public: + /** + * Constructor of the TokenRegistry class. + * + * @param parser is the underlying parser implementing the ParserCallbacks + * interface to which all calls are relayed. + */ + TokenRegistry(ParserCallbacks &parser) : parser(parser) {} + + /* No copy construction */ + TokenRegistry(const TokenRegistry &) = delete; + + /* No assignment */ + TokenRegistry &operator=(const TokenRegistry &) = delete; + + TokenId registerToken(const std::string &token) override; + void unregisterToken(TokenId id) override; +}; +} +} + +#endif /* _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_ */ + diff --git a/test/core/parser/stack/TokenRegistryTest.cpp b/test/core/parser/stack/TokenRegistryTest.cpp new file mode 100644 index 0000000..390851e --- /dev/null +++ b/test/core/parser/stack/TokenRegistryTest.cpp @@ -0,0 +1,78 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { +namespace parser_stack { + +class ParserCallbacksProxy : public ParserCallbacks { +public: + size_t registerTokenCount = 0; + size_t unregisterTokenCount = 0; + + TokenId registerToken(const std::string &token) override + { + registerTokenCount++; + return registerTokenCount; + } + + void unregisterToken(TokenId id) override { unregisterTokenCount++; } +}; + +TEST(TokenRegistry, simple) +{ + ParserCallbacksProxy parser; + TokenRegistry registry(parser); + + ASSERT_EQ(0U, parser.registerTokenCount); + ASSERT_EQ(0U, parser.unregisterTokenCount); + + ASSERT_EQ(1U, registry.registerToken("test")); + ASSERT_EQ(1U, registry.registerToken("test")); + ASSERT_EQ(2U, registry.registerToken("test2")); + ASSERT_EQ(2U, registry.registerToken("test2")); + ASSERT_EQ(2U, parser.registerTokenCount); + ASSERT_EQ(0U, parser.unregisterTokenCount); + + registry.unregisterToken(1); + ASSERT_EQ(2U, parser.registerTokenCount); + ASSERT_EQ(0U, parser.unregisterTokenCount); + + registry.unregisterToken(1); + ASSERT_EQ(2U, parser.registerTokenCount); + ASSERT_EQ(1U, parser.unregisterTokenCount); + + registry.unregisterToken(1); + ASSERT_EQ(2U, parser.registerTokenCount); + ASSERT_EQ(1U, parser.unregisterTokenCount); + + registry.unregisterToken(2); + ASSERT_EQ(2U, parser.registerTokenCount); + ASSERT_EQ(1U, parser.unregisterTokenCount); + + registry.unregisterToken(2); + ASSERT_EQ(2U, parser.registerTokenCount); + ASSERT_EQ(2U, parser.unregisterTokenCount); +} + +} +} + -- cgit v1.2.3 From 3bdc30e0798d6b356782da430e93b72b4303e963 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:32:34 +0100 Subject: Adapted OsxmlParser to new Stack interface, enabled Osxml code in CMakeLists again --- CMakeLists.txt | 22 ++++++++-------- src/formats/osxml/OsxmlEventParser.cpp | 6 ++--- src/formats/osxml/OsxmlEventParser.hpp | 11 +++----- src/formats/osxml/OsxmlParser.cpp | 13 ++++----- test/formats/osxml/OsxmlEventParserTest.cpp | 41 ++++++++++++++++------------- 5 files changed, 48 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e021fd..2a09b54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -225,7 +225,7 @@ TARGET_LINK_LIBRARIES(ousia_osml ADD_LIBRARY(ousia_osxml src/formats/osxml/OsxmlAttributeLocator src/formats/osxml/OsxmlEventParser -# src/formats/osxml/OsxmlParser + src/formats/osxml/OsxmlParser ) TARGET_LINK_LIBRARIES(ousia_osxml @@ -397,17 +397,17 @@ IF(TEST) ousia_filesystem ) -# ADD_EXECUTABLE(ousia_test_osxml -# test/formats/osxml/OsxmlEventParserTest + ADD_EXECUTABLE(ousia_test_osxml + test/formats/osxml/OsxmlEventParserTest # test/formats/osxml/OsxmlParserTest -# ) + ) -# TARGET_LINK_LIBRARIES(ousia_test_osxml -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_osxml -# ousia_filesystem -# ) + TARGET_LINK_LIBRARIES(ousia_test_osxml + ${GTEST_LIBRARIES} + ousia_core + ousia_osxml + ousia_filesystem + ) ADD_EXECUTABLE(ousia_test_xml test/plugins/xml/XmlOutputTest @@ -426,7 +426,7 @@ IF(TEST) ADD_TEST(ousia_test_html ousia_test_html) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) ADD_TEST(ousia_test_osml ousia_test_osml) -# ADD_TEST(ousia_test_osxml ousia_test_osxml) + ADD_TEST(ousia_test_osxml ousia_test_osxml) ADD_TEST(ousia_test_xml ousia_test_xml) ENDIF() diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index 855f80d..83c16f0 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -323,7 +323,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Just issue a "commandStart" event in any other case Variant nameVar = Variant::fromString(nameStr); nameVar.setLocation(nameLoc); - parser->getEvents().command(nameVar, args); + parser->getEvents().commandStart(nameVar, args); } } @@ -358,8 +358,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name) return; } - // Issue the "fieldEnd" event - parser->getEvents().fieldEnd(); + // Issue the "rangeEnd" event + parser->getEvents().rangeEnd(); } static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e3fd5d4..7a8c96d 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -59,7 +59,8 @@ public: * @param args is a map containing the arguments that were given to the * command. */ - virtual void command(const Variant &name, const Variant::mapType &args) = 0; + virtual void commandStart(const Variant &name, + const Variant::mapType &args) = 0; /** * Called whenever an annotation starts. Note that this implicitly always @@ -88,13 +89,9 @@ public: const Variant &elementName) = 0; /** - * Called whenever the default field which was implicitly started by - * commandStart or annotationStart ends. Note that this does not end the - * range of an annotation, but the default field of the annotation. To - * signal the end of the annotation this, the annotationEnd method will be - * invoked. + * Called whenever the command or annotation tags end. */ - virtual void fieldEnd() = 0; + virtual void rangeEnd() = 0; /** * Called whenever string data is found. diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index c216855..924d11b 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -16,6 +16,8 @@ along with this program. If not, see . */ +#include +#include #include #include #include @@ -63,17 +65,16 @@ public: */ void parse() { parser.parse(); } - void command(const Variant &name, const Variant::mapType &args) override + void commandStart(const Variant &name, + const Variant::mapType &args) override { - stack.command(name, args); - stack.fieldStart(true); + stack.commandStart(name, args, true); } void annotationStart(const Variant &name, const Variant::mapType &args) override { - stack.annotationStart(name, args); - stack.fieldStart(true); + stack.annotationStart(name, args, true); } void annotationEnd(const Variant &className, @@ -82,7 +83,7 @@ public: stack.annotationEnd(className, elementName); } - void fieldEnd() override { stack.fieldEnd(); } + void rangeEnd() override { stack.rangeEnd(); } void data(const Variant &data) override { stack.data(data); } }; diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp index 6942166..b24a43d 100644 --- a/test/formats/osxml/OsxmlEventParserTest.cpp +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -32,10 +32,10 @@ static TerminalLogger logger(std::cerr, true); namespace { enum class OsxmlEvent { - COMMAND, + COMMAND_START, ANNOTATION_START, ANNOTATION_END, - FIELD_END, + RANGE_END, DATA }; @@ -43,9 +43,10 @@ class TestOsxmlEventListener : public OsxmlEvents { public: std::vector> events; - void command(const Variant &name, const Variant::mapType &args) override + void commandStart(const Variant &name, + const Variant::mapType &args) override { - events.emplace_back(OsxmlEvent::COMMAND, + events.emplace_back(OsxmlEvent::COMMAND_START, Variant::arrayType{name, args}); } @@ -63,9 +64,9 @@ public: Variant::arrayType{className, elementName}); } - void fieldEnd() override + void rangeEnd() override { - events.emplace_back(OsxmlEvent::FIELD_END, Variant::arrayType{}); + events.emplace_back(OsxmlEvent::RANGE_END, Variant::arrayType{}); } void data(const Variant &data) override @@ -92,11 +93,11 @@ TEST(OsxmlEventParser, simpleCommandWithArgs) // 0 1 2 3 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, + {OsxmlEvent::COMMAND_START, Variant::arrayType{ "a", Variant::mapType{ {"name", "test"}, {"a", 1}, {"b", 2}, {"c", "blub"}}}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + {OsxmlEvent::RANGE_END, Variant::arrayType{}}}; auto events = parseXml(testString); ASSERT_EQ(expectedEvents, events); @@ -132,10 +133,12 @@ TEST(OsxmlEventParser, magicTopLevelTag) const char *testString = ""; std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{{"a", Variant::mapType{}}}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}, - {OsxmlEvent::COMMAND, Variant::arrayType{{"b", Variant::mapType{}}}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::RANGE_END, Variant::arrayType{}}, + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"b", Variant::mapType{}}}}, + {OsxmlEvent::RANGE_END, Variant::arrayType{}}}; auto events = parseXml(testString); ASSERT_EQ(expectedEvents, events); @@ -146,11 +149,12 @@ TEST(OsxmlEventParser, magicTopLevelTagInside) const char *testString = ""; std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{{"a", Variant::mapType{}}}}, - {OsxmlEvent::COMMAND, + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND_START, Variant::arrayType{{"ousia", Variant::mapType{}}}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + {OsxmlEvent::RANGE_END, Variant::arrayType{}}, + {OsxmlEvent::RANGE_END, Variant::arrayType{}}}; auto events = parseXml(testString); ASSERT_EQ(expectedEvents, events); @@ -163,9 +167,10 @@ TEST(OsxmlEventParser, commandWithData) // 0 1 2 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::COMMAND_START, + Variant::arrayType{"a", Variant::mapType{}}}, {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, - {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + {OsxmlEvent::RANGE_END, Variant::arrayType{}}}; auto events = parseXml(testString); ASSERT_EQ(expectedEvents, events); -- cgit v1.2.3 From 8197dc488926e8645efb47e60d0988a6a65fc15f Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:33:32 +0100 Subject: Adapted OsmlParser to new Stack interface, reenabled OsmlParser code in CMakeLists --- CMakeLists.txt | 2 +- src/formats/osml/OsmlParser.cpp | 28 +++++++++++++--------------- src/formats/osml/OsmlStreamParser.cpp | 10 +++++----- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a09b54..f6a7257 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,7 +214,7 @@ ADD_LIBRARY(ousia_core #) ADD_LIBRARY(ousia_osml -# src/formats/osml/OsmlParser + src/formats/osml/OsmlParser src/formats/osml/OsmlStreamParser ) diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp index a24f091..c25974f 100644 --- a/src/formats/osml/OsmlParser.cpp +++ b/src/formats/osml/OsmlParser.cpp @@ -88,7 +88,7 @@ public: OsmlStreamParser::State state = parser.parse(); logger.setDefaultLocation(parser.getLocation()); switch (state) { - case OsmlStreamParser::State::COMMAND: { + case OsmlStreamParser::State::COMMAND_START: { // Implicitly create a "document" element if the first // command is not any other top-level command if (needsDocument) { @@ -96,23 +96,23 @@ public: parser.getCommandName().asString(); if (cmd != "typesystem" && cmd != "document" && cmd != "domain") { - stack.command("document", Variant::mapType{}); + stack.commandStart("document", Variant::mapType{}, + false); } needsDocument = false; } - stack.command(parser.getCommandName(), - parser.getCommandArguments().asMap()); + stack.commandStart(parser.getCommandName(), + parser.getCommandArguments().asMap(), + parser.inRangeCommand()); break; } - case OsmlStreamParser::State::DATA: - stack.data(parser.getData()); - break; - case OsmlStreamParser::State::ENTITY: - // TODO + case OsmlStreamParser::State::RANGE_END: + stack.rangeEnd(); break; case OsmlStreamParser::State::ANNOTATION_START: stack.annotationStart(parser.getCommandName(), - parser.getCommandArguments().asMap()); + parser.getCommandArguments().asMap(), + parser.inRangeCommand()); break; case OsmlStreamParser::State::ANNOTATION_END: { Variant elementName = Variant::fromString(std::string{}); @@ -130,11 +130,9 @@ public: case OsmlStreamParser::State::FIELD_END: stack.fieldEnd(); break; - case OsmlStreamParser::State::NONE: - case OsmlStreamParser::State::ERROR: - // Internally used in OsmlStreamParser, these states should - // never occur. Just contiunue. - continue; + case OsmlStreamParser::State::DATA: + stack.data(parser.getData()); + break; case OsmlStreamParser::State::END: return; } diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 823075a..64a489d 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -409,7 +409,7 @@ public: State parse(); TokenId registerToken(const std::string &token); - void unregisterToken(TokenId token); + void unregisterToken(TokenId id); const TokenizedData &getData() const { return data; } const Variant &getCommandName() const { return cmd().getName(); } @@ -928,9 +928,9 @@ TokenId OsmlStreamParserImpl::registerToken(const std::string &token) return tokenizer.registerToken(token, false); } -void OsmlStreamParserImpl::unregisterToken(TokenId token) +void OsmlStreamParserImpl::unregisterToken(TokenId id) { - assert(tokenizer.unregisterToken(token)); + assert(tokenizer.unregisterToken(id)); } /* Class OsmlStreamParser */ @@ -979,8 +979,8 @@ TokenId OsmlStreamParser::registerToken(const std::string &token) return impl->registerToken(token); } -void OsmlStreamParser::unregisterToken(TokenId token) +void OsmlStreamParser::unregisterToken(TokenId id) { - impl->unregisterToken(token); + impl->unregisterToken(id); } } -- cgit v1.2.3 From 596fdab71b8bd116e20e33647d68f1d7a567696e Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:34:15 +0100 Subject: Wrote isUserDefinedToken function which checks whether a token is a valid user defined token and added unit tests --- src/core/common/Utils.cpp | 24 ++++++++++++++++++++++++ src/core/common/Utils.hpp | 19 +++++++++++++++++++ test/core/common/UtilsTest.cpp | 31 ++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 85d2c28..219b437 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -118,5 +118,29 @@ bool Utils::endsWith(const std::string &s, const std::string &suffix) return suffix.size() <= s.size() && s.substr(s.size() - suffix.size(), suffix.size()) == suffix; } + +bool Utils::isUserDefinedToken(const std::string &token) +{ + // Make sure the token meets is neither empty, nor starts or ends with an + // alphanumeric character + const size_t len = token.size(); + if (len == 0 || isAlphanumeric(token[0]) || isAlphanumeric(token[len - 1])) { + return false; + } + + // Make sure the token is not any special OSML token + if (token == "\\" || token == "%" || token == "%{" || token == "}%" || + token == "{!" || token == "<\\" || token == "\\>") { + return false; + } + + // Make sure the token contains other characters but { and } + for (char c: token) { + if (c != '{' && c != '}') { + return true; + } + } + return false; +} } diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 82a8f8c..25a4de5 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -102,6 +102,25 @@ public: */ static bool isNamespacedIdentifier(const std::string &name); + /** + * Returns true if the given characters form a valid user-defined token. + * This function returns true under the following circumstances: + *
    + *
  • The given token is not empty
  • + *
  • The given token starts and ends with a non-alphanumeric character + *
  • + *
  • The token is none of the following character sequences (which are + * special in OSML): + *
      + *
    • '{', '}' or any combined repetition of these characters
    • + *
    • '\', '{!', '<\', '\>'
    • + *
    • '%', '%{', '}%'
    • + *
    + *
  • + *
+ */ + static bool isUserDefinedToken(const std::string &token); + /** * Returns true if the given character is a linebreak character. */ diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 4bf1587..54890ee 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -131,4 +131,33 @@ TEST(Utils, collapse) ASSERT_EQ("long test", Utils::collapse(" long test ")); } -} \ No newline at end of file +TEST(Utils, isUserDefinedToken) +{ + EXPECT_FALSE(Utils::isUserDefinedToken("")); + EXPECT_FALSE(Utils::isUserDefinedToken("a")); + EXPECT_TRUE(Utils::isUserDefinedToken(":")); + EXPECT_TRUE(Utils::isUserDefinedToken("::")); + EXPECT_TRUE(Utils::isUserDefinedToken("!?")); + EXPECT_TRUE(Utils::isUserDefinedToken(".")); + EXPECT_TRUE(Utils::isUserDefinedToken("<<")); + EXPECT_TRUE(Utils::isUserDefinedToken(">>")); + EXPECT_TRUE(Utils::isUserDefinedToken("''")); + EXPECT_TRUE(Utils::isUserDefinedToken("``")); + EXPECT_TRUE(Utils::isUserDefinedToken("´´")); + EXPECT_TRUE(Utils::isUserDefinedToken("´")); + EXPECT_TRUE(Utils::isUserDefinedToken("`")); + EXPECT_TRUE(Utils::isUserDefinedToken("<")); + EXPECT_TRUE(Utils::isUserDefinedToken(">")); + EXPECT_FALSE(Utils::isUserDefinedToken("a:")); + EXPECT_FALSE(Utils::isUserDefinedToken("a:a")); + EXPECT_FALSE(Utils::isUserDefinedToken(":a")); + EXPECT_FALSE(Utils::isUserDefinedToken("{")); + EXPECT_FALSE(Utils::isUserDefinedToken("{{")); + EXPECT_FALSE(Utils::isUserDefinedToken("}}")); + EXPECT_FALSE(Utils::isUserDefinedToken("{{}{}")); + EXPECT_FALSE(Utils::isUserDefinedToken("<\\")); + EXPECT_FALSE(Utils::isUserDefinedToken("\\>")); + EXPECT_FALSE(Utils::isUserDefinedToken("{!")); +} + +} -- cgit v1.2.3 From 231f426708babe0964495ac28a54f0f2835c084a Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:35:36 +0100 Subject: Unregistering all registered tokens when TokenRegistry is destroyed --- src/core/parser/stack/TokenRegistry.cpp | 8 ++++ src/core/parser/stack/TokenRegistry.hpp | 47 +++++++++++++++---- test/core/parser/stack/TokenRegistryTest.cpp | 68 +++++++++++++++------------- 3 files changed, 81 insertions(+), 42 deletions(-) diff --git a/src/core/parser/stack/TokenRegistry.cpp b/src/core/parser/stack/TokenRegistry.cpp index 21ae109..c135b98 100644 --- a/src/core/parser/stack/TokenRegistry.cpp +++ b/src/core/parser/stack/TokenRegistry.cpp @@ -16,11 +16,19 @@ along with this program. If not, see . */ +#include "Callbacks.hpp" #include "TokenRegistry.hpp" namespace ousia { namespace parser_stack { +TokenRegistry::~TokenRegistry() +{ + for (const auto &tid: tokenIds) { + parser.unregisterToken(tid.first); + } +} + TokenId TokenRegistry::registerToken(const std::string &token) { // Check whether the given token is already registered diff --git a/src/core/parser/stack/TokenRegistry.hpp b/src/core/parser/stack/TokenRegistry.hpp index 21c36b5..545db39 100644 --- a/src/core/parser/stack/TokenRegistry.hpp +++ b/src/core/parser/stack/TokenRegistry.hpp @@ -19,7 +19,7 @@ /** * @file TokenRegistry.hpp * - * Contains the TokenRegistry class used for registering all possible tokens + * Contains the TokenRegistry class used for registering all user defined tokens * during the parsing process. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) @@ -31,19 +31,22 @@ #include #include -#include "Callbacks.hpp" +#include namespace ousia { namespace parser_stack { +// Forward declarations +class ParserCallbacks; + /** - * The TokenRegistry class is used for registering all possible tokens during - * the Parsing process. The TokenRegistry class acts as an adapter between the - * parser which allocates TokenId for each unique token and the Handler classes - * which may register tokens multiple times and expect the same TokenId to be - * returned for the same token. + * The TokenRegistry class is used for registering all user defined tokens + * during the Parsing process. The TokenRegistry class acts as an adapter + * between the parser which allocates a TokenId for each unique token and the + * Handler classes which may register the same token multiple times and expect + * the same TokenId to be returned for the same token. */ -class TokenRegistry : public ParserCallbacks { +class TokenRegistry { private: /** * Reference at the ParserCallback instance the tokens are relayed to. @@ -71,14 +74,38 @@ public: */ TokenRegistry(ParserCallbacks &parser) : parser(parser) {} + /** + * Destructor of the TokenRegistry class, removes all registered tokens from + * the parser. + */ + ~TokenRegistry(); + /* No copy construction */ TokenRegistry(const TokenRegistry &) = delete; /* No assignment */ TokenRegistry &operator=(const TokenRegistry &) = delete; - TokenId registerToken(const std::string &token) override; - void unregisterToken(TokenId id) override; + /** + * Registers the given string token in the underlying parser and returns the + * TokenId of that token. If the same token string is given multiple times, + * the same TokenId is returned. The token is only registered once in the + * parser. + * + * @param token is the token that should be registered. + * @return the TokenId associated with this token. + */ + TokenId registerToken(const std::string &token); + + /** + * Unregisters the token with the given TokenId from the parser. Note that + * the token will only be unregistered if unregisterToken() has been called + * as many times as registerToken() for the same token. + * + * @param id is the id of the token returned by registerToken() that should + * be unregistered. + */ + void unregisterToken(TokenId id); }; } } diff --git a/test/core/parser/stack/TokenRegistryTest.cpp b/test/core/parser/stack/TokenRegistryTest.cpp index 390851e..20d6cd0 100644 --- a/test/core/parser/stack/TokenRegistryTest.cpp +++ b/test/core/parser/stack/TokenRegistryTest.cpp @@ -18,6 +18,7 @@ #include +#include #include namespace ousia { @@ -40,39 +41,42 @@ public: TEST(TokenRegistry, simple) { ParserCallbacksProxy parser; - TokenRegistry registry(parser); - - ASSERT_EQ(0U, parser.registerTokenCount); - ASSERT_EQ(0U, parser.unregisterTokenCount); - - ASSERT_EQ(1U, registry.registerToken("test")); - ASSERT_EQ(1U, registry.registerToken("test")); - ASSERT_EQ(2U, registry.registerToken("test2")); - ASSERT_EQ(2U, registry.registerToken("test2")); - ASSERT_EQ(2U, parser.registerTokenCount); - ASSERT_EQ(0U, parser.unregisterTokenCount); - - registry.unregisterToken(1); - ASSERT_EQ(2U, parser.registerTokenCount); - ASSERT_EQ(0U, parser.unregisterTokenCount); - - registry.unregisterToken(1); - ASSERT_EQ(2U, parser.registerTokenCount); - ASSERT_EQ(1U, parser.unregisterTokenCount); - - registry.unregisterToken(1); - ASSERT_EQ(2U, parser.registerTokenCount); - ASSERT_EQ(1U, parser.unregisterTokenCount); - - registry.unregisterToken(2); - ASSERT_EQ(2U, parser.registerTokenCount); - ASSERT_EQ(1U, parser.unregisterTokenCount); - - registry.unregisterToken(2); - ASSERT_EQ(2U, parser.registerTokenCount); - ASSERT_EQ(2U, parser.unregisterTokenCount); + { + TokenRegistry registry(parser); + + ASSERT_EQ(0U, parser.registerTokenCount); + ASSERT_EQ(0U, parser.unregisterTokenCount); + + ASSERT_EQ(1U, registry.registerToken("test")); + ASSERT_EQ(1U, registry.registerToken("test")); + ASSERT_EQ(2U, registry.registerToken("test2")); + ASSERT_EQ(2U, registry.registerToken("test2")); + ASSERT_EQ(3U, registry.registerToken("test3")); + ASSERT_EQ(3U, parser.registerTokenCount); + ASSERT_EQ(0U, parser.unregisterTokenCount); + + registry.unregisterToken(1); + ASSERT_EQ(3U, parser.registerTokenCount); + ASSERT_EQ(0U, parser.unregisterTokenCount); + + registry.unregisterToken(1); + ASSERT_EQ(3U, parser.registerTokenCount); + ASSERT_EQ(1U, parser.unregisterTokenCount); + + registry.unregisterToken(1); + ASSERT_EQ(3U, parser.registerTokenCount); + ASSERT_EQ(1U, parser.unregisterTokenCount); + + registry.unregisterToken(2); + ASSERT_EQ(3U, parser.registerTokenCount); + ASSERT_EQ(1U, parser.unregisterTokenCount); + + registry.unregisterToken(2); + ASSERT_EQ(3U, parser.registerTokenCount); + ASSERT_EQ(2U, parser.unregisterTokenCount); + } + ASSERT_EQ(3U, parser.unregisterTokenCount); } - } } -- cgit v1.2.3 From 7a8b4eb8b9d943959b919076596ec96ef0c4c03c Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:36:18 +0100 Subject: Adapted Callbacks interface and Handlers --- CMakeLists.txt | 2 +- src/core/parser/stack/Callbacks.cpp | 10 ++++ src/core/parser/stack/Callbacks.hpp | 68 ++++++++++++++++----------- src/core/parser/stack/Handler.cpp | 52 +++++++++++---------- src/core/parser/stack/Handler.hpp | 93 ++++++++++++++++++++++--------------- 5 files changed, 134 insertions(+), 91 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6a7257..1e81822 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,7 +184,7 @@ ADD_LIBRARY(ousia_core # src/core/parser/stack/DocumentHandler # src/core/parser/stack/DomainHandler # src/core/parser/stack/GenericParserStates -# src/core/parser/stack/Handler + src/core/parser/stack/Handler # src/core/parser/stack/ImportIncludeHandler src/core/parser/stack/State # src/core/parser/stack/Stack diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp index 6ebc549..44b31c6 100644 --- a/src/core/parser/stack/Callbacks.cpp +++ b/src/core/parser/stack/Callbacks.cpp @@ -19,5 +19,15 @@ #include "Callbacks.hpp" namespace ousia { +namespace parser_stack { + +/* Class ParserCallbacks */ + +ParserCallbacks::~ParserCallbacks() +{ + // Do nothing here +} + +} } diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp index 9c61000..d7b2547 100644 --- a/src/core/parser/stack/Callbacks.hpp +++ b/src/core/parser/stack/Callbacks.hpp @@ -30,66 +30,78 @@ #define _OUSIA_PARSER_STACK_CALLBACKS_HPP_ #include +#include #include +#include namespace ousia { + +// Forward declarations +class Variant; + namespace parser_stack { /** - * Interface defining a set of callback functions that act as a basis for the - * StateStackCallbacks and the ParserCallbacks. + * Interface between the Stack class and the underlying parser used for + * registering and unregistering tokens. */ -class Callbacks { +class ParserCallbacks { public: /** * Virtual descructor. */ - virtual ~Callbacks() {}; - - /** - * Sets the whitespace mode that specifies how string data should be - * processed. - * - * @param whitespaceMode specifies one of the three WhitespaceMode constants - * PRESERVE, TRIM or COLLAPSE. - */ - virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; + virtual ~ParserCallbacks(); /** * Registers the given token as token that should be reported to the handler * using the "token" function. * * @param token is the token string that should be reported. + * @return the token id with which the token will be reported. Should return + * Tokens::Empty if the given token could not be registered. */ - virtual void registerToken(const std::string &token) = 0; + virtual TokenId registerToken(const std::string &token) = 0; /** * Unregisters the given token, it will no longer be reported to the handler * using the "token" function. * - * @param token is the token string that should be unregistered. + * @param id is the token id of the token that should be unregistered. */ - virtual void unregisterToken(const std::string &token) = 0; + virtual void unregisterToken(TokenId id) = 0; }; /** - * Interface defining the callback functions that can be passed from a - * StateStack to the underlying parser. + * Interface defining a set of callback functions that act as a basis for the + * StateStackCallbacks and the ParserCallbacks. */ -class ParserCallbacks : public Callbacks { +class HandlerCallbacks: public ParserCallbacks { +public: /** - * Checks whether the given token is supported by the parser. The parser - * returns true, if the token is supported, false if this token cannot be - * registered. Note that parsers that do not support the registration of - * tokens at all should always return "true". + * Reads a string variant form the current input stream. This function must + * be called from the data() method. * - * @param token is the token that should be checked for support. - * @return true if the token is generally supported (or the parser does not - * support registering tokens at all), false if the token is not supported, - * because e.g. it is a reserved token or it interferes with other tokens. + * @return a string variant containing the current text data. The return + * value depends on the currently set whitespace mode and the tokens that + * were enabled using the enableTokens callback method. + */ + Variant readData(); + + /** + * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. + * The tokens described in the token list are the tokens that are currently + * enabled. + * + * @param tokens is a list of TokenSyntaxDescriptor instances that should be + * stored on the stack. + */ + void pushTokens(const std::vector &tokens); + + /** + * Removes the previously pushed list of tokens from the stack. */ - virtual bool supportsToken(const std::string &token) = 0; + void popTokens(); }; } diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index 3d413e8..734976a 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -30,11 +31,11 @@ namespace parser_stack { /* Class HandlerData */ -HandlerData::HandlerData(ParserContext &ctx, /*Callbacks &callbacks,*/ +HandlerData::HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks, const std::string &name, const State &state, const SourceLocation &location) : ctx(ctx), - /*callbacks(callbacks),*/ + callbacks(callbacks), name(name), state(state), location(location) @@ -68,19 +69,29 @@ const SourceLocation &Handler::location() const { return handlerData.location; } const std::string &Handler::name() const { return handlerData.name; } -void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) +Variant Handler::readData() { - /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/ + return handlerData.callbacks.readData(); } -void Handler::registerToken(const std::string &token) +void Handler::pushTokens(const std::vector &tokens) { - /*handlerData.callbacks.registerToken(token);*/ + handlerData.callbacks.pushTokens(tokens); } -void Handler::unregisterToken(const std::string &token) +void Handler::popTokens() { - /*handlerData.callbacks.unregisterToken(token);*/ + handlerData.callbacks.popTokens(); +} + +TokenId Handler::registerToken(const std::string &token) +{ + return handlerData.callbacks.registerToken(token); +} + +void Handler::unregisterToken(TokenId id) +{ + handlerData.callbacks.unregisterToken(id); } const std::string &Handler::getName() const { return name(); } @@ -131,7 +142,7 @@ bool EmptyHandler::annotationEnd(const Variant &className, return true; } -bool EmptyHandler::data(TokenizedData &data) +bool EmptyHandler::data() { // Support any data return true; @@ -185,13 +196,10 @@ bool StaticHandler::annotationEnd(const Variant &className, return false; } -bool StaticHandler::data(TokenizedData &data) +bool StaticHandler::data() { - if (data.text(WhitespaceMode::TRIM) != nullptr) { - logger().error("Did not expect any data here", data); - return false; - } - return true; + logger().error("Did not expect any data here", readData()); + return false; } /* Class StaticFieldHandler */ @@ -231,19 +239,15 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(TokenizedData &data) +bool StaticFieldHandler::data() { - Variant text = data.text(WhitespaceMode::TRIM); - if (text == nullptr) { - // Providing no data here is ok as long as the "doHandle" callback - // function has already been called - return handled; - } + // Fetch the actual text data + Variant stringData = readData(); // Call the doHandle function if this has not been done before if (!handled) { handled = true; - doHandle(text, args); + doHandle(stringData, args); return true; } @@ -251,7 +255,7 @@ bool StaticFieldHandler::data(TokenizedData &data) logger().error( std::string("Found data, but the corresponding argument \"") + argName + std::string("\" was already specified"), - text); + stringData); // Print the location at which the attribute was originally specified auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 929466d..848d395 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -32,6 +32,7 @@ class ParserScope; class ParserContext; class Logger; class TokenizedData; +class Variant; namespace parser_stack { @@ -52,11 +53,11 @@ public: ParserContext &ctx; /** - * Reference at an instance of the Callbacks class, used for - * modifying the behaviour of the parser (like registering tokens, setting - * the data type or changing the whitespace handling mode). + * Reference at a class implementing the HandlerCallbacks interface, used + * for modifying the behaviour of the parser (like registering tokens, + * setting the data type or changing the whitespace handling mode). */ - // Callbacks &callbacks; + HandlerCallbacks &callbacks; /** * Contains the name of the command that is being handled. @@ -83,9 +84,9 @@ public: * @param state is the state this handler was called for. * @param location is the location at which the handler is created. */ - HandlerData(ParserContext &ctx, - /*Callbacks &callbacks,*/ const std::string &name, - const State &state, const SourceLocation &location); + HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks, + const std::string &name, const State &state, + const SourceLocation &location); }; /** @@ -159,6 +160,17 @@ protected: */ const std::string &name() const; + /** + * Calls the corresponding method in the HandlerCallbacks instance. Reads a + * string variant form the current input stream. This function must be + * called from the data() method. + * + * @return a string variant containing the current text data. The return + * value depends on the currently set whitespace mode and the tokens that + * were enabled using the enableTokens callback method. + */ + Variant readData(); + /** * Calls the corresponding function in the Callbacks instance. Sets the * whitespace mode that specifies how string data should be processed. The @@ -170,7 +182,7 @@ protected: * @param whitespaceMode specifies one of the three WhitespaceMode constants * PRESERVE, TRIM or COLLAPSE. */ - void pushWhitespaceMode(WhitespaceMode whitespaceMode); + // void pushWhitespaceMode(WhitespaceMode whitespaceMode); /** * Pops a previously pushed whitespace mode. Calls to this function should @@ -178,38 +190,45 @@ protected: * can only undo pushs that were performed by the pushWhitespaceMode() * method of the same handler. */ - void popWhitespaceMode(); + // void popWhitespaceMode(); /** - * Calls the corresponding function in the Callbacks instance. Sets the - * whitespace mode that specifies how string data should be processed. The - * calls to this function are placed on a stack by the underlying Stack - * class. This function should be called from the "fieldStart" callback and - * the "start" callback. If no whitespace mode is pushed in the "start" - * method the whitespace mode "TRIM" is implicitly assumed. + * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. + * The tokens described in the token list are the tokens that are currently + * enabled. * - * @param tokens is a list of tokens that should be reported to this handler - * instance via the "token" method. + * @param tokens is a list of TokenSyntaxDescriptor instances that should be + * stored on the stack. */ - void pushTokens(const std::vector &tokens); + void pushTokens(const std::vector &tokens); /** - * Pops a previously pushed whitespace mode. Calls to this function should - * occur in the "end" callback and the "fieldEnd" callback. This function - * can only undo pushs that were performed by the pushWhitespaceMode() - * method of the same handler. + * Calls the corresponding function in the HandlerCallbacks instance. + * Removes the previously pushed list of tokens from the stack. */ - void popWhitespaceMode(); + void popTokens(); + /** + * Calls the corresponding function in the HandlerCallbacks instance. This + * method registers the given tokens as tokens that are generally available, + * tokens must be explicitly enabled using the "pushTokens" and "popTokens" + * method. Tokens that have not been registered are not guaranteed to be + * reported (except for special tokens, these do not have to be registerd). + * + * @param token is the token string that should be made available. + * @return the TokenId that will be used to refer to the token. + */ + TokenId registerToken(const std::string &token); /** - * Calls the corresponding function in the Callbacks instance. This method - * registers the given tokens as tokens that are generally available, tokens - * must be explicitly enabled using the "pushTokens" and "popTokens" method. - * Tokens that have not been registered are not guaranteed to be reported, - * even though they are + * Calls the corresponding function in the HandlerCallbacks instance. This + * method unregisters the given token. Note that for a token to be no longer + * reported, this function has to be called as many times as registerToken() + * for the corresponding token. + * + * @param id is the id of the Token that should be unregistered. */ - void registerTokens(const std::vector &tokens); + void unregisterToken(TokenId id); public: /** @@ -321,13 +340,12 @@ public: /** * Called whenever raw data (int the form of a string) is available for the * Handler instance. Should return true if the data could be handled, false - * otherwise. + * otherwise. The actual data variant must be retrieved using the "text()" + * callback. * - * @param data is an instance of TokenizedData containing the segmented - * character data and its location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(TokenizedData &data) = 0; + virtual bool data() = 0; }; /** @@ -357,7 +375,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(TokenizedData &data) override; + bool data() override; /** * Creates an instance of the EmptyHandler class. @@ -383,7 +401,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(TokenizedData &data) override; + bool data() override; }; /** @@ -430,13 +448,12 @@ protected: * @param fieldData is the captured field data. * @param args are the arguments that were given in the "start" function. */ - virtual void doHandle(const Variant &fieldData, - Variant::mapType &args) = 0; + virtual void doHandle(const Variant &fieldData, Variant::mapType &args) = 0; public: bool start(Variant::mapType &args) override; void end() override; - bool data(TokenizedData &data) override; + bool data() override; }; } } -- cgit v1.2.3 From 2807dc44b0555c19944f2520852d242eacc30b20 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:36:44 +0100 Subject: Added first (not final) version of the TokenStack class which will be used internally by the Stack class --- CMakeLists.txt | 1 + src/core/parser/stack/TokenStack.cpp | 45 ++++++++++++++ src/core/parser/stack/TokenStack.hpp | 111 +++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 src/core/parser/stack/TokenStack.cpp create mode 100644 src/core/parser/stack/TokenStack.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e81822..b206458 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -189,6 +189,7 @@ ADD_LIBRARY(ousia_core src/core/parser/stack/State # src/core/parser/stack/Stack src/core/parser/stack/TokenRegistry + src/core/parser/stack/TokenStack # src/core/parser/stack/TypesystemHandler src/core/parser/utils/SourceOffsetVector src/core/parser/utils/TokenizedData diff --git a/src/core/parser/stack/TokenStack.cpp b/src/core/parser/stack/TokenStack.cpp new file mode 100644 index 0000000..6afeaed --- /dev/null +++ b/src/core/parser/stack/TokenStack.cpp @@ -0,0 +1,45 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "TokenStack.hpp" + +namespace ousia { +namespace parser_stack { + +void TokenStack::pushTokens(const std::vector &tokens) +{ + stack.push_back(tokens); +} + +void TokenStack::popTokens() { stack.pop_back(); } + +TokenSet TokenStack::tokens() const +{ + if (stack.empty() && parentStack != nullptr) { + return parentStack->tokens(); + } + + TokenSet res; + for (const TokenSyntaxDescriptor &descr : stack.back()) { + descr.insertIntoTokenSet(res); + } + return res; +} +} +} + diff --git a/src/core/parser/stack/TokenStack.hpp b/src/core/parser/stack/TokenStack.hpp new file mode 100644 index 0000000..9669f50 --- /dev/null +++ b/src/core/parser/stack/TokenStack.hpp @@ -0,0 +1,111 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file TokenStack.hpp + * + * Contains the TokenStack class used for collecting the currently enabled user + * defined tokens on a per-field basis. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_ +#define _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_ + +#include +#include + +#include + +namespace ousia { +namespace parser_stack { + +/** + * The TokenStack class is used by the Stack class to collect all currently + * enabled user defined tokens. + */ +class TokenStack { +private: + /** + * Shared pointer at the parent TokenStack instance. May be nullptr, in + * which case no parent TokenStack instance exists. + */ + const TokenStack *parentStack; + + /** + * Stack containing vectors of TokenSyntaxDescriptor instances as given by + * the user. + */ + std::vector> stack; + + /** + * Constructor of the TokenStack class. + * + * @param parentStack is a pointer at the underlying parentStack instance + * to which calls should be forwarded if no data has been pushed onto this + * stack instance. + */ + TokenStack(const TokenStack *parentStack) : parentStack(parentStack) {} + +public: + /** + * Default constructor of the TokenStack class with no reference at a parent + * stack. + */ + TokenStack() : TokenStack(nullptr) {} + + /** + * Constructor of the TokenStack class with a reference at a parent + * TokenStack instance. + * + * @param parentStack is a reference at a parent TokenStack instance. If no + * data has yet been pushed onto this instance, calls will be forwarded to + * the parent stack. + */ + TokenStack(const TokenStack &parentStack) : TokenStack(&parentStack) {} + + /** + * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. + * + * @param tokens is a list of TokenSyntaxDescriptor instances that should be + * stored on the stack. + */ + void pushTokens(const std::vector &tokens); + + /** + * Removes the previously pushed list of tokens from the stack. + */ + void popTokens(); + + /** + * Returns a set containing all currently enabled tokens. The set of enabled + * tokens are those tokens that were pushed last onto the stack. This set + * has to be passed to the TokenizedData instance in order to gather all + * tokens that are currently possible. + * + * @return a set of tokens containing all the Tokens that are currently + * possible. + */ + TokenSet tokens() const; +}; +} +} + +#endif /* _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_ */ + -- cgit v1.2.3 From 95f0ade7c19d7c6c451025e9a76d66ffb64e1f70 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:41:15 +0100 Subject: Adapted Stack interface (Stack.cpp is a mess right now and does not compile) --- src/core/parser/stack/Stack.cpp | 202 +++++++++++++++++++++++++++++- src/core/parser/stack/Stack.hpp | 270 ++++++---------------------------------- 2 files changed, 238 insertions(+), 234 deletions(-) diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 309c9a0..292e7e2 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -32,6 +32,96 @@ namespace parser_stack { /* Class HandlerInfo */ +/** + * The HandlerInfo class is used internally by the stack to associate additional + * (mutable) data with a handler instance. + */ +class HandlerInfo { +public: + /** + * Pointer pointing at the actual handler instance. + */ + std::shared_ptr handler; + + /** + * Next field index to be passed to the "fieldStart" function of the Handler + * class. + */ + size_t fieldIdx; + + /** + * Set to true if the handler is valid (which is the case if the "start" + * method has returned true). If the handler is invalid, no more calls are + * directed at it until it can be removed from the stack. + */ + bool valid : 1; + + /** + * Set to true if this is an implicit handler, that was created when the + * current stack state was deduced. + */ + bool implicit : 1; + + /** + * Set to true if the handler currently is in a field. + */ + bool inField : 1; + + /** + * Set to true if the handler currently is in the default field. + */ + bool inDefaultField : 1; + + /** + * Set to true if the handler currently is in an implicitly started default + * field. + */ + bool inImplicitDefaultField : 1; + + /** + * Set to false if this field is only opened pro-forma and does not accept + * any data. Otherwise set to true. + */ + bool inValidField : 1; + + /** + * Set to true, if the default field was already started. + */ + bool hadDefaultField : 1; + + /** + * Default constructor of the HandlerInfo class. + */ + HandlerInfo(); + /** + * Constructor of the HandlerInfo class, allows to set all flags manually. + */ + HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField, bool inValidField); + + /** + * Constructor of the HandlerInfo class, taking a shared_ptr to the handler + * to which additional information should be attached. + */ + HandlerInfo(std::shared_ptr handler); + + /** + * Destructor of the HandlerInfo class (to allow Handler to be forward + * declared). + */ + ~HandlerInfo(); + + /** + * Updates the "field" flags according to a "fieldStart" event. + */ + void fieldStart(bool isDefault, bool isImplicit, bool isValid); + + /** + * Updates the "fields" flags according to a "fieldEnd" event. + */ + void fieldEnd(); +}; + HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {} HandlerInfo::HandlerInfo(std::shared_ptr handler) @@ -117,6 +207,113 @@ static LoggableException buildInvalidCommandException( } } +/* Class StackImpl */ + +class StackImpl { + +private: + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap &states; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::vector stack; + + /** + * Return the reference in the Logger instance stored within the context. + */ + Logger &logger(); + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetState(const std::string &name); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state, also including the wildcard "*" state. + * Throws an exception if the given target state is not a valid identifier. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetStateOrWildcard(const std::string &name); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + * an equivalent state as the one in the including file. + */ + void deduceState(); + + /** + * Returns a reference at the current HandlerInfo instance (or a stub + * HandlerInfo instance if the stack is empty). + */ + HandlerInfo ¤tInfo(); + + /** + * Returns a reference at the last HandlerInfo instance (or a stub + * HandlerInfo instance if the stack has only one element). + */ + HandlerInfo &lastInfo(); + + /** + * Ends all handlers that currently are not inside a field and already had + * a default field. This method is called whenever the data() and command() + * events are reached. + */ + void endOverdueHandlers(); + + /** + * Ends the current handler and removes the corresponding element from the + * stack. + */ + void endCurrentHandler(); + + /** + * Tries to start a default field for the current handler, if currently the + * handler is not inside a field and did not have a default field yet. + * + * @return true if the handler is inside a field, false if no field could + * be started. + */ + bool ensureHandlerIsInField(); + + /** + * Returns true if all handlers on the stack are currently valid, or false + * if at least one handler is invalid. + * + * @return true if all handlers on the stack are valid. + */ + bool handlersValid(); +}; + + /* Class Stack */ Stack::Stack(ParserContext &ctx, @@ -611,10 +808,5 @@ void Stack::annotationEnd(const Variant &className, const Variant &elementName) { // TODO } - -void Stack::token(Variant token) -{ - // TODO -} } } diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index cd29b28..e1173d0 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -29,226 +29,35 @@ #ifndef _OUSIA_PARSER_STACK_STACK_HPP_ #define _OUSIA_PARSER_STACK_STACK_HPP_ -#include - #include #include -#include -#include - -#include -#include namespace ousia { // Forward declarations class ParserContext; -class Logger; class TokenizedData; +class Variant; namespace parser_stack { // Forward declarations -class Handler; +class StackImpl; class State; -/** - * The HandlerInfo class is used internally by the stack to associate additional - * (mutable) data with a handler instance. - */ -class HandlerInfo { -public: - /** - * Pointer pointing at the actual handler instance. - */ - std::shared_ptr handler; - - /** - * Next field index to be passed to the "fieldStart" function of the Handler - * class. - */ - size_t fieldIdx; - - /** - * Set to true if the handler is valid (which is the case if the "start" - * method has returned true). If the handler is invalid, no more calls are - * directed at it until it can be removed from the stack. - */ - bool valid : 1; - - /** - * Set to true if this is an implicit handler, that was created when the - * current stack state was deduced. - */ - bool implicit : 1; - - /** - * Set to true if the handler currently is in a field. - */ - bool inField : 1; - - /** - * Set to true if the handler currently is in the default field. - */ - bool inDefaultField : 1; - - /** - * Set to true if the handler currently is in an implicitly started default - * field. - */ - bool inImplicitDefaultField : 1; - - /** - * Set to false if this field is only opened pro-forma and does not accept - * any data. Otherwise set to true. - */ - bool inValidField : 1; - - /** - * Set to true, if the default field was already started. - */ - bool hadDefaultField : 1; - - /** - * Default constructor of the HandlerInfo class. - */ - HandlerInfo(); - /** - * Constructor of the HandlerInfo class, allows to set all flags manually. - */ - HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, - bool inImplicitDefaultField, bool inValidField); - - /** - * Constructor of the HandlerInfo class, taking a shared_ptr to the handler - * to which additional information should be attached. - */ - HandlerInfo(std::shared_ptr handler); - - /** - * Destructor of the HandlerInfo class (to allow Handler to be forward - * declared). - */ - ~HandlerInfo(); - - /** - * Updates the "field" flags according to a "fieldStart" event. - */ - void fieldStart(bool isDefault, bool isImplicit, bool isValid); - - /** - * Updates the "fields" flags according to a "fieldEnd" event. - */ - void fieldEnd(); -}; - /** * The Stack class is a pushdown automaton responsible for turning a command * stream into a tree of Node instances. It does so by following a state * transition graph and creating a set of Handler instances, which are placed - * on the stack. + * on the stack. Additionally it is responsible for the normalization of + * Annotations and for handling tokens. */ class Stack { private: /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. + * Pointer at the internal implementation */ - const std::multimap &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::vector stack; - - /** - * Return the reference in the Logger instance stored within the context. - */ - Logger &logger(); - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. - */ - std::set expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - * state otherwise. - */ - const State *findTargetState(const std::string &name); - - /** - * Returns the targetState for a command with the given name that can be - * reached from the current state, also including the wildcard "*" state. - * Throws an exception if the given target state is not a valid identifier. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - * state otherwise. - */ - const State *findTargetStateOrWildcard(const std::string &name); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - * an equivalent state as the one in the including file. - */ - void deduceState(); - - /** - * Returns a reference at the current HandlerInfo instance (or a stub - * HandlerInfo instance if the stack is empty). - */ - HandlerInfo ¤tInfo(); - - /** - * Returns a reference at the last HandlerInfo instance (or a stub - * HandlerInfo instance if the stack has only one element). - */ - HandlerInfo &lastInfo(); - - /** - * Ends all handlers that currently are not inside a field and already had - * a default field. This method is called whenever the data() and command() - * events are reached. - */ - void endOverdueHandlers(); - - /** - * Ends the current handler and removes the corresponding element from the - * stack. - */ - void endCurrentHandler(); - - /** - * Tries to start a default field for the current handler, if currently the - * handler is not inside a field and did not have a default field yet. - * - * @return true if the handler is inside a field, false if no field could - * be started. - */ - bool ensureHandlerIsInField(); - - /** - * Returns true if all handlers on the stack are currently valid, or false - * if at least one handler is invalid. - * - * @return true if all handlers on the stack are valid. - */ - bool handlersValid(); + std::unique_ptr impl; public: /** @@ -269,8 +78,8 @@ public: /** * Returns the state the Stack instance currently is in. * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. + * @return the state of the currently active Handler instance or + * States::None if no handler is on the stack. */ const State ¤tState(); @@ -289,28 +98,36 @@ public: * separator ':') and its corresponding location. Must be a string variant. * @param args is a map containing the arguments that were passed to the * command. + * @param range if true, the started command has an explicit range. */ - void command(const Variant &name, const Variant::mapType &args); + void commandStart(const Variant &name, const Variant::mapType &args, + bool range); /** - * Function that should be called whenever character data is found in the - * input stream. May only be called if the currently is a command on the - * stack. + * Function that should be called whenever an annotation starts. * - * @param data is a TokenizedData instance containing the pre-segmented data - * that should be read. + * @param name is the name of the annotation class. + * @param args is a map variant containing the arguments that were passed + * to the annotation. + * @param range if true, the annotation fields have an explicit range. */ - void data(TokenizedData data); + void annotationStart(const Variant &className, const Variant &args, + bool range); /** - * Function that shuold be called whenever character data is found in the - * input stream. The given string variant is converted into a TokenizedData - * instance internally. + * Function that should be called whenever an annotation ends. * - * @param stringData is a string variant containing the data that has been - * found. + * @param name is the name of the annotation class that was ended. + * @param annotationName is the name of the annotation that was ended. */ - void data(const Variant &stringData); + void annotationEnd(const Variant &className, const Variant &elementName); + + /** + * Function the should be called whenever a ranged command or annotation + * ends. Must be called if the range parameter range was set to true when + * annotationStart() or commandStart() were called. + */ + void rangeEnd(); /** * Function that should be called whenever a new field starts. Fields of the @@ -329,29 +146,24 @@ public: void fieldEnd(); /** - * Function that should be called whenever an annotation starts. - * - * @param name is the name of the annotation class. - * @param args is a map variant containing the arguments that were passed - * to the annotation. - */ - void annotationStart(const Variant &className, const Variant &args); - - /** - * Function that should be called whenever an annotation ends. + * Function that should be called whenever character data is found in the + * input stream. May only be called if the currently is a command on the + * stack. * - * @param name is the name of the annotation class that was ended. - * @param annotationName is the name of the annotation that was ended. + * @param data is a TokenizedData instance containing the pre-segmented data + * that should be read. */ - void annotationEnd(const Variant &className, const Variant &elementName); + void data(const TokenizedData &data); /** - * Function that should be called whenever a previously registered token - * is found in the input stream. + * Function that shuold be called whenever character data is found in the + * input stream. The given string variant is converted into a TokenizedData + * instance internally. * - * @param token is string variant containing the token that was encountered. + * @param stringData is a string variant containing the data that has been + * found. */ - void token(Variant token); + void data(const Variant &stringData); }; } } -- cgit v1.2.3 From 88afbcc2a4c4cb9956e4459cf1c5aa08e349835e Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:41:35 +0100 Subject: Implemented TokenSyntaxDescriptor structure --- src/core/common/Token.cpp | 16 ++++++++++- src/core/common/Token.hpp | 72 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp index 8bcdbb5..e454ae4 100644 --- a/src/core/common/Token.cpp +++ b/src/core/common/Token.cpp @@ -19,6 +19,20 @@ #include "Token.hpp" namespace ousia { -// Stub to make sure Tokens.hpp is valid + +/* Class TokenSyntaxDescriptor */ + +void TokenSyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const +{ + if (start != Tokens::Empty) { + set.insert(start); + } + if (end != Tokens::Empty) { + set.insert(end); + } + if (shortForm != Tokens::Empty) { + set.insert(shortForm); + } +} } diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp index 0cf56b0..f89a0ce 100644 --- a/src/core/common/Token.hpp +++ b/src/core/common/Token.hpp @@ -134,10 +134,7 @@ struct Token { * @param location is the location of the extracted string content in the * source file. */ - Token(SourceLocation location) - : id(Tokens::Data), location(location) - { - } + Token(SourceLocation location) : id(Tokens::Data), location(location) {} /** * Constructor of the Token struct. @@ -165,7 +162,7 @@ struct Token { * @return true if the TokenId indicates that this token is a "special" * token. */ - bool isSpecial() const {return id > Tokens::MaxTokenId;} + bool isSpecial() const { return id > Tokens::MaxTokenId; } /** * The getLocation function allows the tokens to be directly passed as @@ -175,6 +172,71 @@ struct Token { */ const SourceLocation &getLocation() const { return location; } }; + +/** + * Class describing the user defined syntax for a single field or annotation. + */ +struct TokenSyntaxDescriptor { + /** + * Possible start token or Tokens::Empty if no token is set. + */ + TokenId start; + + /** + * Possible end token or Tokens::Empty if no token is set. + */ + TokenId end; + + /** + * Possible representation token or Tokens::Empty if no token is set. + */ + TokenId shortForm; + + /** + * Flag specifying whether this TokenSyntaxDescriptor describes an + * annotation. + */ + bool isAnnotation; + + /** + * Default constructor, sets all token ids to Tokens::Empty and isAnnotation + * to false. + */ + TokenSyntaxDescriptor() + : start(Tokens::Empty), + end(Tokens::Empty), + shortForm(Tokens::Empty), + isAnnotation(false) + { + } + + /** + * Member initializer constructor. + * + * @param start is a possible start token. + * @param end is a possible end token. + * @param shortForm is a possible short form token. + * @param isAnnotation is set to true if this syntax descriptor describes an + * annotation. + */ + TokenSyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, + bool isAnnotation) + : start(start), + end(end), + shortForm(shortForm), + isAnnotation(isAnnotation) + { + } + + /** + * Inserts all tokens referenced in this TokenSyntaxDescriptor into the + * given TokenSet. Skips token ids set to Tokens::Empty. + * + * @param set is the TokenSet instance into which the Tokens should be + * inserted. + */ + void insertIntoTokenSet(TokenSet &set) const; +}; } #endif /* _OUSIA_TOKENS_HPP_ */ -- cgit v1.2.3 From 11ee669f29e426effaf4a1e0d82baa978219e92f Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 10:35:22 +0100 Subject: OsxmlEventParser also fills a TokenizedData instance now --- src/formats/osxml/OsxmlEventParser.cpp | 87 +++++++++-------------------- src/formats/osxml/OsxmlEventParser.hpp | 10 ++-- src/formats/osxml/OsxmlParser.cpp | 2 +- test/formats/osxml/OsxmlEventParserTest.cpp | 11 +++- 4 files changed, 41 insertions(+), 69 deletions(-) diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index 83c16f0..79a8dbe 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" @@ -38,6 +39,11 @@ namespace ousia { */ class OsxmlEventParserData { public: + /** + * Current character data buffer. + */ + TokenizedData data; + /** * Contains the current depth of the parsing process. */ @@ -51,24 +57,13 @@ public: ssize_t annotationEndTagDepth; /** - * Current character data buffer. - */ - std::vector textBuf; - - /** - * Current character data start. - */ - size_t textStart; - - /** - * Current character data end. - */ - size_t textEnd; - - /** - * Default constructor. + * Constructor taking the sourceId of the file from which the XML is being + * parsed. + * + * @param sourceId is the source if of the XML file from which the data is + * currently being parsed. */ - OsxmlEventParserData(); + OsxmlEventParserData(SourceId sourceId); /** * Increments the depth. @@ -91,14 +86,6 @@ public: * @return true if character data is available. */ bool hasText(); - - /** - * Returns a Variant containing the character data and its location. - * - * @return a string variant containing the text data and the character - * location. - */ - Variant getText(SourceId sourceId); }; /* Class GuardedExpatXmlParser */ @@ -156,7 +143,7 @@ public: static const std::string TOP_LEVEL_TAG{"ousia"}; /** - * Prefix used to indicate the start of an annoation (note the trailing colon) + * Prefix used to indicate the start of an annoation (note the trailing colon). */ static const std::string ANNOTATION_START_PREFIX{"a:start:"}; @@ -203,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { - parser->getEvents().data( - parser->getData().getText(parser->getReader().getSourceId())); + TokenizedData &data = parser->getData().data; + parser->getEvents().data(data); + data.clear(); } // Read the argument locations -- this is only a stupid and slow hack, @@ -348,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name) // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { - parser->getEvents().data( - parser->getData().getText(parser->getReader().getSourceId())); + TokenizedData &data = parser->getData().data; + parser->getEvents().data(data); + data.clear(); } // Abort if the special ousia tag ends here @@ -381,18 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) // Synchronize the logger position SourceLocation loc = xmlSyncLoggerPosition(p, ulen); - // Fetch some variables for convenience - OsxmlEventParserData &data = parser->getData(); - std::vector &textBuf = data.textBuf; - - // Update start and end position - if (textBuf.empty()) { - data.textStart = loc.getStart(); - } - data.textEnd = loc.getEnd(); - - // Insert the data into the text buffer - textBuf.insert(textBuf.end(), &s[0], &s[ulen]); + // Append the data to the buffer + parser->getData().data.append(std::string(s, ulen), loc.getStart()); } /* Class OsxmlEvents */ @@ -401,8 +380,8 @@ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ -OsxmlEventParserData::OsxmlEventParserData() - : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0) +OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId) + : data(sourceId), depth(0), annotationEndTagDepth(-1) { } @@ -423,23 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag() return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); } -bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } - -Variant OsxmlEventParserData::getText(SourceId sourceId) -{ - // Create a variant containing the string data and the location - Variant var = - Variant::fromString(std::string{textBuf.data(), textBuf.size()}); - var.setLocation({sourceId, textStart, textEnd}); - - // Reset the text buffers - textBuf.clear(); - textStart = 0; - textEnd = 0; - - // Return the variant - return var; -} +bool OsxmlEventParserData::hasText() { return !data.empty(); } /* Class OsxmlEventParser */ @@ -448,7 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - data(new OsxmlEventParserData()) + data(new OsxmlEventParserData(reader.getSourceId())) { } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index 7a8c96d..4c5a485 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -96,10 +96,10 @@ public: /** * Called whenever string data is found. * - * @param data is a Variant containing the string data that was found in the - * XML file. + * @param data is a TokenizedData instance containing the string data that + * was found in the XML file. */ - virtual void data(const Variant &data) = 0; + virtual void data(const TokenizedData &data) = 0; }; /** @@ -179,7 +179,9 @@ public: OsxmlEvents &getEvents() const; /** - * Returns a reference at the internal data. + * Used internally to fetch a reference at the internal data. + * + * @return a reference at the internal OsxmlEventParserData structure. */ OsxmlEventParserData &getData() const; }; diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index 924d11b..afe0dc6 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -85,7 +85,7 @@ public: void rangeEnd() override { stack.rangeEnd(); } - void data(const Variant &data) override { stack.data(data); } + void data(const TokenizedData &data) override { stack.data(data); } }; /* Class OsxmlParser */ diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp index b24a43d..d4e9443 100644 --- a/test/formats/osxml/OsxmlEventParserTest.cpp +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -69,9 +69,16 @@ public: events.emplace_back(OsxmlEvent::RANGE_END, Variant::arrayType{}); } - void data(const Variant &data) override + void data(const TokenizedData &data) override { - events.emplace_back(OsxmlEvent::DATA, Variant::arrayType{data}); + Token token; + Variant text; + TokenizedDataReader reader = data.reader(); + reader.read(token, TokenSet{}, WhitespaceMode::PRESERVE); + EXPECT_EQ(Tokens::Data, token.id); + text = Variant::fromString(token.content); + text.setLocation(token.getLocation()); + events.emplace_back(OsxmlEvent::DATA, Variant::arrayType{text}); } }; -- cgit v1.2.3 From f65e7af0dd0028ec481360eeaa16c4ff95ce253b Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 10:59:50 +0100 Subject: Got all handlers compling again --- CMakeLists.txt | 12 +-- src/core/parser/stack/DocumentHandler.cpp | 18 ++--- src/core/parser/stack/DocumentHandler.hpp | 4 +- src/core/parser/stack/Handler.hpp | 3 +- src/core/parser/stack/Stack.cpp | 123 +++++++++++++++++++++++------- src/core/parser/stack/Stack.hpp | 14 +--- 6 files changed, 112 insertions(+), 62 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b206458..cef1e31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,16 +181,16 @@ ADD_LIBRARY(ousia_core src/core/parser/ParserContext src/core/parser/ParserScope src/core/parser/stack/Callbacks -# src/core/parser/stack/DocumentHandler -# src/core/parser/stack/DomainHandler -# src/core/parser/stack/GenericParserStates + src/core/parser/stack/DocumentHandler + src/core/parser/stack/DomainHandler + src/core/parser/stack/GenericParserStates src/core/parser/stack/Handler -# src/core/parser/stack/ImportIncludeHandler + src/core/parser/stack/ImportIncludeHandler src/core/parser/stack/State # src/core/parser/stack/Stack src/core/parser/stack/TokenRegistry src/core/parser/stack/TokenStack -# src/core/parser/stack/TypesystemHandler + src/core/parser/stack/TypesystemHandler src/core/parser/utils/SourceOffsetVector src/core/parser/utils/TokenizedData src/core/parser/utils/Tokenizer @@ -215,8 +215,8 @@ ADD_LIBRARY(ousia_core #) ADD_LIBRARY(ousia_osml - src/formats/osml/OsmlParser src/formats/osml/OsmlStreamParser + src/formats/osml/OsmlParser ) TARGET_LINK_LIBRARIES(ousia_osml diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index d44176a..714ab1b 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -246,8 +246,6 @@ bool DocumentChildHandler::start(Variant::mapType &args) parent->getDescriptor()->getFieldDescriptorIndex(); } // create the entity for the new element at last. - // TODO: REMOVE - strct_name = strct->getName(); entity = parent->createChildStructuredEntity(strct, lastFieldIdx, args, nameAttr); } @@ -373,15 +371,8 @@ bool DocumentChildHandler::convertData(Handle field, return valid && scope().resolveValue(data, type, logger); } -bool DocumentChildHandler::data(TokenizedData &data) +bool DocumentChildHandler::data() { - // TODO: Handle this correctly - Variant text = data.text(WhitespaceMode::TRIM); - if (text == nullptr) { - // For now, except "no data" as success - return true; - } - // We're past the region in which explicit fields can be defined in the // parent structure element scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true); @@ -401,6 +392,7 @@ bool DocumentChildHandler::data(TokenizedData &data) // If it is a primitive field directly, try to parse the content. if (field->isPrimitive()) { // Add it as primitive content. + Variant text = readData(); if (!convertData(field, text, logger())) { return false; } @@ -419,6 +411,10 @@ bool DocumentChildHandler::data(TokenizedData &data) for (auto primitiveField : defaultFields) { // Then try to parse the content using the type specification. forks.emplace_back(logger().fork()); + + // TODO: Actually the data has to be read after the path has been + // created (as createPath may push more tokens onto the stack) + Variant text = readData(); if (!convertData(primitiveField, text, forks.back())) { continue; } @@ -428,7 +424,6 @@ bool DocumentChildHandler::data(TokenizedData &data) // Construct the necessary path NodeVector path = field->pathTo(primitiveField, logger()); - // TODO: Create methods with indices instead of names. createPath(fieldIdx, path, parent); // Then create the primitive element @@ -439,6 +434,7 @@ bool DocumentChildHandler::data(TokenizedData &data) // No field was found that might take the data -- dump the error messages // from the loggers -- or, if there were no primitive fields, clearly state // this fact + Variant text = readData(); if (defaultFields.empty()) { logger().error("Got data, but structure \"" + name() + "\" does not have any primitive field", diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index dda7d8b..c51c188 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -93,8 +93,6 @@ public: class DocumentChildHandler : public Handler { private: bool isExplicitField = false; - //TODO: REMOVE - std::string strct_name; /** * Code shared by both the start(), fieldStart() and the data() method. @@ -167,7 +165,7 @@ public: bool start(Variant::mapType &args) override; void end() override; - bool data(TokenizedData &data) override; + bool data() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 848d395..377a214 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace ousia { @@ -37,7 +38,7 @@ class Variant; namespace parser_stack { // More forward declarations -class Callbacks; +class HandlerCallbacks; class State; /** diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 292e7e2..ff03a6b 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -210,7 +210,6 @@ static LoggableException buildInvalidCommandException( /* Class StackImpl */ class StackImpl { - private: /** * Reference at the parser context. @@ -231,7 +230,7 @@ private: /** * Return the reference in the Logger instance stored within the context. */ - Logger &logger(); + Logger &logger() {return ctx.getLogger();} /** * Used internally to get all expected command names for the current state. @@ -311,12 +310,28 @@ private: * @return true if all handlers on the stack are valid. */ bool handlersValid(); -}; +public: + StackImpl(ParserContext &ctx, + const std::multimap &states); + + ~StackImpl(); -/* Class Stack */ + const State ¤tState() const; + std::string currentCommandName() const; -Stack::Stack(ParserContext &ctx, + void commandStart(const Variant &name, const Variant::mapType &args, + bool range); + void annotationStart(const Variant &className, const Variant &args, + bool range); + void annotationEnd(const Variant &className, const Variant &elementName); + void rangeEnd(); + void fieldStart(bool isDefault); + void fieldEnd(); + void data(const TokenizedData &data); +}; + +StackImpl::StackImpl(ParserContext &ctx, const std::multimap &states) : ctx(ctx), states(states) { @@ -327,7 +342,7 @@ Stack::Stack(ParserContext &ctx, } } -Stack::~Stack() +StackImpl::~StackImpl() { while (!stack.empty()) { // Fetch the topmost stack element @@ -351,7 +366,7 @@ Stack::~Stack() } } -void Stack::deduceState() +void StackImpl::deduceState() { // Assemble all states std::vector states; @@ -384,7 +399,7 @@ void Stack::deduceState() info.fieldStart(true, false, true); } -std::set Stack::expectedCommands() +std::set StackImpl::expectedCommands() { const State *currentState = &(this->currentState()); std::set res; @@ -396,17 +411,17 @@ std::set Stack::expectedCommands() return res; } -const State &Stack::currentState() +const State &StackImpl::currentState() { return stack.empty() ? States::None : stack.back().handler->getState(); } -std::string Stack::currentCommandName() +std::string StackImpl::currentCommandName() { return stack.empty() ? std::string{} : stack.back().handler->getName(); } -const State *Stack::findTargetState(const std::string &name) +const State *StackImpl::findTargetState(const std::string &name) { const State *currentState = &(this->currentState()); auto range = states.equal_range(name); @@ -420,7 +435,7 @@ const State *Stack::findTargetState(const std::string &name) return nullptr; } -const State *Stack::findTargetStateOrWildcard(const std::string &name) +const State *StackImpl::findTargetStateOrWildcard(const std::string &name) { // Try to find the target state with the given name, if none is found, try // find a matching "*" state. @@ -431,16 +446,16 @@ const State *Stack::findTargetStateOrWildcard(const std::string &name) return targetState; } -HandlerInfo &Stack::currentInfo() +HandlerInfo &StackImpl::currentInfo() { return stack.empty() ? EmptyHandlerInfo : stack.back(); } -HandlerInfo &Stack::lastInfo() +HandlerInfo &StackImpl::lastInfo() { return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2]; } -void Stack::endCurrentHandler() +void StackImpl::endCurrentHandler() { if (!stack.empty()) { // Fetch the handler info for the current top-level element @@ -467,7 +482,7 @@ void Stack::endCurrentHandler() } } -void Stack::endOverdueHandlers() +void StackImpl::endOverdueHandlers() { if (!stack.empty()) { // Fetch the handler info for the current top-level element @@ -483,7 +498,7 @@ void Stack::endOverdueHandlers() } } -bool Stack::ensureHandlerIsInField() +bool StackImpl::ensureHandlerIsInField() { // If the current handler is not in a field (and actually has a handler) // try to start a default field @@ -507,7 +522,7 @@ bool Stack::ensureHandlerIsInField() return true; } -bool Stack::handlersValid() +bool StackImpl::handlersValid() { for (auto it = stack.crbegin(); it != stack.crend(); it++) { if (!it->valid) { @@ -517,9 +532,7 @@ bool Stack::handlersValid() return true; } -Logger &Stack::logger() { return ctx.getLogger(); } - -void Stack::command(const Variant &name, const Variant::mapType &args) +void StackImpl::commandStart(const Variant &name, const Variant::mapType &args) { // End handlers that already had a default field and are currently not // active. @@ -611,7 +624,22 @@ void Stack::command(const Variant &name, const Variant::mapType &args) } } -void Stack::data(TokenizedData data) +void StackImpl::annotationStart(const Variant &className, const Variant &args) +{ + // TODO +} + +void StackImpl::annotationEnd(const Variant &className, const Variant &elementName) +{ + // TODO +} + +void StackImpl::rangeEnd() +{ + // TODO +} + +void StackImpl::data(TokenizedData data) { // TODO: Rewrite this function for token handling // TODO: This loop needs to be refactored out @@ -626,7 +654,8 @@ void Stack::data(TokenizedData data) // make sure the data actually is data if (stack.empty()) { if (hasNonWhitespaceText) { - throw LoggableException("No command here to receive data.", data); + throw LoggableException("No command here to receive data.", + data); } return; } @@ -699,7 +728,7 @@ void Stack::data(TokenizedData data) } } -void Stack::data(const Variant &stringData) +void StackImpl::data(const Variant &stringData) { // Fetch the SourceLocation of the given stringData variant SourceLocation loc = stringData.getLocation(); @@ -712,7 +741,7 @@ void Stack::data(const Variant &stringData) data(tokenizedData); } -void Stack::fieldStart(bool isDefault) +void StackImpl::fieldStart(bool isDefault) { // Make sure the current handler stack is not empty if (stack.empty()) { @@ -764,7 +793,7 @@ void Stack::fieldStart(bool isDefault) info.fieldStart(defaultField, false, valid); } -void Stack::fieldEnd() +void StackImpl::fieldEnd() { // Unroll the stack until the next explicitly open field while (!stack.empty()) { @@ -799,14 +828,50 @@ void Stack::fieldEnd() info.fieldEnd(); } -void Stack::annotationStart(const Variant &className, const Variant &args) +/* Class Stack */ + +Stack::Stack(ParserContext &ctx, + const std::multimap &states) + : impl(new StackImpl(ctx, states)) +{ +} + +Stack::~Stack() { - // TODO + // Do nothing here, stub needed because StackImpl is incomplete in hpp +} + +const State &Stack::currentState() const { return impl->currentState(); } + +std::string Stack::currentCommandName() const +{ + return impl->currentCommandName(); +} + +void Stack::commandStart(const Variant &name, const Variant::mapType &args, + bool range) +{ + impl->commandStart(name, args, range); +} + +void Stack::annotationStart(const Variant &className, const Variant &args, + bool range) +{ + impl->annotationStart(className, args, range); } void Stack::annotationEnd(const Variant &className, const Variant &elementName) { - // TODO + impl->annotationEnd(className, elementName); } + +void Stack::rangeEnd() { impl->rangeEnd(); } + +void Stack::fieldStart(bool isDefault) { impl->fieldStart(isDefault); } + +void Stack::fieldEnd() { impl->fieldEnd(); } + +void Stack::data(const TokenizedData &data) { impl->data(data); } +}; } } diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index e1173d0..1d87b9c 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -81,7 +81,7 @@ public: * @return the state of the currently active Handler instance or * States::None if no handler is on the stack. */ - const State ¤tState(); + const State ¤tState() const; /** * Returns the command name that is currently being handled. @@ -89,7 +89,7 @@ public: * @return the name of the command currently being handled by the active * Handler instance or an empty string if no handler is currently active. */ - std::string currentCommandName(); + std::string currentCommandName() const; /** * Function that should be called whenever a new command is reached. @@ -154,16 +154,6 @@ public: * that should be read. */ void data(const TokenizedData &data); - - /** - * Function that shuold be called whenever character data is found in the - * input stream. The given string variant is converted into a TokenizedData - * instance internally. - * - * @param stringData is a string variant containing the data that has been - * found. - */ - void data(const Variant &stringData); }; } } -- cgit v1.2.3 From 5b81f755a5303c3eab05c605711ecca32c071b6d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 11:46:47 +0100 Subject: Got Stack compiling again --- src/core/parser/stack/Callbacks.hpp | 28 ++-- src/core/parser/stack/Handler.hpp | 75 ++++----- src/core/parser/stack/Stack.cpp | 297 ++++++++++++++++++++++-------------- src/core/parser/stack/Stack.hpp | 5 +- src/formats/osml/OsmlParser.cpp | 2 +- src/formats/osxml/OsxmlParser.cpp | 15 +- 6 files changed, 251 insertions(+), 171 deletions(-) diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp index d7b2547..8acc02d 100644 --- a/src/core/parser/stack/Callbacks.hpp +++ b/src/core/parser/stack/Callbacks.hpp @@ -76,18 +76,8 @@ public: * Interface defining a set of callback functions that act as a basis for the * StateStackCallbacks and the ParserCallbacks. */ -class HandlerCallbacks: public ParserCallbacks { +class HandlerCallbacks : public ParserCallbacks { public: - /** - * Reads a string variant form the current input stream. This function must - * be called from the data() method. - * - * @return a string variant containing the current text data. The return - * value depends on the currently set whitespace mode and the tokens that - * were enabled using the enableTokens callback method. - */ - Variant readData(); - /** * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. * The tokens described in the token list are the tokens that are currently @@ -96,14 +86,24 @@ public: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + virtual void pushTokens( + const std::vector &tokens) = 0; /** * Removes the previously pushed list of tokens from the stack. */ - void popTokens(); -}; + virtual void popTokens() = 0; + /** + * Reads a string variant form the current input stream. This function must + * be called from the data() method. + * + * @return a string variant containing the current text data. The return + * value depends on the currently set whitespace mode and the tokens that + * were enabled using the enableTokens callback method. + */ + virtual Variant readData() = 0; +}; } } diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 377a214..19c3d65 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -161,6 +161,44 @@ protected: */ const std::string &name() const; + /** + * Calls the corresponding function in the HandlerCallbacks instance. This + * method registers the given tokens as tokens that are generally available, + * tokens must be explicitly enabled using the "pushTokens" and "popTokens" + * method. Tokens that have not been registered are not guaranteed to be + * reported (except for special tokens, these do not have to be registerd). + * + * @param token is the token string that should be made available. + * @return the TokenId that will be used to refer to the token. + */ + TokenId registerToken(const std::string &token); + + /** + * Calls the corresponding function in the HandlerCallbacks instance. This + * method unregisters the given token. Note that for a token to be no longer + * reported, this function has to be called as many times as registerToken() + * for the corresponding token. + * + * @param id is the id of the Token that should be unregistered. + */ + void unregisterToken(TokenId id); + + /** + * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. + * The tokens described in the token list are the tokens that are currently + * enabled. + * + * @param tokens is a list of TokenSyntaxDescriptor instances that should be + * stored on the stack. + */ + void pushTokens(const std::vector &tokens); + + /** + * Calls the corresponding function in the HandlerCallbacks instance. + * Removes the previously pushed list of tokens from the stack. + */ + void popTokens(); + /** * Calls the corresponding method in the HandlerCallbacks instance. Reads a * string variant form the current input stream. This function must be @@ -193,43 +231,6 @@ protected: */ // void popWhitespaceMode(); - /** - * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. - * The tokens described in the token list are the tokens that are currently - * enabled. - * - * @param tokens is a list of TokenSyntaxDescriptor instances that should be - * stored on the stack. - */ - void pushTokens(const std::vector &tokens); - - /** - * Calls the corresponding function in the HandlerCallbacks instance. - * Removes the previously pushed list of tokens from the stack. - */ - void popTokens(); - - /** - * Calls the corresponding function in the HandlerCallbacks instance. This - * method registers the given tokens as tokens that are generally available, - * tokens must be explicitly enabled using the "pushTokens" and "popTokens" - * method. Tokens that have not been registered are not guaranteed to be - * reported (except for special tokens, these do not have to be registerd). - * - * @param token is the token string that should be made available. - * @return the TokenId that will be used to refer to the token. - */ - TokenId registerToken(const std::string &token); - - /** - * Calls the corresponding function in the HandlerCallbacks instance. This - * method unregisters the given token. Note that for a token to be no longer - * reported, this function has to be called as many times as registerToken() - * for the corresponding token. - * - * @param id is the id of the Token that should be unregistered. - */ - void unregisterToken(TokenId id); public: /** diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index ff03a6b..a556999 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -23,9 +23,12 @@ #include #include +#include "Callbacks.hpp" #include "Handler.hpp" #include "Stack.hpp" #include "State.hpp" +#include "TokenRegistry.hpp" +#include "TokenStack.hpp" namespace ousia { namespace parser_stack { @@ -209,8 +212,14 @@ static LoggableException buildInvalidCommandException( /* Class StackImpl */ -class StackImpl { +class StackImpl : public HandlerCallbacks { private: + /** + * Reference at an implementation of the ParserCallbacks instance to which + * certain handler callbacks are directed. + */ + ParserCallbacks &parser; + /** * Reference at the parser context. */ @@ -222,6 +231,18 @@ private: */ const std::multimap &states; + /** + * Registry responsible for registering the tokens proposed by the + * Handlers in the parser. + */ + TokenRegistry tokenRegistry; + + /** + * Pointer at a TokenizedDataReader instance from which the data should + * currently be read. + */ + TokenizedDataReader *dataReader; + /** * Internal stack used for managing the currently active Handler instances. */ @@ -230,7 +251,7 @@ private: /** * Return the reference in the Logger instance stored within the context. */ - Logger &logger() {return ctx.getLogger();} + Logger &logger() { return ctx.getLogger(); } /** * Used internally to get all expected command names for the current state. @@ -312,8 +333,8 @@ private: bool handlersValid(); public: - StackImpl(ParserContext &ctx, - const std::multimap &states); + StackImpl(ParserCallbacks &parser, ParserContext &ctx, + const std::multimap &states); ~StackImpl(); @@ -329,11 +350,22 @@ public: void fieldStart(bool isDefault); void fieldEnd(); void data(const TokenizedData &data); + + TokenId registerToken(const std::string &token) override; + void unregisterToken(TokenId id) override; + Variant readData() override; + bool hasData(); + void pushTokens(const std::vector &tokens) override; + void popTokens() override; }; -StackImpl::StackImpl(ParserContext &ctx, - const std::multimap &states) - : ctx(ctx), states(states) +StackImpl::StackImpl(ParserCallbacks &parser, ParserContext &ctx, + const std::multimap &states) + : parser(parser), + ctx(ctx), + states(states), + tokenRegistry(parser), + dataReader(nullptr) { // If the scope instance is not empty we need to deduce the current parser // state @@ -389,8 +421,8 @@ void StackImpl::deduceState() HandlerConstructor ctor = state.elementHandler ? state.elementHandler : EmptyHandler::create; - std::shared_ptr handler = - std::shared_ptr{ctor({ctx, "", state, SourceLocation{}})}; + std::shared_ptr handler = std::shared_ptr{ + ctor({ctx, *this, "", state, SourceLocation{}})}; stack.emplace_back(handler); // Set the correct flags for this implicit handler @@ -411,12 +443,12 @@ std::set StackImpl::expectedCommands() return res; } -const State &StackImpl::currentState() +const State &StackImpl::currentState() const { return stack.empty() ? States::None : stack.back().handler->getState(); } -std::string StackImpl::currentCommandName() +std::string StackImpl::currentCommandName() const { return stack.empty() ? std::string{} : stack.back().handler->getName(); } @@ -532,7 +564,8 @@ bool StackImpl::handlersValid() return true; } -void StackImpl::commandStart(const Variant &name, const Variant::mapType &args) +void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, + bool range) { // End handlers that already had a default field and are currently not // active. @@ -575,8 +608,8 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args) HandlerConstructor ctor = targetState->elementHandler ? targetState->elementHandler : EmptyHandler::create; - std::shared_ptr handler{ - ctor({ctx, name.asString(), *targetState, name.getLocation()})}; + std::shared_ptr handler{ctor( + {ctx, *this, name.asString(), *targetState, name.getLocation()})}; stack.emplace_back(handler); // Fetch the HandlerInfo for the parent element and the current element @@ -624,12 +657,14 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args) } } -void StackImpl::annotationStart(const Variant &className, const Variant &args) +void StackImpl::annotationStart(const Variant &className, const Variant &args, + bool range) { // TODO } -void StackImpl::annotationEnd(const Variant &className, const Variant &elementName) +void StackImpl::annotationEnd(const Variant &className, + const Variant &elementName) { // TODO } @@ -639,106 +674,93 @@ void StackImpl::rangeEnd() // TODO } -void StackImpl::data(TokenizedData data) +void StackImpl::data(const TokenizedData &data) { // TODO: Rewrite this function for token handling // TODO: This loop needs to be refactored out - while (!data.atEnd()) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); - - const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); - - // Check whether there is any command the data can be sent to -- if not, - // make sure the data actually is data - if (stack.empty()) { - if (hasNonWhitespaceText) { - throw LoggableException("No command here to receive data.", - data); - } - return; - } - - // Fetch the current command handler information - HandlerInfo &info = currentInfo(); - - // Make sure the current handler has an open field - if (!ensureHandlerIsInField()) { - endCurrentHandler(); - continue; - } - - // If this field should not get any data, log an error and do not call - // the "data" handler - if (!info.inValidField) { - // If the "hadDefaultField" flag is set, we already issued an error - // message - if (!info.hadDefaultField) { - if (hasNonWhitespaceText) { - logger().error("Did not expect any data here", data); - } - return; - } - } - - if (handlersValid() && info.inValidField) { - // Fork the logger and set it as temporary logger for the "start" - // method. We only want to keep error messages if this was not a try - // to implicitly open a default field. - LoggerFork loggerFork = logger().fork(); - info.handler->setLogger(loggerFork); - - // Pass the data to the current Handler instance - bool valid = false; - try { - // Create a fork of the TokenizedData and let the handler work - // on it - TokenizedData dataFork = data; - valid = info.handler->data(dataFork); - - // If the data was validly handled by the handler, commit the - // change - if (valid) { - data = dataFork; - } - } - catch (LoggableException ex) { - loggerFork.log(ex); - } - - // Reset the logger instance as soon as possible - info.handler->resetLogger(); - - // If placing the data here failed and we're currently in an - // implicitly opened field, just unroll the stack to the next field - // and try again - if (!valid && info.inImplicitDefaultField) { - endCurrentHandler(); - continue; - } - - // Commit the content of the logger fork. Do not change the valid - // flag. - loggerFork.commit(); - } - - // There was no reason to unroll the stack any further, so continue - return; - } -} - -void StackImpl::data(const Variant &stringData) -{ - // Fetch the SourceLocation of the given stringData variant - SourceLocation loc = stringData.getLocation(); - - // Create a TokenizedData instance and feed the given string data into it - TokenizedData tokenizedData(loc.getSourceId()); - tokenizedData.append(stringData.asString(), loc.getStart()); - - // Call the actual "data" method - data(tokenizedData); + /*while (!data.atEnd()) { + // End handlers that already had a default field and are currently not + // active. + endOverdueHandlers(); + + const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); + + // Check whether there is any command the data can be sent to -- if not, + // make sure the data actually is data + if (stack.empty()) { + if (hasNonWhitespaceText) { + throw LoggableException("No command here to receive data.", + data); + } + return; + } + + // Fetch the current command handler information + HandlerInfo &info = currentInfo(); + + // Make sure the current handler has an open field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } + + // If this field should not get any data, log an error and do not call + // the "data" handler + if (!info.inValidField) { + // If the "hadDefaultField" flag is set, we already issued an error + // message + if (!info.hadDefaultField) { + if (hasNonWhitespaceText) { + logger().error("Did not expect any data here", data); + } + return; + } + } + + if (handlersValid() && info.inValidField) { + // Fork the logger and set it as temporary logger for the "start" + // method. We only want to keep error messages if this was not a try + // to implicitly open a default field. + LoggerFork loggerFork = logger().fork(); + info.handler->setLogger(loggerFork); + + // Pass the data to the current Handler instance + bool valid = false; + try { + // Create a fork of the TokenizedData and let the handler work + // on it + TokenizedData dataFork = data; + valid = info.handler->data(dataFork); + + // If the data was validly handled by the handler, commit the + // change + if (valid) { + data = dataFork; + } + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + + // Reset the logger instance as soon as possible + info.handler->resetLogger(); + + // If placing the data here failed and we're currently in an + // implicitly opened field, just unroll the stack to the next field + // and try again + if (!valid && info.inImplicitDefaultField) { + endCurrentHandler(); + continue; + } + + // Commit the content of the logger fork. Do not change the valid + // flag. + loggerFork.commit(); + } + + // There was no reason to unroll the stack any further, so continue + return; + }*/ } void StackImpl::fieldStart(bool isDefault) @@ -828,11 +850,55 @@ void StackImpl::fieldEnd() info.fieldEnd(); } +TokenId StackImpl::registerToken(const std::string &token) +{ + return tokenRegistry.registerToken(token); +} + +void StackImpl::unregisterToken(TokenId id) +{ + tokenRegistry.unregisterToken(id); +} + +void StackImpl::pushTokens(const std::vector &tokens) +{ + // TODO +} + +void StackImpl::popTokens() +{ + // TODO +} + +Variant StackImpl::readData() +{ + if (dataReader != nullptr) { + TokenizedDataReaderFork dataReaderFork = dataReader->fork(); + Token token; + + // TODO: Use correct token set + TokenSet tokens; + + // TODO: Use correct whitespace mode + WhitespaceMode mode = WhitespaceMode::COLLAPSE; + + dataReaderFork.read(token, tokens, mode); + if (token.id == Tokens::Data) { + Variant res = Variant::fromString(token.content); + res.setLocation(token.getLocation()); + return res; + } + } + return Variant{}; +} + +bool StackImpl::hasData() { return readData() != nullptr; } + /* Class Stack */ -Stack::Stack(ParserContext &ctx, +Stack::Stack(ParserCallbacks &parser, ParserContext &ctx, const std::multimap &states) - : impl(new StackImpl(ctx, states)) + : impl(new StackImpl(parser, ctx, states)) { } @@ -872,6 +938,5 @@ void Stack::fieldStart(bool isDefault) { impl->fieldStart(isDefault); } void Stack::fieldEnd() { impl->fieldEnd(); } void Stack::data(const TokenizedData &data) { impl->data(data); } -}; } } diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index 1d87b9c..de281d4 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -42,6 +42,7 @@ class Variant; namespace parser_stack { // Forward declarations +class ParserCallbacks; class StackImpl; class State; @@ -63,11 +64,13 @@ public: /** * Creates a new instance of the Stack class. * + * @param parser is an implementation of the ParserCallbacks instance to + * which certain calls are directed. * @param ctx is the parser context the parser stack is working on. * @param states is a map containing the command names and pointers at the * corresponding State instances. */ - Stack(ParserContext &ctx, + Stack(ParserCallbacks &parser, ParserContext &ctx, const std::multimap &states); /** diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp index c25974f..36ef2b6 100644 --- a/src/formats/osml/OsmlParser.cpp +++ b/src/formats/osml/OsmlParser.cpp @@ -73,7 +73,7 @@ public: : logger(ctx.getLogger()), ctx(ctx), parser(reader, logger), - stack(ctx, GenericParserStates) + stack(parser, ctx, GenericParserStates) { } diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index afe0dc6..10cc77a 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -32,7 +33,7 @@ using namespace parser_stack; /** * Class containing the actual OsxmlParser implementation. */ -class OsxmlParserImplementation : public OsxmlEvents { +class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks { private: /** * Actual xml parser -- converts the xml stream into a set of events. @@ -56,7 +57,7 @@ public: */ OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) : parser(reader, *this, ctx.getLogger()), - stack(ctx, GenericParserStates) + stack(*this, ctx, GenericParserStates) { } @@ -86,6 +87,16 @@ public: void rangeEnd() override { stack.rangeEnd(); } void data(const TokenizedData &data) override { stack.data(data); } + + TokenId registerToken(const std::string &token) override + { + return Tokens::Empty; + } + + void unregisterToken(TokenId id) override + { + // Do nothing here + } }; /* Class OsxmlParser */ -- cgit v1.2.3 From 072992a634d816fc7061b7eee5fd0cabe4242de4 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 13:45:24 +0100 Subject: Added supportsTokens method to StateBuilder --- src/core/parser/stack/State.cpp | 15 ++++++++++++--- src/core/parser/stack/State.hpp | 33 ++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/core/parser/stack/State.cpp b/src/core/parser/stack/State.cpp index d72f533..0feeed6 100644 --- a/src/core/parser/stack/State.cpp +++ b/src/core/parser/stack/State.cpp @@ -23,17 +23,19 @@ namespace parser_stack { /* Class State */ -State::State() : elementHandler(nullptr) {} +State::State() : elementHandler(nullptr), supportsAnnotations(false), supportsTokens(false) {} State::State(StateSet parents, Arguments arguments, RttiSet createdNodeTypes, HandlerConstructor elementHandler, - bool supportsAnnotations) + bool supportsAnnotations, + bool supportsTokens) : parents(parents), arguments(arguments), createdNodeTypes(createdNodeTypes), elementHandler(elementHandler), - supportsAnnotations(supportsAnnotations) + supportsAnnotations(supportsAnnotations), + supportsTokens(supportsTokens) { } @@ -93,6 +95,13 @@ StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations) return *this; } +StateBuilder &StateBuilder::supportsTokens(bool supportsTokens) +{ + state.supportsTokens = supportsTokens; + return *this; +} + + const State &StateBuilder::build() const { return state; } /* Class StateDeductor */ diff --git a/src/core/parser/stack/State.hpp b/src/core/parser/stack/State.hpp index 4766235..011ccd6 100644 --- a/src/core/parser/stack/State.hpp +++ b/src/core/parser/stack/State.hpp @@ -82,13 +82,21 @@ struct State { /** * Set to true if this handler does support annotations. This is almost - * always false (e.g. all description handlers), except for document + * always false (e.g. all description handlers), except for document * element handlers. */ - bool supportsAnnotations; + bool supportsAnnotations : 1; /** - * Default constructor, initializes the handlers with nullptr. + * Set to true if this handler does support tokens. This is almost + * always false (e.g. all description handlers), except for document + * element handlers. + */ + bool supportsTokens : 1; + + /** + * Default constructor, initializes the handlers with nullptr and the + * supportsAnnotations and supportsTokens flags with false. */ State(); @@ -108,11 +116,12 @@ struct State { * be nullptr in which case no handler instance is created. * @param supportsAnnotations specifies whether annotations are supported * here at all. + * @param supportsTokens specified whether tokens are supported here at all. */ State(StateSet parents, Arguments arguments = Arguments{}, - RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr, - bool supportsAnnotations = false); + RttiSet createdNodeTypes = RttiSet{}, + HandlerConstructor elementHandler = nullptr, + bool supportsAnnotations = false, bool supportsTokens = false); /** * Creates this State from the given StateBuilder instance. @@ -219,6 +228,16 @@ public: */ StateBuilder &supportsAnnotations(bool supportsAnnotations); + /** + * Sets the state of the "supportsTokens" flag (default value is false). + * + * @param supportsTokens should be set to true, if the elementHandler + * registered for this state is capable of handling tokens. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &supportsTokens(bool supportsTokens); + /** * Returns a reference at the internal State instance that was built * using the StateBuilder. @@ -275,7 +294,7 @@ public: * @param states is a list of states that should be checked. */ StateDeductor(std::vector signature, - std::vector states); + std::vector states); /** * Selects all active states from the given states. Only considers those -- cgit v1.2.3 From 5d6ee07995c7f59e66e0df558c8ebe7d2a8d1f68 Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Mon, 2 Mar 2015 15:52:13 +0100 Subject: refactored SyntaxDescriptor to Token.hpp and added TokenDescriptor class. --- CMakeLists.txt | 1 + src/core/common/Token.cpp | 14 --- src/core/common/Token.hpp | 67 +----------- src/core/model/Syntax.cpp | 58 +++++++++++ src/core/model/Syntax.hpp | 196 +++++++++++++++++++++++++++++++++++ src/core/parser/stack/Callbacks.hpp | 3 +- src/core/parser/stack/Handler.cpp | 2 +- src/core/parser/stack/Handler.hpp | 3 +- src/core/parser/stack/TokenStack.cpp | 4 +- src/core/parser/stack/TokenStack.hpp | 5 +- 10 files changed, 266 insertions(+), 87 deletions(-) create mode 100644 src/core/model/Syntax.cpp create mode 100644 src/core/model/Syntax.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b206458..13de9ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,7 @@ ADD_LIBRARY(ousia_core src/core/model/Project src/core/model/RootNode src/core/model/Style + src/core/model/Syntax src/core/model/Typesystem src/core/parser/Parser src/core/parser/ParserContext diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp index e454ae4..17ce03e 100644 --- a/src/core/common/Token.cpp +++ b/src/core/common/Token.cpp @@ -20,19 +20,5 @@ namespace ousia { -/* Class TokenSyntaxDescriptor */ - -void TokenSyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const -{ - if (start != Tokens::Empty) { - set.insert(start); - } - if (end != Tokens::Empty) { - set.insert(end); - } - if (shortForm != Tokens::Empty) { - set.insert(shortForm); - } -} } diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp index f89a0ce..f37151f 100644 --- a/src/core/common/Token.hpp +++ b/src/core/common/Token.hpp @@ -173,71 +173,6 @@ struct Token { const SourceLocation &getLocation() const { return location; } }; -/** - * Class describing the user defined syntax for a single field or annotation. - */ -struct TokenSyntaxDescriptor { - /** - * Possible start token or Tokens::Empty if no token is set. - */ - TokenId start; - - /** - * Possible end token or Tokens::Empty if no token is set. - */ - TokenId end; - - /** - * Possible representation token or Tokens::Empty if no token is set. - */ - TokenId shortForm; - - /** - * Flag specifying whether this TokenSyntaxDescriptor describes an - * annotation. - */ - bool isAnnotation; - - /** - * Default constructor, sets all token ids to Tokens::Empty and isAnnotation - * to false. - */ - TokenSyntaxDescriptor() - : start(Tokens::Empty), - end(Tokens::Empty), - shortForm(Tokens::Empty), - isAnnotation(false) - { - } - - /** - * Member initializer constructor. - * - * @param start is a possible start token. - * @param end is a possible end token. - * @param shortForm is a possible short form token. - * @param isAnnotation is set to true if this syntax descriptor describes an - * annotation. - */ - TokenSyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, - bool isAnnotation) - : start(start), - end(end), - shortForm(shortForm), - isAnnotation(isAnnotation) - { - } - - /** - * Inserts all tokens referenced in this TokenSyntaxDescriptor into the - * given TokenSet. Skips token ids set to Tokens::Empty. - * - * @param set is the TokenSet instance into which the Tokens should be - * inserted. - */ - void insertIntoTokenSet(TokenSet &set) const; -}; } -#endif /* _OUSIA_TOKENS_HPP_ */ - +#endif /* _OUSIA_TOKENS_HPP_ */ \ No newline at end of file diff --git a/src/core/model/Syntax.cpp b/src/core/model/Syntax.cpp new file mode 100644 index 0000000..9dbaccc --- /dev/null +++ b/src/core/model/Syntax.cpp @@ -0,0 +1,58 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Syntax.hpp" + +#include "Domain.hpp" + +namespace ousia { + +/* Class TokenSyntaxDescriptor */ + +bool SyntaxDescriptor::isAnnotation() const +{ + return descriptor->isa(&RttiTypes::AnnotationClass); +} +bool SyntaxDescriptor::isFieldDescriptor() const +{ + return descriptor->isa(&RttiTypes::FieldDescriptor); +} +bool SyntaxDescriptor::isStruct() const +{ + return descriptor->isa(&RttiTypes::StructuredClass); +} + +void SyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const +{ + if (start != Tokens::Empty) { + set.insert(start); + } + if (end != Tokens::Empty) { + set.insert(end); + } + if (shortForm != Tokens::Empty) { + set.insert(shortForm); + } +} + +bool SyntaxDescriptor::isEmpty() const +{ + return start == Tokens::Empty && end == Tokens::Empty && + shortForm == Tokens::Empty; +} +} \ No newline at end of file diff --git a/src/core/model/Syntax.hpp b/src/core/model/Syntax.hpp new file mode 100644 index 0000000..4da3408 --- /dev/null +++ b/src/core/model/Syntax.hpp @@ -0,0 +1,196 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Syntax.hpp + * + * This header contains the Descriptor classes for user definable syntax for + * Document entities or fields. These classes are referenced in Ontology.hpp. + */ + +#ifndef _OUSIA_MODEL_SYNTAX_HPP_ +#define _OUSIA_MODEL_SYNTAX_HPP_ + +#include +#include "Node.hpp" + +namespace ousia { + +/** + * Class to describe a single token that shall be used as user-defined syntax. + */ +struct TokenDescriptor { + /** + * The string content of this token, if it is not a special one. + */ + std::string token; + /** + * A flag to be set true if this TokenDescriptor uses a special token. + */ + bool special; + /** + * An id to uniquely identify this token. + */ + TokenId id; + + /** + * Constructor for non-special tokens. The special flag is set to false and + * the id to Tokens::Empty. + * + * @param token The string content of this token, if it is not a special + * one. + */ + TokenDescriptor(std::string token = std::string()) + : token(std::move(token)), special(false), id(Tokens::Empty) + { + } + + /** + * Constructor for special tokens. The token is set to an empty string and + * the special flag to true. + * + * @param id the id of the special token. + */ + TokenDescriptor(TokenId id) : special(true), id(id) {} + + /** + * Returns true if and only if neither a string nor an ID is given. + * + * @return true if and only if neither a string nor an ID is given. + */ + bool isEmpty() const { return token.empty() && id == Tokens::Empty; } +}; + +/** + * Class describing the user defined syntax for a StructuredClass, + * AnnotationClass or FieldDescriptor. + * + * This class is used during parsing of a Document. It is used to describe + * the tokens relevant for one Descriptor that could be created at this point + * during parsing. + */ +struct SyntaxDescriptor { + /** + * Possible start token or Tokens::Empty if no token is set. + */ + TokenId start; + + /** + * Possible end token or Tokens::Empty if no token is set. + */ + TokenId end; + + /** + * Possible representation token or Tokens::Empty if no token is set. + */ + TokenId shortForm; + + /* + * The Descriptor this SyntaxDescriptor belongs to. As this may be + * a FieldDescriptor as well as a class Descriptor (StructuredClass or + * AnnotationClass) we can only use the class Node as inner argument here. + */ + Rooted descriptor; + /* + * Given the current leaf in the parsed document the depth of a + * SyntaxDescriptor is defined as the number of transparent elements that + * would be needed to construct an instance of the referenced descriptor. + */ + ssize_t depth; + + /** + * Default constructor, sets all token ids to Tokens::Empty and the + * descriptor handle to nullptr. + */ + SyntaxDescriptor() + : start(Tokens::Empty), + end(Tokens::Empty), + shortForm(Tokens::Empty), + descriptor(nullptr), + depth(-1) + { + } + + /** + * Member initializer constructor. + * + * @param start is a possible start token. + * @param end is a possible end token. + * @param shortForm is a possible short form token. + * @param descriptor The Descriptor this SyntaxDescriptor belongs to. + * @param depth Given the current leaf in the parsed document the depth of a + * SyntaxDescriptor is defined as the number of transparent elements that + * would be needed to construct an instance of the referenced descriptor. + */ + SyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, + Handle descriptor, ssize_t depth) + : start(start), + end(end), + shortForm(shortForm), + descriptor(descriptor), + depth(depth) + { + } + + /** + * Inserts all tokens referenced in this SyntaxDescriptor into the + * given TokenSet. Skips token ids set to Tokens::Empty. + * + * @param set is the TokenSet instance into which the Tokens should be + * inserted. + */ + void insertIntoTokenSet(TokenSet &set) const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to an + * AnnotationClass. + * + * @return true if and only if this SyntaxDescriptor belongs to an + * AnnotationClass. + */ + bool isAnnotation() const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to a + * StrcturedClass. + * + * @return true if and only if this SyntaxDescriptor belongs to a + * StrcturedClass. + */ + bool isStruct() const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to a + * FieldDescriptor. + * + * @return true if and only if this SyntaxDescriptor belongs to a + * FieldDescriptor. + */ + bool isFieldDescriptor() const; + + /** + * Returns true if and only if this SyntaxDescriptor has only empty + * entries in start, end and short. + * + * @return true if and only if this SyntaxDescriptor has only empty + * entries in start, end and short. + */ + bool isEmpty() const; +}; +} +#endif \ No newline at end of file diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp index d7b2547..e471881 100644 --- a/src/core/parser/stack/Callbacks.hpp +++ b/src/core/parser/stack/Callbacks.hpp @@ -34,6 +34,7 @@ #include #include +#include namespace ousia { @@ -96,7 +97,7 @@ public: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + void pushTokens(const std::vector &tokens); /** * Removes the previously pushed list of tokens from the stack. diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index 734976a..12df0fd 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -74,7 +74,7 @@ Variant Handler::readData() return handlerData.callbacks.readData(); } -void Handler::pushTokens(const std::vector &tokens) +void Handler::pushTokens(const std::vector &tokens) { handlerData.callbacks.pushTokens(tokens); } diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 848d395..19660d0 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace ousia { @@ -200,7 +201,7 @@ protected: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + void pushTokens(const std::vector &tokens); /** * Calls the corresponding function in the HandlerCallbacks instance. diff --git a/src/core/parser/stack/TokenStack.cpp b/src/core/parser/stack/TokenStack.cpp index 6afeaed..ac1d94e 100644 --- a/src/core/parser/stack/TokenStack.cpp +++ b/src/core/parser/stack/TokenStack.cpp @@ -21,7 +21,7 @@ namespace ousia { namespace parser_stack { -void TokenStack::pushTokens(const std::vector &tokens) +void TokenStack::pushTokens(const std::vector &tokens) { stack.push_back(tokens); } @@ -35,7 +35,7 @@ TokenSet TokenStack::tokens() const } TokenSet res; - for (const TokenSyntaxDescriptor &descr : stack.back()) { + for (const SyntaxDescriptor &descr : stack.back()) { descr.insertIntoTokenSet(res); } return res; diff --git a/src/core/parser/stack/TokenStack.hpp b/src/core/parser/stack/TokenStack.hpp index 9669f50..af734bb 100644 --- a/src/core/parser/stack/TokenStack.hpp +++ b/src/core/parser/stack/TokenStack.hpp @@ -32,6 +32,7 @@ #include #include +#include namespace ousia { namespace parser_stack { @@ -52,7 +53,7 @@ private: * Stack containing vectors of TokenSyntaxDescriptor instances as given by * the user. */ - std::vector> stack; + std::vector> stack; /** * Constructor of the TokenStack class. @@ -86,7 +87,7 @@ public: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + void pushTokens(const std::vector &tokens); /** * Removes the previously pushed list of tokens from the stack. -- cgit v1.2.3 From 522580cfdfc9e6dc3448240448c29533e68f240f Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Mon, 2 Mar 2015 15:52:34 +0100 Subject: added check for witespace characters in Utils::isUserDefinedToken --- src/core/common/Utils.cpp | 15 +++++++++++---- src/core/common/Utils.hpp | 1 + test/core/common/UtilsTest.cpp | 2 ++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 219b437..a87ff6d 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -124,7 +124,8 @@ bool Utils::isUserDefinedToken(const std::string &token) // Make sure the token meets is neither empty, nor starts or ends with an // alphanumeric character const size_t len = token.size(); - if (len == 0 || isAlphanumeric(token[0]) || isAlphanumeric(token[len - 1])) { + if (len == 0 || isAlphanumeric(token[0]) || + isAlphanumeric(token[len - 1])) { return false; } @@ -134,13 +135,19 @@ bool Utils::isUserDefinedToken(const std::string &token) return false; } + // Make sure the token does not contain any whitespaces. + for (char c : token) { + if (isWhitespace(c)) { + return false; + } + } + // Make sure the token contains other characters but { and } - for (char c: token) { + for (char c : token) { if (c != '{' && c != '}') { return true; } } return false; } -} - +} \ No newline at end of file diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 25a4de5..d9e26da 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -117,6 +117,7 @@ public: *
  • '%', '%{', '}%'
  • * * + *
  • The token does not contain any whitespaces.
  • * */ static bool isUserDefinedToken(const std::string &token); diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 54890ee..2aaa430 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -148,6 +148,7 @@ TEST(Utils, isUserDefinedToken) EXPECT_TRUE(Utils::isUserDefinedToken("`")); EXPECT_TRUE(Utils::isUserDefinedToken("<")); EXPECT_TRUE(Utils::isUserDefinedToken(">")); + EXPECT_TRUE(Utils::isUserDefinedToken("<+>")); EXPECT_FALSE(Utils::isUserDefinedToken("a:")); EXPECT_FALSE(Utils::isUserDefinedToken("a:a")); EXPECT_FALSE(Utils::isUserDefinedToken(":a")); @@ -158,6 +159,7 @@ TEST(Utils, isUserDefinedToken) EXPECT_FALSE(Utils::isUserDefinedToken("<\\")); EXPECT_FALSE(Utils::isUserDefinedToken("\\>")); EXPECT_FALSE(Utils::isUserDefinedToken("{!")); + EXPECT_FALSE(Utils::isUserDefinedToken("< + >")); } } -- cgit v1.2.3 From ee943c5e9b60cf577ff236a694df180db89b0972 Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Mon, 2 Mar 2015 15:53:20 +0100 Subject: integrated syntax tokens in Domain. --- src/core/model/Domain.cpp | 193 +++++++++++++++++++++++--- src/core/model/Domain.hpp | 297 +++++++++++++++++++++++++++++++++++++---- test/core/model/DomainTest.cpp | 165 ++++++++++++++++++++++- 3 files changed, 607 insertions(+), 48 deletions(-) diff --git a/src/core/model/Domain.cpp b/src/core/model/Domain.cpp index 8255401..587a382 100644 --- a/src/core/model/Domain.cpp +++ b/src/core/model/Domain.cpp @@ -20,8 +20,9 @@ #include #include -#include #include +#include +#include #include "Domain.hpp" @@ -169,52 +170,60 @@ static NodeVector pathTo(const Node *start, Logger &logger, return shortest; } +struct CollectState { + Node *n; + size_t depth; + + CollectState(Node *n, size_t depth) : n(n), depth(depth) {} +}; + template static NodeVector collect(const Node *start, F match) { // result NodeVector res; // queue for breadth-first search of graph. - std::queue> q; + std::queue q; // put the initial node on the stack. - q.push(const_cast(start)); + q.push(CollectState(const_cast(start), 0)); // set of visited nodes. std::unordered_set visited; while (!q.empty()) { - Rooted n = q.front(); + CollectState state = q.front(); q.pop(); // do not proceed if this node was already visited. - if (!visited.insert(n.get()).second) { + if (!visited.insert(state.n).second) { continue; } - if (n->isa(&RttiTypes::StructuredClass)) { - Rooted strct = n.cast(); + if (state.n->isa(&RttiTypes::Descriptor)) { + Rooted strct{static_cast(state.n)}; // look through all fields. NodeVector fields = strct->getFieldDescriptors(); for (auto fd : fields) { // note matches. - if (match(fd)) { + if (match(fd, state.depth)) { res.push_back(fd); } // only continue in the TREE field. if (fd->getFieldType() == FieldDescriptor::FieldType::TREE) { - q.push(fd); + q.push(CollectState(fd.get(), state.depth)); } } } else { // otherwise this is a FieldDescriptor. - Rooted field = n.cast(); + Rooted field{ + static_cast(state.n)}; // and we proceed by visiting all permitted children. for (auto c : field->getChildrenWithSubclasses()) { // note matches. - if (match(c)) { + if (match(c, state.depth)) { res.push_back(c); } // We only continue our search via transparent children. if (c->isTransparent()) { - q.push(c); + q.push(CollectState(c.get(), state.depth + 1)); } } } @@ -222,28 +231,59 @@ static NodeVector collect(const Node *start, F match) return res; } +static std::vector collectPermittedTokens( + const Node *start, Handle domain) +{ + // gather SyntaxDescriptors for structure children first. + std::vector res; + collect(start, [&res](Handle n, size_t depth) { + SyntaxDescriptor stx; + if (n->isa(&RttiTypes::FieldDescriptor)) { + stx = n.cast()->getSyntaxDescriptor(depth); + } else { + stx = n.cast()->getSyntaxDescriptor(depth); + } + // do not add trivial SyntaxDescriptors. + if (!stx.isEmpty()) { + res.push_back(stx); + } + return false; + }); + // gather SyntaxDescriptors for AnnotationClasses. + for (auto a : domain->getAnnotationClasses()) { + SyntaxDescriptor stx = a->getSyntaxDescriptor(); + if (!stx.isEmpty()) { + res.push_back(stx); + } + } + return res; +} + /* Class FieldDescriptor */ FieldDescriptor::FieldDescriptor(Manager &mgr, Handle primitiveType, Handle parent, FieldType fieldType, - std::string name, bool optional) + std::string name, bool optional, + WhitespaceMode whitespaceMode) : Node(mgr, std::move(name), parent), children(this), fieldType(fieldType), primitiveType(acquire(primitiveType)), optional(optional), - primitive(true) + primitive(true), + whitespaceMode(whitespaceMode) { } FieldDescriptor::FieldDescriptor(Manager &mgr, Handle parent, FieldType fieldType, std::string name, - bool optional) + bool optional, WhitespaceMode whitespaceMode) : Node(mgr, std::move(name), parent), children(this), fieldType(fieldType), optional(optional), - primitive(false) + primitive(false), + whitespaceMode(whitespaceMode) { } @@ -272,6 +312,25 @@ bool FieldDescriptor::doValidate(Logger &logger) const } else { valid = valid & validateName(logger); } + // check start and end token. + if (!startToken.special && !startToken.token.empty() && + !Utils::isUserDefinedToken(startToken.token)) { + // TODO: Correct error message. + logger.error(std::string("Field \"") + getName() + + "\" has an invalid custom start token: " + + startToken.token, + *this); + valid = false; + } + if (!endToken.special && !endToken.token.empty() && + !Utils::isUserDefinedToken(endToken.token)) { + // TODO: Correct error message. + logger.error(std::string("Field \"") + getName() + + "\" has an invalid custom end token: " + + endToken.token, + *this); + valid = false; + } // check consistency of FieldType with the rest of the FieldDescriptor. if (primitive) { @@ -325,7 +384,7 @@ bool FieldDescriptor::doValidate(Logger &logger) const } static void gatherSubclasses( - std::unordered_set& visited, + std::unordered_set &visited, NodeVector &res, Handle strct) { // this check is to prevent cycles. @@ -334,7 +393,7 @@ static void gatherSubclasses( } for (auto sub : strct->getSubclasses()) { // this check is to prevent cycles. - if(visited.count(sub.get())){ + if (visited.count(sub.get())) { continue; } res.push_back(sub); @@ -381,7 +440,7 @@ NodeVector FieldDescriptor::pathTo(Handle field, NodeVector FieldDescriptor::getDefaultFields() const { // TODO: In principle a cast would be nicer here, but for now we copy. - NodeVector nodes = collect(this, [](Handle n) { + NodeVector nodes = collect(this, [](Handle n, size_t depth) { if (!n->isa(&RttiTypes::FieldDescriptor)) { return false; } @@ -396,6 +455,16 @@ NodeVector FieldDescriptor::getDefaultFields() const return res; } +std::vector FieldDescriptor::getPermittedTokens() const +{ + if (getParent() == nullptr || + getParent().cast()->getParent() == nullptr) { + return std::vector(); + } + return collectPermittedTokens( + this, getParent().cast()->getParent().cast()); +} + /* Class Descriptor */ void Descriptor::doResolve(ResolutionState &state) @@ -443,6 +512,25 @@ bool Descriptor::doValidate(Logger &logger) const } valid = valid & attributesDescriptor->validate(logger); } + + // check start and end token. + if (!startToken.special && !startToken.token.empty() && + !Utils::isUserDefinedToken(startToken.token)) { + logger.error(std::string("Descriptor \"") + getName() + + "\" has an invalid custom start token: " + + startToken.token, + *this); + valid = false; + } + if (!endToken.special && !endToken.token.empty() && + !Utils::isUserDefinedToken(endToken.token)) { + logger.error(std::string("Descriptor \"") + getName() + + "\" has an invalid custom end token: " + + endToken.token, + *this); + valid = false; + } + // check that only one FieldDescriptor is of type TREE. auto fds = Descriptor::getFieldDescriptors(); bool hasTREE = false; @@ -483,7 +571,7 @@ std::pair, bool> Descriptor::pathTo( NodeVector Descriptor::getDefaultFields() const { // TODO: In principle a cast would be nicer here, but for now we copy. - NodeVector nodes = collect(this, [](Handle n) { + NodeVector nodes = collect(this, [](Handle n, size_t depth) { if (!n->isa(&RttiTypes::FieldDescriptor)) { return false; } @@ -501,7 +589,7 @@ NodeVector Descriptor::getDefaultFields() const NodeVector Descriptor::getPermittedChildren() const { // TODO: In principle a cast would be nicer here, but for now we copy. - NodeVector nodes = collect(this, [](Handle n) { + NodeVector nodes = collect(this, [](Handle n, size_t depth) { return n->isa(&RttiTypes::StructuredClass); }); NodeVector res; @@ -669,6 +757,14 @@ std::pair, bool> Descriptor::createFieldDescriptor( return std::make_pair(fd, sorted); } +std::vector Descriptor::getPermittedTokens() const +{ + if (getParent() == nullptr) { + return std::vector(); + } + return collectPermittedTokens(this, getParent().cast()); +} + /* Class StructuredClass */ StructuredClass::StructuredClass(Manager &mgr, std::string name, @@ -709,6 +805,16 @@ bool StructuredClass::doValidate(Logger &logger) const logger.error(cardinality.toString() + " is not a cardinality!", *this); valid = false; } + + // check short token. + if (!shortToken.special && !shortToken.token.empty() && + !Utils::isUserDefinedToken(shortToken.token)) { + logger.error(std::string("Descriptor \"") + getName() + + "\" has an invalid custom short form token: " + + shortToken.token, + *this); + valid = false; + } // check the validity of this superclass. if (superclass != nullptr) { valid = valid & superclass->validate(logger); @@ -961,6 +1067,51 @@ Rooted Domain::createAnnotationClass(std::string name) new AnnotationClass(getManager(), std::move(name), this)}; } +static void gatherTokenDescriptors( + Handle desc, std::vector &res, + std::unordered_set &visited) +{ + // add the TokenDescriptors for the Descriptor itself. + if (!desc->getStartToken().isEmpty()) { + res.push_back(desc->getStartTokenPointer()); + } + if (!desc->getEndToken().isEmpty()) { + res.push_back(desc->getEndTokenPointer()); + } + // add the TokenDescriptors for its FieldDescriptors. + for (auto fd : desc->getFieldDescriptors()) { + if (!visited.insert(fd.get()).second) { + continue; + } + if (!fd->getStartToken().isEmpty()) { + res.push_back(fd->getStartTokenPointer()); + } + if (!fd->getEndToken().isEmpty()) { + res.push_back(fd->getEndTokenPointer()); + } + } +} + +std::vector Domain::getAllTokenDescriptors() const +{ + std::vector res; + // note all fields that are already visited because FieldReferences might + // lead to doubled fields. + std::unordered_set visited; + // add the TokenDescriptors for the StructuredClasses (and their fields). + for (auto s : structuredClasses) { + if (!s->getShortToken().isEmpty()) { + res.push_back(s->getShortTokenPointer()); + } + gatherTokenDescriptors(s, res, visited); + } + // add the TokenDescriptors for the AnnotationClasses (and their fields). + for (auto a : annotationClasses) { + gatherTokenDescriptors(a, res, visited); + } + return res; +} + /* Type registrations */ namespace RttiTypes { diff --git a/src/core/model/Domain.hpp b/src/core/model/Domain.hpp index 7e10d91..e984ed9 100644 --- a/src/core/model/Domain.hpp +++ b/src/core/model/Domain.hpp @@ -167,11 +167,13 @@ #ifndef _OUSIA_MODEL_DOMAIN_HPP_ #define _OUSIA_MODEL_DOMAIN_HPP_ +#include #include #include #include "Node.hpp" #include "RootNode.hpp" +#include "Syntax.hpp" #include "Typesystem.hpp" namespace ousia { @@ -225,6 +227,9 @@ private: Owned primitiveType; bool optional; bool primitive; + TokenDescriptor startToken; + TokenDescriptor endToken; + WhitespaceMode whitespaceMode; protected: bool doValidate(Logger &logger) const override; @@ -233,39 +238,46 @@ public: /** * This is the constructor for primitive fields. * - * @param mgr is the global Manager instance. - * @param parent is a handle of the Descriptor node that has this - * FieldDescriptor. - * @param primitiveType is a handle to some Type in some Typesystem of which - * one instance is allowed to fill this field. - * @param name is the name of this field. - * @param optional should be set to 'false' is this field needs to be - * filled in order for an instance of the parent - * Descriptor to be valid. + * @param mgr is the global Manager instance. + * @param parent is a handle of the Descriptor node that has this + * FieldDescriptor. + * @param primitiveType is a handle to some Type in some Typesystem of + *which + * one instance is allowed to fill this field. + * @param name is the name of this field. + * @param optional should be set to 'false' is this field needs to be + * filled in order for an instance of the parent + * Descriptor to be valid. + * @param whitespaceMode the WhitespaceMode to be used when an instance of + * this FieldDescriptor is parsed. */ FieldDescriptor(Manager &mgr, Handle primitiveType, Handle parent, FieldType fieldType = FieldType::TREE, - std::string name = "", bool optional = false); + std::string name = "", bool optional = false, + WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * This is the constructor for non-primitive fields. You have to provide * children here later on. * - * @param mgr is the global Manager instance. - * @param parent is a handle of the Descriptor node that has this - * FieldDescriptor. - * @param fieldType is the FieldType of this FieldDescriptor, either - * TREE for the main or default structure or SUBTREE - * for supporting structures. - * @param name is the name of this field. - * @param optional should be set to 'false' is this field needs to be - * filled in order for an instance of the parent - * Descriptor to be valid. + * @param mgr is the global Manager instance. + * @param parent is a handle of the Descriptor node that has this + * FieldDescriptor. + * @param fieldType is the FieldType of this FieldDescriptor, either + * TREE for the main or default structure or SUBTREE + * for supporting structures. + * @param name is the name of this field. + * @param optional should be set to 'false' is this field needs to be + * filled in order for an instance of the parent + * Descriptor to be valid. + * @param whitespaceMode the WhitespaceMode to be used when an instance of + * this FieldDescriptor is parsed. */ FieldDescriptor(Manager &mgr, Handle parent = nullptr, FieldType fieldType = FieldType::TREE, - std::string name = "", bool optional = false); + std::string name = "", bool optional = false, + WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Returns a const reference to the NodeVector of StructuredClasses whose @@ -437,6 +449,109 @@ public: * children of an instance of this Descriptor. */ NodeVector getDefaultFields() const; + + /** + * Returns a pointer to the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * Note that this does not invalidate the FieldDescriptor. So use with + * care. + * + * @return a pointer to the start TokenDescriptor. + */ + TokenDescriptor *getStartTokenPointer() { return &startToken; } + + /** + * Returns a copy of the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @return a copy of the start TokenDescriptor. + */ + TokenDescriptor getStartToken() const { return startToken; } + + /** + * Sets the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @param st the new start TokenDescriptor. + */ + void setStartToken(TokenDescriptor st) + { + invalidate(); + startToken = st; + } + + /** + * Returns a pointer to the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a pointer to the end TokenDescriptor. + */ + TokenDescriptor *getEndTokenPointer() { return &endToken; } + + /** + * Returns a copy of the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a copy of the end TokenDescriptor. + */ + TokenDescriptor getEndToken() const { return endToken; } + + /** + * Sets the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @param e the new end TokenDescriptor. + */ + void setEndToken(TokenDescriptor e) + { + invalidate(); + endToken = e; + } + + /** + * Returns the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + * + * @return the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + */ + WhitespaceMode getWhitespaceMode() const { return whitespaceMode; } + + /** + * Sets the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + * + * @param wm the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + */ + WhitespaceMode setWhitespaceMode(WhitespaceMode wm) + { + return whitespaceMode = wm; + } + + /** + * Returns the SyntaxDescriptor for this FieldDescriptor. + * + * @return the SyntaxDescriptor for this FieldDescriptor. + */ + SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1) + { + SyntaxDescriptor stx{startToken.id, endToken.id, Tokens::Empty, + const_cast(this), depth}; + return stx; + } + + /** + * Returns a vector of SyntaxDescriptors, one for each Descriptor + * (StructuredClasses, AnnotationClasses or FieldDescriptors) that is + * permitted as child of this FieldDescriptor. This also makes use + * of transparency. + * + * @return a vector of SyntaxDescriptors, one for each Descriptor that is + * permitted as child of this FieldDescriptor + */ + std::vector getPermittedTokens() const; }; /** @@ -460,7 +575,10 @@ public: * * \endcode * - * key="value" inside the A-node would be an attribute, while value + * key="value" inside the A-node would be an attribute, while + * \code{.xml} + * value + * \endcode * would be a primitive field. While equivalent in XML the semantics are * different: An attribute describes indeed attributes, features of one single * node whereas a primitive field describes the _content_ of a node. @@ -472,6 +590,8 @@ class Descriptor : public Node { private: Owned attributesDescriptor; NodeVector fieldDescriptors; + TokenDescriptor startToken; + TokenDescriptor endToken; bool addAndSortFieldDescriptor(Handle fd, Logger &logger); @@ -720,6 +840,85 @@ public: * of an instance of this Descriptor in the structure tree. */ NodeVector getPermittedChildren() const; + + /** + * Returns a pointer to the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @return a pointer to the start TokenDescriptor. + */ + TokenDescriptor *getStartTokenPointer() { return &startToken; } + + /** + * Returns a copy of the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @return a copy of the start TokenDescriptor. + */ + TokenDescriptor getStartToken() const { return startToken; } + + /** + * Sets the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @param st the new start TokenDescriptor. + */ + void setStartToken(TokenDescriptor st) + { + invalidate(); + startToken = st; + } + + /** + * Returns a pointer to the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a pointer to the end TokenDescriptor. + */ + TokenDescriptor *getEndTokenPointer() { return &endToken; } + + /** + * Returns a copy of the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a copy of the end TokenDescriptor. + */ + TokenDescriptor getEndToken() const { return endToken; } + + /** + * Sets the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @param e the new end TokenDescriptor. + */ + void setEndToken(TokenDescriptor e) + { + invalidate(); + endToken = e; + } + + /** + * Returns the SyntaxDescriptor for this Descriptor. + * + * @return the SyntaxDescriptor for this Descriptor. + */ + virtual SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1) + { + SyntaxDescriptor stx{startToken.id, endToken.id, Tokens::Empty, + const_cast(this), depth}; + return stx; + } + + /** + * Returns a vector of SyntaxDescriptors, one for each Descriptor + * (StructuredClasses, AnnotationClasses or FieldDescriptors) that is + * permitted as child of this Descriptor. This also makes use + * of transparency. + * + * @return a vector of SyntaxDescriptors, one for each Descriptor that is + * permitted as child of this Descriptor. + */ + std::vector getPermittedTokens() const; }; /* * TODO: We should discuss Cardinalities one more time. Is it smart to define @@ -806,6 +1005,7 @@ private: NodeVector subclasses; bool transparent; bool root; + TokenDescriptor shortToken; /** * Helper method for getFieldDescriptors. @@ -963,6 +1163,50 @@ public: invalidate(); root = std::move(r); } + + /** + * Returns a pointer to the short TokenDescriptor. During parsing an + * occurence of this token will be translated to an empty instance of this + * StructuredClass. + * + * @return a pointer to the short TokenDescriptor. + */ + TokenDescriptor *getShortTokenPointer() { return &shortToken; } + + /** + * Returns a copy of the short TokenDescriptor. During parsing an + * occurence of this token will be translated to an empty instance of this + * StructuredClass. + * + * @return a copy of the short TokenDescriptor. + */ + TokenDescriptor getShortToken() const { return shortToken; } + + /** + * Sets the short TokenDescriptor. During parsing an + * occurence of this token will be translated to an empty instance of this + * StructuredClass. + * + * @param s the new short TokenDescriptor. + */ + void setShortToken(TokenDescriptor s) + { + invalidate(); + shortToken = s; + } + + /** + * Returns the SyntaxDescriptor for this StructuredClass. + * + * @return the SyntaxDescriptor for this StructuredClass. + */ + SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1) override + { + SyntaxDescriptor stx{getStartToken().id, getEndToken().id, + shortToken.id, const_cast(this), + depth}; + return stx; + } }; /** @@ -1188,6 +1432,13 @@ public: { domains.insert(domains.end(), ds.begin(), ds.end()); } + + /** + * Returns all TokenDescriptors of classes and fields in this Ontology. + * + * @return all TokenDescriptors of classes and fields in this Ontology. + */ + std::vector getAllTokenDescriptors() const; }; namespace RttiTypes { @@ -1200,4 +1451,4 @@ extern const Rtti Domain; } } -#endif /* _OUSIA_MODEL_DOMAIN_HPP_ */ +#endif /* _OUSIA_MODEL_DOMAIN_HPP_ */ \ No newline at end of file diff --git a/test/core/model/DomainTest.cpp b/test/core/model/DomainTest.cpp index 6bbf26d..f59e745 100644 --- a/test/core/model/DomainTest.cpp +++ b/test/core/model/DomainTest.cpp @@ -82,9 +82,7 @@ TEST(Domain, testDomainResolving) } // i use this wrapper due to the strange behaviour of GTEST. -static void assertFalse(bool b){ - ASSERT_FALSE(b); -} +static void assertFalse(bool b) { ASSERT_FALSE(b); } static Rooted createUnsortedPrimitiveField( Handle strct, Handle type, Logger &logger, bool tree, @@ -170,7 +168,6 @@ TEST(StructuredClass, getFieldDescriptors) } } - TEST(StructuredClass, getFieldDescriptorsCycles) { Logger logger; @@ -523,6 +520,91 @@ TEST(Descriptor, getPermittedChildrenCycles) ASSERT_EQ(A, children[0]); } +TEST(Descriptor, getSyntaxDescriptor) +{ + // build an ontology with some custom syntax. + Manager mgr{1}; + Logger logger; + Rooted sys{new SystemTypesystem(mgr)}; + // Construct the domain + Rooted domain{new Domain(mgr, sys, "ontology")}; + Rooted A{new StructuredClass( + mgr, "A", domain, Cardinality::any(), {nullptr}, true, true)}; + A->setStartToken(TokenDescriptor(Tokens::Indent)); + A->setEndToken(TokenDescriptor(Tokens::Dedent)); + { + TokenDescriptor sh{"<+>"}; + sh.id = 1; + A->setShortToken(sh); + } + // check the SyntaxDescriptor + SyntaxDescriptor stx = A->getSyntaxDescriptor(); + ASSERT_EQ(Tokens::Indent, stx.start); + ASSERT_EQ(Tokens::Dedent, stx.end); + ASSERT_EQ(1, stx.shortForm); + ASSERT_EQ(A, stx.descriptor); + ASSERT_TRUE(stx.isStruct()); + ASSERT_FALSE(stx.isAnnotation()); + ASSERT_FALSE(stx.isFieldDescriptor()); +} + +TEST(Descriptor, getPermittedTokens) +{ + // build an ontology with some custom syntax. + Manager mgr{1}; + Logger logger; + Rooted sys{new SystemTypesystem(mgr)}; + // Construct the domain + Rooted domain{new Domain(mgr, sys, "ontology")}; + // add one StructuredClass with all tokens set. + Rooted A{new StructuredClass( + mgr, "A", domain, Cardinality::any(), {nullptr}, true, true)}; + A->setStartToken(TokenDescriptor(Tokens::Indent)); + A->setEndToken(TokenDescriptor(Tokens::Dedent)); + { + TokenDescriptor sh{"<+>"}; + sh.id = 1; + A->setShortToken(sh); + } + // add a field with one token set. + Rooted A_field = A->createFieldDescriptor(logger).first; + A_field->setEndToken(TokenDescriptor(Tokens::Newline)); + A_field->addChild(A); + // add an annotation with start and end set. + Rooted A_anno = domain->createAnnotationClass("A"); + { + TokenDescriptor start{"<"}; + start.id = 7; + A_anno->setStartToken(start); + } + { + TokenDescriptor end{">"}; + end.id = 8; + A_anno->setEndToken(end); + } + // add a trivial annotation, which should not be returned. + Rooted B_anno = domain->createAnnotationClass("B"); + ASSERT_TRUE(domain->validate(logger)); + + // check result. + std::vector stxs = A->getPermittedTokens(); + ASSERT_EQ(3, stxs.size()); + // the field should be first, because A itself should not be collected + // directly. + ASSERT_EQ(A_field, stxs[0].descriptor); + ASSERT_EQ(Tokens::Empty, stxs[0].start); + ASSERT_EQ(Tokens::Newline, stxs[0].end); + ASSERT_EQ(Tokens::Empty, stxs[0].shortForm); + ASSERT_EQ(A, stxs[1].descriptor); + ASSERT_EQ(Tokens::Indent, stxs[1].start); + ASSERT_EQ(Tokens::Dedent, stxs[1].end); + ASSERT_EQ(1, stxs[1].shortForm); + ASSERT_EQ(A_anno, stxs[2].descriptor); + ASSERT_EQ(7, stxs[2].start); + ASSERT_EQ(8, stxs[2].end); + ASSERT_EQ(Tokens::Empty, stxs[2].shortForm); +} + TEST(StructuredClass, isSubclassOf) { // create an inheritance hierarchy. @@ -629,6 +711,14 @@ TEST(Domain, validate) base_field->setPrimitiveType(sys->getStringType()); ASSERT_EQ(ValidationState::UNKNOWN, domain->getValidationState()); ASSERT_TRUE(domain->validate(logger)); + // add an invalid start token. + base_field->setStartToken(TokenDescriptor("< + >")); + ASSERT_EQ(ValidationState::UNKNOWN, domain->getValidationState()); + ASSERT_FALSE(domain->validate(logger)); + // make it valid. + base_field->setStartToken(TokenDescriptor("<")); + ASSERT_EQ(ValidationState::UNKNOWN, domain->getValidationState()); + ASSERT_TRUE(domain->validate(logger)); // add a subclass for our base class. Rooted sub{new StructuredClass(mgr, "sub", domain)}; // this should be valid in itself. @@ -686,4 +776,71 @@ TEST(Domain, validate) ASSERT_TRUE(domain->validate(logger)); } } + +TEST(Domain, getAllTokenDescriptors) +{ + // build an ontology with some custom syntax. + Manager mgr{1}; + Logger logger; + Rooted sys{new SystemTypesystem(mgr)}; + // Construct the domain + Rooted domain{new Domain(mgr, sys, "ontology")}; + // add one StructuredClass with all tokens set. + Rooted A{new StructuredClass( + mgr, "A", domain, Cardinality::any(), {nullptr}, true, true)}; + A->setStartToken(TokenDescriptor(Tokens::Indent)); + A->setEndToken(TokenDescriptor(Tokens::Dedent)); + { + TokenDescriptor sh{"<+>"}; + sh.id = 1; + A->setShortToken(sh); + } + // add a field with one token set. + Rooted A_field = A->createFieldDescriptor(logger).first; + A_field->setEndToken(TokenDescriptor(Tokens::Newline)); + A_field->addChild(A); + // add an annotation with start and end set. + Rooted A_anno = domain->createAnnotationClass("A"); + { + TokenDescriptor start{"<"}; + start.id = 7; + A_anno->setStartToken(start); + } + { + TokenDescriptor end{">"}; + end.id = 8; + A_anno->setEndToken(end); + } + // add a trivial annotation, which should not be returned. + Rooted B_anno = domain->createAnnotationClass("B"); + ASSERT_TRUE(domain->validate(logger)); + + // check the result. + std::vector tks = domain->getAllTokenDescriptors(); + + // A short token + ASSERT_EQ("<+>", tks[0]->token); + ASSERT_EQ(1, tks[0]->id); + ASSERT_FALSE(tks[0]->special); + // A start token + ASSERT_EQ("", tks[1]->token); + ASSERT_EQ(Tokens::Indent, tks[1]->id); + ASSERT_TRUE(tks[1]->special); + // A end token + ASSERT_EQ("", tks[2]->token); + ASSERT_EQ(Tokens::Dedent, tks[2]->id); + ASSERT_TRUE(tks[2]->special); + // A field end token + ASSERT_EQ("", tks[3]->token); + ASSERT_EQ(Tokens::Newline, tks[3]->id); + ASSERT_TRUE(tks[3]->special); + // A anno start token + ASSERT_EQ("<", tks[4]->token); + ASSERT_EQ(7, tks[4]->id); + ASSERT_FALSE(tks[4]->special); + // A anno end token + ASSERT_EQ(">", tks[5]->token); + ASSERT_EQ(8, tks[5]->id); + ASSERT_FALSE(tks[5]->special); +} } \ No newline at end of file -- cgit v1.2.3 From 4b5f37d07e4e691848b243ae795bb59893a6379c Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Mon, 2 Mar 2015 15:55:41 +0100 Subject: added another domain test case for invalid syntax tokens. --- test/core/model/DomainTest.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/core/model/DomainTest.cpp b/test/core/model/DomainTest.cpp index f59e745..b3c5771 100644 --- a/test/core/model/DomainTest.cpp +++ b/test/core/model/DomainTest.cpp @@ -701,6 +701,14 @@ TEST(Domain, validate) base->setName("myClass"); ASSERT_EQ(ValidationState::UNKNOWN, domain->getValidationState()); ASSERT_TRUE(domain->validate(logger)); + // add an invalid short token. + base->setShortToken(TokenDescriptor("bla")); + ASSERT_EQ(ValidationState::UNKNOWN, domain->getValidationState()); + ASSERT_FALSE(domain->validate(logger)); + // make it valid. + base->setShortToken(TokenDescriptor("!bla!")); + ASSERT_EQ(ValidationState::UNKNOWN, domain->getValidationState()); + ASSERT_TRUE(domain->validate(logger)); // Let's add a primitive field (without a primitive type at first) Rooted base_field = base->createPrimitiveFieldDescriptor(nullptr, logger).first; -- cgit v1.2.3 From 3cc6ebf406c53b0c82a52f0daf1ce14c62f7b521 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 16:30:51 +0100 Subject: Implemented new "start" methods in the Handler instances --- CMakeLists.txt | 6 +- src/core/parser/stack/DocumentHandler.cpp | 133 +++++++++++++++--------- src/core/parser/stack/DocumentHandler.hpp | 69 ++++++++++--- src/core/parser/stack/DomainHandler.cpp | 76 ++++++++------ src/core/parser/stack/DomainHandler.hpp | 22 ++-- src/core/parser/stack/Handler.cpp | 102 +++++++++---------- src/core/parser/stack/Handler.hpp | 150 +++++++++++++++------------- src/core/parser/stack/Stack.cpp | 24 +++-- src/core/parser/stack/TypesystemHandler.cpp | 29 +++--- src/core/parser/stack/TypesystemHandler.hpp | 15 ++- 10 files changed, 375 insertions(+), 251 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cef1e31..45310a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,7 +187,7 @@ ADD_LIBRARY(ousia_core src/core/parser/stack/Handler src/core/parser/stack/ImportIncludeHandler src/core/parser/stack/State -# src/core/parser/stack/Stack + src/core/parser/stack/Stack src/core/parser/stack/TokenRegistry src/core/parser/stack/TokenStack src/core/parser/stack/TypesystemHandler @@ -387,7 +387,7 @@ IF(TEST) # ) ADD_EXECUTABLE(ousia_test_osml -# test/formats/osml/OsmlParserTest + test/formats/osml/OsmlParserTest test/formats/osml/OsmlStreamParserTest ) @@ -400,7 +400,7 @@ IF(TEST) ADD_EXECUTABLE(ousia_test_osxml test/formats/osxml/OsxmlEventParserTest -# test/formats/osxml/OsxmlParserTest + test/formats/osxml/OsxmlParserTest ) TARGET_LINK_LIBRARIES(ousia_test_osxml diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index 714ab1b..de6e367 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -37,7 +37,8 @@ namespace parser_stack { /* DocumentHandler */ -bool DocumentHandler::start(Variant::mapType &args) +bool DocumentHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { Rooted document = context().getProject()->createDocument(args["name"].asString()); @@ -52,6 +53,25 @@ void DocumentHandler::end() { scope().pop(logger()); } /* DocumentChildHandler */ +DocumentChildHandler::DocumentChildHandler(const HandlerData &handlerData) + : Handler(handlerData), mode(Mode::STRUCT) +{ +} + +void DocumentChildHandler::setMode(Mode mode, const std::string &name) +{ + this->mode = mode; + this->name = name; + this->token = Token(); +} + +void DocumentChildHandler::setMode(Mode mode, const Token &token) +{ + this->mode = mode; + this->name = token.content; + this->token = token; +} + void DocumentChildHandler::preamble(Rooted &parentNode, size_t &fieldIdx, DocumentEntity *&parent) { @@ -122,10 +142,14 @@ void DocumentChildHandler::createPath(const size_t &firstFieldIdx, scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, false); } -bool DocumentChildHandler::start(Variant::mapType &args) +bool DocumentChildHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { - // extract the special "name" attribute from the input arguments. - // the remaining attributes will be forwarded to the newly constructed + // Set the internal mode to STRUCT and copy the name + setMode(Mode::STRUCT, name); + + // Extract the special "name" attribute from the input arguments. + // The remaining attributes will be forwarded to the newly constructed // element. std::string nameAttr; { @@ -152,11 +176,11 @@ bool DocumentChildHandler::start(Variant::mapType &args) return false; } Rooted strct = scope().resolve( - Utils::split(name(), ':'), logger()); + Utils::split(name, ':'), logger()); if (strct == nullptr) { // if we could not resolve the name, throw an exception. throw LoggableException( - std::string("\"") + name() + "\" could not be resolved.", + std::string("\"") + name + "\" could not be resolved.", location()); } entity = parentNode.cast()->createRootStructuredEntity( @@ -169,13 +193,6 @@ bool DocumentChildHandler::start(Variant::mapType &args) preamble(parentNode, fieldIdx, parent); - // TODO: REMOVE - std::string thisName = name(); - std::string parentClassName; - if (parent != nullptr) { - parentClassName = parent->getDescriptor()->getName(); - } - /* * Try to find a FieldDescriptor for the given tag if we are not in * a field already. This does _not_ try to construct transparent @@ -183,7 +200,7 @@ bool DocumentChildHandler::start(Variant::mapType &args) */ { ssize_t newFieldIdx = - parent->getDescriptor()->getFieldDescriptorIndex(name()); + parent->getDescriptor()->getFieldDescriptorIndex(name); if (newFieldIdx != -1) { // Check whether explicit fields are allowed here, if not if (scope().getFlag(ParserFlag::POST_EXPLICIT_FIELDS)) { @@ -191,17 +208,17 @@ bool DocumentChildHandler::start(Variant::mapType &args) std::string( "Data or structure commands have already been " "given, command \"") + - name() + std::string( - "\" is not interpreted explicit " - "field. Move explicit field " - "references to the beginning."), + name + std::string( + "\" is not interpreted explicit " + "field. Move explicit field " + "references to the beginning."), location()); } else { Rooted field{new DocumentField( manager(), parentNode, newFieldIdx, false)}; field->setLocation(location()); scope().push(field); - isExplicitField = true; + setMode(Mode::EXPLICIT_FIELD, name); return true; } } @@ -210,11 +227,11 @@ bool DocumentChildHandler::start(Variant::mapType &args) // Otherwise create a new StructuredEntity // TODO: Consider Anchors and AnnotationEntities Rooted strct = scope().resolve( - Utils::split(name(), ':'), logger()); + Utils::split(name, ':'), logger()); if (strct == nullptr) { // if we could not resolve the name, throw an exception. throw LoggableException( - std::string("\"") + name() + "\" could not be resolved.", + std::string("\"") + name + "\" could not be resolved.", location()); } @@ -261,24 +278,56 @@ bool DocumentChildHandler::start(Variant::mapType &args) } } +bool DocumentChildHandler::startAnnotation(const std::string &name, + Variant::mapType &args, + AnnotationType annotationType) +{ + // Set the internal mode and name correctly + if (annotationType == AnnotationType::START) { + setMode(Mode::ANNOTATION_START, name); + } else { + setMode(Mode::ANNOTATION_END, name); + } + + // TODO: Handle annotation + return false; +} + +bool DocumentChildHandler::startToken(const Token &token, Handle node) +{ + // Set the internal mode correctly + setMode(Mode::TOKEN, token); + + // TODO: Handle token start + return false; +} + +DocumentChildHandler::EndTokenResult DocumentChildHandler::endToken( + const Token &token, Handle node) +{ + // TODO: Handle token end + return EndTokenResult::ENDED_NONE; +} + void DocumentChildHandler::end() { - // in case of explicit fields we do not want to pop something from the + // In case of explicit fields we do not want to pop something from the // stack. - if (isExplicitField) { - return; + if (mode == Mode::STRUCT) { + // pop the "main" element. + scope().pop(logger()); } - // pop the "main" element. - scope().pop(logger()); } bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx) { - if (isExplicitField) { + // TODO: Handle other cases + if (mode == Mode::EXPLICIT_FIELD) { // In case of explicit fields we do not want to create another field. isDefault = true; return fieldIdx == 0; } + Rooted parentNode = scope().getLeaf(); assert(parentNode->isa(&RttiTypes::StructuredEntity) || parentNode->isa(&RttiTypes::AnnotationEntity)); @@ -291,7 +340,7 @@ bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx) parent->getDescriptor()->getFieldDescriptors(); if (isDefault) { - if(fields.empty()){ + if (fields.empty()) { return false; } fieldIdx = fields.size() - 1; @@ -317,33 +366,19 @@ void DocumentChildHandler::fieldEnd() { assert(scope().getLeaf()->isa(&RttiTypes::DocumentField)); - // pop the field from the stack. + // Pop the field from the stack. scope().pop(logger()); - // pop all remaining transparent elements. + // Pop all remaining transparent elements. while (scope().getLeaf()->isa(&RttiTypes::StructuredEntity) && scope().getLeaf().cast()->isTransparent()) { - // pop the transparent element. + // Pop the transparent element. scope().pop(logger()); - // pop the transparent field. + // Pop the transparent field. scope().pop(logger()); } } -bool DocumentChildHandler::annotationStart(const Variant &className, - Variant::mapType &args) -{ - // TODO: Implement - return false; -} - -bool DocumentChildHandler::annotationEnd(const Variant &className, - const Variant &elementName) -{ - // TODO: Implement - return false; -} - bool DocumentChildHandler::convertData(Handle field, Variant &data, Logger &logger) { @@ -436,7 +471,7 @@ bool DocumentChildHandler::data() // this fact Variant text = readData(); if (defaultFields.empty()) { - logger().error("Got data, but structure \"" + name() + + logger().error("Got data, but structure \"" + name + "\" does not have any primitive field", text); } else { @@ -467,7 +502,9 @@ const State DocumentChild = StateBuilder() .createdNodeTypes({&RttiTypes::StructureNode, &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}) - .elementHandler(DocumentChildHandler::create); + .elementHandler(DocumentChildHandler::create) + .supportsAnnotations(true) + .supportsTokens(true); } } diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index c51c188..9a41508 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -53,7 +53,8 @@ class DocumentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; void end() override; /** @@ -91,8 +92,55 @@ public: * defined elements in an Ousía document. */ class DocumentChildHandler : public Handler { +public: + /** + * Enum type used to represent the mode of the DocumentChildHandler. + * TODO: Having to have such a type is actually quite stupid, it would be + * nicer to have separate handler classes for each of these cases. But this + * is a story for a different day. + */ + enum class Mode { + STRUCT, + EXPLICIT_FIELD, + ANNOTATION_START, + ANNOTATION_END, + TOKEN + }; + private: - bool isExplicitField = false; + /** + * Internal Mode of the DocumentChildHandler. + */ + Mode mode; + + /** + * Contains the name of the command or the annotation that is represented + * by this DocumentChildHandler. + */ + std::string name; + + /** + * Token represented by the document child handler. + */ + Token token; + + /** + * Switches the mode to the given mode and copies the given name. Resets the + * token. + * + * @param mode is the new mode. + * @param name is the new name. + */ + void setMode(Mode mode, const std::string &name); + + /** + * Switches the mode to the given mode and copies the given token, sets the + * name to the content of the token. + * + * @param mode is the new mode. + * @param token is the new token. + */ + void setMode(Mode mode, const Token &token); /** * Code shared by both the start(), fieldStart() and the data() method. @@ -161,22 +209,19 @@ private: Logger &logger); public: - using Handler::Handler; + DocumentChildHandler(const HandlerData &handlerData); - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; + bool startAnnotation(const std::string &name, Variant::mapType &args, + AnnotationType annotationType) override; + bool startToken(const Token &token, Handle node) override; + EndTokenResult endToken(const Token &token, Handle node) override; void end() override; bool data() override; - bool fieldStart(bool &isDefault, size_t fieldIdx) override; - void fieldEnd() override; - bool annotationStart(const Variant &className, - Variant::mapType &args) override; - - bool annotationEnd(const Variant &className, - const Variant &elementName) override; - /** * Creates a new instance of the DocumentChildHandler. * diff --git a/src/core/parser/stack/DomainHandler.cpp b/src/core/parser/stack/DomainHandler.cpp index aa18faa..5ca4f5b 100644 --- a/src/core/parser/stack/DomainHandler.cpp +++ b/src/core/parser/stack/DomainHandler.cpp @@ -33,7 +33,8 @@ namespace parser_stack { /* DomainHandler */ -bool DomainHandler::start(Variant::mapType &args) +bool DomainHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { // Create the Domain node Rooted domain = @@ -57,7 +58,8 @@ void DomainHandler::end() { scope().pop(logger()); } /* DomainStructHandler */ -bool DomainStructHandler::start(Variant::mapType &args) +bool DomainStructHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -88,7 +90,8 @@ bool DomainStructHandler::start(Variant::mapType &args) void DomainStructHandler::end() { scope().pop(logger()); } /* DomainAnnotationHandler */ -bool DomainAnnotationHandler::start(Variant::mapType &args) +bool DomainAnnotationHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -106,7 +109,8 @@ void DomainAnnotationHandler::end() { scope().pop(logger()); } /* DomainAttributesHandler */ -bool DomainAttributesHandler::start(Variant::mapType &args) +bool DomainAttributesHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { // Fetch the current typesystem and create the struct node Rooted parent = scope().selectOrThrow(); @@ -122,7 +126,8 @@ void DomainAttributesHandler::end() { scope().pop(logger()); } /* DomainFieldHandler */ -bool DomainFieldHandler::start(Variant::mapType &args) +bool DomainFieldHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { FieldDescriptor::FieldType type; if (args["isSubtree"].asBool()) { @@ -152,15 +157,16 @@ void DomainFieldHandler::end() { scope().pop(logger()); } /* DomainFieldRefHandler */ -bool DomainFieldRefHandler::start(Variant::mapType &args) +bool DomainFieldRefHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); - const std::string &name = args["ref"].asString(); + const std::string &ref = args["ref"].asString(); auto loc = location(); - scope().resolveFieldDescriptor(name, parent, logger(), + scope().resolveFieldDescriptor(ref, parent, logger(), [loc](Handle field, Handle parent, Logger &logger) { if (field != nullptr) { @@ -182,7 +188,8 @@ void DomainFieldRefHandler::end() {} /* DomainPrimitiveHandler */ -bool DomainPrimitiveHandler::start(Variant::mapType &args) +bool DomainPrimitiveHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); @@ -222,7 +229,8 @@ void DomainPrimitiveHandler::end() { scope().pop(logger()); } /* DomainChildHandler */ -bool DomainChildHandler::start(Variant::mapType &args) +bool DomainChildHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { Rooted field = scope().selectOrThrow(); @@ -240,7 +248,8 @@ bool DomainChildHandler::start(Variant::mapType &args) /* DomainParentHandler */ -bool DomainParentHandler::start(Variant::mapType &args) +bool DomainParentHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { Rooted strct = scope().selectOrThrow(); @@ -255,7 +264,8 @@ void DomainParentHandler::end() { scope().pop(logger()); } /* DomainParentFieldHandler */ -bool DomainParentFieldHandler::start(Variant::mapType &args) +bool DomainParentFieldHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); FieldDescriptor::FieldType type; @@ -265,7 +275,7 @@ bool DomainParentFieldHandler::start(Variant::mapType &args) type = FieldDescriptor::FieldType::TREE; } - const std::string &name = args["name"].asString(); + const std::string &fieldName = args["name"].asString(); const bool optional = args["optional"].asBool(); Rooted strct = parentNameNode->getParent().cast(); @@ -274,12 +284,12 @@ bool DomainParentFieldHandler::start(Variant::mapType &args) // StructuredClass as child to it. scope().resolve( parentNameNode->getName(), strct, logger(), - [type, name, optional](Handle parent, Handle strct, - Logger &logger) { + [type, fieldName, optional](Handle parent, Handle strct, + Logger &logger) { if (parent != nullptr) { Rooted field = (parent.cast()->createFieldDescriptor( - logger, type, name, optional)).first; + logger, type, fieldName, optional)).first; field->addChild(strct.cast()); } }); @@ -288,32 +298,32 @@ bool DomainParentFieldHandler::start(Variant::mapType &args) /* DomainParentFieldRefHandler */ -bool DomainParentFieldRefHandler::start(Variant::mapType &args) +bool DomainParentFieldRefHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); - const std::string &name = args["ref"].asString(); + const std::string &ref = args["ref"].asString(); Rooted strct = parentNameNode->getParent().cast(); auto loc = location(); // resolve the parent, get the referenced field and add the declared // StructuredClass as child to it. - scope().resolve( - parentNameNode->getName(), strct, logger(), - [name, loc](Handle parent, Handle strct, Logger &logger) { - if (parent != nullptr) { - Rooted field = - parent.cast()->getFieldDescriptor(name); - if (field == nullptr) { - logger.error( - std::string("Could not find referenced field ") + name, - loc); - return; - } - field->addChild(strct.cast()); - } - }); + scope().resolve(parentNameNode->getName(), strct, logger(), + [ref, loc](Handle parent, + Handle strct, Logger &logger) { + if (parent != nullptr) { + Rooted field = + parent.cast()->getFieldDescriptor(ref); + if (field == nullptr) { + logger.error( + std::string("Could not find referenced field ") + ref, loc); + return; + } + field->addChild(strct.cast()); + } + }); return true; } diff --git a/src/core/parser/stack/DomainHandler.hpp b/src/core/parser/stack/DomainHandler.hpp index 76172d6..4116919 100644 --- a/src/core/parser/stack/DomainHandler.hpp +++ b/src/core/parser/stack/DomainHandler.hpp @@ -46,7 +46,7 @@ class DomainHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -59,7 +59,7 @@ class DomainStructHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -72,7 +72,7 @@ class DomainAnnotationHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -85,7 +85,7 @@ class DomainAttributesHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -98,7 +98,7 @@ class DomainFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -111,7 +111,7 @@ class DomainFieldRefHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -124,7 +124,7 @@ class DomainPrimitiveHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -137,7 +137,7 @@ class DomainChildHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -154,7 +154,7 @@ class DomainParentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -167,7 +167,7 @@ class DomainParentFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -179,7 +179,7 @@ class DomainParentFieldRefHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &name, Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index 734976a..f9cefc2 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -32,13 +32,8 @@ namespace parser_stack { /* Class HandlerData */ HandlerData::HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks, - const std::string &name, const State &state, - const SourceLocation &location) - : ctx(ctx), - callbacks(callbacks), - name(name), - state(state), - location(location) + const State &state, const SourceLocation &location) + : ctx(ctx), callbacks(callbacks), state(state), location(location) { } @@ -67,22 +62,14 @@ Logger &Handler::logger() const SourceLocation &Handler::location() const { return handlerData.location; } -const std::string &Handler::name() const { return handlerData.name; } - -Variant Handler::readData() -{ - return handlerData.callbacks.readData(); -} +Variant Handler::readData() { return handlerData.callbacks.readData(); } void Handler::pushTokens(const std::vector &tokens) { handlerData.callbacks.pushTokens(tokens); } -void Handler::popTokens() -{ - handlerData.callbacks.popTokens(); -} +void Handler::popTokens() { handlerData.callbacks.popTokens(); } TokenId Handler::registerToken(const std::string &token) { @@ -94,8 +81,6 @@ void Handler::unregisterToken(TokenId id) handlerData.callbacks.unregisterToken(id); } -const std::string &Handler::getName() const { return name(); } - const State &Handler::getState() const { return handlerData.state; } void Handler::setLogger(Logger &logger) { internalLogger = &logger; } @@ -106,42 +91,51 @@ const SourceLocation &Handler::getLocation() const { return location(); } /* Class EmptyHandler */ -bool EmptyHandler::start(Variant::mapType &args) +bool EmptyHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { - // Just accept anything + // Well, we'll support any command we get, don't we? return true; } -void EmptyHandler::end() +bool EmptyHandler::startAnnotation(const std::string &name, + Variant::mapType &args, + Handler::AnnotationType annotationType) { - // Do nothing if a command ends + // Do not support annotations. Annotations are too complicated for poor + // EmptyHandler. + return false; } -bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex) +bool EmptyHandler::startToken(const Token &token, Handle node) { - // Accept any field - return true; + // EmptyHandler does not support tokens. + return false; } -void EmptyHandler::fieldEnd() +Handler::EndTokenResult EmptyHandler::endToken(const Token &token, + Handle node) { - // Do not handle fields + // There are no tokens to end here. + return EndTokenResult::ENDED_NONE; } -bool EmptyHandler::annotationStart(const Variant &className, - Variant::mapType &args) +void EmptyHandler::end() { - // Accept any data - return true; + // Do nothing if a command ends } -bool EmptyHandler::annotationEnd(const Variant &className, - const Variant &elementName) +bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex) { - // Accept any annotation + // Accept any field return true; } +void EmptyHandler::fieldEnd() +{ + // Do not handle field ends +} + bool EmptyHandler::data() { // Support any data @@ -155,12 +149,31 @@ Handler *EmptyHandler::create(const HandlerData &handlerData) /* Class StaticHandler */ -bool StaticHandler::start(Variant::mapType &args) +bool StaticHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { // Do nothing in the default implementation, accept anything return true; } +bool StaticHandler::startAnnotation(const std::string &name, + Variant::mapType &args, + Handler::AnnotationType annotationType) +{ + return false; +} + +bool StaticHandler::startToken(const Token &token, Handle node) +{ + return false; +} + +Handler::EndTokenResult StaticHandler::endToken(const Token &token, + Handle node) +{ + return EndTokenResult::ENDED_NONE; +} + void StaticHandler::end() { // Do nothing here @@ -182,20 +195,6 @@ void StaticHandler::fieldEnd() // Do nothing here } -bool StaticHandler::annotationStart(const Variant &className, - Variant::mapType &args) -{ - // No annotations supported - return false; -} - -bool StaticHandler::annotationEnd(const Variant &className, - const Variant &elementName) -{ - // No annotations supported - return false; -} - bool StaticHandler::data() { logger().error("Did not expect any data here", readData()); @@ -210,7 +209,8 @@ StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData, { } -bool StaticFieldHandler::start(Variant::mapType &args) +bool StaticFieldHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { if (!argName.empty()) { auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 19c3d65..f0968e7 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -25,6 +25,7 @@ #include #include #include +#include namespace ousia { @@ -60,11 +61,6 @@ public: */ HandlerCallbacks &callbacks; - /** - * Contains the name of the command that is being handled. - */ - std::string name; - /** * Contains the current state of the state machine. */ @@ -81,13 +77,11 @@ public: * @param ctx is the parser context the handler should be executed in. * @param callbacks is an instance of Callbacks used to notify * the parser about certain state changes. - * @param name is the name of the string. * @param state is the state this handler was called for. * @param location is the location at which the handler is created. */ HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks, - const std::string &name, const State &state, - const SourceLocation &location); + const State &state, const SourceLocation &location); }; /** @@ -154,13 +148,6 @@ protected: */ const SourceLocation &location() const; - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. - */ - const std::string &name() const; - /** * Calls the corresponding function in the HandlerCallbacks instance. This * method registers the given tokens as tokens that are generally available, @@ -231,19 +218,23 @@ protected: */ // void popWhitespaceMode(); - public: /** - * Virtual destructor. + * Enum representing the type of the annotation a Handle instance handles. + * It may either handle the start of an annotation or the end of an + * annotation. */ - virtual ~Handler(); + enum class AnnotationType { START, END }; /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. + * Enum type representing the possible outcomes of the endToken() method. */ - const std::string &getName() const; + enum class EndTokenResult { ENDED_THIS, ENDED_HIDDEN, ENDED_NONE }; + + /** + * Virtual destructor. + */ + virtual ~Handler(); /** * Reference at the State descriptor for which this Handler was created. @@ -274,14 +265,63 @@ public: const SourceLocation &getLocation() const; /** - * Called when the command that was specified in the constructor is - * instanciated. + * Called whenever the handler should handle the start of a command. This + * method (or any other of the "start" methods) is called exactly once, + * after the constructor. * + * @param name is the name of the command that is started here. * @param args is a map from strings to variants (argument name and value). - * @return true if the handler was successful in starting the element it - * represents, false otherwise. + * @return true if the handler was successful in starting an element with + * the given name represents, false otherwise. */ - virtual bool start(Variant::mapType &args) = 0; + virtual bool startCommand(const std::string &commandName, + Variant::mapType &args) = 0; + + /** + * Called whenever the handler should handle the start of an annotation. + * This method (or any other of the "start" methods) is called exactly once, + * after the constructor. This method is only called if the + * "supportsAnnotations" flag of the State instance referencing this Handler + * is set to true. + * + * @param name is the name of the annotation that is started here. + * @param args is a map from strings to variants (argument name and value). + * @param type specifies whether this handler should handle the start of an + * annotation or the end of an annotation. + */ + virtual bool startAnnotation(const std::string &name, + Variant::mapType &args, + AnnotationType annotationType) = 0; + + /** + * Called whenever the handler should handle the start of a token. This + * method (or any other of the "start" methods) is called exactly once, + * after the constructor. This method is only called if the "supportsTokens" + * flag of the State instance referencing this Handler is set to true. + * + * @param token is the Token for which the handler should be started. + * @param node is the node for which this token was registered. + */ + virtual bool startToken(const Token &token, Handle node) = 0; + + /** + * Called whenever a token is marked as "end" token and this handler happens + * to be the currently active handler. This operation may have three + * outcomes: + *
      + *
    1. The token marks the end of the complete handler and the calling + * code should call the "end" method.
    2. + *
    3. The token marks the end of some element that is unknown the calling + * code. So the operation itself was a success, but the calling code + * should not call the "end" method. + *
    4. The token did not anything in this context. Basically this shuold + * never happen, but who knows.
    5. + *
    + * + * @param id is the Token for which the handler should be started. + * @param node is the node for which this token was registered. + */ + virtual EndTokenResult endToken(const Token &token, Handle node) = 0; /** * Called before the command for which this handler is defined ends (is @@ -310,35 +350,6 @@ public: */ virtual void fieldEnd() = 0; - /** - * Called whenever an annotation starts while this handler is active. The - * function should return true if starting the annotation was successful, - * false otherwise. - * - * @param className is a string variant containing the name of the - * annotation class and the location of the name in the source code. - * @param args is a map from strings to variants (argument name and value). - * @return true if the mentioned annotation could be started here, false - * if an error occurred. - */ - virtual bool annotationStart(const Variant &className, - Variant::mapType &args) = 0; - - /** - * Called whenever an annotation ends while this handler is active. The - * function should return true if ending the annotation was successful, - * false otherwise. - * - * @param className is a string variant containing the name of the - * annotation class and the location of the class name in the source code. - * @param elementName is a string variant containing the name of the - * annotation class and the location of the element name in the source code. - * @return true if the mentioned annotation could be started here, false if - * an error occurred. - */ - virtual bool annotationEnd(const Variant &className, - const Variant &elementName) = 0; - /** * Called whenever raw data (int the form of a string) is available for the * Handler instance. Should return true if the data could be handled, false @@ -369,14 +380,15 @@ protected: using Handler::Handler; public: - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; + bool startAnnotation(const std::string &name, Variant::mapType &args, + AnnotationType annotationType) override; + bool startToken(const Token &token, Handle node) override; + EndTokenResult endToken(const Token &token, Handle node) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; - bool annotationStart(const Variant &className, - Variant::mapType &args) override; - bool annotationEnd(const Variant &className, - const Variant &elementName) override; bool data() override; /** @@ -395,14 +407,15 @@ protected: using Handler::Handler; public: - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; + bool startAnnotation(const std::string &name, Variant::mapType &args, + AnnotationType annotationType) override; + bool startToken(const Token &token, Handle node) override; + EndTokenResult endToken(const Token &token, Handle node) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; - bool annotationStart(const Variant &className, - Variant::mapType &args) override; - bool annotationEnd(const Variant &className, - const Variant &elementName) override; bool data() override; }; @@ -453,9 +466,10 @@ protected: virtual void doHandle(const Variant &fieldData, Variant::mapType &args) = 0; public: - bool start(Variant::mapType &args) override; - void end() override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; bool data() override; + void end() override; }; } } diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index a556999..3545c37 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -32,7 +32,7 @@ namespace ousia { namespace parser_stack { - +namespace { /* Class HandlerInfo */ /** @@ -41,6 +41,11 @@ namespace parser_stack { */ class HandlerInfo { public: + /** + * Name of the command or the token sequence. + */ + std::string name; + /** * Pointer pointing at the actual handler instance. */ @@ -96,6 +101,7 @@ public: * Default constructor of the HandlerInfo class. */ HandlerInfo(); + /** * Constructor of the HandlerInfo class, allows to set all flags manually. */ @@ -182,6 +188,7 @@ void HandlerInfo::fieldEnd() * Stub instance of HandlerInfo containing no handler information. */ static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true}; +} /* Helper functions */ @@ -387,7 +394,7 @@ StackImpl::~StackImpl() !info.inImplicitDefaultField) { logger().error( std::string("Reached end of stream, but command \"") + - info.handler->getName() + + info.name + "\" has not ended yet. Command was started here:", info.handler->getLocation()); } @@ -421,8 +428,8 @@ void StackImpl::deduceState() HandlerConstructor ctor = state.elementHandler ? state.elementHandler : EmptyHandler::create; - std::shared_ptr handler = std::shared_ptr{ - ctor({ctx, *this, "", state, SourceLocation{}})}; + std::shared_ptr handler = + std::shared_ptr{ctor({ctx, *this, state, SourceLocation{}})}; stack.emplace_back(handler); // Set the correct flags for this implicit handler @@ -450,7 +457,7 @@ const State &StackImpl::currentState() const std::string StackImpl::currentCommandName() const { - return stack.empty() ? std::string{} : stack.back().handler->getName(); + return stack.empty() ? std::string{} : stack.back().name; } const State *StackImpl::findTargetState(const std::string &name) @@ -608,8 +615,8 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, HandlerConstructor ctor = targetState->elementHandler ? targetState->elementHandler : EmptyHandler::create; - std::shared_ptr handler{ctor( - {ctx, *this, name.asString(), *targetState, name.getLocation()})}; + std::shared_ptr handler{ + ctor({ctx, *this, *targetState, name.getLocation()})}; stack.emplace_back(handler); // Fetch the HandlerInfo for the parent element and the current element @@ -631,7 +638,8 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, handler->setLogger(loggerFork); try { - info.valid = handler->start(canonicalArgs); + info.valid = + handler->startCommand(name.asString(), canonicalArgs); } catch (LoggableException ex) { loggerFork.log(ex); diff --git a/src/core/parser/stack/TypesystemHandler.cpp b/src/core/parser/stack/TypesystemHandler.cpp index de8ee49..110c56f 100644 --- a/src/core/parser/stack/TypesystemHandler.cpp +++ b/src/core/parser/stack/TypesystemHandler.cpp @@ -32,7 +32,8 @@ namespace parser_stack { /* TypesystemHandler */ -bool TypesystemHandler::start(Variant::mapType &args) +bool TypesystemHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { // Create the typesystem instance Rooted typesystem = @@ -63,7 +64,8 @@ void TypesystemHandler::end() { scope().pop(logger()); } /* TypesystemEnumHandler */ -bool TypesystemEnumHandler::start(Variant::mapType &args) +bool TypesystemEnumHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -91,17 +93,18 @@ void TypesystemEnumEntryHandler::doHandle(const Variant &fieldData, /* TypesystemStructHandler */ -bool TypesystemStructHandler::start(Variant::mapType &args) +bool TypesystemStructHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); // Fetch the arguments used for creating this type - const std::string &name = args["name"].asString(); + const std::string &structNmae = args["name"].asString(); const std::string &parent = args["parent"].asString(); // Fetch the current typesystem and create the struct node Rooted typesystem = scope().selectOrThrow(); - Rooted structType = typesystem->createStructType(name); + Rooted structType = typesystem->createStructType(structNmae); structType->setLocation(location()); // Try to resolve the parent type and set it as parent structure @@ -124,18 +127,19 @@ void TypesystemStructHandler::end() { scope().pop(logger()); } /* TypesystemStructFieldHandler */ -bool TypesystemStructFieldHandler::start(Variant::mapType &args) +bool TypesystemStructFieldHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { // Read the argument values - const std::string &name = args["name"].asString(); + const std::string &fieldName = args["name"].asString(); const std::string &type = args["type"].asString(); const Variant &defaultValue = args["default"]; const bool optional = !(defaultValue.isObject() && defaultValue.asObject() == nullptr); Rooted structType = scope().selectOrThrow(); - Rooted attribute = - structType->createAttribute(name, defaultValue, optional, logger()); + Rooted attribute = structType->createAttribute( + fieldName, defaultValue, optional, logger()); attribute->setLocation(location()); // Try to resolve the type and default value @@ -163,17 +167,18 @@ bool TypesystemStructFieldHandler::start(Variant::mapType &args) /* TypesystemConstantHandler */ -bool TypesystemConstantHandler::start(Variant::mapType &args) +bool TypesystemConstantHandler::startCommand(const std::string &commandName, + Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); // Read the argument values - const std::string &name = args["name"].asString(); + const std::string &constantName = args["name"].asString(); const std::string &type = args["type"].asString(); const Variant &value = args["value"]; Rooted typesystem = scope().selectOrThrow(); - Rooted constant = typesystem->createConstant(name, value); + Rooted constant = typesystem->createConstant(constantName, value); constant->setLocation(location()); // Try to resolve the type diff --git a/src/core/parser/stack/TypesystemHandler.hpp b/src/core/parser/stack/TypesystemHandler.hpp index 85494f1..75cba01 100644 --- a/src/core/parser/stack/TypesystemHandler.hpp +++ b/src/core/parser/stack/TypesystemHandler.hpp @@ -43,7 +43,8 @@ class TypesystemHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; void end() override; /** @@ -67,7 +68,8 @@ class TypesystemEnumHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; void end() override; /** @@ -114,7 +116,8 @@ class TypesystemStructHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; void end() override; /** @@ -139,7 +142,8 @@ class TypesystemStructFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; /** * Creates a new instance of the TypesystemStructFieldHandler. @@ -162,7 +166,8 @@ class TypesystemConstantHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(const std::string &commandName, + Variant::mapType &args) override; /** * Creates a new instance of the TypesystemConstantHandler. -- cgit v1.2.3 From e31968c9e073c64cf718fbcaebbc83ee2bee48c8 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 18:09:34 +0100 Subject: Added additional constructor to Token --- src/core/common/Token.hpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp index f37151f..4b56f1a 100644 --- a/src/core/common/Token.hpp +++ b/src/core/common/Token.hpp @@ -134,7 +134,9 @@ struct Token { * @param location is the location of the extracted string content in the * source file. */ - Token(SourceLocation location) : id(Tokens::Data), location(location) {} + Token(const SourceLocation &location) : id(Tokens::Data), location(location) + { + } /** * Constructor of the Token struct. @@ -144,11 +146,25 @@ struct Token { * @param location is the location of the extracted string content in the * source file. */ - Token(TokenId id, const std::string &content, SourceLocation location) + Token(TokenId id, const std::string &content, + const SourceLocation &location) : id(id), content(content), location(location) { } + /** + * Constructor of the a "data" Token with the given string data and + * location. + * + * @param content is the string content that should be stored in the token. + * @param location is the location of the content within the source file. + */ + Token(const std::string &content, + const SourceLocation &location = SourceLocation{}) + : id(Tokens::Data), content(content), location(location) + { + } + /** * Constructor of the Token struct, only initializes the token id * @@ -172,7 +188,6 @@ struct Token { */ const SourceLocation &getLocation() const { return location; } }; - } -#endif /* _OUSIA_TOKENS_HPP_ */ \ No newline at end of file +#endif /* _OUSIA_TOKENS_HPP_ */ -- cgit v1.2.3 From e0b9f6ef6692ee8c37386c23f721dc6a57f69ae6 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 18:10:28 +0100 Subject: Storing type and name in the HandlerData once again, using a Token --- src/core/parser/stack/Callbacks.hpp | 3 +- src/core/parser/stack/DocumentHandler.cpp | 61 +++++-------------- src/core/parser/stack/DocumentHandler.hpp | 59 +++---------------- src/core/parser/stack/DomainHandler.cpp | 55 +++++++---------- src/core/parser/stack/DomainHandler.hpp | 22 +++---- src/core/parser/stack/Handler.cpp | 44 +++++++------- src/core/parser/stack/Stack.cpp | 91 +++++++++++++++++------------ src/core/parser/stack/TokenStack.hpp | 4 +- src/core/parser/stack/TypesystemHandler.cpp | 15 ++--- src/core/parser/stack/TypesystemHandler.hpp | 15 ++--- 10 files changed, 148 insertions(+), 221 deletions(-) diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp index 092664a..dfe41fc 100644 --- a/src/core/parser/stack/Callbacks.hpp +++ b/src/core/parser/stack/Callbacks.hpp @@ -87,7 +87,8 @@ public: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + virtual void pushTokens(const std::vector &tokens) = 0; + /** * Removes the previously pushed list of tokens from the stack. */ diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index de6e367..e931d8d 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -37,8 +37,7 @@ namespace parser_stack { /* DocumentHandler */ -bool DocumentHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DocumentHandler::startCommand(Variant::mapType &args) { Rooted document = context().getProject()->createDocument(args["name"].asString()); @@ -54,24 +53,10 @@ void DocumentHandler::end() { scope().pop(logger()); } /* DocumentChildHandler */ DocumentChildHandler::DocumentChildHandler(const HandlerData &handlerData) - : Handler(handlerData), mode(Mode::STRUCT) + : Handler(handlerData), isExplicitField(false) { } -void DocumentChildHandler::setMode(Mode mode, const std::string &name) -{ - this->mode = mode; - this->name = name; - this->token = Token(); -} - -void DocumentChildHandler::setMode(Mode mode, const Token &token) -{ - this->mode = mode; - this->name = token.content; - this->token = token; -} - void DocumentChildHandler::preamble(Rooted &parentNode, size_t &fieldIdx, DocumentEntity *&parent) { @@ -142,12 +127,8 @@ void DocumentChildHandler::createPath(const size_t &firstFieldIdx, scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, false); } -bool DocumentChildHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DocumentChildHandler::startCommand(Variant::mapType &args) { - // Set the internal mode to STRUCT and copy the name - setMode(Mode::STRUCT, name); - // Extract the special "name" attribute from the input arguments. // The remaining attributes will be forwarded to the newly constructed // element. @@ -176,11 +157,11 @@ bool DocumentChildHandler::startCommand(const std::string &commandName, return false; } Rooted strct = scope().resolve( - Utils::split(name, ':'), logger()); + Utils::split(name(), ':'), logger()); if (strct == nullptr) { // if we could not resolve the name, throw an exception. throw LoggableException( - std::string("\"") + name + "\" could not be resolved.", + std::string("\"") + name() + "\" could not be resolved.", location()); } entity = parentNode.cast()->createRootStructuredEntity( @@ -200,7 +181,7 @@ bool DocumentChildHandler::startCommand(const std::string &commandName, */ { ssize_t newFieldIdx = - parent->getDescriptor()->getFieldDescriptorIndex(name); + parent->getDescriptor()->getFieldDescriptorIndex(name()); if (newFieldIdx != -1) { // Check whether explicit fields are allowed here, if not if (scope().getFlag(ParserFlag::POST_EXPLICIT_FIELDS)) { @@ -208,7 +189,7 @@ bool DocumentChildHandler::startCommand(const std::string &commandName, std::string( "Data or structure commands have already been " "given, command \"") + - name + std::string( + name() + std::string( "\" is not interpreted explicit " "field. Move explicit field " "references to the beginning."), @@ -218,7 +199,7 @@ bool DocumentChildHandler::startCommand(const std::string &commandName, manager(), parentNode, newFieldIdx, false)}; field->setLocation(location()); scope().push(field); - setMode(Mode::EXPLICIT_FIELD, name); + isExplicitField = true; return true; } } @@ -227,11 +208,11 @@ bool DocumentChildHandler::startCommand(const std::string &commandName, // Otherwise create a new StructuredEntity // TODO: Consider Anchors and AnnotationEntities Rooted strct = scope().resolve( - Utils::split(name, ':'), logger()); + Utils::split(name(), ':'), logger()); if (strct == nullptr) { // if we could not resolve the name, throw an exception. throw LoggableException( - std::string("\"") + name + "\" could not be resolved.", + std::string("\"") + name() + "\" could not be resolved.", location()); } @@ -278,26 +259,15 @@ bool DocumentChildHandler::startCommand(const std::string &commandName, } } -bool DocumentChildHandler::startAnnotation(const std::string &name, - Variant::mapType &args, +bool DocumentChildHandler::startAnnotation(Variant::mapType &args, AnnotationType annotationType) { - // Set the internal mode and name correctly - if (annotationType == AnnotationType::START) { - setMode(Mode::ANNOTATION_START, name); - } else { - setMode(Mode::ANNOTATION_END, name); - } - // TODO: Handle annotation return false; } -bool DocumentChildHandler::startToken(const Token &token, Handle node) +bool DocumentChildHandler::startToken(Handle node) { - // Set the internal mode correctly - setMode(Mode::TOKEN, token); - // TODO: Handle token start return false; } @@ -313,7 +283,7 @@ void DocumentChildHandler::end() { // In case of explicit fields we do not want to pop something from the // stack. - if (mode == Mode::STRUCT) { + if (!isExplicitField) { // pop the "main" element. scope().pop(logger()); } @@ -321,8 +291,7 @@ void DocumentChildHandler::end() bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx) { - // TODO: Handle other cases - if (mode == Mode::EXPLICIT_FIELD) { + if (isExplicitField) { // In case of explicit fields we do not want to create another field. isDefault = true; return fieldIdx == 0; @@ -471,7 +440,7 @@ bool DocumentChildHandler::data() // this fact Variant text = readData(); if (defaultFields.empty()) { - logger().error("Got data, but structure \"" + name + + logger().error("Got data, but structure \"" + name() + "\" does not have any primitive field", text); } else { diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 9a41508..d34c020 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -53,8 +53,7 @@ class DocumentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &commandName, - Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -92,55 +91,11 @@ public: * defined elements in an Ousía document. */ class DocumentChildHandler : public Handler { -public: - /** - * Enum type used to represent the mode of the DocumentChildHandler. - * TODO: Having to have such a type is actually quite stupid, it would be - * nicer to have separate handler classes for each of these cases. But this - * is a story for a different day. - */ - enum class Mode { - STRUCT, - EXPLICIT_FIELD, - ANNOTATION_START, - ANNOTATION_END, - TOKEN - }; - private: /** - * Internal Mode of the DocumentChildHandler. - */ - Mode mode; - - /** - * Contains the name of the command or the annotation that is represented - * by this DocumentChildHandler. - */ - std::string name; - - /** - * Token represented by the document child handler. + * If set to true, this handler represents an explicit field. */ - Token token; - - /** - * Switches the mode to the given mode and copies the given name. Resets the - * token. - * - * @param mode is the new mode. - * @param name is the new name. - */ - void setMode(Mode mode, const std::string &name); - - /** - * Switches the mode to the given mode and copies the given token, sets the - * name to the content of the token. - * - * @param mode is the new mode. - * @param token is the new token. - */ - void setMode(Mode mode, const Token &token); + bool isExplicitField; /** * Code shared by both the start(), fieldStart() and the data() method. @@ -211,11 +166,10 @@ private: public: DocumentChildHandler(const HandlerData &handlerData); - bool startCommand(const std::string &commandName, - Variant::mapType &args) override; - bool startAnnotation(const std::string &name, Variant::mapType &args, + bool startCommand(Variant::mapType &args) override; + bool startAnnotation(Variant::mapType &args, AnnotationType annotationType) override; - bool startToken(const Token &token, Handle node) override; + bool startToken(Handle node) override; EndTokenResult endToken(const Token &token, Handle node) override; void end() override; bool data() override; @@ -257,3 +211,4 @@ extern const Rtti DocumentField; } #endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/DomainHandler.cpp b/src/core/parser/stack/DomainHandler.cpp index 5ca4f5b..aef5b47 100644 --- a/src/core/parser/stack/DomainHandler.cpp +++ b/src/core/parser/stack/DomainHandler.cpp @@ -33,8 +33,7 @@ namespace parser_stack { /* DomainHandler */ -bool DomainHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainHandler::startCommand(Variant::mapType &args) { // Create the Domain node Rooted domain = @@ -58,8 +57,7 @@ void DomainHandler::end() { scope().pop(logger()); } /* DomainStructHandler */ -bool DomainStructHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainStructHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -90,8 +88,7 @@ bool DomainStructHandler::startCommand(const std::string &commandName, void DomainStructHandler::end() { scope().pop(logger()); } /* DomainAnnotationHandler */ -bool DomainAnnotationHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainAnnotationHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -109,8 +106,7 @@ void DomainAnnotationHandler::end() { scope().pop(logger()); } /* DomainAttributesHandler */ -bool DomainAttributesHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainAttributesHandler::startCommand(Variant::mapType &args) { // Fetch the current typesystem and create the struct node Rooted parent = scope().selectOrThrow(); @@ -126,8 +122,7 @@ void DomainAttributesHandler::end() { scope().pop(logger()); } /* DomainFieldHandler */ -bool DomainFieldHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainFieldHandler::startCommand(Variant::mapType &args) { FieldDescriptor::FieldType type; if (args["isSubtree"].asBool()) { @@ -157,16 +152,15 @@ void DomainFieldHandler::end() { scope().pop(logger()); } /* DomainFieldRefHandler */ -bool DomainFieldRefHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainFieldRefHandler::startCommand(Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); - const std::string &ref = args["ref"].asString(); + const std::string &name = args["ref"].asString(); auto loc = location(); - scope().resolveFieldDescriptor(ref, parent, logger(), + scope().resolveFieldDescriptor(name, parent, logger(), [loc](Handle field, Handle parent, Logger &logger) { if (field != nullptr) { @@ -188,8 +182,7 @@ void DomainFieldRefHandler::end() {} /* DomainPrimitiveHandler */ -bool DomainPrimitiveHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainPrimitiveHandler::startCommand(Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); @@ -229,14 +222,13 @@ void DomainPrimitiveHandler::end() { scope().pop(logger()); } /* DomainChildHandler */ -bool DomainChildHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainChildHandler::startCommand(Variant::mapType &args) { Rooted field = scope().selectOrThrow(); - const std::string &ref = args["ref"].asString(); + const std::string &name = args["ref"].asString(); scope().resolve( - ref, field, logger(), + name, field, logger(), [](Handle child, Handle field, Logger &logger) { if (child != nullptr) { field.cast()->addChild( @@ -248,8 +240,7 @@ bool DomainChildHandler::startCommand(const std::string &commandName, /* DomainParentHandler */ -bool DomainParentHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainParentHandler::startCommand(Variant::mapType &args) { Rooted strct = scope().selectOrThrow(); @@ -264,8 +255,7 @@ void DomainParentHandler::end() { scope().pop(logger()); } /* DomainParentFieldHandler */ -bool DomainParentFieldHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainParentFieldHandler::startCommand(Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); FieldDescriptor::FieldType type; @@ -275,7 +265,7 @@ bool DomainParentFieldHandler::startCommand(const std::string &commandName, type = FieldDescriptor::FieldType::TREE; } - const std::string &fieldName = args["name"].asString(); + const std::string &name = args["name"].asString(); const bool optional = args["optional"].asBool(); Rooted strct = parentNameNode->getParent().cast(); @@ -284,12 +274,12 @@ bool DomainParentFieldHandler::startCommand(const std::string &commandName, // StructuredClass as child to it. scope().resolve( parentNameNode->getName(), strct, logger(), - [type, fieldName, optional](Handle parent, Handle strct, + [type, name, optional](Handle parent, Handle strct, Logger &logger) { if (parent != nullptr) { Rooted field = (parent.cast()->createFieldDescriptor( - logger, type, fieldName, optional)).first; + logger, type, name, optional)).first; field->addChild(strct.cast()); } }); @@ -298,12 +288,11 @@ bool DomainParentFieldHandler::startCommand(const std::string &commandName, /* DomainParentFieldRefHandler */ -bool DomainParentFieldRefHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool DomainParentFieldRefHandler::startCommand(Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); - const std::string &ref = args["ref"].asString(); + const std::string &name = args["ref"].asString(); Rooted strct = parentNameNode->getParent().cast(); auto loc = location(); @@ -311,14 +300,14 @@ bool DomainParentFieldRefHandler::startCommand(const std::string &commandName, // resolve the parent, get the referenced field and add the declared // StructuredClass as child to it. scope().resolve(parentNameNode->getName(), strct, logger(), - [ref, loc](Handle parent, + [name, loc](Handle parent, Handle strct, Logger &logger) { if (parent != nullptr) { Rooted field = - parent.cast()->getFieldDescriptor(ref); + parent.cast()->getFieldDescriptor(name); if (field == nullptr) { logger.error( - std::string("Could not find referenced field ") + ref, loc); + std::string("Could not find referenced field ") + name, loc); return; } field->addChild(strct.cast()); diff --git a/src/core/parser/stack/DomainHandler.hpp b/src/core/parser/stack/DomainHandler.hpp index 4116919..f12d863 100644 --- a/src/core/parser/stack/DomainHandler.hpp +++ b/src/core/parser/stack/DomainHandler.hpp @@ -46,7 +46,7 @@ class DomainHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -59,7 +59,7 @@ class DomainStructHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -72,7 +72,7 @@ class DomainAnnotationHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -85,7 +85,7 @@ class DomainAttributesHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -98,7 +98,7 @@ class DomainFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -111,7 +111,7 @@ class DomainFieldRefHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -124,7 +124,7 @@ class DomainPrimitiveHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -137,7 +137,7 @@ class DomainChildHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -154,7 +154,7 @@ class DomainParentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -167,7 +167,7 @@ class DomainParentFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -179,7 +179,7 @@ class DomainParentFieldRefHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &name, Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index 006e521..c01e74c 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -32,8 +32,9 @@ namespace parser_stack { /* Class HandlerData */ HandlerData::HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks, - const State &state, const SourceLocation &location) - : ctx(ctx), callbacks(callbacks), state(state), location(location) + const State &state, const Token &token, + HandlerType type) + : ctx(ctx), callbacks(callbacks), state(state), token(token), type(type) { } @@ -60,7 +61,20 @@ Logger &Handler::logger() return handlerData.ctx.getLogger(); } -const SourceLocation &Handler::location() const { return handlerData.location; } +const std::string &Handler::name() const { return handlerData.token.content; } + +TokenId Handler::tokenId() const { return handlerData.token.id; } + +const Token &Handler::token() const { return handlerData.token; } + +const SourceLocation &Handler::location() const +{ + return handlerData.token.location; +} + +HandlerType Handler::type() const { return handlerData.type; } + +const State &Handler::state() const { return handlerData.state; } Variant Handler::readData() { return handlerData.callbacks.readData(); } @@ -81,8 +95,6 @@ void Handler::unregisterToken(TokenId id) handlerData.callbacks.unregisterToken(id); } -const State &Handler::getState() const { return handlerData.state; } - void Handler::setLogger(Logger &logger) { internalLogger = &logger; } void Handler::resetLogger() { internalLogger = nullptr; } @@ -91,15 +103,13 @@ const SourceLocation &Handler::getLocation() const { return location(); } /* Class EmptyHandler */ -bool EmptyHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool EmptyHandler::startCommand(Variant::mapType &args) { // Well, we'll support any command we get, don't we? return true; } -bool EmptyHandler::startAnnotation(const std::string &name, - Variant::mapType &args, +bool EmptyHandler::startAnnotation(Variant::mapType &args, Handler::AnnotationType annotationType) { // Do not support annotations. Annotations are too complicated for poor @@ -107,7 +117,7 @@ bool EmptyHandler::startAnnotation(const std::string &name, return false; } -bool EmptyHandler::startToken(const Token &token, Handle node) +bool EmptyHandler::startToken(Handle node) { // EmptyHandler does not support tokens. return false; @@ -149,24 +159,19 @@ Handler *EmptyHandler::create(const HandlerData &handlerData) /* Class StaticHandler */ -bool StaticHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool StaticHandler::startCommand(Variant::mapType &args) { // Do nothing in the default implementation, accept anything return true; } -bool StaticHandler::startAnnotation(const std::string &name, - Variant::mapType &args, +bool StaticHandler::startAnnotation(Variant::mapType &args, Handler::AnnotationType annotationType) { return false; } -bool StaticHandler::startToken(const Token &token, Handle node) -{ - return false; -} +bool StaticHandler::startToken(Handle node) { return false; } Handler::EndTokenResult StaticHandler::endToken(const Token &token, Handle node) @@ -209,8 +214,7 @@ StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData, { } -bool StaticFieldHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool StaticFieldHandler::startCommand(Variant::mapType &args) { if (!argName.empty()) { auto it = args.find(argName); diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 3545c37..cad4078 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -41,11 +41,6 @@ namespace { */ class HandlerInfo { public: - /** - * Name of the command or the token sequence. - */ - std::string name; - /** * Pointer pointing at the actual handler instance. */ @@ -362,7 +357,7 @@ public: void unregisterToken(TokenId id) override; Variant readData() override; bool hasData(); - void pushTokens(const std::vector &tokens) override; + void pushTokens(const std::vector &tokens) override; void popTokens() override; }; @@ -394,7 +389,7 @@ StackImpl::~StackImpl() !info.inImplicitDefaultField) { logger().error( std::string("Reached end of stream, but command \"") + - info.name + + currentCommandName() + "\" has not ended yet. Command was started here:", info.handler->getLocation()); } @@ -428,8 +423,8 @@ void StackImpl::deduceState() HandlerConstructor ctor = state.elementHandler ? state.elementHandler : EmptyHandler::create; - std::shared_ptr handler = - std::shared_ptr{ctor({ctx, *this, state, SourceLocation{}})}; + std::shared_ptr handler = std::shared_ptr{ + ctor({ctx, *this, state, SourceLocation{}, HandlerType::COMMAND})}; stack.emplace_back(handler); // Set the correct flags for this implicit handler @@ -452,12 +447,12 @@ std::set StackImpl::expectedCommands() const State &StackImpl::currentState() const { - return stack.empty() ? States::None : stack.back().handler->getState(); + return stack.empty() ? States::None : stack.back().handler->state(); } std::string StackImpl::currentCommandName() const { - return stack.empty() ? std::string{} : stack.back().name; + return stack.empty() ? std::string{} : stack.back().handler->name(); } const State *StackImpl::findTargetState(const std::string &name) @@ -616,21 +611,29 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, ? targetState->elementHandler : EmptyHandler::create; std::shared_ptr handler{ - ctor({ctx, *this, *targetState, name.getLocation()})}; + ctor({ctx, + *this, + *targetState, + {name.asString(), name.getLocation()}, + HandlerType::COMMAND})}; stack.emplace_back(handler); - // Fetch the HandlerInfo for the parent element and the current element + // Fetch the HandlerInfo for the parent element and the current + // element HandlerInfo &parentInfo = lastInfo(); HandlerInfo &info = currentInfo(); - // Call the "start" method of the handler, store the result of the start - // method as the validity of the handler -- do not call the start method + // Call the "start" method of the handler, store the result of the + // start + // method as the validity of the handler -- do not call the start + // method // if the stack is currently invalid (as this may cause further, // unwanted errors) bool validStack = handlersValid(); info.valid = false; if (validStack) { - // Canonicalize the arguments (if this has not already been done), + // Canonicalize the arguments (if this has not already been + // done), // allow additional arguments and numeric indices Variant::mapType canonicalArgs = args; targetState->arguments.validateMap(canonicalArgs, loggerFork, true, @@ -638,8 +641,7 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, handler->setLogger(loggerFork); try { - info.valid = - handler->startCommand(name.asString(), canonicalArgs); + info.valid = handler->startCommand(canonicalArgs); } catch (LoggableException ex) { loggerFork.log(ex); @@ -647,8 +649,10 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, handler->resetLogger(); } - // We started the command within an implicit default field and it is not - // valid -- remove both the new handler and the parent field from the + // We started the command within an implicit default field and it is + // not + // valid -- remove both the new handler and the parent field from + // the // stack if (!info.valid && parentInfo.inImplicitDefaultField) { endCurrentHandler(); @@ -656,7 +660,8 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, continue; } - // If we ended up here, starting the command may or may not have worked, + // If we ended up here, starting the command may or may not have + // worked, // but after all, we cannot unroll the stack any further. Update the // "valid" flag, commit any potential error messages and return. info.valid = parentInfo.valid && info.valid; @@ -687,13 +692,15 @@ void StackImpl::data(const TokenizedData &data) // TODO: Rewrite this function for token handling // TODO: This loop needs to be refactored out /*while (!data.atEnd()) { - // End handlers that already had a default field and are currently not + // End handlers that already had a default field and are currently + not // active. endOverdueHandlers(); const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); - // Check whether there is any command the data can be sent to -- if not, + // Check whether there is any command the data can be sent to -- if + not, // make sure the data actually is data if (stack.empty()) { if (hasNonWhitespaceText) { @@ -712,10 +719,12 @@ void StackImpl::data(const TokenizedData &data) continue; } - // If this field should not get any data, log an error and do not call + // If this field should not get any data, log an error and do not + call // the "data" handler if (!info.inValidField) { - // If the "hadDefaultField" flag is set, we already issued an error + // If the "hadDefaultField" flag is set, we already issued an + error // message if (!info.hadDefaultField) { if (hasNonWhitespaceText) { @@ -726,8 +735,10 @@ void StackImpl::data(const TokenizedData &data) } if (handlersValid() && info.inValidField) { - // Fork the logger and set it as temporary logger for the "start" - // method. We only want to keep error messages if this was not a try + // Fork the logger and set it as temporary logger for the + "start" + // method. We only want to keep error messages if this was not a + try // to implicitly open a default field. LoggerFork loggerFork = logger().fork(); info.handler->setLogger(loggerFork); @@ -735,12 +746,14 @@ void StackImpl::data(const TokenizedData &data) // Pass the data to the current Handler instance bool valid = false; try { - // Create a fork of the TokenizedData and let the handler work + // Create a fork of the TokenizedData and let the handler + work // on it TokenizedData dataFork = data; valid = info.handler->data(dataFork); - // If the data was validly handled by the handler, commit the + // If the data was validly handled by the handler, commit + the // change if (valid) { data = dataFork; @@ -754,14 +767,16 @@ void StackImpl::data(const TokenizedData &data) info.handler->resetLogger(); // If placing the data here failed and we're currently in an - // implicitly opened field, just unroll the stack to the next field + // implicitly opened field, just unroll the stack to the next + field // and try again if (!valid && info.inImplicitDefaultField) { endCurrentHandler(); continue; } - // Commit the content of the logger fork. Do not change the valid + // Commit the content of the logger fork. Do not change the + valid // flag. loggerFork.commit(); } @@ -783,12 +798,14 @@ void StackImpl::fieldStart(bool isDefault) HandlerInfo &info = currentInfo(); if (info.inField) { logger().error( - "Got field start, but there is no command for which to start the " + "Got field start, but there is no command for which to start " + "the " "field."); return; } - // If the handler already had a default field we cannot start a new field + // If the handler already had a default field we cannot start a new + // field // (the default field always is the last field) -- mark the command as // invalid if (info.hadDefaultField) { @@ -797,7 +814,8 @@ void StackImpl::fieldStart(bool isDefault) std::string("\" does not have any more fields")); } - // Copy the isDefault flag to a local variable, the fieldStart method will + // Copy the isDefault flag to a local variable, the fieldStart method + // will // write into this variable bool defaultField = isDefault; @@ -843,7 +861,8 @@ void StackImpl::fieldEnd() return; } - // Only continue if the current handler stack is in a valid state, do not + // Only continue if the current handler stack is in a valid state, do + // not // call the fieldEnd function if something went wrong before if (handlersValid() && !info.hadDefaultField && info.inValidField) { try { @@ -868,7 +887,7 @@ void StackImpl::unregisterToken(TokenId id) tokenRegistry.unregisterToken(id); } -void StackImpl::pushTokens(const std::vector &tokens) +void StackImpl::pushTokens(const std::vector &tokens) { // TODO } diff --git a/src/core/parser/stack/TokenStack.hpp b/src/core/parser/stack/TokenStack.hpp index af734bb..f2e7edc 100644 --- a/src/core/parser/stack/TokenStack.hpp +++ b/src/core/parser/stack/TokenStack.hpp @@ -82,9 +82,9 @@ public: TokenStack(const TokenStack &parentStack) : TokenStack(&parentStack) {} /** - * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. + * Pushes a list of SyntaxDescriptor instances onto the internal stack. * - * @param tokens is a list of TokenSyntaxDescriptor instances that should be + * @param tokens is a list of SyntaxDescriptor instances that should be * stored on the stack. */ void pushTokens(const std::vector &tokens); diff --git a/src/core/parser/stack/TypesystemHandler.cpp b/src/core/parser/stack/TypesystemHandler.cpp index 110c56f..3fa641a 100644 --- a/src/core/parser/stack/TypesystemHandler.cpp +++ b/src/core/parser/stack/TypesystemHandler.cpp @@ -32,8 +32,7 @@ namespace parser_stack { /* TypesystemHandler */ -bool TypesystemHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool TypesystemHandler::startCommand(Variant::mapType &args) { // Create the typesystem instance Rooted typesystem = @@ -64,8 +63,7 @@ void TypesystemHandler::end() { scope().pop(logger()); } /* TypesystemEnumHandler */ -bool TypesystemEnumHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool TypesystemEnumHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -93,8 +91,7 @@ void TypesystemEnumEntryHandler::doHandle(const Variant &fieldData, /* TypesystemStructHandler */ -bool TypesystemStructHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool TypesystemStructHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -127,8 +124,7 @@ void TypesystemStructHandler::end() { scope().pop(logger()); } /* TypesystemStructFieldHandler */ -bool TypesystemStructFieldHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool TypesystemStructFieldHandler::startCommand(Variant::mapType &args) { // Read the argument values const std::string &fieldName = args["name"].asString(); @@ -167,8 +163,7 @@ bool TypesystemStructFieldHandler::startCommand(const std::string &commandName, /* TypesystemConstantHandler */ -bool TypesystemConstantHandler::startCommand(const std::string &commandName, - Variant::mapType &args) +bool TypesystemConstantHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); diff --git a/src/core/parser/stack/TypesystemHandler.hpp b/src/core/parser/stack/TypesystemHandler.hpp index 75cba01..0773a3a 100644 --- a/src/core/parser/stack/TypesystemHandler.hpp +++ b/src/core/parser/stack/TypesystemHandler.hpp @@ -43,8 +43,7 @@ class TypesystemHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &commandName, - Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -68,8 +67,7 @@ class TypesystemEnumHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &commandName, - Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -116,8 +114,7 @@ class TypesystemStructHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &commandName, - Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -142,8 +139,7 @@ class TypesystemStructFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &commandName, - Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; /** * Creates a new instance of the TypesystemStructFieldHandler. @@ -166,8 +162,7 @@ class TypesystemConstantHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool startCommand(const std::string &commandName, - Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; /** * Creates a new instance of the TypesystemConstantHandler. -- cgit v1.2.3 From cdae062d0cbc19ce605df24b2fff5e3808f21ca6 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 18:33:46 +0100 Subject: Added range flag to HandlerInfo --- src/core/parser/stack/Stack.cpp | 60 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index cad4078..e5bd224 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -65,6 +65,11 @@ public: */ bool implicit : 1; + /** + * Set to true if the handled command or annotation has a range. + */ + bool range : 1; + /** * Set to true if the handler currently is in a field. */ @@ -100,8 +105,9 @@ public: /** * Constructor of the HandlerInfo class, allows to set all flags manually. */ - HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, - bool inImplicitDefaultField, bool inValidField); + HandlerInfo(bool valid, bool implicit, bool range, bool inField, + bool inDefaultField, bool inImplicitDefaultField, + bool inValidField); /** * Constructor of the HandlerInfo class, taking a shared_ptr to the handler @@ -124,6 +130,30 @@ public: * Updates the "fields" flags according to a "fieldEnd" event. */ void fieldEnd(); + + /** + * Returns the name of the referenced handler or an empty string if no + * handler is present. + * + * @return the current handler name. + */ + std::string name() const; + + /** + * Returns the type of the referenced handler or COMMAND if no handler is + * present. + * + * @return the current handler type. + */ + HandlerType type() const; + + /** + * Returns the current state the handler is on or States::None if no handler + * is present. + * + * @return the current state machine state. + */ + const State &state() const; }; HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {} @@ -133,6 +163,7 @@ HandlerInfo::HandlerInfo(std::shared_ptr handler) fieldIdx(0), valid(true), implicit(false), + range(false), inField(false), inDefaultField(false), inImplicitDefaultField(false), @@ -141,13 +172,14 @@ HandlerInfo::HandlerInfo(std::shared_ptr handler) { } -HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField, +HandlerInfo::HandlerInfo(bool valid, bool implicit, bool range, bool inField, bool inDefaultField, bool inImplicitDefaultField, bool inValidField) : handler(nullptr), fieldIdx(0), valid(valid), implicit(implicit), + range(range), inField(inField), inDefaultField(inDefaultField), inImplicitDefaultField(inImplicitDefaultField), @@ -156,6 +188,21 @@ HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField, { } +std::string HandlerInfo::name() const +{ + return handler == nullptr ? std::string{} : handler->name(); +} + +HandlerType HandlerInfo::type() const +{ + return handler == nullptr ? HandlerType::COMMAND : handler->type(); +} + +const State &HandlerInfo::state() const +{ + return handler == nullptr ? States::None : handler->state(); +} + HandlerInfo::~HandlerInfo() { // Do nothing @@ -182,7 +229,7 @@ void HandlerInfo::fieldEnd() /** * Stub instance of HandlerInfo containing no handler information. */ -static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true}; +static HandlerInfo EmptyHandlerInfo{true, true, false, true, true, false, true}; } /* Helper functions */ @@ -447,12 +494,12 @@ std::set StackImpl::expectedCommands() const State &StackImpl::currentState() const { - return stack.empty() ? States::None : stack.back().handler->state(); + return stack.empty() ? States::None : stack.back().state(); } std::string StackImpl::currentCommandName() const { - return stack.empty() ? std::string{} : stack.back().handler->name(); + return stack.empty() ? std::string{} : stack.back().name(); } const State *StackImpl::findTargetState(const std::string &name) @@ -665,6 +712,7 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, // but after all, we cannot unroll the stack any further. Update the // "valid" flag, commit any potential error messages and return. info.valid = parentInfo.valid && info.valid; + info.range = range; loggerFork.commit(); return; } -- cgit v1.2.3 From 1c33913ebb5d9202575d3ca99bd17366d30f2261 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Tue, 3 Mar 2015 00:30:38 +0100 Subject: Started restructuring and adapting Stack class, reenabled unit tests (does not compile right now) --- CMakeLists.txt | 2 +- src/core/parser/stack/Stack.cpp | 447 ++++++++------- src/core/parser/stack/Stack.hpp | 13 +- src/core/parser/utils/TokenizedData.cpp | 10 +- src/core/parser/utils/TokenizedData.hpp | 12 + test/core/parser/stack/StackTest.cpp | 959 ++++++++++++++++---------------- 6 files changed, 744 insertions(+), 699 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f99c212..c7ad7a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -326,7 +326,7 @@ IF(TEST) test/core/model/StyleTest test/core/model/TypesystemTest test/core/parser/ParserScopeTest -# test/core/parser/stack/StackTest + test/core/parser/stack/StackTest test/core/parser/stack/StateTest test/core/parser/stack/TokenRegistryTest test/core/parser/utils/SourceOffsetVectorTest diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index e5bd224..89217ea 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -86,6 +86,12 @@ public: */ bool inImplicitDefaultField : 1; + /** + * Set to true if the handler current is in an implicitly started range + * field. + */ + bool inImplicitRangeField: 1; + /** * Set to false if this field is only opened pro-forma and does not accept * any data. Otherwise set to true. @@ -230,6 +236,18 @@ void HandlerInfo::fieldEnd() * Stub instance of HandlerInfo containing no handler information. */ static HandlerInfo EmptyHandlerInfo{true, true, false, true, true, false, true}; + +/** + * Small helper class makeing sure the reference at some variable is reset once + * the scope is left. + */ +template +struct GuardedTemporaryPointer { + T **ptr; + GuardedTemporaryPointer(T *ref, T **ptr) : ptr(ptr) { *ptr = ref; } + + ~GuardedTemporaryPointer() { *ptr = nullptr; } +}; } /* Helper functions */ @@ -352,11 +370,18 @@ private: HandlerInfo &lastInfo(); /** - * Ends all handlers that currently are not inside a field and already had - * a default field. This method is called whenever the data() and command() - * events are reached. + * Returns a set containing the tokens that should currently be processed + * by the TokenizedData instance. + * + * @return a TokenSet instance containing all tokens that should currently + * be processed. */ - void endOverdueHandlers(); + TokenSet currentTokens() const; + + /** + * Returns the whitespace mode defined by the current command. + */ + WhitespaceMode currentWhitespaceMode() const; /** * Ends the current handler and removes the corresponding element from the @@ -365,13 +390,14 @@ private: void endCurrentHandler(); /** - * Tries to start a default field for the current handler, if currently the - * handler is not inside a field and did not have a default field yet. - * - * @return true if the handler is inside a field, false if no field could - * be started. + * Ends all handlers that currently are not inside a field and already had + * a default field. Tries to start a default field for the current handler, + * if currently the handler is not inside a field and did not have a default + * field yet. This method is called whenever the data(), startAnnotation(), + * startToken(), startCommand(), annotationStart() or annotationEnd() events + * are reached. */ - bool ensureHandlerIsInField(); + void prepareCurrentHandler(); /** * Returns true if all handlers on the stack are currently valid, or false @@ -381,6 +407,30 @@ private: */ bool handlersValid(); + /** + * Called whenever there is an actual data pending on the current + * TokenizedDataReader. Tries to feed this data to the current handler. + */ + void handleData(); + + /** + * Called whenever there is a token waiting to be processed. If possible + * tries to end a current handler with this token or to start a new handler + * with the token. + * + * @param token is the token that should be handled. + */ + void handleToken(const Token &token); + + /** + * Called by the rangeEnd() and fieldEnd() methods to end the current ranged + * command. + * + * @param rangeCommand specifies whether this should end the range of a + * command with range. + */ + void handleFieldEnd(bool rangeCommand); + public: StackImpl(ParserCallbacks &parser, ParserContext &ctx, const std::multimap &states); @@ -403,7 +453,6 @@ public: TokenId registerToken(const std::string &token) override; void unregisterToken(TokenId id) override; Variant readData() override; - bool hasData(); void pushTokens(const std::vector &tokens) override; void popTokens() override; }; @@ -492,16 +541,6 @@ std::set StackImpl::expectedCommands() return res; } -const State &StackImpl::currentState() const -{ - return stack.empty() ? States::None : stack.back().state(); -} - -std::string StackImpl::currentCommandName() const -{ - return stack.empty() ? std::string{} : stack.back().name(); -} - const State *StackImpl::findTargetState(const std::string &name) { const State *currentState = &(this->currentState()); @@ -527,6 +566,28 @@ const State *StackImpl::findTargetStateOrWildcard(const std::string &name) return targetState; } +const State &StackImpl::currentState() const +{ + return stack.empty() ? States::None : stack.back().state(); +} + +std::string StackImpl::currentCommandName() const +{ + return stack.empty() ? std::string{} : stack.back().name(); +} + +TokenSet StackImpl::currentTokens() const +{ + // TODO: Implement + return Tokens{}; +} + +WhitespaceMode currentWhitespaceMode() const +{ + // TODO: Implement + return WhitespaceMode::COLLAPSE; +} + HandlerInfo &StackImpl::currentInfo() { return stack.empty() ? EmptyHandlerInfo : stack.back(); @@ -536,6 +597,8 @@ HandlerInfo &StackImpl::lastInfo() return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2]; } +/* Stack helper functions */ + void StackImpl::endCurrentHandler() { if (!stack.empty()) { @@ -563,44 +626,37 @@ void StackImpl::endCurrentHandler() } } -void StackImpl::endOverdueHandlers() +void StackImpl::prepareCurrentHandler() { - if (!stack.empty()) { - // Fetch the handler info for the current top-level element - HandlerInfo &info = stack.back(); + // Repeat until a valid handler is found on the stack + while (true) { + // Fetch the handler for the current top-level element + HandlerInfo &info = currentInfo(); - // Abort if this handler currently is inside a field - if (info.inField || (!info.hadDefaultField && info.valid)) { + // If the current Handler is in a field, there is nothing to be done, + // abort + if (info.inField) { return; } - // Otherwise end the current handler - endCurrentHandler(); - } -} - -bool StackImpl::ensureHandlerIsInField() -{ - // If the current handler is not in a field (and actually has a handler) - // try to start a default field - HandlerInfo &info = currentInfo(); - if (!info.inField && info.handler != nullptr) { - // Abort if the element already had a default field or the handler is - // not valid + // If the current field already had a default field or is not valid, + // end it and repeat if (info.hadDefaultField || !info.valid) { - return false; + endCurrentHandler(); + continue; } // Try to start a new default field, abort if this did not work bool isDefault = true; if (!info.handler->fieldStart(isDefault, info.fieldIdx)) { - return false; + endCurrentHandler(); + continue; } - // Mark the field as started - info.fieldStart(true, true, true); + // Mark the field as started and return -- the field should be marked + // is implicit if this is not a field with range + info.fieldStart(true, !info.range, true, info.range); } - return true; } bool StackImpl::handlersValid() @@ -613,13 +669,105 @@ bool StackImpl::handlersValid() return true; } +void StackImpl::handleData() +{ + // Repeat until we found some handle willingly consuming the data + while (true) { + // Prepare the stack -- make sure all overdue handlers are ended and + // we currently are in an open field + prepareCurrentHandler(); + + // Fetch the current handler information + HandlerInfo &info = currentInfo(); + + // If this field should not get any data, log an error and do not + // call the "data" handler + if (!info.inValidField) { + if (!info.hadDefaultField) { + logger().error("Did not expect any data here", data); + } + return; + } + + // If we're currently in an invalid subtree, just eat the data and abort + if (!handlersValid()) { + return; + } + + // Fork the logger and set it as temporary logger for the "data" + // method. We only want to keep error messages if this was not a + // try to implicitly open a default field. + LoggerFork loggerFork = logger().fork(); + info.handler->setLogger(loggerFork); + + // Pass the data to the current Handler instance + bool valid = false; + try { + valid = info.handler->data(); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + + // Reset the logger instance of the handler as soon as possible + info.handler->resetLogger(); + + // If placing the data here failed and we're currently in an + // implicitly opened field, just unroll the stack to the next field + // and try again + if (!valid && info.inImplicitDefaultField) { + endCurrentHandler(); + continue; + } + + // Commit the content of the logger fork. Do not change the valid flag. + loggerFork.commit(); + } +} + +void StackImpl::handleToken(const Token &token) { + // TODO: Implement + // Just eat them for now +} + +void StackImpl::handleFieldEnd(bool rangedCommand) +{ + // Throw away all overdue handlers, start the default field at least once + // if this has not been done yet (this is important for range commands) + prepareStack(); + + // Close all implicit default fields + while (!stack.empty()) { + HandlerInfo &info = currentInfo(); + if (!info.inImplicitDefaultField) { + break; + } + endCurrentHandler(); + } + + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (!info.inField || stack.empty()) { + logger().error("Got field end, but there is no field here to end"); + return; + } + + // Only continue if the current handler stack is in a valid state, do not + // call the fieldEnd function if something went wrong before + if (handlersValid()) { + if (info.range && info.inDefaultField) + info.handler->fieldEnd(); + } + + // This command no longer is in a field + info.fieldEnd(); +} + +/* Class StackImpl public functions */ + void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, bool range) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); - // Make sure the given identifier is valid (preventing "*" from being // malicously passed to this function) if (!Utils::isNamespacedIdentifier(name.asString())) { @@ -629,6 +777,10 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, } while (true) { + // Prepare the stack -- make sure all overdue handlers are ended and + // we currently are in an open field + prepareCurrentHandler(); + // Try to find a target state for the given command, if none can be // found and the current command does not have an open field, then try // to create an empty default field, otherwise this is an exception @@ -644,12 +796,6 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, } } - // Make sure we're currently inside a field - if (!ensureHandlerIsInField()) { - endCurrentHandler(); - continue; - } - // Fork the logger. We do not want any validation errors to skip LoggerFork loggerFork = logger().fork(); @@ -670,17 +816,14 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, HandlerInfo &parentInfo = lastInfo(); HandlerInfo &info = currentInfo(); - // Call the "start" method of the handler, store the result of the - // start - // method as the validity of the handler -- do not call the start - // method + // Call the "start" method of the handler, store the result of the start + // method as the validity of the handler -- do not call the start method // if the stack is currently invalid (as this may cause further, // unwanted errors) bool validStack = handlersValid(); info.valid = false; if (validStack) { - // Canonicalize the arguments (if this has not already been - // done), + // Canonicalize the arguments (if this has not already been done), // allow additional arguments and numeric indices Variant::mapType canonicalArgs = args; targetState->arguments.validateMap(canonicalArgs, loggerFork, true, @@ -697,10 +840,8 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, } // We started the command within an implicit default field and it is - // not - // valid -- remove both the new handler and the parent field from - // the - // stack + // not valid -- remove both the new handler and the parent field from + // the stack if (!info.valid && parentInfo.inImplicitDefaultField) { endCurrentHandler(); endCurrentHandler(); @@ -708,9 +849,8 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, } // If we ended up here, starting the command may or may not have - // worked, - // but after all, we cannot unroll the stack any further. Update the - // "valid" flag, commit any potential error messages and return. + // worked, but after all, we cannot unroll the stack any further. Update + // the "valid" flag, commit any potential error messages and return. info.valid = parentInfo.valid && info.valid; info.range = range; loggerFork.commit(); @@ -732,106 +872,31 @@ void StackImpl::annotationEnd(const Variant &className, void StackImpl::rangeEnd() { - // TODO + handleFieldEnd(true); } void StackImpl::data(const TokenizedData &data) { - // TODO: Rewrite this function for token handling - // TODO: This loop needs to be refactored out - /*while (!data.atEnd()) { - // End handlers that already had a default field and are currently - not - // active. - endOverdueHandlers(); - - const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); - - // Check whether there is any command the data can be sent to -- if - not, - // make sure the data actually is data - if (stack.empty()) { - if (hasNonWhitespaceText) { - throw LoggableException("No command here to receive data.", - data); - } - return; - } - - // Fetch the current command handler information - HandlerInfo &info = currentInfo(); - - // Make sure the current handler has an open field - if (!ensureHandlerIsInField()) { - endCurrentHandler(); - continue; - } - - // If this field should not get any data, log an error and do not - call - // the "data" handler - if (!info.inValidField) { - // If the "hadDefaultField" flag is set, we already issued an - error - // message - if (!info.hadDefaultField) { - if (hasNonWhitespaceText) { - logger().error("Did not expect any data here", data); - } - return; - } - } - - if (handlersValid() && info.inValidField) { - // Fork the logger and set it as temporary logger for the - "start" - // method. We only want to keep error messages if this was not a - try - // to implicitly open a default field. - LoggerFork loggerFork = logger().fork(); - info.handler->setLogger(loggerFork); - - // Pass the data to the current Handler instance - bool valid = false; - try { - // Create a fork of the TokenizedData and let the handler - work - // on it - TokenizedData dataFork = data; - valid = info.handler->data(dataFork); - - // If the data was validly handled by the handler, commit - the - // change - if (valid) { - data = dataFork; - } - } - catch (LoggableException ex) { - loggerFork.log(ex); - } - - // Reset the logger instance as soon as possible - info.handler->resetLogger(); - - // If placing the data here failed and we're currently in an - // implicitly opened field, just unroll the stack to the next - field - // and try again - if (!valid && info.inImplicitDefaultField) { - endCurrentHandler(); - continue; - } - - // Commit the content of the logger fork. Do not change the - valid - // flag. - loggerFork.commit(); - } - - // There was no reason to unroll the stack any further, so continue - return; - }*/ + // Fetch a reader for the given tokenized data instance. + TokenizedDataReader reader = data.reader(); + + // Use the GuardedTemporaryPointer to make sure that the member variable + // dataReader is resetted to nullptr once this scope is left. + GuardedTemporaryPointer ptr(&reader, &dataReader); + + // Peek a token from the reader, repeat until all tokens have been read + Token token; + while (reader.peek(token, currentTokens(), currentWhitespaceMode())) { + // Handle the token as text data or as actual token + if (token.id == Tokens::Data) { + handleData(); + } else { + handleToken(token); + } + + // Consume the peeked token + reader.consumePeek(); + } } void StackImpl::fieldStart(bool isDefault) @@ -853,8 +918,7 @@ void StackImpl::fieldStart(bool isDefault) } // If the handler already had a default field we cannot start a new - // field - // (the default field always is the last field) -- mark the command as + // field (the default field always is the last field) -- mark the command as // invalid if (info.hadDefaultField) { logger().error(std::string("Got field start, but command \"") + @@ -862,8 +926,7 @@ void StackImpl::fieldStart(bool isDefault) std::string("\" does not have any more fields")); } - // Copy the isDefault flag to a local variable, the fieldStart method - // will + // Copy the isDefault flag to a local variable, the fieldStart method will // write into this variable bool defaultField = isDefault; @@ -891,40 +954,11 @@ void StackImpl::fieldStart(bool isDefault) void StackImpl::fieldEnd() { - // Unroll the stack until the next explicitly open field - while (!stack.empty()) { - HandlerInfo &info = currentInfo(); - if (info.inField && !info.inImplicitDefaultField) { - break; - } - endCurrentHandler(); - } - - // Fetch the information attached to the current handler - HandlerInfo &info = currentInfo(); - if (!info.inField || info.inImplicitDefaultField || stack.empty()) { - logger().error( - "Got field end, but there is no command for which to end the " - "field."); - return; - } - - // Only continue if the current handler stack is in a valid state, do - // not - // call the fieldEnd function if something went wrong before - if (handlersValid() && !info.hadDefaultField && info.inValidField) { - try { - info.handler->fieldEnd(); - } - catch (LoggableException ex) { - logger().log(ex); - } - } - - // This command no longer is in a field - info.fieldEnd(); + handleFieldEnd(false); } +/* Class StackImpl HandlerCallbacks */ + TokenId StackImpl::registerToken(const std::string &token) { return tokenRegistry.registerToken(token); @@ -950,14 +984,7 @@ Variant StackImpl::readData() if (dataReader != nullptr) { TokenizedDataReaderFork dataReaderFork = dataReader->fork(); Token token; - - // TODO: Use correct token set - TokenSet tokens; - - // TODO: Use correct whitespace mode - WhitespaceMode mode = WhitespaceMode::COLLAPSE; - - dataReaderFork.read(token, tokens, mode); + dataReaderFork.read(token, currentTokens(), currentWhitespaceMode()); if (token.id == Tokens::Data) { Variant res = Variant::fromString(token.content); res.setLocation(token.getLocation()); @@ -967,8 +994,6 @@ Variant StackImpl::readData() return Variant{}; } -bool StackImpl::hasData() { return readData() != nullptr; } - /* Class Stack */ Stack::Stack(ParserCallbacks &parser, ParserContext &ctx, @@ -1013,5 +1038,7 @@ void Stack::fieldStart(bool isDefault) { impl->fieldStart(isDefault); } void Stack::fieldEnd() { impl->fieldEnd(); } void Stack::data(const TokenizedData &data) { impl->data(data); } + +void Stack::data(const std::string &str) { data(TokenizedData(str)); } } } diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index de281d4..1de7cff 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -150,13 +150,24 @@ public: /** * Function that should be called whenever character data is found in the - * input stream. May only be called if the currently is a command on the + * input stream. May only be called if there currently is a command on the * stack. * * @param data is a TokenizedData instance containing the pre-segmented data * that should be read. */ void data(const TokenizedData &data); + + /** + * Function that may be called whenever character data is found in the + * input stream. May only be called if the currently is a command on the + * stack. This method is mainly intended for unit testing. Pass a + * TokenizedData instance to the + * + * @param str is a string containing the data that should be passed to the + * tokenizer. + */ + void data(const std::string &str); }; } } diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index c3c4f98..d8a8b37 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -29,8 +29,7 @@ namespace ousia { /** * Maximum token length. */ -constexpr TokenLength MaxTokenLength = - std::numeric_limits::max(); +constexpr TokenLength MaxTokenLength = std::numeric_limits::max(); namespace { /** @@ -510,6 +509,13 @@ TokenizedData::TokenizedData(SourceId sourceId) { } +TokenizedData::TokenizedData(const std::string &data, SourceOffset offsStart, + SourceId sourceId) + : TokenizedData(sourceId) +{ + append(data, offsStart); +} + TokenizedData::~TokenizedData() {} size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index b72ca02..bc937f2 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -95,6 +95,18 @@ public: */ TokenizedData(SourceId sourceId); + /** + * Creates a new instance of TokenizedData, takes a SourceId and an initial + * string buffer. + * + * @param data is the string that should be appended to the buffer. + * @param offsStart is the start offset in bytes in the input file. + * @param sourceId is the source identifier that should be used for + * constructing the location when returning tokens. + */ + TokenizedData(const std::string &data, SourceOffset offsStart = 0, + SourceId sourceId = InvalidSourceId); + /** * Destructor. Needs to be defined explicitly for freeing a shared pointer * of the incomplete TokenizedDataImpl type. diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp index 83966d5..8f6c4df 100644 --- a/test/core/parser/stack/StackTest.cpp +++ b/test/core/parser/stack/StackTest.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -38,70 +39,69 @@ static StandaloneEnvironment env(logger); namespace { +class Parser : public ParserCallbacks { + TokenId registerToken(const std::string &token) override + { + return Tokens::Empty; + } + + void unregisterToken(TokenId id) override + { + // Do nothing here + } +}; + +static Parser parser; + struct Tracker { - int startCount; + int startCommandCount; + int startAnnotationCount; + int startTokenCount; + int endTokenCount; int endCount; int fieldStartCount; int fieldEndCount; - int annotationStartCount; - int annotationEndCount; int dataCount; - Variant::mapType startArgs; - bool fieldStartIsDefault; - size_t fieldStartIdx; - Variant annotationStartClassName; - Variant::mapType annotationStartArgs; - Variant annotationEndClassName; - Variant annotationEndElementName; - TokenizedData dataData; - - bool startResult; - bool fieldStartSetIsDefault; + bool startCommandResult; + bool startAnnotationResult; + bool startTokenResult; + Handler::EndTokenResult endTokenResult; bool fieldStartResult; - bool annotationStartResult; - bool annotationEndResult; bool dataResult; Tracker() { reset(); } void reset() { - startCount = 0; + startCommandCount = 0; + startAnnotationCount = 0; + startTokenCount = 0; + endTokenCount = 0; endCount = 0; fieldStartCount = 0; fieldEndCount = 0; - annotationStartCount = 0; - annotationEndCount = 0; dataCount = 0; - startArgs = Variant::mapType{}; - fieldStartIsDefault = false; - fieldStartIdx = 0; - annotationStartClassName = Variant::fromString(std::string{}); - annotationStartArgs = Variant::mapType{}; - annotationEndClassName = Variant::fromString(std::string{}); - annotationEndElementName = Variant::fromString(std::string{}); - dataData = TokenizedData(); - - startResult = true; - fieldStartSetIsDefault = false; + startCommandResult = true; + startAnnotationResult = true; + startTokenResult = true; + endTokenResult = Handler::EndTokenResult::ENDED_THIS; fieldStartResult = true; - annotationStartResult = true; - annotationEndResult = true; dataResult = true; } - void expect(int startCount, int endCount, int fieldStartCount, - int fieldEndCount, int annotationStartCount, - int annotationEndCount, int dataCount) + void expect(int startCommandCount, int endCount, int fieldStartCount, + int fieldEndCount, int dataCount, int startAnnotationCount = 0, + int startTokenCount = 0, int endTokenCount = 0) { - EXPECT_EQ(startCount, this->startCount); + EXPECT_EQ(startCommandCount, this->startCommandCount); + EXPECT_EQ(startAnnotationCount, this->startAnnotationCount); + EXPECT_EQ(startTokenCount, this->startTokenCount); + EXPECT_EQ(endTokenCount, this->endTokenCount); EXPECT_EQ(endCount, this->endCount); EXPECT_EQ(fieldStartCount, this->fieldStartCount); EXPECT_EQ(fieldEndCount, this->fieldEndCount); - EXPECT_EQ(annotationStartCount, this->annotationStartCount); - EXPECT_EQ(annotationEndCount, this->annotationEndCount); EXPECT_EQ(dataCount, this->dataCount); } }; @@ -113,55 +113,44 @@ private: TestHandler(const HandlerData &handlerData) : Handler(handlerData) {} public: - bool start(Variant::mapType &args) override + bool startCommand(Variant::mapType &args) override { - tracker.startCount++; - tracker.startArgs = args; - if (!tracker.startResult) { - logger().error( - "The TestHandler was told not to allow a field start. So it " - "doesn't. The TestHandler always obeys its master."); - } - return tracker.startResult; + tracker.startCommandCount++; + return tracker.startCommandResult; } - void end() override { tracker.endCount++; } - - bool fieldStart(bool &isDefault, size_t fieldIdx) override + bool startAnnotation(Variant::mapType &args, + AnnotationType annotationType) override { - tracker.fieldStartCount++; - tracker.fieldStartIsDefault = isDefault; - tracker.fieldStartIdx = fieldIdx; - if (tracker.fieldStartSetIsDefault) { - isDefault = true; - } - return tracker.fieldStartResult; + tracker.startAnnotationCount++; + return tracker.startAnnotationResult; } - void fieldEnd() override { tracker.fieldEndCount++; } + bool startToken(Handle node) override + { + tracker.startTokenCount++; + return tracker.startTokenResult; + } - bool annotationStart(const Variant &className, - Variant::mapType &args) override + EndTokenResult endToken(const Token &token, Handle node) override { - tracker.annotationStartCount++; - tracker.annotationStartClassName = className; - tracker.annotationStartArgs = args; - return tracker.annotationStartResult; + tracker.endTokenCount++; + return tracker.endTokenResult; } - bool annotationEnd(const Variant &className, - const Variant &elementName) override + void end() override { tracker.endCount++; } + + bool fieldStart(bool &isDefault, size_t fieldIdx) override { - tracker.annotationEndCount++; - tracker.annotationEndClassName = className; - tracker.annotationEndElementName = elementName; - return tracker.annotationEndResult; + tracker.fieldStartCount++; + return tracker.fieldStartResult; } - bool data(TokenizedData &data) override + void fieldEnd() override { tracker.fieldEndCount++; } + + bool data() override { tracker.dataCount++; - tracker.dataData = data; return tracker.dataResult; } @@ -205,544 +194,544 @@ TEST(Stack, basicTest) tracker.reset(); logger.reset(); { - Stack s{env.context, States::TestHandlers}; + Stack s{parser, env.context, States::TestHandlers}; EXPECT_EQ("", s.currentCommandName()); EXPECT_EQ(&States::None, &s.currentState()); - s.command("document", {}); + s.commandStart("document", {}, true); s.fieldStart(true); s.data("test1"); EXPECT_EQ("document", s.currentCommandName()); EXPECT_EQ(&States::Document, &s.currentState()); - tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(1, 0, 1, 0, 1); // scc, ec, fsc, fse, dc, sac, stc, etc - s.command("body", {}); + s.commandStart("body", {}, true); s.fieldStart(true); s.data("test2"); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); - tracker.expect(2, 0, 2, 0, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 2, 0, 2); // scc, ec, fsc, fse, dc, sac, stc, etc - s.command("inner", {}); + s.commandStart("inner", {}, true); s.fieldStart(true); EXPECT_EQ("inner", s.currentCommandName()); EXPECT_EQ(&States::BodyChildren, &s.currentState()); s.fieldEnd(); - tracker.expect(3, 0, 3, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(3, 0, 3, 1, 2); // scc, ec, fsc, fse, dc, sac, stc, etc s.fieldEnd(); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); - tracker.expect(3, 1, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(3, 1, 3, 2, 2); // scc, ec, fsc, fse, dc, sac, stc, etc - s.command("body", {}); + s.commandStart("body", {}, true); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); - tracker.expect(4, 2, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(4, 2, 3, 2, 2); // scc, ec, fsc, fse, dc, sac, stc, etc s.fieldStart(true); s.data("test3"); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); s.fieldEnd(); - tracker.expect(4, 2, 4, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(4, 2, 4, 3, 3); // scc, ec, fsc, fse, dc, sac, stc, etc EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); s.fieldEnd(); - tracker.expect(4, 3, 4, 4, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(4, 3, 4, 4, 3); // scc, ec, fsc, fse, dc, sac, stc, etc EXPECT_EQ("document", s.currentCommandName()); EXPECT_EQ(&States::Document, &s.currentState()); } - tracker.expect(4, 4, 4, 4, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(4, 4, 4, 4, 3); // scc, ec, fsc, fse, dc, sac, stc, etc ASSERT_FALSE(logger.hasError()); } - +/* TEST(Stack, errorInvalidCommands) { - Stack s{env.context, States::TestHandlers}; - tracker.reset(); - EXPECT_THROW(s.command("body", {}), LoggableException); - s.command("document", {}); - s.fieldStart(true); - EXPECT_THROW(s.command("document", {}), LoggableException); - s.command("empty", {}); - s.fieldStart(true); - EXPECT_THROW(s.command("body", {}), LoggableException); - s.command("special", {}); - s.fieldStart(true); - s.fieldEnd(); - s.fieldEnd(); - s.fieldEnd(); - - logger.reset(); - s.fieldEnd(); - ASSERT_TRUE(logger.hasError()); - - EXPECT_THROW(s.data("test"), LoggableException); - EXPECT_EQ(&States::None, &s.currentState()); + Stack s{env.context, States::TestHandlers}; + tracker.reset(); + EXPECT_THROW(s.command("body", {}), LoggableException); + s.command("document", {}); + s.fieldStart(true); + EXPECT_THROW(s.command("document", {}), LoggableException); + s.command("empty", {}); + s.fieldStart(true); + EXPECT_THROW(s.command("body", {}), LoggableException); + s.command("special", {}); + s.fieldStart(true); + s.fieldEnd(); + s.fieldEnd(); + s.fieldEnd(); + + logger.reset(); + s.fieldEnd(); + ASSERT_TRUE(logger.hasError()); + + EXPECT_THROW(s.data("test"), LoggableException); + EXPECT_EQ(&States::None, &s.currentState()); } TEST(Stack, validation) { - Stack s{env.context, States::TestHandlers}; - tracker.reset(); - logger.reset(); - - s.command("arguments", {}); - EXPECT_TRUE(logger.hasError()); - s.fieldStart(true); - s.fieldEnd(); - - logger.reset(); - s.command("arguments", {{"a", 5}}); - EXPECT_TRUE(logger.hasError()); - s.fieldStart(true); - s.fieldEnd(); - - logger.reset(); - s.command("arguments", {{"a", 5}, {"b", "test"}}); - EXPECT_FALSE(logger.hasError()); - s.fieldStart(true); - s.fieldEnd(); + Stack s{env.context, States::TestHandlers}; + tracker.reset(); + logger.reset(); + + s.command("arguments", {}); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); + + logger.reset(); + s.command("arguments", {{"a", 5}}); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); + + logger.reset(); + s.command("arguments", {{"a", 5}, {"b", "test"}}); + EXPECT_FALSE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); } TEST(Stack, invalidCommandName) { - tracker.reset(); - logger.reset(); - - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.command("a_", {}); - tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(2, 1, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.command("a_:b", {}); - tracker.expect(3, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(3, 2, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - ASSERT_THROW(s.command("_a", {}), LoggableException); - tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - ASSERT_THROW(s.command("a:", {}), LoggableException); - tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - ASSERT_THROW(s.command("a:_b", {}), LoggableException); - tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.reset(); + logger.reset(); + + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("a_", {}); + tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 1, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("a_:b", {}); + tracker.expect(3, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(3, 2, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + ASSERT_THROW(s.command("_a", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + ASSERT_THROW(s.command("a:", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + ASSERT_THROW(s.command("a:_b", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc } TEST(Stack, multipleFields) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - - s.command("a", {{"a", false}}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("a", s.currentCommandName()); - EXPECT_EQ(Variant::mapType({{"a", false}}), tracker.startArgs); - - s.fieldStart(false); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_FALSE(tracker.fieldStartIsDefault); - EXPECT_EQ(0U, tracker.fieldStartIdx); - - s.data("test"); - tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test", tracker.dataData.text().asString()); - - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldStart(false); - tracker.expect(1, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_FALSE(tracker.fieldStartIsDefault); - EXPECT_EQ(1U, tracker.fieldStartIdx); - - s.data("test2"); - tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test2", tracker.dataData.text().asString()); - - s.fieldEnd(); - tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldStart(true); - tracker.expect(1, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_TRUE(tracker.fieldStartIsDefault); - EXPECT_EQ(2U, tracker.fieldStartIdx); - - s.data("test3"); - tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test3", tracker.dataData.text().asString()); - - s.fieldEnd(); - tracker.expect(1, 0, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {{"a", false}}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("a", s.currentCommandName()); + EXPECT_EQ(Variant::mapType({{"a", false}}), tracker.startArgs); + + s.fieldStart(false); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(0U, tracker.fieldStartIdx); + + s.data("test"); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test", tracker.dataData.text().asString()); + + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + tracker.expect(1, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(1U, tracker.fieldStartIdx); + + s.data("test2"); + tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test2", tracker.dataData.text().asString()); + + s.fieldEnd(); + tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(true); + tracker.expect(1, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_TRUE(tracker.fieldStartIsDefault); + EXPECT_EQ(2U, tracker.fieldStartIdx); + + s.data("test3"); + tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test3", tracker.dataData.text().asString()); + + s.fieldEnd(); + tracker.expect(1, 0, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, implicitDefaultFieldOnNewCommand) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.command("b", {}); - tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("b", {}); + tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, implicitDefaultFieldOnNewCommandWithExplicitDefaultField) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - - s.command("b", {}); - tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(2, 0, 2, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(2, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + s.command("b", {}); + tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 0, 2, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, noImplicitDefaultFieldOnIncompatibleCommand) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - - tracker.fieldStartResult = false; - s.command("b", {}); - tracker.expect(2, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(2, 2, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + tracker.fieldStartResult = false; + s.command("b", {}); + tracker.expect(2, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, noImplicitDefaultFieldIfDefaultFieldGiven) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - s.fieldStart(true); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - - s.command("b", {}); - tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldStart(true); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + s.command("b", {}); + tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, noEndIfStartFails) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - - tracker.startResult = false; - s.command("b", {}); - tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_TRUE(logger.hasError()); + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + tracker.startResult = false; + s.command("b", {}); + tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_TRUE(logger.hasError()); } TEST(Stack, implicitDefaultFieldOnData) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.data("test"); - tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.data("test"); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, autoFieldEnd) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, autoImplicitFieldEnd) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - s.command("b", {}); - s.command("c", {}); - s.command("d", {}); - s.command("e", {}); - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(5, 0, 5, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(5, 5, 5, 5, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + s.command("b", {}); + s.command("c", {}); + s.command("d", {}); + s.command("e", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(5, 0, 5, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(5, 5, 5, 5, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, invalidDefaultField) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.fieldStartResult = false; - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, errorInvalidDefaultFieldData) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.fieldStartResult = false; - s.fieldStart(true); - ASSERT_FALSE(logger.hasError()); - s.data("test"); - ASSERT_TRUE(logger.hasError()); - s.fieldEnd(); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + ASSERT_FALSE(logger.hasError()); + s.data("test"); + ASSERT_TRUE(logger.hasError()); + s.fieldEnd(); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc } TEST(Stack, errorInvalidFieldData) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.fieldStartResult = false; - ASSERT_FALSE(logger.hasError()); - s.fieldStart(false); - ASSERT_TRUE(logger.hasError()); - s.data("test"); - s.fieldEnd(); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + s.data("test"); + s.fieldEnd(); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc } TEST(Stack, errorFieldStartNoCommand) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - Stack s{env.context, States::AnyHandlers}; - ASSERT_THROW(s.fieldStart(false), LoggableException); - ASSERT_THROW(s.fieldStart(true), LoggableException); - tracker.expect(0, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + Stack s{env.context, States::AnyHandlers}; + ASSERT_THROW(s.fieldStart(false), LoggableException); + ASSERT_THROW(s.fieldStart(true), LoggableException); + tracker.expect(0, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc } TEST(Stack, errorMultipleFieldStarts) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldStart(false); - ASSERT_FALSE(logger.hasError()); - s.fieldStart(false); - ASSERT_TRUE(logger.hasError()); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc } TEST(Stack, errorMultipleFieldEnds) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldStart(false); - s.fieldEnd(); - ASSERT_FALSE(logger.hasError()); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldEnd(); - ASSERT_TRUE(logger.hasError()); - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + s.fieldEnd(); + ASSERT_FALSE(logger.hasError()); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc } TEST(Stack, errorOpenField) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldStart(false); - ASSERT_FALSE(logger.hasError()); - } - ASSERT_TRUE(logger.hasError()); - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + ASSERT_FALSE(logger.hasError()); + } + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc } TEST(Stack, fieldEndWhenImplicitDefaultFieldOpen) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - s.fieldStart(true); - s.command("b", {}); - s.data("test"); - s.fieldEnd(); - tracker.expect(2, 1, 2, 2, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(2, 2, 2, 2, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + s.fieldStart(true); + s.command("b", {}); + s.data("test"); + s.fieldEnd(); + tracker.expect(2, 1, 2, 2, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(2, 2, 2, 2, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, fieldAfterDefaultField) { - tracker.reset(); - logger.reset(); - - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldStart(true); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.command("b", {}); - tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldStart(false); - tracker.expect(2, 0, 2, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.data("f1"); - tracker.expect(2, 0, 2, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - s.fieldEnd(); - tracker.expect(2, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - tracker.fieldStartSetIsDefault = true; - - s.fieldStart(false); - tracker.fieldStartSetIsDefault = false; - tracker.expect(2, 0, 3, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - s.data("f2"); - tracker.expect(2, 0, 3, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - s.fieldEnd(); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - - ASSERT_FALSE(logger.hasError()); - s.fieldStart(false); - ASSERT_TRUE(logger.hasError()); - logger.reset(); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - s.data("f3"); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - s.fieldEnd(); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - - s.fieldEnd(); - tracker.expect(2, 1, 3, 3, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(2, 2, 3, 3, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); -} + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldStart(true); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("b", {}); + tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + tracker.expect(2, 0, 2, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.data("f1"); + tracker.expect(2, 0, 2, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + tracker.expect(2, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + tracker.fieldStartSetIsDefault = true; + + s.fieldStart(false); + tracker.fieldStartSetIsDefault = false; + tracker.expect(2, 0, 3, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + s.data("f2"); + tracker.expect(2, 0, 3, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + logger.reset(); + tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + s.data("f3"); + tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldEnd(); + tracker.expect(2, 1, 3, 3, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(2, 2, 3, 3, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +}*/ } } -- cgit v1.2.3 From 21aa94db203c0b1bcab18bc4858edcdb2afc894d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Tue, 3 Mar 2015 14:33:55 +0100 Subject: Reactivated main program --- CMakeLists.txt | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c7ad7a3..f6807f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,19 +276,19 @@ TARGET_LINK_LIBRARIES(ousia_xml # Command line interface -#ADD_EXECUTABLE(ousia -# src/cli/Main -#) +ADD_EXECUTABLE(ousia + src/cli/Main +) -#TARGET_LINK_LIBRARIES(ousia -# ousia_core -# ousia_filesystem -# ousia_html -# ousia_xml -# ousia_osml -# ousia_osxml -# ${Boost_LIBRARIES} -#) +TARGET_LINK_LIBRARIES(ousia + ousia_core + ousia_filesystem + ousia_html + ousia_xml + ousia_osml + ousia_osxml + ${Boost_LIBRARIES} +) # If testing is enabled, build the unit tests IF(TEST) -- cgit v1.2.3 From fb8d4cdf01909b61e4e5d0806ec6de178ff0058c Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Tue, 3 Mar 2015 14:34:14 +0100 Subject: Finished stack and adapted all unit tests --- src/core/parser/stack/Stack.cpp | 218 +++++++--- src/core/parser/stack/Stack.hpp | 4 +- test/core/parser/stack/StackTest.cpp | 772 ++++++++++++++++++++--------------- 3 files changed, 595 insertions(+), 399 deletions(-) diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 89217ea..f341f1d 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -30,9 +30,15 @@ #include "TokenRegistry.hpp" #include "TokenStack.hpp" +#define STACK_DEBUG_OUTPUT 0 +#if STACK_DEBUG_OUTPUT +#include +#endif + namespace ousia { namespace parser_stack { namespace { + /* Class HandlerInfo */ /** @@ -86,12 +92,6 @@ public: */ bool inImplicitDefaultField : 1; - /** - * Set to true if the handler current is in an implicitly started range - * field. - */ - bool inImplicitRangeField: 1; - /** * Set to false if this field is only opened pro-forma and does not accept * any data. Otherwise set to true. @@ -109,11 +109,10 @@ public: HandlerInfo(); /** - * Constructor of the HandlerInfo class, allows to set all flags manually. + * Constructor of the HandlerInfo class, allows to set some flags manually. */ - HandlerInfo(bool valid, bool implicit, bool range, bool inField, - bool inDefaultField, bool inImplicitDefaultField, - bool inValidField); + HandlerInfo(bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField); /** * Constructor of the HandlerInfo class, taking a shared_ptr to the handler @@ -178,18 +177,17 @@ HandlerInfo::HandlerInfo(std::shared_ptr handler) { } -HandlerInfo::HandlerInfo(bool valid, bool implicit, bool range, bool inField, - bool inDefaultField, bool inImplicitDefaultField, - bool inValidField) +HandlerInfo::HandlerInfo(bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField) : handler(nullptr), fieldIdx(0), - valid(valid), + valid(true), implicit(implicit), - range(range), + range(false), inField(inField), inDefaultField(inDefaultField), inImplicitDefaultField(inImplicitDefaultField), - inValidField(inValidField), + inValidField(true), hadDefaultField(false) { } @@ -235,7 +233,7 @@ void HandlerInfo::fieldEnd() /** * Stub instance of HandlerInfo containing no handler information. */ -static HandlerInfo EmptyHandlerInfo{true, true, false, true, true, false, true}; +static HandlerInfo EmptyHandlerInfo{true, true, true, true}; /** * Small helper class makeing sure the reference at some variable is reset once @@ -386,8 +384,10 @@ private: /** * Ends the current handler and removes the corresponding element from the * stack. + * + * @return true if a command was ended, false otherwise. */ - void endCurrentHandler(); + bool endCurrentHandler(); /** * Ends all handlers that currently are not inside a field and already had @@ -396,8 +396,10 @@ private: * field yet. This method is called whenever the data(), startAnnotation(), * startToken(), startCommand(), annotationStart() or annotationEnd() events * are reached. + * + * @return true if the current command is in a valid field. */ - void prepareCurrentHandler(); + bool prepareCurrentHandler(bool startImplicitDefaultField = true); /** * Returns true if all handlers on the stack are currently valid, or false @@ -413,23 +415,23 @@ private: */ void handleData(); - /** - * Called whenever there is a token waiting to be processed. If possible - * tries to end a current handler with this token or to start a new handler - * with the token. - * - * @param token is the token that should be handled. - */ - void handleToken(const Token &token); + /** + * Called whenever there is a token waiting to be processed. If possible + * tries to end a current handler with this token or to start a new handler + * with the token. + * + * @param token is the token that should be handled. + */ + void handleToken(const Token &token); /** * Called by the rangeEnd() and fieldEnd() methods to end the current ranged * command. * - * @param rangeCommand specifies whether this should end the range of a + * @param endRange specifies whether this should end the range of a * command with range. */ - void handleFieldEnd(bool rangeCommand); + void handleFieldEnd(bool endRange); public: StackImpl(ParserCallbacks &parser, ParserContext &ctx, @@ -579,10 +581,10 @@ std::string StackImpl::currentCommandName() const TokenSet StackImpl::currentTokens() const { // TODO: Implement - return Tokens{}; + return TokenSet{}; } -WhitespaceMode currentWhitespaceMode() const +WhitespaceMode StackImpl::currentWhitespaceMode() const { // TODO: Implement return WhitespaceMode::COLLAPSE; @@ -599,7 +601,7 @@ HandlerInfo &StackImpl::lastInfo() /* Stack helper functions */ -void StackImpl::endCurrentHandler() +bool StackImpl::endCurrentHandler() { if (!stack.empty()) { // Fetch the handler info for the current top-level element @@ -623,29 +625,43 @@ void StackImpl::endCurrentHandler() // Remove the element from the stack stack.pop_back(); + return true; } + return false; } -void StackImpl::prepareCurrentHandler() +bool StackImpl::prepareCurrentHandler(bool startImplicitDefaultField) { // Repeat until a valid handler is found on the stack - while (true) { + while (!stack.empty()) { // Fetch the handler for the current top-level element HandlerInfo &info = currentInfo(); // If the current Handler is in a field, there is nothing to be done, // abort if (info.inField) { - return; + return true; } // If the current field already had a default field or is not valid, // end it and repeat - if (info.hadDefaultField || !info.valid) { + if ((info.hadDefaultField || !startImplicitDefaultField) || + !info.valid) { + // We cannot end the command if it is marked as "range" command + if (info.range) { + return false; + } + + // End the current handler endCurrentHandler(); continue; } + // Abort if starting new default fields is not allowed here + if (!startImplicitDefaultField) { + return false; + } + // Try to start a new default field, abort if this did not work bool isDefault = true; if (!info.handler->fieldStart(isDefault, info.fieldIdx)) { @@ -655,8 +671,10 @@ void StackImpl::prepareCurrentHandler() // Mark the field as started and return -- the field should be marked // is implicit if this is not a field with range - info.fieldStart(true, !info.range, true, info.range); + info.fieldStart(true, !info.range, true); + return true; } + return false; } bool StackImpl::handlersValid() @@ -675,7 +693,9 @@ void StackImpl::handleData() while (true) { // Prepare the stack -- make sure all overdue handlers are ended and // we currently are in an open field - prepareCurrentHandler(); + if (stack.empty() || !prepareCurrentHandler()) { + throw LoggableException("Did not expect any data here"); + } // Fetch the current handler information HandlerInfo &info = currentInfo(); @@ -684,7 +704,7 @@ void StackImpl::handleData() // call the "data" handler if (!info.inValidField) { if (!info.hadDefaultField) { - logger().error("Did not expect any data here", data); + logger().error("Did not expect any data here"); } return; } @@ -722,24 +742,25 @@ void StackImpl::handleData() // Commit the content of the logger fork. Do not change the valid flag. loggerFork.commit(); + return; } } -void StackImpl::handleToken(const Token &token) { +void StackImpl::handleToken(const Token &token) +{ // TODO: Implement // Just eat them for now } -void StackImpl::handleFieldEnd(bool rangedCommand) +void StackImpl::handleFieldEnd(bool endRange) { - // Throw away all overdue handlers, start the default field at least once - // if this has not been done yet (this is important for range commands) - prepareStack(); + // Throw away all overdue handlers + prepareCurrentHandler(false); // Close all implicit default fields while (!stack.empty()) { HandlerInfo &info = currentInfo(); - if (!info.inImplicitDefaultField) { + if (!info.inImplicitDefaultField || info.range) { break; } endCurrentHandler(); @@ -747,16 +768,37 @@ void StackImpl::handleFieldEnd(bool rangedCommand) // Fetch the information attached to the current handler HandlerInfo &info = currentInfo(); - if (!info.inField || stack.empty()) { - logger().error("Got field end, but there is no field here to end"); + if (stack.empty() || (!info.inField && !endRange) || + (!info.range && endRange)) { + if (endRange) { + logger().error( + "Got end of range, but there is no command here to end"); + } else { + logger().error("Got field end, but there is no field here to end"); + } return; } // Only continue if the current handler stack is in a valid state, do not // call the fieldEnd function if something went wrong before if (handlersValid()) { - if (info.range && info.inDefaultField) - info.handler->fieldEnd(); + // End the current field if it is valid + if (info.inValidField) { + info.handler->fieldEnd(); + info.fieldEnd(); + } + + // End the complete command if this is a range command, start the + // default field for once if range command did not have a default field + if (info.range && endRange) { + if (!info.hadDefaultField) { + bool isDefault = true; + info.handler->fieldStart(isDefault, true); + info.fieldStart(true, true, true); + } + endCurrentHandler(); + return; + } } // This command no longer is in a field @@ -768,6 +810,9 @@ void StackImpl::handleFieldEnd(bool rangedCommand) void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, bool range) { + // Call prepareCurrentHandler once to end all overdue commands + prepareCurrentHandler(); + // Make sure the given identifier is valid (preventing "*" from being // malicously passed to this function) if (!Utils::isNamespacedIdentifier(name.asString())) { @@ -787,8 +832,8 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, const State *targetState = findTargetStateOrWildcard(name.asString()); if (targetState == nullptr) { HandlerInfo &info = currentInfo(); - if (info.inImplicitDefaultField || !info.inField) { - endCurrentHandler(); + if ((info.inImplicitDefaultField || !info.inField) && + endCurrentHandler()) { continue; } else { throw buildInvalidCommandException(name.asString(), @@ -843,9 +888,10 @@ void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, // not valid -- remove both the new handler and the parent field from // the stack if (!info.valid && parentInfo.inImplicitDefaultField) { - endCurrentHandler(); - endCurrentHandler(); - continue; + // Only continue if the parent handler could actually be removed + if (endCurrentHandler() && endCurrentHandler()) { + continue; + } } // If we ended up here, starting the command may or may not have @@ -870,10 +916,7 @@ void StackImpl::annotationEnd(const Variant &className, // TODO } -void StackImpl::rangeEnd() -{ - handleFieldEnd(true); -} +void StackImpl::rangeEnd() { handleFieldEnd(true); } void StackImpl::data(const TokenizedData &data) { @@ -882,7 +925,7 @@ void StackImpl::data(const TokenizedData &data) // Use the GuardedTemporaryPointer to make sure that the member variable // dataReader is resetted to nullptr once this scope is left. - GuardedTemporaryPointer ptr(&reader, &dataReader); + GuardedTemporaryPointer ptr(&reader, &dataReader); // Peek a token from the reader, repeat until all tokens have been read Token token; @@ -952,10 +995,7 @@ void StackImpl::fieldStart(bool isDefault) info.fieldStart(defaultField, false, valid); } -void StackImpl::fieldEnd() -{ - handleFieldEnd(false); -} +void StackImpl::fieldEnd() { handleFieldEnd(false); } /* Class StackImpl HandlerCallbacks */ @@ -1017,28 +1057,70 @@ std::string Stack::currentCommandName() const void Stack::commandStart(const Variant &name, const Variant::mapType &args, bool range) { +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: commandStart " << name << " " << args << " " << range + << std::endl; +#endif impl->commandStart(name, args, range); } void Stack::annotationStart(const Variant &className, const Variant &args, bool range) { +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: annotationStart " << className << " " << args << " " + << range << std::endl; +#endif impl->annotationStart(className, args, range); } void Stack::annotationEnd(const Variant &className, const Variant &elementName) { +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: annotationEnd " << className << " " << elementName + << std::endl; +#endif impl->annotationEnd(className, elementName); } -void Stack::rangeEnd() { impl->rangeEnd(); } +void Stack::rangeEnd() +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: rangeEnd" << std::endl; +#endif + impl->rangeEnd(); +} -void Stack::fieldStart(bool isDefault) { impl->fieldStart(isDefault); } +void Stack::fieldStart(bool isDefault) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: fieldStart " << isDefault << std::endl; +#endif + impl->fieldStart(isDefault); +} -void Stack::fieldEnd() { impl->fieldEnd(); } +void Stack::fieldEnd() +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: fieldEnd" << std::endl; +#endif + impl->fieldEnd(); +} -void Stack::data(const TokenizedData &data) { impl->data(data); } +void Stack::data(const TokenizedData &data) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: data" << std::endl; +#endif + impl->data(data); +} -void Stack::data(const std::string &str) { data(TokenizedData(str)); } +void Stack::data(const std::string &str) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: data (string) " << str << std::endl; +#endif + data(TokenizedData(str)); +} } } diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index 1de7cff..6d42f10 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -104,7 +104,7 @@ public: * @param range if true, the started command has an explicit range. */ void commandStart(const Variant &name, const Variant::mapType &args, - bool range); + bool range = false); /** * Function that should be called whenever an annotation starts. @@ -115,7 +115,7 @@ public: * @param range if true, the annotation fields have an explicit range. */ void annotationStart(const Variant &className, const Variant &args, - bool range); + bool range = false); /** * Function that should be called whenever an annotation ends. diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp index 8f6c4df..a831c32 100644 --- a/test/core/parser/stack/StackTest.cpp +++ b/test/core/parser/stack/StackTest.cpp @@ -70,6 +70,16 @@ struct Tracker { bool fieldStartResult; bool dataResult; + Variant::mapType startCommandArgs; + Variant::mapType startAnnotationArgs; + + bool fieldStartReturnValue; + size_t fieldStartIdx; + bool fieldStartIsDefault; + bool fieldStartSetIsDefault; + + Variant dataData; + Tracker() { reset(); } void reset() @@ -89,6 +99,15 @@ struct Tracker { endTokenResult = Handler::EndTokenResult::ENDED_THIS; fieldStartResult = true; dataResult = true; + + startCommandArgs = Variant::mapType{}; + startAnnotationArgs = Variant::mapType{}; + + fieldStartIdx = 0; + fieldStartIsDefault = false; + fieldStartSetIsDefault = false; + + dataData = Variant{}; } void expect(int startCommandCount, int endCount, int fieldStartCount, @@ -115,13 +134,20 @@ private: public: bool startCommand(Variant::mapType &args) override { + tracker.startCommandArgs = args; tracker.startCommandCount++; + if (!tracker.startCommandResult) { + logger().error( + "TestHandler was told not to allow a command start. " + "TestHandler always obeys its master."); + } return tracker.startCommandResult; } bool startAnnotation(Variant::mapType &args, AnnotationType annotationType) override { + tracker.startAnnotationArgs = args; tracker.startAnnotationCount++; return tracker.startAnnotationResult; } @@ -142,6 +168,11 @@ public: bool fieldStart(bool &isDefault, size_t fieldIdx) override { + tracker.fieldStartIsDefault = isDefault; + tracker.fieldStartIdx = fieldIdx; + if (tracker.fieldStartSetIsDefault) { + isDefault = true; + } tracker.fieldStartCount++; return tracker.fieldStartResult; } @@ -150,6 +181,7 @@ public: bool data() override { + tracker.dataData = readData(); tracker.dataCount++; return tracker.dataResult; } @@ -199,456 +231,518 @@ TEST(Stack, basicTest) EXPECT_EQ("", s.currentCommandName()); EXPECT_EQ(&States::None, &s.currentState()); - s.commandStart("document", {}, true); + s.commandStart("document", {}); s.fieldStart(true); s.data("test1"); EXPECT_EQ("document", s.currentCommandName()); EXPECT_EQ(&States::Document, &s.currentState()); - tracker.expect(1, 0, 1, 0, 1); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(1, 0, 1, 0, 1); // scc, ec, fsc, fec, dc, sac, stc, etc - s.commandStart("body", {}, true); + s.commandStart("body", {}); s.fieldStart(true); s.data("test2"); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); - tracker.expect(2, 0, 2, 0, 2); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(2, 0, 2, 0, 2); // scc, ec, fsc, fec, dc, sac, stc, etc - s.commandStart("inner", {}, true); + s.commandStart("inner", {}); s.fieldStart(true); EXPECT_EQ("inner", s.currentCommandName()); EXPECT_EQ(&States::BodyChildren, &s.currentState()); s.fieldEnd(); - tracker.expect(3, 0, 3, 1, 2); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(3, 0, 3, 1, 2); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldEnd(); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); - tracker.expect(3, 1, 3, 2, 2); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(3, 1, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc - s.commandStart("body", {}, true); + s.commandStart("body", {}); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); - tracker.expect(4, 2, 3, 2, 2); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(4, 2, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldStart(true); s.data("test3"); EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); s.fieldEnd(); - tracker.expect(4, 2, 4, 3, 3); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(4, 2, 4, 3, 3); // scc, ec, fsc, fec, dc, sac, stc, etc EXPECT_EQ("body", s.currentCommandName()); EXPECT_EQ(&States::Body, &s.currentState()); s.fieldEnd(); - tracker.expect(4, 3, 4, 4, 3); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(4, 3, 4, 4, 3); // scc, ec, fsc, fec, dc, sac, stc, etc EXPECT_EQ("document", s.currentCommandName()); EXPECT_EQ(&States::Document, &s.currentState()); } - tracker.expect(4, 4, 4, 4, 3); // scc, ec, fsc, fse, dc, sac, stc, etc + tracker.expect(4, 4, 4, 4, 3); // scc, ec, fsc, fec, dc, sac, stc, etc ASSERT_FALSE(logger.hasError()); } -/* + +TEST(Stack, basicTestRangeCommands) +{ + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::TestHandlers}; + + EXPECT_EQ("", s.currentCommandName()); + EXPECT_EQ(&States::None, &s.currentState()); + + s.commandStart("document", {}, true); + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + + s.data("test1"); + tracker.expect(1, 0, 1, 0, 1); // scc, ec, fsc, fec, dc, sac, stc, etc + + s.commandStart("body", {}, true); + tracker.expect(2, 0, 1, 0, 1); // scc, ec, fsc, fec, dc, sac, stc, etc + s.data("test2"); + tracker.expect(2, 0, 2, 0, 2); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + + s.commandStart("inner", {}, true); + tracker.expect(3, 0, 2, 0, 2); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("inner", s.currentCommandName()); + EXPECT_EQ(&States::BodyChildren, &s.currentState()); + s.rangeEnd(); + tracker.expect(3, 1, 3, 1, 2); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + s.rangeEnd(); + tracker.expect(3, 2, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc + + s.commandStart("body", {}, true); + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + tracker.expect(4, 2, 3, 2, 2); // scc, ec, fsc, fse, dc, sac, stc, etc + s.fieldStart(true); + tracker.expect(4, 2, 4, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc + s.data("test3"); + tracker.expect(4, 2, 4, 2, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + s.fieldEnd(); + tracker.expect(4, 2, 4, 3, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + s.rangeEnd(); + tracker.expect(4, 3, 4, 3, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + s.rangeEnd(); + tracker.expect(4, 4, 4, 4, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(4, 4, 4, 4, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); +} + TEST(Stack, errorInvalidCommands) { - Stack s{env.context, States::TestHandlers}; - tracker.reset(); - EXPECT_THROW(s.command("body", {}), LoggableException); - s.command("document", {}); - s.fieldStart(true); - EXPECT_THROW(s.command("document", {}), LoggableException); - s.command("empty", {}); - s.fieldStart(true); - EXPECT_THROW(s.command("body", {}), LoggableException); - s.command("special", {}); - s.fieldStart(true); - s.fieldEnd(); - s.fieldEnd(); - s.fieldEnd(); + Stack s{parser, env.context, States::TestHandlers}; + tracker.reset(); + EXPECT_THROW(s.commandStart("body", {}), LoggableException); + s.commandStart("document", {}); + s.fieldStart(true); + EXPECT_THROW(s.commandStart("document", {}), LoggableException); + s.commandStart("empty", {}); + s.fieldStart(true); + EXPECT_THROW(s.commandStart("body", {}), LoggableException); + s.commandStart("special", {}); + s.fieldStart(true); + s.fieldEnd(); + s.fieldEnd(); + s.fieldEnd(); - logger.reset(); - s.fieldEnd(); - ASSERT_TRUE(logger.hasError()); + logger.reset(); + s.fieldEnd(); + ASSERT_TRUE(logger.hasError()); - EXPECT_THROW(s.data("test"), LoggableException); - EXPECT_EQ(&States::None, &s.currentState()); + EXPECT_THROW(s.data("test"), LoggableException); + EXPECT_EQ(&States::None, &s.currentState()); } TEST(Stack, validation) { - Stack s{env.context, States::TestHandlers}; - tracker.reset(); - logger.reset(); + Stack s{parser, env.context, States::TestHandlers}; + tracker.reset(); + logger.reset(); - s.command("arguments", {}); - EXPECT_TRUE(logger.hasError()); - s.fieldStart(true); - s.fieldEnd(); + s.commandStart("arguments", {}); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); - logger.reset(); - s.command("arguments", {{"a", 5}}); - EXPECT_TRUE(logger.hasError()); - s.fieldStart(true); - s.fieldEnd(); + logger.reset(); + s.commandStart("arguments", {{"a", 5}}, false); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); - logger.reset(); - s.command("arguments", {{"a", 5}, {"b", "test"}}); - EXPECT_FALSE(logger.hasError()); - s.fieldStart(true); - s.fieldEnd(); + logger.reset(); + s.commandStart("arguments", {{"a", 5}, {"b", "test"}}, false); + EXPECT_FALSE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); } TEST(Stack, invalidCommandName) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.command("a_", {}); - tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(2, 1, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - s.command("a_:b", {}); - tracker.expect(3, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(3, 2, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - ASSERT_THROW(s.command("_a", {}), LoggableException); - tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - ASSERT_THROW(s.command("a:", {}), LoggableException); - tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - - ASSERT_THROW(s.command("a:_b", {}), LoggableException); - tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + + s.commandStart("a_", {}); + tracker.expect(2, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 1, 2, 2, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + + s.commandStart("a_:b", {}); + tracker.expect(3, 2, 2, 2, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(3, 2, 3, 3, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + + ASSERT_THROW(s.commandStart("_a", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + + ASSERT_THROW(s.commandStart("a:", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + + ASSERT_THROW(s.commandStart("a:_b", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0); // scc, ec, fsc, fec, dc, sac, stc, etc } TEST(Stack, multipleFields) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::AnyHandlers}; - s.command("a", {{"a", false}}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("a", s.currentCommandName()); - EXPECT_EQ(Variant::mapType({{"a", false}}), tracker.startArgs); + s.commandStart("a", {{"a", false}}, false); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("a", s.currentCommandName()); + EXPECT_EQ(Variant::mapType({{"a", false}}), tracker.startCommandArgs); - s.fieldStart(false); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_FALSE(tracker.fieldStartIsDefault); - EXPECT_EQ(0U, tracker.fieldStartIdx); + s.fieldStart(false); + tracker.expect(1, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(0U, tracker.fieldStartIdx); - s.data("test"); - tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test", tracker.dataData.text().asString()); + s.data("test"); + tracker.expect(1, 0, 1, 0, 1); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("test", tracker.dataData.asString()); - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 1); // scc, ec, fsc, fec, dc, sac, stc, etc - s.fieldStart(false); - tracker.expect(1, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_FALSE(tracker.fieldStartIsDefault); - EXPECT_EQ(1U, tracker.fieldStartIdx); + s.fieldStart(false); + tracker.expect(1, 0, 2, 1, 1); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(1U, tracker.fieldStartIdx); - s.data("test2"); - tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test2", tracker.dataData.text().asString()); + s.data("test2"); + tracker.expect(1, 0, 2, 1, 2); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("test2", tracker.dataData.asString()); - s.fieldEnd(); - tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + tracker.expect(1, 0, 2, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc - s.fieldStart(true); - tracker.expect(1, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_TRUE(tracker.fieldStartIsDefault); - EXPECT_EQ(2U, tracker.fieldStartIdx); + s.fieldStart(true); + tracker.expect(1, 0, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_TRUE(tracker.fieldStartIsDefault); + EXPECT_EQ(2U, tracker.fieldStartIdx); - s.data("test3"); - tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - EXPECT_EQ("test3", tracker.dataData.text().asString()); + s.data("test3"); + tracker.expect(1, 0, 3, 2, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ("test3", tracker.dataData.asString()); - s.fieldEnd(); - tracker.expect(1, 0, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + s.fieldEnd(); + tracker.expect(1, 0, 3, 3, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 3, 3, 3); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, implicitDefaultFieldOnNewCommand) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc - s.command("b", {}); - tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + s.commandStart("b", {}); + tracker.expect(2, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(2, 2, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, implicitDefaultFieldOnNewCommandWithExplicitDefaultField) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("a", s.currentCommandName()); - s.command("b", {}); - tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(2, 0, 2, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(2, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + s.commandStart("b", {}); + tracker.expect(2, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("b", s.currentCommandName()); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 0, 2, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 2, 2, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, noImplicitDefaultFieldOnIncompatibleCommand) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("a", s.currentCommandName()); - tracker.fieldStartResult = false; - s.command("b", {}); - tracker.expect(2, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(2, 2, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + tracker.fieldStartResult = false; + s.commandStart("b", {}); + tracker.expect(2, 1, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, noImplicitDefaultFieldIfDefaultFieldGiven) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - s.fieldStart(true); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldStart(true); + tracker.expect(1, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("a", s.currentCommandName()); - s.command("b", {}); - tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + s.commandStart("b", {}); + tracker.expect(2, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, noEndIfStartFails) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("a", s.currentCommandName()); + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_EQ("a", s.currentCommandName()); - tracker.startResult = false; - s.command("b", {}); - tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_EQ("b", s.currentCommandName()); - } - tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_TRUE(logger.hasError()); + tracker.startCommandResult = false; + s.commandStart("b", {}); + tracker.expect(3, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + EXPECT_EQ(&States::None, &s.currentState()); + } + tracker.expect(3, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_TRUE(logger.hasError()); } TEST(Stack, implicitDefaultFieldOnData) { - tracker.reset(); - logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + { + Stack s{parser, env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc - s.data("test"); - tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + s.data("test"); + tracker.expect(1, 0, 1, 0, 1); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 1, 1, 1); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, autoFieldEnd) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + { + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, autoImplicitFieldEnd) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - s.command("b", {}); - s.command("c", {}); - s.command("d", {}); - s.command("e", {}); - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(5, 0, 5, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(5, 5, 5, 5, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + { + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + s.commandStart("b", {}); + s.commandStart("c", {}); + s.commandStart("d", {}); + s.commandStart("e", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(5, 0, 5, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(5, 5, 5, 5, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, invalidDefaultField) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.fieldStartResult = false; - s.fieldStart(true); - s.fieldEnd(); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - ASSERT_FALSE(logger.hasError()); + { + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); } TEST(Stack, errorInvalidDefaultFieldData) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.fieldStartResult = false; - s.fieldStart(true); - ASSERT_FALSE(logger.hasError()); - s.data("test"); - ASSERT_TRUE(logger.hasError()); - s.fieldEnd(); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + { + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + ASSERT_FALSE(logger.hasError()); + s.data("test"); + ASSERT_TRUE(logger.hasError()); + s.fieldEnd(); + tracker.expect(1, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc } TEST(Stack, errorInvalidFieldData) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.fieldStartResult = false; - ASSERT_FALSE(logger.hasError()); - s.fieldStart(false); - ASSERT_TRUE(logger.hasError()); - s.data("test"); - s.fieldEnd(); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + { + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.fieldStartResult = false; + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + s.data("test"); + s.fieldEnd(); + tracker.expect(1, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc } TEST(Stack, errorFieldStartNoCommand) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - Stack s{env.context, States::AnyHandlers}; - ASSERT_THROW(s.fieldStart(false), LoggableException); - ASSERT_THROW(s.fieldStart(true), LoggableException); - tracker.expect(0, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + Stack s{parser, env.context, States::AnyHandlers}; + ASSERT_THROW(s.fieldStart(false), LoggableException); + ASSERT_THROW(s.fieldStart(true), LoggableException); + tracker.expect(0, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc } TEST(Stack, errorMultipleFieldStarts) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + { + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc - s.fieldStart(false); - ASSERT_FALSE(logger.hasError()); - s.fieldStart(false); - ASSERT_TRUE(logger.hasError()); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldStart(false); + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc - s.fieldEnd(); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc } TEST(Stack, errorMultipleFieldEnds) { - tracker.reset(); - logger.reset(); + tracker.reset(); + logger.reset(); - { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + { + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc - s.fieldStart(false); - s.fieldEnd(); - ASSERT_FALSE(logger.hasError()); - tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - s.fieldEnd(); - ASSERT_TRUE(logger.hasError()); - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc - } - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldStart(false); + s.fieldEnd(); + ASSERT_FALSE(logger.hasError()); + tracker.expect(1, 0, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + s.fieldEnd(); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(1, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc } TEST(Stack, errorOpenField) @@ -657,15 +751,15 @@ TEST(Stack, errorOpenField) logger.reset(); { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldStart(false); ASSERT_FALSE(logger.hasError()); } ASSERT_TRUE(logger.hasError()); - tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(1, 1, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc } TEST(Stack, fieldEndWhenImplicitDefaultFieldOpen) @@ -674,15 +768,15 @@ TEST(Stack, fieldEndWhenImplicitDefaultFieldOpen) logger.reset(); { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); s.fieldStart(true); - s.command("b", {}); + s.commandStart("b", {}); s.data("test"); s.fieldEnd(); - tracker.expect(2, 1, 2, 2, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 1, 2, 2, 1); // scc, ec, fsc, fec, dc, sac, stc, etc } - tracker.expect(2, 2, 2, 2, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 2, 2, 2, 1); // scc, ec, fsc, fec, dc, sac, stc, etc ASSERT_FALSE(logger.hasError()); } @@ -692,46 +786,66 @@ TEST(Stack, fieldAfterDefaultField) logger.reset(); { - Stack s{env.context, States::AnyHandlers}; - s.command("a", {}); - tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + Stack s{parser, env.context, States::AnyHandlers}; + s.commandStart("a", {}); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldStart(true); - tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(1, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc - s.command("b", {}); - tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.commandStart("b", {}); + tracker.expect(2, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldStart(false); - tracker.expect(2, 0, 2, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 2, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc s.data("f1"); - tracker.expect(2, 0, 2, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 2, 0, 1); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldEnd(); - tracker.expect(2, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 2, 1, 1); // scc, ec, fsc, fec, dc, sac, stc, etc tracker.fieldStartSetIsDefault = true; s.fieldStart(false); tracker.fieldStartSetIsDefault = false; - tracker.expect(2, 0, 3, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 3, 1, 1); // scc, ec, fsc, fec, dc, sac, stc, etc s.data("f2"); - tracker.expect(2, 0, 3, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 3, 1, 2); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldEnd(); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc ASSERT_FALSE(logger.hasError()); s.fieldStart(false); ASSERT_TRUE(logger.hasError()); logger.reset(); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc s.data("f3"); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldEnd(); - tracker.expect(2, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 0, 3, 2, 2); // scc, ec, fsc, fec, dc, sac, stc, etc s.fieldEnd(); - tracker.expect(2, 1, 3, 3, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 1, 3, 3, 2); // scc, ec, fsc, fec, dc, sac, stc, etc } - tracker.expect(2, 2, 3, 3, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + tracker.expect(2, 2, 3, 3, 2); // scc, ec, fsc, fec, dc, sac, stc, etc ASSERT_FALSE(logger.hasError()); -}*/ +} + +TEST(Stack, rangeCommandUnranged) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{parser, env.context, States::AnyHandlers}; + tracker.expect(0, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + s.commandStart("a", {}, true); + tracker.expect(1, 0, 0, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + s.commandStart("b", {}); + tracker.expect(2, 0, 1, 0, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + s.rangeEnd(); + tracker.expect(2, 2, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + } + tracker.expect(2, 2, 1, 1, 0); // scc, ec, fsc, fec, dc, sac, stc, etc + ASSERT_FALSE(logger.hasError()); +} + } } -- cgit v1.2.3