diff options
Diffstat (limited to 'src/core')
35 files changed, 3272 insertions, 1430 deletions
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp index d5d379c..f7dbdf3 100644 --- a/src/core/common/SourceContextReader.cpp +++ b/src/core/common/SourceContextReader.cpp @@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader, ctx.relLen = end - start; // end >= start (I2) // Remove linebreaks at the beginning and the end - const std::pair<size_t, size_t> b = - Utils::trim(lineBuf, Utils::isLinebreak); + const std::pair<size_t, size_t> b = Utils::trim( + lineBuf, + [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); }); ssize_t s = b.first, e = b.second; s = std::min(s, static_cast<ssize_t>(ctx.relPos)); diff --git a/src/core/parser/utils/Token.cpp b/src/core/common/Token.cpp index 8bcdbb5..17ce03e 100644 --- a/src/core/parser/utils/Token.cpp +++ b/src/core/common/Token.cpp @@ -19,6 +19,6 @@ #include "Token.hpp" namespace ousia { -// Stub to make sure Tokens.hpp is valid + } diff --git a/src/core/parser/utils/Token.hpp b/src/core/common/Token.hpp index f907450..4b56f1a 100644 --- a/src/core/parser/utils/Token.hpp +++ b/src/core/common/Token.hpp @@ -30,6 +30,7 @@ #include <cstdint> #include <limits> #include <string> +#include <unordered_set> #include <core/common/Location.hpp> @@ -46,6 +47,11 @@ using TokenId = uint32_t; using TokenLength = uint16_t; /** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set<TokenId>; + +/** * Namespace containing constants for TokenId instances with special meaning. */ namespace Tokens { @@ -66,15 +72,29 @@ constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2; /** * Token which represents a paragraph token -- issued if two consecutive - * newlines occur with optionally any amout of whitespace between them. + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached. */ constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3; /** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4; + +/** * Token which represents an indentation token -- issued if the indentation of - * this line is larget than the indentation of the previous line. + * this line is larger than the indentation of the previous line. + */ +constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5; + +/** + * Token which represents an dedentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. */ -constexpr TokenId Indentation = std::numeric_limits<TokenId>::max() - 4; +constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6; /** * Maximum token id to be used. Tokens allocated for users should not surpass @@ -109,6 +129,16 @@ struct Token { Token() : id(Tokens::Empty) {} /** + * Constructor of a "data" token with no explicit content. + * + * @param location is the location of the extracted string content in the + * source file. + */ + Token(const SourceLocation &location) : id(Tokens::Data), location(location) + { + } + + /** * Constructor of the Token struct. * * @param id represents the token id. @@ -116,12 +146,26 @@ struct Token { * @param location is the location of the extracted string content in the * source file. */ - Token(TokenId id, const std::string &content, SourceLocation location) + Token(TokenId id, const std::string &content, + const SourceLocation &location) : id(id), content(content), location(location) { } /** + * Constructor of the a "data" Token with the given string data and + * location. + * + * @param content is the string content that should be stored in the token. + * @param location is the location of the content within the source file. + */ + Token(const std::string &content, + const SourceLocation &location = SourceLocation{}) + : id(Tokens::Data), content(content), location(location) + { + } + + /** * Constructor of the Token struct, only initializes the token id * * @param id is the id corresponding to the id of the token. @@ -129,6 +173,14 @@ struct Token { Token(TokenId id) : id(id) {} /** + * Returns true if this token is special. + * + * @return true if the TokenId indicates that this token is a "special" + * token. + */ + bool isSpecial() const { return id > Tokens::MaxTokenId; } + + /** * The getLocation function allows the tokens to be directly passed as * parameter to Logger or LoggableException instances. * @@ -139,4 +191,3 @@ struct Token { } #endif /* _OUSIA_TOKENS_HPP_ */ - diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index a77951e..a87ff6d 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename) return std::string{}; } -std::string Utils::trim(const std::string &s) -{ - std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::startsWith(const std::string &s, const std::string &prefix) { return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; @@ -124,5 +118,36 @@ bool Utils::endsWith(const std::string &s, const std::string &suffix) return suffix.size() <= s.size() && s.substr(s.size() - suffix.size(), suffix.size()) == suffix; } -} +bool Utils::isUserDefinedToken(const std::string &token) +{ + // Make sure the token meets is neither empty, nor starts or ends with an + // alphanumeric character + const size_t len = token.size(); + if (len == 0 || isAlphanumeric(token[0]) || + isAlphanumeric(token[len - 1])) { + return false; + } + + // Make sure the token is not any special OSML token + if (token == "\\" || token == "%" || token == "%{" || token == "}%" || + token == "{!" || token == "<\\" || token == "\\>") { + return false; + } + + // Make sure the token does not contain any whitespaces. + for (char c : token) { + if (isWhitespace(c)) { + return false; + } + } + + // Make sure the token contains other characters but { and } + for (char c : token) { + if (c != '{' && c != '}') { + return true; + } + } + return false; +} +}
\ No newline at end of file diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 7d96562..d9e26da 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -103,6 +103,26 @@ public: static bool isNamespacedIdentifier(const std::string &name); /** + * Returns true if the given characters form a valid user-defined token. + * This function returns true under the following circumstances: + * <ul> + * <li>The given token is not empty</li> + * <li>The given token starts and ends with a non-alphanumeric character + * </li> + * <li>The token is none of the following character sequences (which are + * special in OSML): + * <ul> + * <li>'{', '}' or any combined repetition of these characters</li> + * <li>'\', '{!', '<\', '\>'</li> + * <li>'%', '%{', '}%'</li> + * </ul> + * </li> + * <li>The token does not contain any whitespaces.</li> + * </ul> + */ + static bool isUserDefinedToken(const std::string &token); + + /** * Returns true if the given character is a linebreak character. */ static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } @@ -124,14 +144,6 @@ public: static bool hasNonWhitepaceChar(const std::string &s); /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** * Trims the given string or vector of chars by returning the start and end * index. * @@ -153,8 +165,8 @@ public: * * @param s is the container that should be trimmed. * @param len is the number of elements in the container. - * @param f is a function that returns true for values that should be - * removed. + * @param f is a function that returns true for values at a certain index + * that should be removed. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" */ @@ -163,7 +175,7 @@ public: { size_t start = 0; for (size_t i = 0; i < len; i++) { - if (!f(s[i])) { + if (!f(i)) { start = i; break; } @@ -171,7 +183,7 @@ public: size_t end = 0; for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) { - if (!f(s[i])) { + if (!f(i)) { end = i + 1; break; } @@ -198,17 +210,33 @@ public: * the collapsed version of the string ends. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" + * @param f is a function that returns true for values at a certain index + * that should be removed. */ - template <class T> - static std::string trim(const T &s, size_t len, size_t &start, size_t &end) + template <class T, class Filter> + static std::string trim(const T &s, size_t len, size_t &start, size_t &end, + Filter f) { - auto res = trim(s, len, isWhitespace); + auto res = trim(s, len, f); start = res.first; end = res.second; return std::string(&s[start], end - start); } /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s) + { + std::pair<size_t, size_t> bounds = + trim(s, [&s](size_t i) { return isWhitespace(s[i]); }); + return s.substr(bounds.first, bounds.second - bounds.first); + } + + /** * Collapses the whitespaces in the given string (trims the string and * replaces all whitespace characters by a single one). * @@ -219,7 +247,8 @@ public: { size_t start; size_t end; - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -236,7 +265,8 @@ public: static std::string collapse(const std::string &s, size_t &start, size_t &end) { - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -244,6 +274,8 @@ public: * replaces all whitespace characters by a single one). * * @tparam T is the string type that should be used. + * @tparam Filter is a filter function used for detecting the character + * indices that might be removed. * @param s is the string in which the whitespace should be collapsed. * @param len is the length of the input string * @param start is an output parameter which is set to the offset at which @@ -252,9 +284,9 @@ public: * the collapsed version of the string ends. * @return a copy of s with collapsed whitespace. */ - template <class T> + template <class T, class Filter> static std::string collapse(const T &s, size_t len, size_t &start, - size_t &end) + size_t &end, Filter f) { // Result vector std::vector<char> res; @@ -268,8 +300,7 @@ public: bool hadWhitespace = false; for (size_t i = 0; i < len; i++) { const char c = s[i]; - const bool whitespace = isWhitespace(c); - if (whitespace) { + if (f(i)) { hadWhitespace = !res.empty(); } else { // Adapt the start and end position diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include <string> -#include <vector> - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector<char> textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - WhitespaceHandler() : textStart(0), textEnd(0) {} - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } - - /** - * Returns the content of the WhitespaceHandler as string. - */ - std::string toString() const - { - return std::string(textBuf.data(), textBuf.size()); - } -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd); - } - - /** - * Static version of PreservingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector<char> whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); - } - - /** - * Static version of TrimmingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param whitespaceBuf is a reference at the buffer for storing whitespace - * characters. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd, std::vector<char> &whitespaceBuf) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); - } - - /** - * Static version of CollapsingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param hasWhitespace is a reference at the "hasWhitespace" flag. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd, bool &hasWhitespace) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template <typename WhitespaceHandler, typename Buffer> -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, - size_t start) -{ - for (auto elem : buf) { - handler.append(elem, start, start + 1); - start++; - } -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ - diff --git a/src/core/model/Ontology.cpp b/src/core/model/Ontology.cpp index 8829139..3af727d 100644 --- a/src/core/model/Ontology.cpp +++ b/src/core/model/Ontology.cpp @@ -20,8 +20,9 @@ #include <queue> #include <set> -#include <core/common/RttiBuilder.hpp> #include <core/common/Exceptions.hpp> +#include <core/common/RttiBuilder.hpp> +#include <core/common/Utils.hpp> #include "Ontology.hpp" @@ -169,52 +170,60 @@ static NodeVector<Node> pathTo(const Node *start, Logger &logger, return shortest; } +struct CollectState { + Node *n; + size_t depth; + + CollectState(Node *n, size_t depth) : n(n), depth(depth) {} +}; + template <typename F> static NodeVector<Node> collect(const Node *start, F match) { // result NodeVector<Node> res; // queue for breadth-first search of graph. - std::queue<Rooted<Node>> q; + std::queue<CollectState> q; // put the initial node on the stack. - q.push(const_cast<Node *>(start)); + q.push(CollectState(const_cast<Node *>(start), 0)); // set of visited nodes. std::unordered_set<const Node *> visited; while (!q.empty()) { - Rooted<Node> n = q.front(); + CollectState state = q.front(); q.pop(); // do not proceed if this node was already visited. - if (!visited.insert(n.get()).second) { + if (!visited.insert(state.n).second) { continue; } - if (n->isa(&RttiTypes::StructuredClass)) { - Rooted<StructuredClass> strct = n.cast<StructuredClass>(); + if (state.n->isa(&RttiTypes::Descriptor)) { + Rooted<Descriptor> strct{static_cast<Descriptor *>(state.n)}; // look through all fields. NodeVector<FieldDescriptor> fields = strct->getFieldDescriptors(); for (auto fd : fields) { // note matches. - if (match(fd)) { + if (match(fd, state.depth)) { res.push_back(fd); } // only continue in the TREE field. if (fd->getFieldType() == FieldDescriptor::FieldType::TREE) { - q.push(fd); + q.push(CollectState(fd.get(), state.depth)); } } } else { // otherwise this is a FieldDescriptor. - Rooted<FieldDescriptor> field = n.cast<FieldDescriptor>(); + Rooted<FieldDescriptor> field{ + static_cast<FieldDescriptor *>(state.n)}; // and we proceed by visiting all permitted children. for (auto c : field->getChildrenWithSubclasses()) { // note matches. - if (match(c)) { + if (match(c, state.depth)) { res.push_back(c); } // We only continue our search via transparent children. if (c->isTransparent()) { - q.push(c); + q.push(CollectState(c.get(), state.depth + 1)); } } } @@ -222,28 +231,59 @@ static NodeVector<Node> collect(const Node *start, F match) return res; } +static std::vector<SyntaxDescriptor> collectPermittedTokens( + const Node *start, Handle<Domain> domain) +{ + // gather SyntaxDescriptors for structure children first. + std::vector<SyntaxDescriptor> res; + collect(start, [&res](Handle<Node> n, size_t depth) { + SyntaxDescriptor stx; + if (n->isa(&RttiTypes::FieldDescriptor)) { + stx = n.cast<FieldDescriptor>()->getSyntaxDescriptor(depth); + } else { + stx = n.cast<Descriptor>()->getSyntaxDescriptor(depth); + } + // do not add trivial SyntaxDescriptors. + if (!stx.isEmpty()) { + res.push_back(stx); + } + return false; + }); + // gather SyntaxDescriptors for AnnotationClasses. + for (auto a : domain->getAnnotationClasses()) { + SyntaxDescriptor stx = a->getSyntaxDescriptor(); + if (!stx.isEmpty()) { + res.push_back(stx); + } + } + return res; +} + /* Class FieldDescriptor */ FieldDescriptor::FieldDescriptor(Manager &mgr, Handle<Type> primitiveType, Handle<Descriptor> parent, FieldType fieldType, - std::string name, bool optional) + std::string name, bool optional, + WhitespaceMode whitespaceMode) : Node(mgr, std::move(name), parent), children(this), fieldType(fieldType), primitiveType(acquire(primitiveType)), optional(optional), - primitive(true) + primitive(true), + whitespaceMode(whitespaceMode) { } FieldDescriptor::FieldDescriptor(Manager &mgr, Handle<Descriptor> parent, FieldType fieldType, std::string name, - bool optional) + bool optional, WhitespaceMode whitespaceMode) : Node(mgr, std::move(name), parent), children(this), fieldType(fieldType), optional(optional), - primitive(false) + primitive(false), + whitespaceMode(whitespaceMode) { } @@ -272,6 +312,25 @@ bool FieldDescriptor::doValidate(Logger &logger) const } else { valid = valid & validateName(logger); } + // check start and end token. + if (!startToken.special && !startToken.token.empty() && + !Utils::isUserDefinedToken(startToken.token)) { + // TODO: Correct error message. + logger.error(std::string("Field \"") + getName() + + "\" has an invalid custom start token: " + + startToken.token, + *this); + valid = false; + } + if (!endToken.special && !endToken.token.empty() && + !Utils::isUserDefinedToken(endToken.token)) { + // TODO: Correct error message. + logger.error(std::string("Field \"") + getName() + + "\" has an invalid custom end token: " + + endToken.token, + *this); + valid = false; + } // check consistency of FieldType with the rest of the FieldDescriptor. if (primitive) { @@ -325,7 +384,7 @@ bool FieldDescriptor::doValidate(Logger &logger) const } static void gatherSubclasses( - std::unordered_set<const StructuredClass *>& visited, + std::unordered_set<const StructuredClass *> &visited, NodeVector<StructuredClass> &res, Handle<StructuredClass> strct) { // this check is to prevent cycles. @@ -334,7 +393,7 @@ static void gatherSubclasses( } for (auto sub : strct->getSubclasses()) { // this check is to prevent cycles. - if(visited.count(sub.get())){ + if (visited.count(sub.get())) { continue; } res.push_back(sub); @@ -381,7 +440,7 @@ NodeVector<Node> FieldDescriptor::pathTo(Handle<FieldDescriptor> field, NodeVector<FieldDescriptor> FieldDescriptor::getDefaultFields() const { // TODO: In principle a cast would be nicer here, but for now we copy. - NodeVector<Node> nodes = collect(this, [](Handle<Node> n) { + NodeVector<Node> nodes = collect(this, [](Handle<Node> n, size_t depth) { if (!n->isa(&RttiTypes::FieldDescriptor)) { return false; } @@ -396,6 +455,16 @@ NodeVector<FieldDescriptor> FieldDescriptor::getDefaultFields() const return res; } +std::vector<SyntaxDescriptor> FieldDescriptor::getPermittedTokens() const +{ + if (getParent() == nullptr || + getParent().cast<Descriptor>()->getParent() == nullptr) { + return std::vector<SyntaxDescriptor>(); + } + return collectPermittedTokens( + this, getParent().cast<Descriptor>()->getParent().cast<Domain>()); +} + /* Class Descriptor */ void Descriptor::doResolve(ResolutionState &state) @@ -443,6 +512,25 @@ bool Descriptor::doValidate(Logger &logger) const } valid = valid & attributesDescriptor->validate(logger); } + + // check start and end token. + if (!startToken.special && !startToken.token.empty() && + !Utils::isUserDefinedToken(startToken.token)) { + logger.error(std::string("Descriptor \"") + getName() + + "\" has an invalid custom start token: " + + startToken.token, + *this); + valid = false; + } + if (!endToken.special && !endToken.token.empty() && + !Utils::isUserDefinedToken(endToken.token)) { + logger.error(std::string("Descriptor \"") + getName() + + "\" has an invalid custom end token: " + + endToken.token, + *this); + valid = false; + } + // check that only one FieldDescriptor is of type TREE. auto fds = Descriptor::getFieldDescriptors(); bool hasTREE = false; @@ -483,7 +571,7 @@ std::pair<NodeVector<Node>, bool> Descriptor::pathTo( NodeVector<FieldDescriptor> Descriptor::getDefaultFields() const { // TODO: In principle a cast would be nicer here, but for now we copy. - NodeVector<Node> nodes = collect(this, [](Handle<Node> n) { + NodeVector<Node> nodes = collect(this, [](Handle<Node> n, size_t depth) { if (!n->isa(&RttiTypes::FieldDescriptor)) { return false; } @@ -501,7 +589,7 @@ NodeVector<FieldDescriptor> Descriptor::getDefaultFields() const NodeVector<StructuredClass> Descriptor::getPermittedChildren() const { // TODO: In principle a cast would be nicer here, but for now we copy. - NodeVector<Node> nodes = collect(this, [](Handle<Node> n) { + NodeVector<Node> nodes = collect(this, [](Handle<Node> n, size_t depth) { return n->isa(&RttiTypes::StructuredClass); }); NodeVector<StructuredClass> res; @@ -669,6 +757,14 @@ std::pair<Rooted<FieldDescriptor>, bool> Descriptor::createFieldDescriptor( return std::make_pair(fd, sorted); } +std::vector<SyntaxDescriptor> Descriptor::getPermittedTokens() const +{ + if (getParent() == nullptr) { + return std::vector<SyntaxDescriptor>(); + } + return collectPermittedTokens(this, getParent().cast<Domain>()); +} + /* Class StructuredClass */ StructuredClass::StructuredClass(Manager &mgr, std::string name, @@ -709,6 +805,16 @@ bool StructuredClass::doValidate(Logger &logger) const logger.error(cardinality.toString() + " is not a cardinality!", *this); valid = false; } + + // check short token. + if (!shortToken.special && !shortToken.token.empty() && + !Utils::isUserDefinedToken(shortToken.token)) { + logger.error(std::string("Descriptor \"") + getName() + + "\" has an invalid custom short form token: " + + shortToken.token, + *this); + valid = false; + } // check the validity of this superclass. if (superclass != nullptr) { valid = valid & superclass->validate(logger); @@ -961,6 +1067,51 @@ Rooted<AnnotationClass> Ontology::createAnnotationClass(std::string name) new AnnotationClass(getManager(), std::move(name), this)}; } +static void gatherTokenDescriptors( + Handle<Descriptor> desc, std::vector<TokenDescriptor *> &res, + std::unordered_set<FieldDescriptor *> &visited) +{ + // add the TokenDescriptors for the Descriptor itself. + if (!desc->getStartToken().isEmpty()) { + res.push_back(desc->getStartTokenPointer()); + } + if (!desc->getEndToken().isEmpty()) { + res.push_back(desc->getEndTokenPointer()); + } + // add the TokenDescriptors for its FieldDescriptors. + for (auto fd : desc->getFieldDescriptors()) { + if (!visited.insert(fd.get()).second) { + continue; + } + if (!fd->getStartToken().isEmpty()) { + res.push_back(fd->getStartTokenPointer()); + } + if (!fd->getEndToken().isEmpty()) { + res.push_back(fd->getEndTokenPointer()); + } + } +} + +std::vector<TokenDescriptor *> Domain::getAllTokenDescriptors() const +{ + std::vector<TokenDescriptor *> res; + // note all fields that are already visited because FieldReferences might + // lead to doubled fields. + std::unordered_set<FieldDescriptor *> visited; + // add the TokenDescriptors for the StructuredClasses (and their fields). + for (auto s : structuredClasses) { + if (!s->getShortToken().isEmpty()) { + res.push_back(s->getShortTokenPointer()); + } + gatherTokenDescriptors(s, res, visited); + } + // add the TokenDescriptors for the AnnotationClasses (and their fields). + for (auto a : annotationClasses) { + gatherTokenDescriptors(a, res, visited); + } + return res; +} + /* Type registrations */ namespace RttiTypes { diff --git a/src/core/model/Ontology.hpp b/src/core/model/Ontology.hpp index e1fbe96..d682bdf 100644 --- a/src/core/model/Ontology.hpp +++ b/src/core/model/Ontology.hpp @@ -168,11 +168,13 @@ #ifndef _OUSIA_MODEL_DOMAIN_HPP_ #define _OUSIA_MODEL_DOMAIN_HPP_ +#include <core/common/Whitespace.hpp> #include <core/managed/ManagedContainer.hpp> #include <core/RangeSet.hpp> #include "Node.hpp" #include "RootNode.hpp" +#include "Syntax.hpp" #include "Typesystem.hpp" namespace ousia { @@ -226,6 +228,9 @@ private: Owned<Type> primitiveType; bool optional; bool primitive; + TokenDescriptor startToken; + TokenDescriptor endToken; + WhitespaceMode whitespaceMode; protected: bool doValidate(Logger &logger) const override; @@ -234,39 +239,46 @@ public: /** * This is the constructor for primitive fields. * - * @param mgr is the global Manager instance. - * @param parent is a handle of the Descriptor node that has this - * FieldDescriptor. - * @param primitiveType is a handle to some Type in some Typesystem of which - * one instance is allowed to fill this field. - * @param name is the name of this field. - * @param optional should be set to 'false' is this field needs to be - * filled in order for an instance of the parent - * Descriptor to be valid. + * @param mgr is the global Manager instance. + * @param parent is a handle of the Descriptor node that has this + * FieldDescriptor. + * @param primitiveType is a handle to some Type in some Typesystem of + *which + * one instance is allowed to fill this field. + * @param name is the name of this field. + * @param optional should be set to 'false' is this field needs to be + * filled in order for an instance of the parent + * Descriptor to be valid. + * @param whitespaceMode the WhitespaceMode to be used when an instance of + * this FieldDescriptor is parsed. */ FieldDescriptor(Manager &mgr, Handle<Type> primitiveType, Handle<Descriptor> parent, FieldType fieldType = FieldType::TREE, - std::string name = "", bool optional = false); + std::string name = "", bool optional = false, + WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * This is the constructor for non-primitive fields. You have to provide * children here later on. * - * @param mgr is the global Manager instance. - * @param parent is a handle of the Descriptor node that has this - * FieldDescriptor. - * @param fieldType is the FieldType of this FieldDescriptor, either - * TREE for the main or default structure or SUBTREE - * for supporting structures. - * @param name is the name of this field. - * @param optional should be set to 'false' is this field needs to be - * filled in order for an instance of the parent - * Descriptor to be valid. + * @param mgr is the global Manager instance. + * @param parent is a handle of the Descriptor node that has this + * FieldDescriptor. + * @param fieldType is the FieldType of this FieldDescriptor, either + * TREE for the main or default structure or SUBTREE + * for supporting structures. + * @param name is the name of this field. + * @param optional should be set to 'false' is this field needs to be + * filled in order for an instance of the parent + * Descriptor to be valid. + * @param whitespaceMode the WhitespaceMode to be used when an instance of + * this FieldDescriptor is parsed. */ FieldDescriptor(Manager &mgr, Handle<Descriptor> parent = nullptr, FieldType fieldType = FieldType::TREE, - std::string name = "", bool optional = false); + std::string name = "", bool optional = false, + WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Returns a const reference to the NodeVector of StructuredClasses whose @@ -455,6 +467,109 @@ public: return std::move(name); } } + + /** + * Returns a pointer to the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * Note that this does not invalidate the FieldDescriptor. So use with + * care. + * + * @return a pointer to the start TokenDescriptor. + */ + TokenDescriptor *getStartTokenPointer() { return &startToken; } + + /** + * Returns a copy of the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @return a copy of the start TokenDescriptor. + */ + TokenDescriptor getStartToken() const { return startToken; } + + /** + * Sets the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @param st the new start TokenDescriptor. + */ + void setStartToken(TokenDescriptor st) + { + invalidate(); + startToken = st; + } + + /** + * Returns a pointer to the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a pointer to the end TokenDescriptor. + */ + TokenDescriptor *getEndTokenPointer() { return &endToken; } + + /** + * Returns a copy of the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a copy of the end TokenDescriptor. + */ + TokenDescriptor getEndToken() const { return endToken; } + + /** + * Sets the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @param e the new end TokenDescriptor. + */ + void setEndToken(TokenDescriptor e) + { + invalidate(); + endToken = e; + } + + /** + * Returns the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + * + * @return the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + */ + WhitespaceMode getWhitespaceMode() const { return whitespaceMode; } + + /** + * Sets the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + * + * @param wm the WhitespaceMode to be used when an instance of this + * FieldDescriptor is parsed. + */ + WhitespaceMode setWhitespaceMode(WhitespaceMode wm) + { + return whitespaceMode = wm; + } + + /** + * Returns the SyntaxDescriptor for this FieldDescriptor. + * + * @return the SyntaxDescriptor for this FieldDescriptor. + */ + SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1) + { + SyntaxDescriptor stx{startToken.id, endToken.id, Tokens::Empty, + const_cast<FieldDescriptor *>(this), depth}; + return stx; + } + + /** + * Returns a vector of SyntaxDescriptors, one for each Descriptor + * (StructuredClasses, AnnotationClasses or FieldDescriptors) that is + * permitted as child of this FieldDescriptor. This also makes use + * of transparency. + * + * @return a vector of SyntaxDescriptors, one for each Descriptor that is + * permitted as child of this FieldDescriptor + */ + std::vector<SyntaxDescriptor> getPermittedTokens() const; }; /** @@ -478,7 +593,10 @@ public: * </A> * \endcode * - * key="value" inside the A-node would be an attribute, while <key>value</key> + * key="value" inside the A-node would be an attribute, while + * \code{.xml} + * <key>value</key> + * \endcode * would be a primitive field. While equivalent in XML the semantics are * different: An attribute describes indeed attributes, features of one single * node whereas a primitive field describes the _content_ of a node. @@ -490,6 +608,8 @@ class Descriptor : public Node { private: Owned<StructType> attributesDescriptor; NodeVector<FieldDescriptor> fieldDescriptors; + TokenDescriptor startToken; + TokenDescriptor endToken; bool addAndSortFieldDescriptor(Handle<FieldDescriptor> fd, Logger &logger); @@ -738,6 +858,85 @@ public: * of an instance of this Descriptor in the structure tree. */ NodeVector<StructuredClass> getPermittedChildren() const; + + /** + * Returns a pointer to the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @return a pointer to the start TokenDescriptor. + */ + TokenDescriptor *getStartTokenPointer() { return &startToken; } + + /** + * Returns a copy of the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @return a copy of the start TokenDescriptor. + */ + TokenDescriptor getStartToken() const { return startToken; } + + /** + * Sets the start TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor starts. + * + * @param st the new start TokenDescriptor. + */ + void setStartToken(TokenDescriptor st) + { + invalidate(); + startToken = st; + } + + /** + * Returns a pointer to the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a pointer to the end TokenDescriptor. + */ + TokenDescriptor *getEndTokenPointer() { return &endToken; } + + /** + * Returns a copy of the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @return a copy of the end TokenDescriptor. + */ + TokenDescriptor getEndToken() const { return endToken; } + + /** + * Sets the end TokenDescriptor. This Token is used as a + * signifier during parsing that an instance of this FieldDescriptor ends. + * + * @param e the new end TokenDescriptor. + */ + void setEndToken(TokenDescriptor e) + { + invalidate(); + endToken = e; + } + + /** + * Returns the SyntaxDescriptor for this Descriptor. + * + * @return the SyntaxDescriptor for this Descriptor. + */ + virtual SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1) + { + SyntaxDescriptor stx{startToken.id, endToken.id, Tokens::Empty, + const_cast<Descriptor *>(this), depth}; + return stx; + } + + /** + * Returns a vector of SyntaxDescriptors, one for each Descriptor + * (StructuredClasses, AnnotationClasses or FieldDescriptors) that is + * permitted as child of this Descriptor. This also makes use + * of transparency. + * + * @return a vector of SyntaxDescriptors, one for each Descriptor that is + * permitted as child of this Descriptor. + */ + std::vector<SyntaxDescriptor> getPermittedTokens() const; }; /* * TODO: We should discuss Cardinalities one more time. Is it smart to define @@ -824,6 +1023,7 @@ private: NodeVector<StructuredClass> subclasses; bool transparent; bool root; + TokenDescriptor shortToken; /** * Helper method for getFieldDescriptors. @@ -981,6 +1181,50 @@ public: invalidate(); root = std::move(r); } + + /** + * Returns a pointer to the short TokenDescriptor. During parsing an + * occurence of this token will be translated to an empty instance of this + * StructuredClass. + * + * @return a pointer to the short TokenDescriptor. + */ + TokenDescriptor *getShortTokenPointer() { return &shortToken; } + + /** + * Returns a copy of the short TokenDescriptor. During parsing an + * occurence of this token will be translated to an empty instance of this + * StructuredClass. + * + * @return a copy of the short TokenDescriptor. + */ + TokenDescriptor getShortToken() const { return shortToken; } + + /** + * Sets the short TokenDescriptor. During parsing an + * occurence of this token will be translated to an empty instance of this + * StructuredClass. + * + * @param s the new short TokenDescriptor. + */ + void setShortToken(TokenDescriptor s) + { + invalidate(); + shortToken = s; + } + + /** + * Returns the SyntaxDescriptor for this StructuredClass. + * + * @return the SyntaxDescriptor for this StructuredClass. + */ + SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1) override + { + SyntaxDescriptor stx{getStartToken().id, getEndToken().id, + shortToken.id, const_cast<StructuredClass *>(this), + depth}; + return stx; + } }; /** @@ -1207,6 +1451,13 @@ public: { ontologies.insert(ontologies.end(), ds.begin(), ds.end()); } + + /** + * Returns all TokenDescriptors of classes and fields in this Ontology. + * + * @return all TokenDescriptors of classes and fields in this Ontology. + */ + std::vector<TokenDescriptor *> getAllTokenDescriptors() const; }; namespace RttiTypes { @@ -1219,4 +1470,4 @@ extern const Rtti Ontology; } } -#endif /* _OUSIA_MODEL_DOMAIN_HPP_ */
\ No newline at end of file +#endif /* _OUSIA_MODEL_DOMAIN_HPP_ */ diff --git a/src/core/model/Syntax.cpp b/src/core/model/Syntax.cpp new file mode 100644 index 0000000..9dbaccc --- /dev/null +++ b/src/core/model/Syntax.cpp @@ -0,0 +1,58 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "Syntax.hpp" + +#include "Domain.hpp" + +namespace ousia { + +/* Class TokenSyntaxDescriptor */ + +bool SyntaxDescriptor::isAnnotation() const +{ + return descriptor->isa(&RttiTypes::AnnotationClass); +} +bool SyntaxDescriptor::isFieldDescriptor() const +{ + return descriptor->isa(&RttiTypes::FieldDescriptor); +} +bool SyntaxDescriptor::isStruct() const +{ + return descriptor->isa(&RttiTypes::StructuredClass); +} + +void SyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const +{ + if (start != Tokens::Empty) { + set.insert(start); + } + if (end != Tokens::Empty) { + set.insert(end); + } + if (shortForm != Tokens::Empty) { + set.insert(shortForm); + } +} + +bool SyntaxDescriptor::isEmpty() const +{ + return start == Tokens::Empty && end == Tokens::Empty && + shortForm == Tokens::Empty; +} +}
\ No newline at end of file diff --git a/src/core/model/Syntax.hpp b/src/core/model/Syntax.hpp new file mode 100644 index 0000000..4da3408 --- /dev/null +++ b/src/core/model/Syntax.hpp @@ -0,0 +1,196 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Syntax.hpp + * + * This header contains the Descriptor classes for user definable syntax for + * Document entities or fields. These classes are referenced in Ontology.hpp. + */ + +#ifndef _OUSIA_MODEL_SYNTAX_HPP_ +#define _OUSIA_MODEL_SYNTAX_HPP_ + +#include <core/common/Token.hpp> +#include "Node.hpp" + +namespace ousia { + +/** + * Class to describe a single token that shall be used as user-defined syntax. + */ +struct TokenDescriptor { + /** + * The string content of this token, if it is not a special one. + */ + std::string token; + /** + * A flag to be set true if this TokenDescriptor uses a special token. + */ + bool special; + /** + * An id to uniquely identify this token. + */ + TokenId id; + + /** + * Constructor for non-special tokens. The special flag is set to false and + * the id to Tokens::Empty. + * + * @param token The string content of this token, if it is not a special + * one. + */ + TokenDescriptor(std::string token = std::string()) + : token(std::move(token)), special(false), id(Tokens::Empty) + { + } + + /** + * Constructor for special tokens. The token is set to an empty string and + * the special flag to true. + * + * @param id the id of the special token. + */ + TokenDescriptor(TokenId id) : special(true), id(id) {} + + /** + * Returns true if and only if neither a string nor an ID is given. + * + * @return true if and only if neither a string nor an ID is given. + */ + bool isEmpty() const { return token.empty() && id == Tokens::Empty; } +}; + +/** + * Class describing the user defined syntax for a StructuredClass, + * AnnotationClass or FieldDescriptor. + * + * This class is used during parsing of a Document. It is used to describe + * the tokens relevant for one Descriptor that could be created at this point + * during parsing. + */ +struct SyntaxDescriptor { + /** + * Possible start token or Tokens::Empty if no token is set. + */ + TokenId start; + + /** + * Possible end token or Tokens::Empty if no token is set. + */ + TokenId end; + + /** + * Possible representation token or Tokens::Empty if no token is set. + */ + TokenId shortForm; + + /* + * The Descriptor this SyntaxDescriptor belongs to. As this may be + * a FieldDescriptor as well as a class Descriptor (StructuredClass or + * AnnotationClass) we can only use the class Node as inner argument here. + */ + Rooted<Node> descriptor; + /* + * Given the current leaf in the parsed document the depth of a + * SyntaxDescriptor is defined as the number of transparent elements that + * would be needed to construct an instance of the referenced descriptor. + */ + ssize_t depth; + + /** + * Default constructor, sets all token ids to Tokens::Empty and the + * descriptor handle to nullptr. + */ + SyntaxDescriptor() + : start(Tokens::Empty), + end(Tokens::Empty), + shortForm(Tokens::Empty), + descriptor(nullptr), + depth(-1) + { + } + + /** + * Member initializer constructor. + * + * @param start is a possible start token. + * @param end is a possible end token. + * @param shortForm is a possible short form token. + * @param descriptor The Descriptor this SyntaxDescriptor belongs to. + * @param depth Given the current leaf in the parsed document the depth of a + * SyntaxDescriptor is defined as the number of transparent elements that + * would be needed to construct an instance of the referenced descriptor. + */ + SyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, + Handle<Node> descriptor, ssize_t depth) + : start(start), + end(end), + shortForm(shortForm), + descriptor(descriptor), + depth(depth) + { + } + + /** + * Inserts all tokens referenced in this SyntaxDescriptor into the + * given TokenSet. Skips token ids set to Tokens::Empty. + * + * @param set is the TokenSet instance into which the Tokens should be + * inserted. + */ + void insertIntoTokenSet(TokenSet &set) const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to an + * AnnotationClass. + * + * @return true if and only if this SyntaxDescriptor belongs to an + * AnnotationClass. + */ + bool isAnnotation() const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to a + * StrcturedClass. + * + * @return true if and only if this SyntaxDescriptor belongs to a + * StrcturedClass. + */ + bool isStruct() const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to a + * FieldDescriptor. + * + * @return true if and only if this SyntaxDescriptor belongs to a + * FieldDescriptor. + */ + bool isFieldDescriptor() const; + + /** + * Returns true if and only if this SyntaxDescriptor has only empty + * entries in start, end and short. + * + * @return true if and only if this SyntaxDescriptor has only empty + * entries in start, end and short. + */ + bool isEmpty() const; +}; +} +#endif
\ No newline at end of file diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp index 6ebc549..44b31c6 100644 --- a/src/core/parser/stack/Callbacks.cpp +++ b/src/core/parser/stack/Callbacks.cpp @@ -19,5 +19,15 @@ #include "Callbacks.hpp" namespace ousia { +namespace parser_stack { + +/* Class ParserCallbacks */ + +ParserCallbacks::~ParserCallbacks() +{ + // Do nothing here +} + +} } diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp index 9c61000..dfe41fc 100644 --- a/src/core/parser/stack/Callbacks.hpp +++ b/src/core/parser/stack/Callbacks.hpp @@ -30,68 +30,80 @@ #define _OUSIA_PARSER_STACK_CALLBACKS_HPP_ #include <string> +#include <vector> #include <core/common/Whitespace.hpp> +#include <core/common/Token.hpp> +#include <core/model/Syntax.hpp> namespace ousia { + +// Forward declarations +class Variant; + namespace parser_stack { /** - * Interface defining a set of callback functions that act as a basis for the - * StateStackCallbacks and the ParserCallbacks. + * Interface between the Stack class and the underlying parser used for + * registering and unregistering tokens. */ -class Callbacks { +class ParserCallbacks { public: /** * Virtual descructor. */ - virtual ~Callbacks() {}; - - /** - * Sets the whitespace mode that specifies how string data should be - * processed. - * - * @param whitespaceMode specifies one of the three WhitespaceMode constants - * PRESERVE, TRIM or COLLAPSE. - */ - virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; + virtual ~ParserCallbacks(); /** * Registers the given token as token that should be reported to the handler * using the "token" function. * * @param token is the token string that should be reported. + * @return the token id with which the token will be reported. Should return + * Tokens::Empty if the given token could not be registered. */ - virtual void registerToken(const std::string &token) = 0; + virtual TokenId registerToken(const std::string &token) = 0; /** * Unregisters the given token, it will no longer be reported to the handler * using the "token" function. * - * @param token is the token string that should be unregistered. + * @param id is the token id of the token that should be unregistered. */ - virtual void unregisterToken(const std::string &token) = 0; + virtual void unregisterToken(TokenId id) = 0; }; /** - * Interface defining the callback functions that can be passed from a - * StateStack to the underlying parser. + * Interface defining a set of callback functions that act as a basis for the + * StateStackCallbacks and the ParserCallbacks. */ -class ParserCallbacks : public Callbacks { +class HandlerCallbacks : public ParserCallbacks { +public: /** - * Checks whether the given token is supported by the parser. The parser - * returns true, if the token is supported, false if this token cannot be - * registered. Note that parsers that do not support the registration of - * tokens at all should always return "true". + * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. + * The tokens described in the token list are the tokens that are currently + * enabled. * - * @param token is the token that should be checked for support. - * @return true if the token is generally supported (or the parser does not - * support registering tokens at all), false if the token is not supported, - * because e.g. it is a reserved token or it interferes with other tokens. + * @param tokens is a list of TokenSyntaxDescriptor instances that should be + * stored on the stack. */ - virtual bool supportsToken(const std::string &token) = 0; -}; + virtual void pushTokens(const std::vector<SyntaxDescriptor> &tokens) = 0; + + /** + * Removes the previously pushed list of tokens from the stack. + */ + virtual void popTokens() = 0; + /** + * Reads a string variant form the current input stream. This function must + * be called from the data() method. + * + * @return a string variant containing the current text data. The return + * value depends on the currently set whitespace mode and the tokens that + * were enabled using the enableTokens callback method. + */ + virtual Variant readData() = 0; +}; } } diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index a307f71..26b9b6e 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -25,6 +25,7 @@ #include <core/model/Ontology.hpp> #include <core/model/Project.hpp> #include <core/model/Typesystem.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <core/parser/ParserScope.hpp> #include <core/parser/ParserContext.hpp> @@ -36,7 +37,7 @@ namespace parser_stack { /* DocumentHandler */ -bool DocumentHandler::start(Variant::mapType &args) +bool DocumentHandler::startCommand(Variant::mapType &args) { Rooted<Document> document = context().getProject()->createDocument(args["name"].asString()); @@ -51,6 +52,11 @@ void DocumentHandler::end() { scope().pop(logger()); } /* DocumentChildHandler */ +DocumentChildHandler::DocumentChildHandler(const HandlerData &handlerData) + : Handler(handlerData), isExplicitField(false) +{ +} + void DocumentChildHandler::preamble(Rooted<Node> &parentNode, size_t &fieldIdx, DocumentEntity *&parent) { @@ -121,10 +127,10 @@ void DocumentChildHandler::createPath(const size_t &firstFieldIdx, scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, false); } -bool DocumentChildHandler::start(Variant::mapType &args) +bool DocumentChildHandler::startCommand(Variant::mapType &args) { - // extract the special "name" attribute from the input arguments. - // the remaining attributes will be forwarded to the newly constructed + // Extract the special "name" attribute from the input arguments. + // The remaining attributes will be forwarded to the newly constructed // element. std::string nameAttr; { @@ -168,13 +174,6 @@ bool DocumentChildHandler::start(Variant::mapType &args) preamble(parentNode, fieldIdx, parent); - // TODO: REMOVE - std::string thisName = name(); - std::string parentClassName; - if (parent != nullptr) { - parentClassName = parent->getDescriptor()->getName(); - } - /* * Try to find a FieldDescriptor for the given tag if we are not in * a field already. This does _not_ try to construct transparent @@ -191,9 +190,9 @@ bool DocumentChildHandler::start(Variant::mapType &args) "Data or structure commands have already been " "given, command \"") + name() + std::string( - "\" is not interpreted explicit " - "field. Move explicit field " - "references to the beginning."), + "\" is not interpreted explicit " + "field. Move explicit field " + "references to the beginning."), location()); } else { Rooted<DocumentField> field{new DocumentField( @@ -260,15 +259,34 @@ bool DocumentChildHandler::start(Variant::mapType &args) } } +bool DocumentChildHandler::startAnnotation(Variant::mapType &args, + AnnotationType annotationType) +{ + // TODO: Handle annotation + return false; +} + +bool DocumentChildHandler::startToken(Handle<Node> node) +{ + // TODO: Handle token start + return false; +} + +DocumentChildHandler::EndTokenResult DocumentChildHandler::endToken( + const Token &token, Handle<Node> node) +{ + // TODO: Handle token end + return EndTokenResult::ENDED_NONE; +} + void DocumentChildHandler::end() { - // in case of explicit fields we do not want to pop something from the + // In case of explicit fields we do not want to pop something from the // stack. - if (isExplicitField) { - return; + if (!isExplicitField) { + // pop the "main" element. + scope().pop(logger()); } - // pop the "main" element. - scope().pop(logger()); } bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx) @@ -278,6 +296,7 @@ bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx) isDefault = true; return fieldIdx == 0; } + Rooted<Node> parentNode = scope().getLeaf(); assert(parentNode->isa(&RttiTypes::StructuredEntity) || parentNode->isa(&RttiTypes::AnnotationEntity)); @@ -290,7 +309,7 @@ bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx) parent->getDescriptor()->getFieldDescriptors(); if (isDefault) { - if(fields.empty()){ + if (fields.empty()) { return false; } fieldIdx = fields.size() - 1; @@ -316,33 +335,19 @@ void DocumentChildHandler::fieldEnd() { assert(scope().getLeaf()->isa(&RttiTypes::DocumentField)); - // pop the field from the stack. + // Pop the field from the stack. scope().pop(logger()); - // pop all remaining transparent elements. + // Pop all remaining transparent elements. while (scope().getLeaf()->isa(&RttiTypes::StructuredEntity) && scope().getLeaf().cast<StructuredEntity>()->isTransparent()) { - // pop the transparent element. + // Pop the transparent element. scope().pop(logger()); - // pop the transparent field. + // Pop the transparent field. scope().pop(logger()); } } -bool DocumentChildHandler::annotationStart(const Variant &className, - Variant::mapType &args) -{ - // TODO: Implement - return false; -} - -bool DocumentChildHandler::annotationEnd(const Variant &className, - const Variant &elementName) -{ - // TODO: Implement - return false; -} - bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field, Variant &data, Logger &logger) { @@ -370,7 +375,7 @@ bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field, return valid && scope().resolveValue(data, type, logger); } -bool DocumentChildHandler::data(Variant &data) +bool DocumentChildHandler::data() { // We're past the region in which explicit fields can be defined in the // parent structure element @@ -391,11 +396,12 @@ bool DocumentChildHandler::data(Variant &data) // If it is a primitive field directly, try to parse the content. if (field->isPrimitive()) { // Add it as primitive content. - if (!convertData(field, data, logger())) { + Variant text = readData(); + if (!convertData(field, text, logger())) { return false; } - parent->createChildDocumentPrimitive(data, fieldIdx); + parent->createChildDocumentPrimitive(text, fieldIdx); return true; } @@ -409,7 +415,11 @@ bool DocumentChildHandler::data(Variant &data) for (auto primitiveField : defaultFields) { // Then try to parse the content using the type specification. forks.emplace_back(logger().fork()); - if (!convertData(primitiveField, data, forks.back())) { + + // TODO: Actually the data has to be read after the path has been + // created (as createPath may push more tokens onto the stack) + Variant text = readData(); + if (!convertData(primitiveField, text, forks.back())) { continue; } @@ -418,24 +428,24 @@ bool DocumentChildHandler::data(Variant &data) // Construct the necessary path NodeVector<Node> path = field->pathTo(primitiveField, logger()); - // TODO: Create methods with indices instead of names. createPath(fieldIdx, path, parent); // Then create the primitive element - parent->createChildDocumentPrimitive(data); + parent->createChildDocumentPrimitive(text); return true; } // No field was found that might take the data -- dump the error messages // from the loggers -- or, if there were no primitive fields, clearly state // this fact + Variant text = readData(); if (defaultFields.empty()) { logger().error("Got data, but structure \"" + name() + "\" does not have any primitive field", - data); + text); } else { logger().error("Could not read data with any of the possible fields:", - data); + text); size_t f = 0; for (auto field : defaultFields) { logger().note(std::string("Field ") + @@ -461,7 +471,9 @@ const State DocumentChild = StateBuilder() .createdNodeTypes({&RttiTypes::StructureNode, &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}) - .elementHandler(DocumentChildHandler::create); + .elementHandler(DocumentChildHandler::create) + .supportsAnnotations(true) + .supportsTokens(true); } } @@ -469,4 +481,4 @@ namespace RttiTypes { const Rtti DocumentField = RttiBuilder<ousia::parser_stack::DocumentField>( "DocumentField").parent(&Node); } -}
\ No newline at end of file +} diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 44feb2b..0e35558 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -53,7 +53,7 @@ class DocumentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -92,9 +92,10 @@ public: */ class DocumentChildHandler : public Handler { private: - bool isExplicitField = false; - //TODO: REMOVE - std::string strct_name; + /** + * If set to true, this handler represents an explicit field. + */ + bool isExplicitField; /** * Code shared by both the start(), fieldStart() and the data() method. @@ -163,22 +164,18 @@ private: Logger &logger); public: - using Handler::Handler; + DocumentChildHandler(const HandlerData &handlerData); - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; + bool startAnnotation(Variant::mapType &args, + AnnotationType annotationType) override; + bool startToken(Handle<Node> node) override; + EndTokenResult endToken(const Token &token, Handle<Node> node) override; void end() override; - bool data(Variant &data) override; - + bool data() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; - void fieldEnd() override; - bool annotationStart(const Variant &className, - Variant::mapType &args) override; - - bool annotationEnd(const Variant &className, - const Variant &elementName) override; - /** * Creates a new instance of the DocumentChildHandler. * @@ -213,4 +210,5 @@ extern const Rtti DocumentField; } } -#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */
\ No newline at end of file +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index bf5d4ea..c01e74c 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -18,6 +18,8 @@ #include <core/common/Exceptions.hpp> #include <core/common/Logger.hpp> +#include <core/common/Variant.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <core/parser/ParserContext.hpp> #include "Callbacks.hpp" @@ -29,14 +31,10 @@ namespace parser_stack { /* Class HandlerData */ -HandlerData::HandlerData(ParserContext &ctx, /*Callbacks &callbacks,*/ - const std::string &name, const State &state, - const SourceLocation &location) - : ctx(ctx), - /*callbacks(callbacks),*/ - name(name), - state(state), - location(location) +HandlerData::HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks, + const State &state, const Token &token, + HandlerType type) + : ctx(ctx), callbacks(callbacks), state(state), token(token), type(type) { } @@ -63,28 +61,39 @@ Logger &Handler::logger() return handlerData.ctx.getLogger(); } -const SourceLocation &Handler::location() const { return handlerData.location; } +const std::string &Handler::name() const { return handlerData.token.content; } -const std::string &Handler::name() const { return handlerData.name; } +TokenId Handler::tokenId() const { return handlerData.token.id; } -void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) +const Token &Handler::token() const { return handlerData.token; } + +const SourceLocation &Handler::location() const { - /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/ + return handlerData.token.location; } -void Handler::registerToken(const std::string &token) +HandlerType Handler::type() const { return handlerData.type; } + +const State &Handler::state() const { return handlerData.state; } + +Variant Handler::readData() { return handlerData.callbacks.readData(); } + +void Handler::pushTokens(const std::vector<SyntaxDescriptor> &tokens) { - /*handlerData.callbacks.registerToken(token);*/ + handlerData.callbacks.pushTokens(tokens); } -void Handler::unregisterToken(const std::string &token) +void Handler::popTokens() { handlerData.callbacks.popTokens(); } + +TokenId Handler::registerToken(const std::string &token) { - /*handlerData.callbacks.unregisterToken(token);*/ + return handlerData.callbacks.registerToken(token); } -const std::string &Handler::getName() const { return name(); } - -const State &Handler::getState() const { return handlerData.state; } +void Handler::unregisterToken(TokenId id) +{ + handlerData.callbacks.unregisterToken(id); +} void Handler::setLogger(Logger &logger) { internalLogger = &logger; } @@ -94,43 +103,50 @@ const SourceLocation &Handler::getLocation() const { return location(); } /* Class EmptyHandler */ -bool EmptyHandler::start(Variant::mapType &args) +bool EmptyHandler::startCommand(Variant::mapType &args) { - // Just accept anything + // Well, we'll support any command we get, don't we? return true; } -void EmptyHandler::end() +bool EmptyHandler::startAnnotation(Variant::mapType &args, + Handler::AnnotationType annotationType) { - // Do nothing if a command ends + // Do not support annotations. Annotations are too complicated for poor + // EmptyHandler. + return false; } -bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex) +bool EmptyHandler::startToken(Handle<Node> node) { - // Accept any field - return true; + // EmptyHandler does not support tokens. + return false; } -void EmptyHandler::fieldEnd() +Handler::EndTokenResult EmptyHandler::endToken(const Token &token, + Handle<Node> node) { - // Do not handle fields + // There are no tokens to end here. + return EndTokenResult::ENDED_NONE; } -bool EmptyHandler::annotationStart(const Variant &className, - Variant::mapType &args) +void EmptyHandler::end() { - // Accept any data - return true; + // Do nothing if a command ends } -bool EmptyHandler::annotationEnd(const Variant &className, - const Variant &elementName) +bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex) { - // Accept any annotation + // Accept any field return true; } -bool EmptyHandler::data(Variant &data) +void EmptyHandler::fieldEnd() +{ + // Do not handle field ends +} + +bool EmptyHandler::data() { // Support any data return true; @@ -143,12 +159,26 @@ Handler *EmptyHandler::create(const HandlerData &handlerData) /* Class StaticHandler */ -bool StaticHandler::start(Variant::mapType &args) +bool StaticHandler::startCommand(Variant::mapType &args) { // Do nothing in the default implementation, accept anything return true; } +bool StaticHandler::startAnnotation(Variant::mapType &args, + Handler::AnnotationType annotationType) +{ + return false; +} + +bool StaticHandler::startToken(Handle<Node> node) { return false; } + +Handler::EndTokenResult StaticHandler::endToken(const Token &token, + Handle<Node> node) +{ + return EndTokenResult::ENDED_NONE; +} + void StaticHandler::end() { // Do nothing here @@ -170,23 +200,9 @@ void StaticHandler::fieldEnd() // Do nothing here } -bool StaticHandler::annotationStart(const Variant &className, - Variant::mapType &args) -{ - // No annotations supported - return false; -} - -bool StaticHandler::annotationEnd(const Variant &className, - const Variant &elementName) +bool StaticHandler::data() { - // No annotations supported - return false; -} - -bool StaticHandler::data(Variant &data) -{ - logger().error("Did not expect any data here", data); + logger().error("Did not expect any data here", readData()); return false; } @@ -198,7 +214,7 @@ StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData, { } -bool StaticFieldHandler::start(Variant::mapType &args) +bool StaticFieldHandler::startCommand(Variant::mapType &args) { if (!argName.empty()) { auto it = args.find(argName); @@ -227,12 +243,15 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(Variant &data) +bool StaticFieldHandler::data() { + // Fetch the actual text data + Variant stringData = readData(); + // Call the doHandle function if this has not been done before if (!handled) { handled = true; - doHandle(data, args); + doHandle(stringData, args); return true; } @@ -240,7 +259,7 @@ bool StaticFieldHandler::data(Variant &data) logger().error( std::string("Found data, but the corresponding argument \"") + argName + std::string("\" was already specified"), - data); + stringData); // Print the location at which the attribute was originally specified auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 7cda7a4..67fde06 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -1,6 +1,6 @@ /* Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,6 +16,15 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +/** + * @file Handler.hpp + * + * Contains the definition of the Handler class, used for representing Handlers + * for certain syntactic elements. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + #ifndef _OUSIA_PARSER_STACK_HANDLER_HPP_ #define _OUSIA_PARSER_STACK_HANDLER_HPP_ @@ -24,6 +33,9 @@ #include <core/common/Location.hpp> #include <core/common/Variant.hpp> #include <core/common/Whitespace.hpp> +#include <core/common/Token.hpp> +#include <core/model/Node.hpp> +#include <core/model/Syntax.hpp> namespace ousia { @@ -31,14 +43,23 @@ namespace ousia { class ParserScope; class ParserContext; class Logger; +class TokenizedData; +class Variant; namespace parser_stack { // More forward declarations -class Callbacks; +class HandlerCallbacks; class State; /** + * Enum describing the type of the Handler instance -- a document handler may + * be created for handling a simple command, a token or an annotation start and + * end. + */ +enum class HandlerType { COMMAND, ANNOTATION_START, ANNOTATION_END, TOKEN }; + +/** * Class collecting all the data that is being passed to a Handler * instance. */ @@ -51,26 +72,28 @@ public: ParserContext &ctx; /** - * Reference at an instance of the Callbacks class, used for - * modifying the behaviour of the parser (like registering tokens, setting - * the data type or changing the whitespace handling mode). + * Reference at a class implementing the HandlerCallbacks interface, used + * for modifying the behaviour of the parser (like registering tokens, + * setting the data type or changing the whitespace handling mode). */ - // Callbacks &callbacks; + HandlerCallbacks &callbacks; /** - * Contains the name of the command that is being handled. + * Contains the current state of the state machine. */ - std::string name; + const State &state; /** - * Contains the current state of the state machine. + * Token containing the name of the command that is being handled, the + * location of the element in the source code or the token id of the token + * that is being handled. */ - const State &state; + Token token; /** - * Current source code location. + * Type describing for which purpose the HandlerData instance was created. */ - SourceLocation location; + HandlerType type; /** * Constructor of the HandlerData class. @@ -78,13 +101,13 @@ public: * @param ctx is the parser context the handler should be executed in. * @param callbacks is an instance of Callbacks used to notify * the parser about certain state changes. - * @param name is the name of the string. * @param state is the state this handler was called for. - * @param location is the location at which the handler is created. + * @param token contains name, token id and location of the command that is + * being handled. + * @param type describes the purpose of the Handler instance at hand. */ - HandlerData(ParserContext &ctx, - /*Callbacks &callbacks,*/ const std::string &name, - const State &state, const SourceLocation &location); + HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks, + const State &state, const Token &token, HandlerType type); }; /** @@ -115,6 +138,94 @@ protected: Handler(const HandlerData &handlerData); /** + * Calls the corresponding function in the HandlerCallbacks instance. This + * method registers the given tokens as tokens that are generally available, + * tokens must be explicitly enabled using the "pushTokens" and "popTokens" + * method. Tokens that have not been registered are not guaranteed to be + * reported (except for special tokens, these do not have to be registerd). + * + * @param token is the token string that should be made available. + * @return the TokenId that will be used to refer to the token. + */ + TokenId registerToken(const std::string &token); + + /** + * Calls the corresponding function in the HandlerCallbacks instance. This + * method unregisters the given token. Note that for a token to be no longer + * reported, this function has to be called as many times as registerToken() + * for the corresponding token. + * + * @param id is the id of the Token that should be unregistered. + */ + void unregisterToken(TokenId id); + + /** + * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack. + * The tokens described in the token list are the tokens that are currently + * enabled. + * + * @param tokens is a list of TokenSyntaxDescriptor instances that should be + * stored on the stack. + */ + void pushTokens(const std::vector<SyntaxDescriptor> &tokens); + + /** + * Calls the corresponding function in the HandlerCallbacks instance. + * Removes the previously pushed list of tokens from the stack. + */ + void popTokens(); + + /** + * Calls the corresponding method in the HandlerCallbacks instance. Reads a + * string variant form the current input stream. This function must be + * called from the data() method. + * + * @return a string variant containing the current text data. The return + * value depends on the currently set whitespace mode and the tokens that + * were enabled using the enableTokens callback method. + */ + Variant readData(); + + /** + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + // void pushWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. + */ + // void popWhitespaceMode(); + +public: + /** + * Enum representing the type of the annotation a Handle instance handles. + * It may either handle the start of an annotation or the end of an + * annotation. + */ + enum class AnnotationType { START, END }; + + /** + * Enum type representing the possible outcomes of the endToken() method. + */ + enum class EndTokenResult { ENDED_THIS, ENDED_HIDDEN, ENDED_NONE }; + + /** + * Virtual destructor. + */ + virtual ~Handler(); + + /** * Returns a reference at the ParserContext. * * @return a reference at the ParserContext. @@ -144,68 +255,55 @@ protected: Logger &logger(); /** - * Returns the location of the element in the source file, for which this - * Handler was created. + * Returns the name of the command or annotation the handler is currently + * handling. In case the command is currently handling a token, the name + * corresponds to the token string sequence. * - * @return the location of the Handler in the source file. - */ - const SourceLocation &location() const; - - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. + * @return the name of the command or the string sequence of the token that + * is being handled by this handler. */ const std::string &name() const; -public: - /** - * Virtual destructor. - */ - virtual ~Handler(); - /** - * Calls the corresponding function in the Callbacks instance. Sets the - * whitespace mode that specifies how string data should be processed. The - * calls to this function are placed on a stack by the underlying Stack - * class. + * Returns the token id of the token that is currently being handled by the + * handler. In case the handler currently handles a command or annotation, + * the token id is set to Tokens::Data. * - * @param whitespaceMode specifies one of the three WhitespaceMode constants - * PRESERVE, TRIM or COLLAPSE. + * @return the current token id or Tokens::Data if no token is being + * handled. */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); + TokenId tokenId() const; /** - * Calls the corresponding function in the Callbacks instance. - * Registers the given token as token that should be reported to the handler - * using the "token" function. + * Returns a reference at the Token instance, containing either the token + * that is currently being handled or the name of the command and annotation + * and their location. * - * @param token is the token string that should be reported. + * @return a const reference at the internal token instance. */ - void registerToken(const std::string &token); + const Token &token() const; /** - * Calls the corresponding function in the Callbacks instance. - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. + * Returns the location of the element in the source file, for which this + * Handler was created. * - * @param token is the token string that should be unregistered. + * @return the location of the Handler in the source file. */ - void unregisterToken(const std::string &token); + const SourceLocation &location() const; /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. + * Returns the type describing the purpose for which the handler instance + * was created. */ - const std::string &getName() const; + HandlerType type() const; /** - * Reference at the State descriptor for which this Handler was created. + * Returns a reference at the State descriptor for which this Handler was + * created. * * @return a const reference at the constructing State descriptor. */ - const State &getState() const; + const State &state() const; /** * Sets the internal logger to the given logger instance. @@ -229,14 +327,62 @@ public: const SourceLocation &getLocation() const; /** - * Called when the command that was specified in the constructor is - * instanciated. + * Called whenever the handler should handle the start of a command. This + * method (or any other of the "start" methods) is called exactly once, + * after the constructor. The name of the command that is started here can + * be accessed using the name() method. + * + * @param args is a map from strings to variants (argument name and value). + * @return true if the handler was successful in starting an element with + * the given name represents, false otherwise. + */ + virtual bool startCommand(Variant::mapType &args) = 0; + + /** + * Called whenever the handler should handle the start of an annotation. + * This method (or any other of the "start" methods) is called exactly once, + * after the constructor. This method is only called if the + * "supportsAnnotations" flag of the State instance referencing this Handler + * is set to true. The name of the command that is started here can be + * accessed using the name() method. * * @param args is a map from strings to variants (argument name and value). - * @return true if the handler was successful in starting the element it - * represents, false otherwise. + * @param type specifies whether this handler should handle the start of an + * annotation or the end of an annotation. + */ + virtual bool startAnnotation(Variant::mapType &args, + AnnotationType annotationType) = 0; + + /** + * Called whenever the handler should handle the start of a token. This + * method (or any other of the "start" methods) is called exactly once, + * after the constructor. This method is only called if the "supportsTokens" + * flag of the State instance referencing this Handler is set to true. The + * token id of the token that is should be handled can be accessed using the + * tokenId() method. + * + * @param node is the node for which this token was registered. + */ + virtual bool startToken(Handle<Node> node) = 0; + + /** + * Called whenever a token is marked as "end" token and this handler happens + * to be the currently active handler. This operation may have three + * outcomes: + * <ol> + * <li>The token marks the end of the complete handler and the calling + * code should call the "end" method.</li> + * <li>The token marks the end of some element that is unknown the calling + * code. So the operation itself was a success, but the calling code + * should not call the "end" method. + * <li>The token did not anything in this context. Basically this shuold + * never happen, but who knows.</li> + * </ol> + * + * @param id is the Token for which the handler should be started. + * @param node is the node for which this token was registered. */ - virtual bool start(Variant::mapType &args) = 0; + virtual EndTokenResult endToken(const Token &token, Handle<Node> node) = 0; /** * Called before the command for which this handler is defined ends (is @@ -266,44 +412,14 @@ public: virtual void fieldEnd() = 0; /** - * Called whenever an annotation starts while this handler is active. The - * function should return true if starting the annotation was successful, - * false otherwise. - * - * @param className is a string variant containing the name of the - * annotation class and the location of the name in the source code. - * @param args is a map from strings to variants (argument name and value). - * @return true if the mentioned annotation could be started here, false - * if an error occurred. - */ - virtual bool annotationStart(const Variant &className, - Variant::mapType &args) = 0; - - /** - * Called whenever an annotation ends while this handler is active. The - * function should return true if ending the annotation was successful, - * false otherwise. - * - * @param className is a string variant containing the name of the - * annotation class and the location of the class name in the source code. - * @param elementName is a string variant containing the name of the - * annotation class and the location of the element name in the source code. - * @return true if the mentioned annotation could be started here, false if - * an error occurred. - */ - virtual bool annotationEnd(const Variant &className, - const Variant &elementName) = 0; - - /** * Called whenever raw data (int the form of a string) is available for the * Handler instance. Should return true if the data could be handled, false - * otherwise. + * otherwise. The actual data variant must be retrieved using the "text()" + * callback. * - * @param data is a string variant containing the character data and its - * location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(Variant &data) = 0; + virtual bool data() = 0; }; /** @@ -325,15 +441,15 @@ protected: using Handler::Handler; public: - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; + bool startAnnotation(Variant::mapType &args, + AnnotationType annotationType) override; + bool startToken(Handle<Node> node) override; + EndTokenResult endToken(const Token &token, Handle<Node> node) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; - bool annotationStart(const Variant &className, - Variant::mapType &args) override; - bool annotationEnd(const Variant &className, - const Variant &elementName) override; - bool data(Variant &data) override; + bool data() override; /** * Creates an instance of the EmptyHandler class. @@ -351,15 +467,15 @@ protected: using Handler::Handler; public: - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; + bool startAnnotation(Variant::mapType &args, + AnnotationType annotationType) override; + bool startToken(Handle<Node> node) override; + EndTokenResult endToken(const Token &token, Handle<Node> node) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; - bool annotationStart(const Variant &className, - Variant::mapType &args) override; - bool annotationEnd(const Variant &className, - const Variant &elementName) override; - bool data(Variant &data) override; + bool data() override; }; /** @@ -406,13 +522,12 @@ protected: * @param fieldData is the captured field data. * @param args are the arguments that were given in the "start" function. */ - virtual void doHandle(const Variant &fieldData, - Variant::mapType &args) = 0; + virtual void doHandle(const Variant &fieldData, Variant::mapType &args) = 0; public: - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; + bool data() override; void end() override; - bool data(Variant &data) override; }; } } diff --git a/src/core/parser/stack/OntologyHandler.cpp b/src/core/parser/stack/OntologyHandler.cpp index 8c0e4d9..3b3b386 100644 --- a/src/core/parser/stack/OntologyHandler.cpp +++ b/src/core/parser/stack/OntologyHandler.cpp @@ -33,7 +33,7 @@ namespace parser_stack { /* OntologyHandler */ -bool OntologyHandler::start(Variant::mapType &args) +bool DomainHandler::startCommand(Variant::mapType &args) { // Create the Ontology node Rooted<Ontology> ontology = @@ -226,9 +226,9 @@ bool OntologyChildHandler::start(Variant::mapType &args) { Rooted<FieldDescriptor> field = scope().selectOrThrow<FieldDescriptor>(); - const std::string &ref = args["ref"].asString(); + const std::string &name = args["ref"].asString(); scope().resolve<StructuredClass>( - ref, field, logger(), + name, field, logger(), [](Handle<Node> child, Handle<Node> field, Logger &logger) { if (child != nullptr) { field.cast<FieldDescriptor>()->addChild( @@ -275,7 +275,7 @@ bool OntologyParentFieldHandler::start(Variant::mapType &args) scope().resolve<Descriptor>( parentNameNode->getName(), strct, logger(), [type, name, optional](Handle<Node> parent, Handle<Node> strct, - Logger &logger) { + Logger &logger) { if (parent != nullptr) { Rooted<FieldDescriptor> field = (parent.cast<Descriptor>()->createFieldDescriptor( @@ -299,21 +299,20 @@ bool OntologyParentFieldRefHandler::start(Variant::mapType &args) // resolve the parent, get the referenced field and add the declared // StructuredClass as child to it. - scope().resolve<Descriptor>( - parentNameNode->getName(), strct, logger(), - [name, loc](Handle<Node> parent, Handle<Node> strct, Logger &logger) { - if (parent != nullptr) { - Rooted<FieldDescriptor> field = - parent.cast<Descriptor>()->getFieldDescriptor(name); - if (field == nullptr) { - logger.error( - std::string("Could not find referenced field ") + name, - loc); - return; - } - field->addChild(strct.cast<StructuredClass>()); - } - }); + scope().resolve<Descriptor>(parentNameNode->getName(), strct, logger(), + [name, loc](Handle<Node> parent, + Handle<Node> strct, Logger &logger) { + if (parent != nullptr) { + Rooted<FieldDescriptor> field = + parent.cast<Descriptor>()->getFieldDescriptor(name); + if (field == nullptr) { + logger.error( + std::string("Could not find referenced field ") + name, loc); + return; + } + field->addChild(strct.cast<StructuredClass>()); + } + }); return true; } diff --git a/src/core/parser/stack/OntologyHandler.hpp b/src/core/parser/stack/OntologyHandler.hpp index caeacc7..66146bd 100644 --- a/src/core/parser/stack/OntologyHandler.hpp +++ b/src/core/parser/stack/OntologyHandler.hpp @@ -46,7 +46,7 @@ class OntologyHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -59,7 +59,7 @@ class OntologyStructHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -72,7 +72,7 @@ class OntologyAnnotationHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -85,7 +85,7 @@ class OntologyAttributesHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -98,7 +98,7 @@ class OntologyFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -111,7 +111,7 @@ class OntologyFieldRefHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -124,7 +124,7 @@ class OntologyPrimitiveHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -137,7 +137,7 @@ class OntologyChildHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -154,7 +154,7 @@ class OntologyParentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -167,7 +167,7 @@ class OntologyParentFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -179,7 +179,7 @@ class OntologyParentFieldRefHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 5b67248..f341f1d 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -19,18 +19,148 @@ #include <core/common/Logger.hpp> #include <core/common/Utils.hpp> #include <core/common/Exceptions.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include <core/parser/ParserScope.hpp> #include <core/parser/ParserContext.hpp> +#include "Callbacks.hpp" #include "Handler.hpp" #include "Stack.hpp" #include "State.hpp" +#include "TokenRegistry.hpp" +#include "TokenStack.hpp" + +#define STACK_DEBUG_OUTPUT 0 +#if STACK_DEBUG_OUTPUT +#include <iostream> +#endif namespace ousia { namespace parser_stack { +namespace { /* Class HandlerInfo */ +/** + * The HandlerInfo class is used internally by the stack to associate additional + * (mutable) data with a handler instance. + */ +class HandlerInfo { +public: + /** + * Pointer pointing at the actual handler instance. + */ + std::shared_ptr<Handler> handler; + + /** + * Next field index to be passed to the "fieldStart" function of the Handler + * class. + */ + size_t fieldIdx; + + /** + * Set to true if the handler is valid (which is the case if the "start" + * method has returned true). If the handler is invalid, no more calls are + * directed at it until it can be removed from the stack. + */ + bool valid : 1; + + /** + * Set to true if this is an implicit handler, that was created when the + * current stack state was deduced. + */ + bool implicit : 1; + + /** + * Set to true if the handled command or annotation has a range. + */ + bool range : 1; + + /** + * Set to true if the handler currently is in a field. + */ + bool inField : 1; + + /** + * Set to true if the handler currently is in the default field. + */ + bool inDefaultField : 1; + + /** + * Set to true if the handler currently is in an implicitly started default + * field. + */ + bool inImplicitDefaultField : 1; + + /** + * Set to false if this field is only opened pro-forma and does not accept + * any data. Otherwise set to true. + */ + bool inValidField : 1; + + /** + * Set to true, if the default field was already started. + */ + bool hadDefaultField : 1; + + /** + * Default constructor of the HandlerInfo class. + */ + HandlerInfo(); + + /** + * Constructor of the HandlerInfo class, allows to set some flags manually. + */ + HandlerInfo(bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField); + + /** + * Constructor of the HandlerInfo class, taking a shared_ptr to the handler + * to which additional information should be attached. + */ + HandlerInfo(std::shared_ptr<Handler> handler); + + /** + * Destructor of the HandlerInfo class (to allow Handler to be forward + * declared). + */ + ~HandlerInfo(); + + /** + * Updates the "field" flags according to a "fieldStart" event. + */ + void fieldStart(bool isDefault, bool isImplicit, bool isValid); + + /** + * Updates the "fields" flags according to a "fieldEnd" event. + */ + void fieldEnd(); + + /** + * Returns the name of the referenced handler or an empty string if no + * handler is present. + * + * @return the current handler name. + */ + std::string name() const; + + /** + * Returns the type of the referenced handler or COMMAND if no handler is + * present. + * + * @return the current handler type. + */ + HandlerType type() const; + + /** + * Returns the current state the handler is on or States::None if no handler + * is present. + * + * @return the current state machine state. + */ + const State &state() const; +}; + HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {} HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler) @@ -38,6 +168,7 @@ HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler) fieldIdx(0), valid(true), implicit(false), + range(false), inField(false), inDefaultField(false), inImplicitDefaultField(false), @@ -46,21 +177,36 @@ HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler) { } -HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField, - bool inDefaultField, bool inImplicitDefaultField, - bool inValidField) +HandlerInfo::HandlerInfo(bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField) : handler(nullptr), fieldIdx(0), - valid(valid), + valid(true), implicit(implicit), + range(false), inField(inField), inDefaultField(inDefaultField), inImplicitDefaultField(inImplicitDefaultField), - inValidField(inValidField), + inValidField(true), hadDefaultField(false) { } +std::string HandlerInfo::name() const +{ + return handler == nullptr ? std::string{} : handler->name(); +} + +HandlerType HandlerInfo::type() const +{ + return handler == nullptr ? HandlerType::COMMAND : handler->type(); +} + +const State &HandlerInfo::state() const +{ + return handler == nullptr ? States::None : handler->state(); +} + HandlerInfo::~HandlerInfo() { // Do nothing @@ -87,7 +233,20 @@ void HandlerInfo::fieldEnd() /** * Stub instance of HandlerInfo containing no handler information. */ -static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true}; +static HandlerInfo EmptyHandlerInfo{true, true, true, true}; + +/** + * Small helper class makeing sure the reference at some variable is reset once + * the scope is left. + */ +template <class T> +struct GuardedTemporaryPointer { + T **ptr; + GuardedTemporaryPointer(T *ref, T **ptr) : ptr(ptr) { *ptr = ref; } + + ~GuardedTemporaryPointer() { *ptr = nullptr; } +}; +} /* Helper functions */ @@ -116,11 +275,197 @@ static LoggableException buildInvalidCommandException( } } -/* Class Stack */ - -Stack::Stack(ParserContext &ctx, - const std::multimap<std::string, const State *> &states) - : ctx(ctx), states(states) +/* Class StackImpl */ + +class StackImpl : public HandlerCallbacks { +private: + /** + * Reference at an implementation of the ParserCallbacks instance to which + * certain handler callbacks are directed. + */ + ParserCallbacks &parser; + + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap<std::string, const State *> &states; + + /** + * Registry responsible for registering the tokens proposed by the + * Handlers in the parser. + */ + TokenRegistry tokenRegistry; + + /** + * Pointer at a TokenizedDataReader instance from which the data should + * currently be read. + */ + TokenizedDataReader *dataReader; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::vector<HandlerInfo> stack; + + /** + * Return the reference in the Logger instance stored within the context. + */ + Logger &logger() { return ctx.getLogger(); } + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set<std::string> expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetState(const std::string &name); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state, also including the wildcard "*" state. + * Throws an exception if the given target state is not a valid identifier. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetStateOrWildcard(const std::string &name); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + * an equivalent state as the one in the including file. + */ + void deduceState(); + + /** + * Returns a reference at the current HandlerInfo instance (or a stub + * HandlerInfo instance if the stack is empty). + */ + HandlerInfo ¤tInfo(); + + /** + * Returns a reference at the last HandlerInfo instance (or a stub + * HandlerInfo instance if the stack has only one element). + */ + HandlerInfo &lastInfo(); + + /** + * Returns a set containing the tokens that should currently be processed + * by the TokenizedData instance. + * + * @return a TokenSet instance containing all tokens that should currently + * be processed. + */ + TokenSet currentTokens() const; + + /** + * Returns the whitespace mode defined by the current command. + */ + WhitespaceMode currentWhitespaceMode() const; + + /** + * Ends the current handler and removes the corresponding element from the + * stack. + * + * @return true if a command was ended, false otherwise. + */ + bool endCurrentHandler(); + + /** + * Ends all handlers that currently are not inside a field and already had + * a default field. Tries to start a default field for the current handler, + * if currently the handler is not inside a field and did not have a default + * field yet. This method is called whenever the data(), startAnnotation(), + * startToken(), startCommand(), annotationStart() or annotationEnd() events + * are reached. + * + * @return true if the current command is in a valid field. + */ + bool prepareCurrentHandler(bool startImplicitDefaultField = true); + + /** + * Returns true if all handlers on the stack are currently valid, or false + * if at least one handler is invalid. + * + * @return true if all handlers on the stack are valid. + */ + bool handlersValid(); + + /** + * Called whenever there is an actual data pending on the current + * TokenizedDataReader. Tries to feed this data to the current handler. + */ + void handleData(); + + /** + * Called whenever there is a token waiting to be processed. If possible + * tries to end a current handler with this token or to start a new handler + * with the token. + * + * @param token is the token that should be handled. + */ + void handleToken(const Token &token); + + /** + * Called by the rangeEnd() and fieldEnd() methods to end the current ranged + * command. + * + * @param endRange specifies whether this should end the range of a + * command with range. + */ + void handleFieldEnd(bool endRange); + +public: + StackImpl(ParserCallbacks &parser, ParserContext &ctx, + const std::multimap<std::string, const State *> &states); + + ~StackImpl(); + + const State ¤tState() const; + std::string currentCommandName() const; + + void commandStart(const Variant &name, const Variant::mapType &args, + bool range); + void annotationStart(const Variant &className, const Variant &args, + bool range); + void annotationEnd(const Variant &className, const Variant &elementName); + void rangeEnd(); + void fieldStart(bool isDefault); + void fieldEnd(); + void data(const TokenizedData &data); + + TokenId registerToken(const std::string &token) override; + void unregisterToken(TokenId id) override; + Variant readData() override; + void pushTokens(const std::vector<SyntaxDescriptor> &tokens) override; + void popTokens() override; +}; + +StackImpl::StackImpl(ParserCallbacks &parser, ParserContext &ctx, + const std::multimap<std::string, const State *> &states) + : parser(parser), + ctx(ctx), + states(states), + tokenRegistry(parser), + dataReader(nullptr) { // If the scope instance is not empty we need to deduce the current parser // state @@ -129,7 +474,7 @@ Stack::Stack(ParserContext &ctx, } } -Stack::~Stack() +StackImpl::~StackImpl() { while (!stack.empty()) { // Fetch the topmost stack element @@ -142,7 +487,7 @@ Stack::~Stack() !info.inImplicitDefaultField) { logger().error( std::string("Reached end of stream, but command \"") + - info.handler->getName() + + currentCommandName() + "\" has not ended yet. Command was started here:", info.handler->getLocation()); } @@ -153,7 +498,7 @@ Stack::~Stack() } } -void Stack::deduceState() +void StackImpl::deduceState() { // Assemble all states std::vector<const State *> states; @@ -176,8 +521,8 @@ void Stack::deduceState() HandlerConstructor ctor = state.elementHandler ? state.elementHandler : EmptyHandler::create; - std::shared_ptr<Handler> handler = - std::shared_ptr<Handler>{ctor({ctx, "", state, SourceLocation{}})}; + std::shared_ptr<Handler> handler = std::shared_ptr<Handler>{ + ctor({ctx, *this, state, SourceLocation{}, HandlerType::COMMAND})}; stack.emplace_back(handler); // Set the correct flags for this implicit handler @@ -186,7 +531,7 @@ void Stack::deduceState() info.fieldStart(true, false, true); } -std::set<std::string> Stack::expectedCommands() +std::set<std::string> StackImpl::expectedCommands() { const State *currentState = &(this->currentState()); std::set<std::string> res; @@ -198,17 +543,7 @@ std::set<std::string> Stack::expectedCommands() return res; } -const State &Stack::currentState() -{ - return stack.empty() ? States::None : stack.back().handler->getState(); -} - -std::string Stack::currentCommandName() -{ - return stack.empty() ? std::string{} : stack.back().handler->getName(); -} - -const State *Stack::findTargetState(const std::string &name) +const State *StackImpl::findTargetState(const std::string &name) { const State *currentState = &(this->currentState()); auto range = states.equal_range(name); @@ -222,7 +557,7 @@ const State *Stack::findTargetState(const std::string &name) return nullptr; } -const State *Stack::findTargetStateOrWildcard(const std::string &name) +const State *StackImpl::findTargetStateOrWildcard(const std::string &name) { // Try to find the target state with the given name, if none is found, try // find a matching "*" state. @@ -233,16 +568,40 @@ const State *Stack::findTargetStateOrWildcard(const std::string &name) return targetState; } -HandlerInfo &Stack::currentInfo() +const State &StackImpl::currentState() const +{ + return stack.empty() ? States::None : stack.back().state(); +} + +std::string StackImpl::currentCommandName() const +{ + return stack.empty() ? std::string{} : stack.back().name(); +} + +TokenSet StackImpl::currentTokens() const +{ + // TODO: Implement + return TokenSet{}; +} + +WhitespaceMode StackImpl::currentWhitespaceMode() const +{ + // TODO: Implement + return WhitespaceMode::COLLAPSE; +} + +HandlerInfo &StackImpl::currentInfo() { return stack.empty() ? EmptyHandlerInfo : stack.back(); } -HandlerInfo &Stack::lastInfo() +HandlerInfo &StackImpl::lastInfo() { return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2]; } -void Stack::endCurrentHandler() +/* Stack helper functions */ + +bool StackImpl::endCurrentHandler() { if (!stack.empty()) { // Fetch the handler info for the current top-level element @@ -266,50 +625,59 @@ void Stack::endCurrentHandler() // Remove the element from the stack stack.pop_back(); + return true; } + return false; } -void Stack::endOverdueHandlers() +bool StackImpl::prepareCurrentHandler(bool startImplicitDefaultField) { - if (!stack.empty()) { - // Fetch the handler info for the current top-level element - HandlerInfo &info = stack.back(); + // Repeat until a valid handler is found on the stack + while (!stack.empty()) { + // Fetch the handler for the current top-level element + HandlerInfo &info = currentInfo(); - // Abort if this handler currently is inside a field - if (info.inField || (!info.hadDefaultField && info.valid)) { - return; + // If the current Handler is in a field, there is nothing to be done, + // abort + if (info.inField) { + return true; } - // Otherwise end the current handler - endCurrentHandler(); - } -} + // If the current field already had a default field or is not valid, + // end it and repeat + if ((info.hadDefaultField || !startImplicitDefaultField) || + !info.valid) { + // We cannot end the command if it is marked as "range" command + if (info.range) { + return false; + } -bool Stack::ensureHandlerIsInField() -{ - // If the current handler is not in a field (and actually has a handler) - // try to start a default field - HandlerInfo &info = currentInfo(); - if (!info.inField && info.handler != nullptr) { - // Abort if the element already had a default field or the handler is - // not valid - if (info.hadDefaultField || !info.valid) { + // End the current handler + endCurrentHandler(); + continue; + } + + // Abort if starting new default fields is not allowed here + if (!startImplicitDefaultField) { return false; } // Try to start a new default field, abort if this did not work bool isDefault = true; if (!info.handler->fieldStart(isDefault, info.fieldIdx)) { - return false; + endCurrentHandler(); + continue; } - // Mark the field as started - info.fieldStart(true, true, true); + // Mark the field as started and return -- the field should be marked + // is implicit if this is not a field with range + info.fieldStart(true, !info.range, true); + return true; } - return true; + return false; } -bool Stack::handlersValid() +bool StackImpl::handlersValid() { for (auto it = stack.crbegin(); it != stack.crend(); it++) { if (!it->valid) { @@ -319,13 +687,131 @@ bool Stack::handlersValid() return true; } -Logger &Stack::logger() { return ctx.getLogger(); } +void StackImpl::handleData() +{ + // Repeat until we found some handle willingly consuming the data + while (true) { + // Prepare the stack -- make sure all overdue handlers are ended and + // we currently are in an open field + if (stack.empty() || !prepareCurrentHandler()) { + throw LoggableException("Did not expect any data here"); + } + + // Fetch the current handler information + HandlerInfo &info = currentInfo(); + + // If this field should not get any data, log an error and do not + // call the "data" handler + if (!info.inValidField) { + if (!info.hadDefaultField) { + logger().error("Did not expect any data here"); + } + return; + } + + // If we're currently in an invalid subtree, just eat the data and abort + if (!handlersValid()) { + return; + } + + // Fork the logger and set it as temporary logger for the "data" + // method. We only want to keep error messages if this was not a + // try to implicitly open a default field. + LoggerFork loggerFork = logger().fork(); + info.handler->setLogger(loggerFork); + + // Pass the data to the current Handler instance + bool valid = false; + try { + valid = info.handler->data(); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + + // Reset the logger instance of the handler as soon as possible + info.handler->resetLogger(); + + // If placing the data here failed and we're currently in an + // implicitly opened field, just unroll the stack to the next field + // and try again + if (!valid && info.inImplicitDefaultField) { + endCurrentHandler(); + continue; + } + + // Commit the content of the logger fork. Do not change the valid flag. + loggerFork.commit(); + return; + } +} + +void StackImpl::handleToken(const Token &token) +{ + // TODO: Implement + // Just eat them for now +} + +void StackImpl::handleFieldEnd(bool endRange) +{ + // Throw away all overdue handlers + prepareCurrentHandler(false); + + // Close all implicit default fields + while (!stack.empty()) { + HandlerInfo &info = currentInfo(); + if (!info.inImplicitDefaultField || info.range) { + break; + } + endCurrentHandler(); + } + + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (stack.empty() || (!info.inField && !endRange) || + (!info.range && endRange)) { + if (endRange) { + logger().error( + "Got end of range, but there is no command here to end"); + } else { + logger().error("Got field end, but there is no field here to end"); + } + return; + } + + // Only continue if the current handler stack is in a valid state, do not + // call the fieldEnd function if something went wrong before + if (handlersValid()) { + // End the current field if it is valid + if (info.inValidField) { + info.handler->fieldEnd(); + info.fieldEnd(); + } + + // End the complete command if this is a range command, start the + // default field for once if range command did not have a default field + if (info.range && endRange) { + if (!info.hadDefaultField) { + bool isDefault = true; + info.handler->fieldStart(isDefault, true); + info.fieldStart(true, true, true); + } + endCurrentHandler(); + return; + } + } + + // This command no longer is in a field + info.fieldEnd(); +} + +/* Class StackImpl public functions */ -void Stack::command(const Variant &name, const Variant::mapType &args) +void StackImpl::commandStart(const Variant &name, const Variant::mapType &args, + bool range) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); + // Call prepareCurrentHandler once to end all overdue commands + prepareCurrentHandler(); // Make sure the given identifier is valid (preventing "*" from being // malicously passed to this function) @@ -336,14 +822,18 @@ void Stack::command(const Variant &name, const Variant::mapType &args) } while (true) { + // Prepare the stack -- make sure all overdue handlers are ended and + // we currently are in an open field + prepareCurrentHandler(); + // Try to find a target state for the given command, if none can be // found and the current command does not have an open field, then try // to create an empty default field, otherwise this is an exception const State *targetState = findTargetStateOrWildcard(name.asString()); if (targetState == nullptr) { HandlerInfo &info = currentInfo(); - if (info.inImplicitDefaultField || !info.inField) { - endCurrentHandler(); + if ((info.inImplicitDefaultField || !info.inField) && + endCurrentHandler()) { continue; } else { throw buildInvalidCommandException(name.asString(), @@ -351,12 +841,6 @@ void Stack::command(const Variant &name, const Variant::mapType &args) } } - // Make sure we're currently inside a field - if (!ensureHandlerIsInField()) { - endCurrentHandler(); - continue; - } - // Fork the logger. We do not want any validation errors to skip LoggerFork loggerFork = logger().fork(); @@ -365,10 +849,15 @@ void Stack::command(const Variant &name, const Variant::mapType &args) ? targetState->elementHandler : EmptyHandler::create; std::shared_ptr<Handler> handler{ - ctor({ctx, name.asString(), *targetState, name.getLocation()})}; + ctor({ctx, + *this, + *targetState, + {name.asString(), name.getLocation()}, + HandlerType::COMMAND})}; stack.emplace_back(handler); - // Fetch the HandlerInfo for the parent element and the current element + // Fetch the HandlerInfo for the parent element and the current + // element HandlerInfo &parentInfo = lastInfo(); HandlerInfo &info = currentInfo(); @@ -387,7 +876,7 @@ void Stack::command(const Variant &name, const Variant::mapType &args) handler->setLogger(loggerFork); try { - info.valid = handler->start(canonicalArgs); + info.valid = handler->startCommand(canonicalArgs); } catch (LoggableException ex) { loggerFork.log(ex); @@ -395,94 +884,65 @@ void Stack::command(const Variant &name, const Variant::mapType &args) handler->resetLogger(); } - // We started the command within an implicit default field and it is not - // valid -- remove both the new handler and the parent field from the - // stack + // We started the command within an implicit default field and it is + // not valid -- remove both the new handler and the parent field from + // the stack if (!info.valid && parentInfo.inImplicitDefaultField) { - endCurrentHandler(); - endCurrentHandler(); - continue; + // Only continue if the parent handler could actually be removed + if (endCurrentHandler() && endCurrentHandler()) { + continue; + } } - // If we ended up here, starting the command may or may not have worked, - // but after all, we cannot unroll the stack any further. Update the - // "valid" flag, commit any potential error messages and return. + // If we ended up here, starting the command may or may not have + // worked, but after all, we cannot unroll the stack any further. Update + // the "valid" flag, commit any potential error messages and return. info.valid = parentInfo.valid && info.valid; + info.range = range; loggerFork.commit(); return; } } -void Stack::data(const Variant &data) +void StackImpl::annotationStart(const Variant &className, const Variant &args, + bool range) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); - - while (true) { - // Check whether there is any command the data can be sent to - if (stack.empty()) { - throw LoggableException("No command here to receive data.", data); - } - - // Fetch the current command handler information - HandlerInfo &info = currentInfo(); - - // Make sure the current handler has an open field - if (!ensureHandlerIsInField()) { - endCurrentHandler(); - continue; - } - - // If this field should not get any data, log an error and do not call - // the "data" handler - if (!info.inValidField) { - // If the "hadDefaultField" flag is set, we already issued an error - // message - if (!info.hadDefaultField) { - logger().error("Did not expect any data here", data); - } - } - - if (handlersValid() && info.inValidField) { - // Fork the logger and set it as temporary logger for the "start" - // method. We only want to keep error messages if this was not a try - // to implicitly open a default field. - LoggerFork loggerFork = logger().fork(); - info.handler->setLogger(loggerFork); - - // Pass the data to the current Handler instance - bool valid = false; - try { - Variant dataCopy = data; - valid = info.handler->data(dataCopy); - } - catch (LoggableException ex) { - loggerFork.log(ex); - } + // TODO +} - // Reset the logger instance as soon as possible - info.handler->resetLogger(); +void StackImpl::annotationEnd(const Variant &className, + const Variant &elementName) +{ + // TODO +} - // If placing the data here failed and we're currently in an - // implicitly opened field, just unroll the stack to the next field - // and try again - if (!valid && info.inImplicitDefaultField) { - endCurrentHandler(); - continue; - } +void StackImpl::rangeEnd() { handleFieldEnd(true); } - // Commit the content of the logger fork. Do not change the valid - // flag. - loggerFork.commit(); +void StackImpl::data(const TokenizedData &data) +{ + // Fetch a reader for the given tokenized data instance. + TokenizedDataReader reader = data.reader(); + + // Use the GuardedTemporaryPointer to make sure that the member variable + // dataReader is resetted to nullptr once this scope is left. + GuardedTemporaryPointer<TokenizedDataReader> ptr(&reader, &dataReader); + + // Peek a token from the reader, repeat until all tokens have been read + Token token; + while (reader.peek(token, currentTokens(), currentWhitespaceMode())) { + // Handle the token as text data or as actual token + if (token.id == Tokens::Data) { + handleData(); + } else { + handleToken(token); } - // There was no reason to unroll the stack any further, so continue - return; + // Consume the peeked token + reader.consumePeek(); } } -void Stack::fieldStart(bool isDefault) +void StackImpl::fieldStart(bool isDefault) { // Make sure the current handler stack is not empty if (stack.empty()) { @@ -494,13 +954,14 @@ void Stack::fieldStart(bool isDefault) HandlerInfo &info = currentInfo(); if (info.inField) { logger().error( - "Got field start, but there is no command for which to start the " + "Got field start, but there is no command for which to start " + "the " "field."); return; } - // If the handler already had a default field we cannot start a new field - // (the default field always is the last field) -- mark the command as + // If the handler already had a default field we cannot start a new + // field (the default field always is the last field) -- mark the command as // invalid if (info.hadDefaultField) { logger().error(std::string("Got field start, but command \"") + @@ -534,54 +995,132 @@ void Stack::fieldStart(bool isDefault) info.fieldStart(defaultField, false, valid); } -void Stack::fieldEnd() +void StackImpl::fieldEnd() { handleFieldEnd(false); } + +/* Class StackImpl HandlerCallbacks */ + +TokenId StackImpl::registerToken(const std::string &token) { - // Unroll the stack until the next explicitly open field - while (!stack.empty()) { - HandlerInfo &info = currentInfo(); - if (info.inField && !info.inImplicitDefaultField) { - break; - } - endCurrentHandler(); - } + return tokenRegistry.registerToken(token); +} - // Fetch the information attached to the current handler - HandlerInfo &info = currentInfo(); - if (!info.inField || info.inImplicitDefaultField || stack.empty()) { - logger().error( - "Got field end, but there is no command for which to end the " - "field."); - return; - } +void StackImpl::unregisterToken(TokenId id) +{ + tokenRegistry.unregisterToken(id); +} - // Only continue if the current handler stack is in a valid state, do not - // call the fieldEnd function if something went wrong before - if (handlersValid() && !info.hadDefaultField && info.inValidField) { - try { - info.handler->fieldEnd(); - } - catch (LoggableException ex) { - logger().log(ex); +void StackImpl::pushTokens(const std::vector<SyntaxDescriptor> &tokens) +{ + // TODO +} + +void StackImpl::popTokens() +{ + // TODO +} + +Variant StackImpl::readData() +{ + if (dataReader != nullptr) { + TokenizedDataReaderFork dataReaderFork = dataReader->fork(); + Token token; + dataReaderFork.read(token, currentTokens(), currentWhitespaceMode()); + if (token.id == Tokens::Data) { + Variant res = Variant::fromString(token.content); + res.setLocation(token.getLocation()); + return res; } } + return Variant{}; +} - // This command no longer is in a field - info.fieldEnd(); +/* Class Stack */ + +Stack::Stack(ParserCallbacks &parser, ParserContext &ctx, + const std::multimap<std::string, const State *> &states) + : impl(new StackImpl(parser, ctx, states)) +{ } -void Stack::annotationStart(const Variant &className, const Variant &args) +Stack::~Stack() { - // TODO + // Do nothing here, stub needed because StackImpl is incomplete in hpp +} + +const State &Stack::currentState() const { return impl->currentState(); } + +std::string Stack::currentCommandName() const +{ + return impl->currentCommandName(); +} + +void Stack::commandStart(const Variant &name, const Variant::mapType &args, + bool range) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: commandStart " << name << " " << args << " " << range + << std::endl; +#endif + impl->commandStart(name, args, range); +} + +void Stack::annotationStart(const Variant &className, const Variant &args, + bool range) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: annotationStart " << className << " " << args << " " + << range << std::endl; +#endif + impl->annotationStart(className, args, range); } void Stack::annotationEnd(const Variant &className, const Variant &elementName) { - // TODO +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: annotationEnd " << className << " " << elementName + << std::endl; +#endif + impl->annotationEnd(className, elementName); } -void Stack::token(Variant token) +void Stack::rangeEnd() { - // TODO +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: rangeEnd" << std::endl; +#endif + impl->rangeEnd(); +} + +void Stack::fieldStart(bool isDefault) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: fieldStart " << isDefault << std::endl; +#endif + impl->fieldStart(isDefault); +} + +void Stack::fieldEnd() +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: fieldEnd" << std::endl; +#endif + impl->fieldEnd(); +} + +void Stack::data(const TokenizedData &data) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: data" << std::endl; +#endif + impl->data(data); +} + +void Stack::data(const std::string &str) +{ +#if STACK_DEBUG_OUTPUT + std::cout << "STACK: data (string) " << str << std::endl; +#endif + data(TokenizedData(str)); +} } } -}
\ No newline at end of file diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index b67ce82..6d42f10 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -29,235 +29,48 @@ #ifndef _OUSIA_PARSER_STACK_STACK_HPP_ #define _OUSIA_PARSER_STACK_STACK_HPP_ -#include <cstdint> - #include <map> #include <memory> -#include <set> -#include <vector> - -#include <core/common/Variant.hpp> -#include <core/parser/Parser.hpp> namespace ousia { // Forward declarations class ParserContext; -class Logger; +class TokenizedData; +class Variant; namespace parser_stack { // Forward declarations -class Handler; +class ParserCallbacks; +class StackImpl; class State; /** - * The HandlerInfo class is used internally by the stack to associate additional - * (mutable) data with a handler instance. - */ -class HandlerInfo { -public: - /** - * Pointer pointing at the actual handler instance. - */ - std::shared_ptr<Handler> handler; - - /** - * Next field index to be passed to the "fieldStart" function of the Handler - * class. - */ - size_t fieldIdx; - - /** - * Set to true if the handler is valid (which is the case if the "start" - * method has returned true). If the handler is invalid, no more calls are - * directed at it until it can be removed from the stack. - */ - bool valid : 1; - - /** - * Set to true if this is an implicit handler, that was created when the - * current stack state was deduced. - */ - bool implicit : 1; - - /** - * Set to true if the handler currently is in a field. - */ - bool inField : 1; - - /** - * Set to true if the handler currently is in the default field. - */ - bool inDefaultField : 1; - - /** - * Set to true if the handler currently is in an implicitly started default - * field. - */ - bool inImplicitDefaultField : 1; - - /** - * Set to false if this field is only opened pro-forma and does not accept - * any data. Otherwise set to true. - */ - bool inValidField : 1; - - /** - * Set to true, if the default field was already started. - */ - bool hadDefaultField : 1; - - /** - * Default constructor of the HandlerInfo class. - */ - HandlerInfo(); - /** - * Constructor of the HandlerInfo class, allows to set all flags manually. - */ - HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, - bool inImplicitDefaultField, bool inValidField); - - /** - * Constructor of the HandlerInfo class, taking a shared_ptr to the handler - * to which additional information should be attached. - */ - HandlerInfo(std::shared_ptr<Handler> handler); - - /** - * Destructor of the HandlerInfo class (to allow Handler to be forward - * declared). - */ - ~HandlerInfo(); - - /** - * Updates the "field" flags according to a "fieldStart" event. - */ - void fieldStart(bool isDefault, bool isImplicit, bool isValid); - - /** - * Updates the "fields" flags according to a "fieldEnd" event. - */ - void fieldEnd(); -}; - -/** * The Stack class is a pushdown automaton responsible for turning a command * stream into a tree of Node instances. It does so by following a state * transition graph and creating a set of Handler instances, which are placed - * on the stack. + * on the stack. Additionally it is responsible for the normalization of + * Annotations and for handling tokens. */ class Stack { private: /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. - */ - const std::multimap<std::string, const State *> &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::vector<HandlerInfo> stack; - - /** - * Return the reference in the Logger instance stored within the context. - */ - Logger &logger(); - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. + * Pointer at the internal implementation */ - std::set<std::string> expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - * state otherwise. - */ - const State *findTargetState(const std::string &name); - - /** - * Returns the targetState for a command with the given name that can be - * reached from the current state, also including the wildcard "*" state. - * Throws an exception if the given target state is not a valid identifier. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - * state otherwise. - */ - const State *findTargetStateOrWildcard(const std::string &name); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - * an equivalent state as the one in the including file. - */ - void deduceState(); - - /** - * Returns a reference at the current HandlerInfo instance (or a stub - * HandlerInfo instance if the stack is empty). - */ - HandlerInfo ¤tInfo(); - - /** - * Returns a reference at the last HandlerInfo instance (or a stub - * HandlerInfo instance if the stack has only one element). - */ - HandlerInfo &lastInfo(); - - /** - * Ends all handlers that currently are not inside a field and already had - * a default field. This method is called whenever the data() and command() - * events are reached. - */ - void endOverdueHandlers(); - - /** - * Ends the current handler and removes the corresponding element from the - * stack. - */ - void endCurrentHandler(); - - /** - * Tries to start a default field for the current handler, if currently the - * handler is not inside a field and did not have a default field yet. - * - * @return true if the handler is inside a field, false if no field could - * be started. - */ - bool ensureHandlerIsInField(); - - /** - * Returns true if all handlers on the stack are currently valid, or false - * if at least one handler is invalid. - * - * @return true if all handlers on the stack are valid. - */ - bool handlersValid(); + std::unique_ptr<StackImpl> impl; public: /** * Creates a new instance of the Stack class. * + * @param parser is an implementation of the ParserCallbacks instance to + * which certain calls are directed. * @param ctx is the parser context the parser stack is working on. * @param states is a map containing the command names and pointers at the * corresponding State instances. */ - Stack(ParserContext &ctx, + Stack(ParserCallbacks &parser, ParserContext &ctx, const std::multimap<std::string, const State *> &states); /** @@ -268,10 +81,10 @@ public: /** * Returns the state the Stack instance currently is in. * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. + * @return the state of the currently active Handler instance or + * States::None if no handler is on the stack. */ - const State ¤tState(); + const State ¤tState() const; /** * Returns the command name that is currently being handled. @@ -279,7 +92,7 @@ public: * @return the name of the command currently being handled by the active * Handler instance or an empty string if no handler is currently active. */ - std::string currentCommandName(); + std::string currentCommandName() const; /** * Function that should be called whenever a new command is reached. @@ -288,17 +101,36 @@ public: * separator ':') and its corresponding location. Must be a string variant. * @param args is a map containing the arguments that were passed to the * command. + * @param range if true, the started command has an explicit range. */ - void command(const Variant &name, const Variant::mapType &args); + void commandStart(const Variant &name, const Variant::mapType &args, + bool range = false); /** - * Function that shuold be called whenever character data is found in the - * input stream. May only be called if the currently is a command on the - * stack. + * Function that should be called whenever an annotation starts. + * + * @param name is the name of the annotation class. + * @param args is a map variant containing the arguments that were passed + * to the annotation. + * @param range if true, the annotation fields have an explicit range. + */ + void annotationStart(const Variant &className, const Variant &args, + bool range = false); + + /** + * Function that should be called whenever an annotation ends. * - * @param data is a string variant containing the data that has been found. + * @param name is the name of the annotation class that was ended. + * @param annotationName is the name of the annotation that was ended. */ - void data(const Variant &data); + void annotationEnd(const Variant &className, const Variant &elementName); + + /** + * Function the should be called whenever a ranged command or annotation + * ends. Must be called if the range parameter range was set to true when + * annotationStart() or commandStart() were called. + */ + void rangeEnd(); /** * Function that should be called whenever a new field starts. Fields of the @@ -317,29 +149,25 @@ public: void fieldEnd(); /** - * Function that should be called whenever an annotation starts. - * - * @param name is the name of the annotation class. - * @param args is a map variant containing the arguments that were passed - * to the annotation. - */ - void annotationStart(const Variant &className, const Variant &args); - - /** - * Function that should be called whenever an annotation ends. + * Function that should be called whenever character data is found in the + * input stream. May only be called if there currently is a command on the + * stack. * - * @param name is the name of the annotation class that was ended. - * @param annotationName is the name of the annotation that was ended. + * @param data is a TokenizedData instance containing the pre-segmented data + * that should be read. */ - void annotationEnd(const Variant &className, const Variant &elementName); + void data(const TokenizedData &data); /** - * Function that should be called whenever a previously registered token - * is found in the input stream. + * Function that may be called whenever character data is found in the + * input stream. May only be called if the currently is a command on the + * stack. This method is mainly intended for unit testing. Pass a + * TokenizedData instance to the * - * @param token is string variant containing the token that was encountered. + * @param str is a string containing the data that should be passed to the + * tokenizer. */ - void token(Variant token); + void data(const std::string &str); }; } } diff --git a/src/core/parser/stack/State.cpp b/src/core/parser/stack/State.cpp index d72f533..0feeed6 100644 --- a/src/core/parser/stack/State.cpp +++ b/src/core/parser/stack/State.cpp @@ -23,17 +23,19 @@ namespace parser_stack { /* Class State */ -State::State() : elementHandler(nullptr) {} +State::State() : elementHandler(nullptr), supportsAnnotations(false), supportsTokens(false) {} State::State(StateSet parents, Arguments arguments, RttiSet createdNodeTypes, HandlerConstructor elementHandler, - bool supportsAnnotations) + bool supportsAnnotations, + bool supportsTokens) : parents(parents), arguments(arguments), createdNodeTypes(createdNodeTypes), elementHandler(elementHandler), - supportsAnnotations(supportsAnnotations) + supportsAnnotations(supportsAnnotations), + supportsTokens(supportsTokens) { } @@ -93,6 +95,13 @@ StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations) return *this; } +StateBuilder &StateBuilder::supportsTokens(bool supportsTokens) +{ + state.supportsTokens = supportsTokens; + return *this; +} + + const State &StateBuilder::build() const { return state; } /* Class StateDeductor */ diff --git a/src/core/parser/stack/State.hpp b/src/core/parser/stack/State.hpp index 4766235..011ccd6 100644 --- a/src/core/parser/stack/State.hpp +++ b/src/core/parser/stack/State.hpp @@ -82,13 +82,21 @@ struct State { /** * Set to true if this handler does support annotations. This is almost - * always false (e.g. all description handlers), except for document + * always false (e.g. all description handlers), except for document * element handlers. */ - bool supportsAnnotations; + bool supportsAnnotations : 1; /** - * Default constructor, initializes the handlers with nullptr. + * Set to true if this handler does support tokens. This is almost + * always false (e.g. all description handlers), except for document + * element handlers. + */ + bool supportsTokens : 1; + + /** + * Default constructor, initializes the handlers with nullptr and the + * supportsAnnotations and supportsTokens flags with false. */ State(); @@ -108,11 +116,12 @@ struct State { * be nullptr in which case no handler instance is created. * @param supportsAnnotations specifies whether annotations are supported * here at all. + * @param supportsTokens specified whether tokens are supported here at all. */ State(StateSet parents, Arguments arguments = Arguments{}, - RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr, - bool supportsAnnotations = false); + RttiSet createdNodeTypes = RttiSet{}, + HandlerConstructor elementHandler = nullptr, + bool supportsAnnotations = false, bool supportsTokens = false); /** * Creates this State from the given StateBuilder instance. @@ -220,6 +229,16 @@ public: StateBuilder &supportsAnnotations(bool supportsAnnotations); /** + * Sets the state of the "supportsTokens" flag (default value is false). + * + * @param supportsTokens should be set to true, if the elementHandler + * registered for this state is capable of handling tokens. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &supportsTokens(bool supportsTokens); + + /** * Returns a reference at the internal State instance that was built * using the StateBuilder. * @@ -275,7 +294,7 @@ public: * @param states is a list of states that should be checked. */ StateDeductor(std::vector<const Rtti *> signature, - std::vector<const State *> states); + std::vector<const State *> states); /** * Selects all active states from the given states. Only considers those diff --git a/src/core/parser/stack/TokenRegistry.cpp b/src/core/parser/stack/TokenRegistry.cpp new file mode 100644 index 0000000..c135b98 --- /dev/null +++ b/src/core/parser/stack/TokenRegistry.cpp @@ -0,0 +1,80 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "Callbacks.hpp" +#include "TokenRegistry.hpp" + +namespace ousia { +namespace parser_stack { + +TokenRegistry::~TokenRegistry() +{ + for (const auto &tid: tokenIds) { + parser.unregisterToken(tid.first); + } +} + +TokenId TokenRegistry::registerToken(const std::string &token) +{ + // Check whether the given token is already registered + auto it = tokens.find(token); + if (it != tokens.end()) { + // Increment the reference count + size_t &refCount = it->second.second; + refCount++; + + // Return the token id + return it->second.first; + } + + // Register the token in the parser + TokenId id = parser.registerToken(token); + tokens[token] = std::pair<TokenId, size_t>(id, 1); + tokenIds[id] = token; + return id; +} + +void TokenRegistry::unregisterToken(TokenId id) +{ + // Lookup the token corresponding to the given token id + auto tokenIt = tokenIds.find(id); + if (tokenIt != tokenIds.end()) { + const std::string &token = tokenIt->second; + // Lookup the reference count for the corresponding token + auto idIt = tokens.find(token); + if (idIt != tokens.end()) { + // Decrement the reference count, abort if the refCount is larger + // than zero + size_t &refCount = idIt->second.second; + refCount--; + if (refCount > 0) { + return; + } + + // Unregister the token from the parser + parser.unregisterToken(id); + + // Unregister the token from the internal tokens map + tokens.erase(token); + } + // Unregister the token from the internal id map + tokenIds.erase(id); + } +} +} +} diff --git a/src/core/parser/stack/TokenRegistry.hpp b/src/core/parser/stack/TokenRegistry.hpp new file mode 100644 index 0000000..545db39 --- /dev/null +++ b/src/core/parser/stack/TokenRegistry.hpp @@ -0,0 +1,114 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenRegistry.hpp + * + * Contains the TokenRegistry class used for registering all user defined tokens + * during the parsing process. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_ +#define _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_ + +#include <string> +#include <unordered_map> + +#include <core/common/Token.hpp> + +namespace ousia { +namespace parser_stack { + +// Forward declarations +class ParserCallbacks; + +/** + * The TokenRegistry class is used for registering all user defined tokens + * during the Parsing process. The TokenRegistry class acts as an adapter + * between the parser which allocates a TokenId for each unique token and the + * Handler classes which may register the same token multiple times and expect + * the same TokenId to be returned for the same token. + */ +class TokenRegistry { +private: + /** + * Reference at the ParserCallback instance the tokens are relayed to. + */ + ParserCallbacks &parser; + + /** + * Store containing all TokenId instances for all registered tokens. The map + * maps from the token strings to the corresponding TokenId and a reference + * count. + */ + std::unordered_map<std::string, std::pair<TokenId, size_t>> tokens; + + /** + * Reverse map containing the string corresponding to a TokenId. + */ + std::unordered_map<TokenId, std::string> tokenIds; + +public: + /** + * Constructor of the TokenRegistry class. + * + * @param parser is the underlying parser implementing the ParserCallbacks + * interface to which all calls are relayed. + */ + TokenRegistry(ParserCallbacks &parser) : parser(parser) {} + + /** + * Destructor of the TokenRegistry class, removes all registered tokens from + * the parser. + */ + ~TokenRegistry(); + + /* No copy construction */ + TokenRegistry(const TokenRegistry &) = delete; + + /* No assignment */ + TokenRegistry &operator=(const TokenRegistry &) = delete; + + /** + * Registers the given string token in the underlying parser and returns the + * TokenId of that token. If the same token string is given multiple times, + * the same TokenId is returned. The token is only registered once in the + * parser. + * + * @param token is the token that should be registered. + * @return the TokenId associated with this token. + */ + TokenId registerToken(const std::string &token); + + /** + * Unregisters the token with the given TokenId from the parser. Note that + * the token will only be unregistered if unregisterToken() has been called + * as many times as registerToken() for the same token. + * + * @param id is the id of the token returned by registerToken() that should + * be unregistered. + */ + void unregisterToken(TokenId id); +}; +} +} + +#endif /* _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_ */ + diff --git a/src/core/parser/stack/TokenStack.cpp b/src/core/parser/stack/TokenStack.cpp new file mode 100644 index 0000000..ac1d94e --- /dev/null +++ b/src/core/parser/stack/TokenStack.cpp @@ -0,0 +1,45 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "TokenStack.hpp" + +namespace ousia { +namespace parser_stack { + +void TokenStack::pushTokens(const std::vector<SyntaxDescriptor> &tokens) +{ + stack.push_back(tokens); +} + +void TokenStack::popTokens() { stack.pop_back(); } + +TokenSet TokenStack::tokens() const +{ + if (stack.empty() && parentStack != nullptr) { + return parentStack->tokens(); + } + + TokenSet res; + for (const SyntaxDescriptor &descr : stack.back()) { + descr.insertIntoTokenSet(res); + } + return res; +} +} +} + diff --git a/src/core/parser/stack/TokenStack.hpp b/src/core/parser/stack/TokenStack.hpp new file mode 100644 index 0000000..f2e7edc --- /dev/null +++ b/src/core/parser/stack/TokenStack.hpp @@ -0,0 +1,112 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenStack.hpp + * + * Contains the TokenStack class used for collecting the currently enabled user + * defined tokens on a per-field basis. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_ +#define _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_ + +#include <memory> +#include <vector> + +#include <core/common/Token.hpp> +#include <core/model/Syntax.hpp> + +namespace ousia { +namespace parser_stack { + +/** + * The TokenStack class is used by the Stack class to collect all currently + * enabled user defined tokens. + */ +class TokenStack { +private: + /** + * Shared pointer at the parent TokenStack instance. May be nullptr, in + * which case no parent TokenStack instance exists. + */ + const TokenStack *parentStack; + + /** + * Stack containing vectors of TokenSyntaxDescriptor instances as given by + * the user. + */ + std::vector<std::vector<SyntaxDescriptor>> stack; + + /** + * Constructor of the TokenStack class. + * + * @param parentStack is a pointer at the underlying parentStack instance + * to which calls should be forwarded if no data has been pushed onto this + * stack instance. + */ + TokenStack(const TokenStack *parentStack) : parentStack(parentStack) {} + +public: + /** + * Default constructor of the TokenStack class with no reference at a parent + * stack. + */ + TokenStack() : TokenStack(nullptr) {} + + /** + * Constructor of the TokenStack class with a reference at a parent + * TokenStack instance. + * + * @param parentStack is a reference at a parent TokenStack instance. If no + * data has yet been pushed onto this instance, calls will be forwarded to + * the parent stack. + */ + TokenStack(const TokenStack &parentStack) : TokenStack(&parentStack) {} + + /** + * Pushes a list of SyntaxDescriptor instances onto the internal stack. + * + * @param tokens is a list of SyntaxDescriptor instances that should be + * stored on the stack. + */ + void pushTokens(const std::vector<SyntaxDescriptor> &tokens); + + /** + * Removes the previously pushed list of tokens from the stack. + */ + void popTokens(); + + /** + * Returns a set containing all currently enabled tokens. The set of enabled + * tokens are those tokens that were pushed last onto the stack. This set + * has to be passed to the TokenizedData instance in order to gather all + * tokens that are currently possible. + * + * @return a set of tokens containing all the Tokens that are currently + * possible. + */ + TokenSet tokens() const; +}; +} +} + +#endif /* _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_ */ + diff --git a/src/core/parser/stack/TypesystemHandler.cpp b/src/core/parser/stack/TypesystemHandler.cpp index b62f684..73bcf62 100644 --- a/src/core/parser/stack/TypesystemHandler.cpp +++ b/src/core/parser/stack/TypesystemHandler.cpp @@ -32,7 +32,7 @@ namespace parser_stack { /* TypesystemHandler */ -bool TypesystemHandler::start(Variant::mapType &args) +bool TypesystemHandler::startCommand(Variant::mapType &args) { // Create the typesystem instance Rooted<Typesystem> typesystem = @@ -63,7 +63,7 @@ void TypesystemHandler::end() { scope().pop(logger()); } /* TypesystemEnumHandler */ -bool TypesystemEnumHandler::start(Variant::mapType &args) +bool TypesystemEnumHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -91,17 +91,17 @@ void TypesystemEnumEntryHandler::doHandle(const Variant &fieldData, /* TypesystemStructHandler */ -bool TypesystemStructHandler::start(Variant::mapType &args) +bool TypesystemStructHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); // Fetch the arguments used for creating this type - const std::string &name = args["name"].asString(); + const std::string &structNmae = args["name"].asString(); const std::string &parent = args["parent"].asString(); // Fetch the current typesystem and create the struct node Rooted<Typesystem> typesystem = scope().selectOrThrow<Typesystem>(); - Rooted<StructType> structType = typesystem->createStructType(name); + Rooted<StructType> structType = typesystem->createStructType(structNmae); structType->setLocation(location()); // Try to resolve the parent type and set it as parent structure @@ -124,18 +124,18 @@ void TypesystemStructHandler::end() { scope().pop(logger()); } /* TypesystemStructFieldHandler */ -bool TypesystemStructFieldHandler::start(Variant::mapType &args) +bool TypesystemStructFieldHandler::startCommand(Variant::mapType &args) { // Read the argument values - const std::string &name = args["name"].asString(); + const std::string &fieldName = args["name"].asString(); const std::string &type = args["type"].asString(); const Variant &defaultValue = args["default"]; const bool optional = !(defaultValue.isObject() && defaultValue.asObject() == nullptr); Rooted<StructType> structType = scope().selectOrThrow<StructType>(); - Rooted<Attribute> attribute = - structType->createAttribute(name, defaultValue, optional, logger()); + Rooted<Attribute> attribute = structType->createAttribute( + fieldName, defaultValue, optional, logger()); attribute->setLocation(location()); // Try to resolve the type and default value @@ -163,17 +163,17 @@ bool TypesystemStructFieldHandler::start(Variant::mapType &args) /* TypesystemConstantHandler */ -bool TypesystemConstantHandler::start(Variant::mapType &args) +bool TypesystemConstantHandler::startCommand(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); // Read the argument values - const std::string &name = args["name"].asString(); + const std::string &constantName = args["name"].asString(); const std::string &type = args["type"].asString(); const Variant &value = args["value"]; Rooted<Typesystem> typesystem = scope().selectOrThrow<Typesystem>(); - Rooted<Constant> constant = typesystem->createConstant(name, value); + Rooted<Constant> constant = typesystem->createConstant(constantName, value); constant->setLocation(location()); // Try to resolve the type diff --git a/src/core/parser/stack/TypesystemHandler.hpp b/src/core/parser/stack/TypesystemHandler.hpp index 85494f1..0773a3a 100644 --- a/src/core/parser/stack/TypesystemHandler.hpp +++ b/src/core/parser/stack/TypesystemHandler.hpp @@ -43,7 +43,7 @@ class TypesystemHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -67,7 +67,7 @@ class TypesystemEnumHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -114,7 +114,7 @@ class TypesystemStructHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; void end() override; /** @@ -139,7 +139,7 @@ class TypesystemStructFieldHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; /** * Creates a new instance of the TypesystemStructFieldHandler. @@ -162,7 +162,7 @@ class TypesystemConstantHandler : public StaticHandler { public: using StaticHandler::StaticHandler; - bool start(Variant::mapType &args) override; + bool startCommand(Variant::mapType &args) override; /** * Creates a new instance of the TypesystemConstantHandler. diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..f322a88 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -33,6 +33,7 @@ #include <limits> #include <vector> #include <utility> +#include <unordered_map> #include <core/common/Location.hpp> @@ -43,6 +44,9 @@ namespace ousia { * a delta compression. */ class SourceOffsetVector { +public: + using OffsPair = std::pair<SourceOffset, SourceOffset>; + private: /** * Type used for representing the length of a character. @@ -82,9 +86,12 @@ private: std::vector<SourceOffset> offsets; /** + * Map used to store discontinuities in the character offsets. + */ + std::unordered_map<size_t, OffsPair> gaps; + + /** * Last position given as "end" position in the storeOffset() method. - * Used to adapt the length of the previous element in case start and end - * positions do not match. */ SourceOffset lastEnd; @@ -105,19 +112,22 @@ public: // Make sure (end - start) is smaller than MAX_LEN assert(end - start < MAX_LEN); - // Adapt the length of the previous character in case there is a gap - if (!lens.empty() && start > lastEnd) { - lens.back() += start - lastEnd; - } - lastEnd = end; - // Store an absolute offset every OFFSET_INTERVAL elements if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) { offsets.push_back(start); } - // Store the length - lens.push_back(end - start); + // Adapt the length of the previous character in case there is a gap + if (!lens.empty() && start > lastEnd) { + // There is a discontinuity, store the given offsets in the "gaps" + // map + gaps[lens.size()] = OffsPair(start, end); + lens.push_back(MAX_LEN); + } else { + // Store the length + lens.push_back(end - start); + } + lastEnd = end; } /** @@ -127,14 +137,13 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) + OffsPair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); if (idx > 0 && idx == count) { auto offs = loadOffset(count - 1); - return std::pair<SourceOffset, SourceOffset>(offs.second, - offs.second); + return OffsPair(offs.second, offs.second); } // Calculate the start index in the lens vector and in the offsets @@ -146,18 +155,66 @@ public: assert(idx < count); assert(offsetIdx < offsets.size()); + // If the length of the last character is MAX_LEN, the position is + // stored in the "gaps" list + if (lens[idx] == MAX_LEN) { + auto it = gaps.find(idx); + assert(it != gaps.end()); + return it->second; + } + // Sum over the length starting with the start offset SourceOffset start = offsets[offsetIdx]; for (size_t i = sumStartIdx; i < idx; i++) { - start += lens[i]; + if (lens[i] == MAX_LEN) { + auto it = gaps.find(i); + assert(it != gaps.end()); + start = it->second.first; + } else { + start += lens[i]; + } } - return std::pair<SourceOffset, SourceOffset>(start, start + lens[idx]); + return OffsPair(start, start + lens[idx]); } /** * Returns the number of characters for which offsets are stored. */ - size_t size() { return lens.size(); } + size_t size() const { return lens.size(); } + + /** + * Trims the length of the TokenizedData instance to the given length. + * Removes all token matches that lie within the trimmed region. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) + { + if (length < size()) { + lens.resize(length); + if (length > 0) { + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + lastEnd = loadOffset(length - 1).second; + } else { + offsets.clear(); + gaps.clear(); + lastEnd = 0; + } + } + } + + /** + * Resets the SourceOffsetVector to the state it had when it was + * constructed. + */ + void clear() + { + lens.clear(); + offsets.clear(); + gaps.clear(); + lastEnd = 0; + } }; } diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 80cc945..a45d3ff 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia { /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(Tokens::Empty) {} +TokenTrie::Node::Node() : id(Tokens::Empty) {} /* Class DynamicTokenTree */ bool TokenTrie::registerToken(const std::string &token, - TokenId type) noexcept + TokenId id) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token, } // If the resulting node already has a type set, we're screwed. - if (node->type != Tokens::Empty) { + if (node->id != Tokens::Empty) { return false; } // Otherwise just set the type to the given type. - node->type = type; + node->id = id; return true; } @@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->type != Tokens::Empty || node->children.size() > 1) && + if ((node->id != Tokens::Empty || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; @@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept } // If the node type is already Tokens::Empty, we cannot do anything here - if (node->type == Tokens::Empty) { + if (node->id == Tokens::Empty) { return false; } // If the target node has children, we cannot delete the subtree. Set the // type to Tokens::Empty instead if (!node->children.empty()) { - node->type = Tokens::Empty; + node->id = Tokens::Empty; return true; } @@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept } node = it->second.get(); } - return node->type; + return node->id; } } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index b2d1539..c470acc 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,7 +33,7 @@ #include <limits> #include <unordered_map> -#include "Token.hpp" +#include <core/common/Token.hpp> namespace ousia { @@ -75,10 +75,9 @@ public: ChildMap children; /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. + * Id of the token represented by this node. */ - TokenId type; + TokenId id; /** * Default constructor, initializes the descriptor with nullptr. @@ -99,10 +98,10 @@ public: * * @param token is the character sequence that should be registered as * token. - * @param type is the descriptor that should be set for this token. + * @param id is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, TokenId type) noexcept; + bool registerToken(const std::string &token, TokenId id) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..d8a8b37 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -26,6 +26,11 @@ #include "TokenizedData.hpp" namespace ousia { +/** + * Maximum token length. + */ +constexpr TokenLength MaxTokenLength = std::numeric_limits<TokenLength>::max(); + namespace { /** * Structure used to represent the position of a token in the internal @@ -48,6 +53,11 @@ struct TokenMark { TokenLength len; /** + * Specifies whether the token is special or not. + */ + bool special; + + /** * Constructor of the TokenMark structure, initializes all members with the * given values. * @@ -55,9 +65,10 @@ struct TokenMark { * @param bufStart is the start position of the TokenMark in the internal * character buffer. * @param len is the length of the token. + * @param special modifies the sort order, special tokens are prefered. */ - TokenMark(TokenId id, size_t bufStart, TokenLength len) - : bufStart(bufStart), id(id), len(len) + TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) + : bufStart(bufStart), id(id), len(len), special(special) { } @@ -72,7 +83,8 @@ struct TokenMark { TokenMark(size_t bufStart) : bufStart(bufStart), id(Tokens::Empty), - len(std::numeric_limits<TokenLength>::max()) + len(MaxTokenLength), + special(true) { } @@ -86,8 +98,22 @@ struct TokenMark { */ friend bool operator<(const TokenMark &m1, const TokenMark &m2) { - return (m1.bufStart < m2.bufStart) || - (m1.bufStart == m2.bufStart && m1.len > m2.len); + // Prefer the mark with the smaller bufStart + if (m1.bufStart < m2.bufStart) { + return true; + } + + // Special handling for marks with the same bufStart + if (m1.bufStart == m2.bufStart) { + // If exactly one of the two marks is special, return true if this + // one is special + if (m1.special != m2.special) { + return m1.special; + } + // Otherwise prefer longer marks + return m1.len > m2.len; + } + return false; } }; } @@ -110,9 +136,9 @@ private: std::vector<char> buf; /** - * Vector containing all token marks. + * Buffset storing the "protected" flag of the character data. */ - std::vector<TokenMark> marks; + std::vector<bool> protectedChars; /** * Vector storing all the character offsets efficiently. @@ -120,9 +146,34 @@ private: SourceOffsetVector offsets; /** + * Vector containing all token marks. + */ + mutable std::vector<TokenMark> marks; + + /** + * Position of the first linebreak in a sequence of linebreaks. + */ + size_t firstLinebreak; + + /** + * Current indentation level. + */ + uint16_t currentIndentation; + + /** + * Last indentation level. + */ + uint16_t lastIndentation; + + /** + * Number of linebreaks without any content between them. + */ + uint16_t numLinebreaks; + + /** * Flag indicating whether the internal "marks" vector is sorted. */ - bool sorted; + mutable bool sorted; public: /** @@ -132,7 +183,7 @@ public: * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ - TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} + TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); } /** * Appends a complete string to the internal character buffer and extends @@ -140,22 +191,22 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart) - { // Append the data to the internal buffer - buf.insert(buf.end(), data.begin(), data.end()); - - // Extend the text regions, interpolate the source position (this may - // yield incorrect results) - const size_t size = buf.size(); - for (SourceOffset offs = offsStart; offs < offsStart + data.size(); - offs++) { - offsets.storeOffset(offs, offs + 1); + size_t append(const std::string &data, SourceOffset offsStart, bool protect) + { + for (size_t i = 0; i < data.size(); i++) { + if (offsStart != InvalidSourceOffset) { + append(data[i], offsStart + i, offsStart + i + 1, protect); + } else { + append(data[i], InvalidSourceOffset, InvalidSourceOffset, + protect); + } } - - return size; + return size(); } /** @@ -165,16 +216,86 @@ public: * @param c is the character that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param offsEnd is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect) { // Add the character to the list and store the location of the character // in the source file buf.push_back(c); + protectedChars.push_back(protect); offsets.storeOffset(offsStart, offsEnd); - return buf.size(); + + // Insert special tokens + const size_t size = buf.size(); + const bool isWhitespace = Utils::isWhitespace(c); + const bool isLinebreak = Utils::isLinebreak(c); + + // Handle linebreaks + if (isLinebreak) { + // Mark linebreaks as linebreak + mark(Tokens::Newline, size - 1, 1, false); + + // The linebreak sequence started at the previous character + if (numLinebreaks == 0) { + firstLinebreak = size - 1; + } + + // Reset the indentation + currentIndentation = 0; + + // Increment the number of linebreaks + numLinebreaks++; + + const size_t markStart = firstLinebreak; + const size_t markLength = size - firstLinebreak; + + // Issue two consecutive linebreaks as paragraph token + if (numLinebreaks == 2) { + mark(Tokens::Paragraph, markStart, markLength, false); + } + + // Issue three consecutive linebreaks as paragraph token + if (numLinebreaks >= 3) { + mark(Tokens::Section, markStart, markLength, false); + } + } else if (isWhitespace) { + // Count the whitespace characters at the beginning of the line + if (numLinebreaks > 0) { + // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the + // next multiple of eight. + if (c == '\t') { + currentIndentation = (currentIndentation + 8) & ~7; + } else { + currentIndentation++; + } + } + } + + // Issue indent and unindent tokens + if (!isWhitespace && numLinebreaks > 0) { + // Issue a larger indentation than that in the previous line as + // "Indent" token + if (currentIndentation > lastIndentation) { + mark(Tokens::Indent, size - 1, 0, true); + } + + // Issue a smaller indentation than that in the previous line as + // "Dedent" token + if (currentIndentation < lastIndentation) { + mark(Tokens::Dedent, size - 1, 0, true); + } + + // Reset the internal state machine + lastIndentation = currentIndentation; + numLinebreaks = 0; + } + + return size; } /** @@ -184,11 +305,12 @@ public: * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. + * @param special tags the mark as "special", prefering it in the sort order */ - void mark(TokenId id, size_t bufStart, TokenLength len) + void mark(TokenId id, size_t bufStart, TokenLength len, bool special) { // Push the new instance back onto the list - marks.emplace_back(id, bufStart, len); + marks.emplace_back(id, bufStart, len, special); // Update the sorted flag as soon as more than one element is in the // list @@ -212,9 +334,13 @@ public: * @return true if a token was returned, false if no more tokens are * available. */ - bool next(Token &token, WhitespaceMode mode, - const std::unordered_set<TokenId> &tokens, size_t &cursor) + bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, + TokenizedDataCursor &cursor) const { + // Some variables for convenient access + size_t &bufPos = cursor.bufPos; + size_t &markPos = cursor.markPos; + // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { std::sort(marks.begin(), marks.end()); @@ -222,10 +348,11 @@ public: } // Fetch the next larger TokenMark instance, make sure the token is in - // the "enabled" list - auto it = - std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); - while (it != marks.end() && tokens.count(it->id) == 0) { + // the "enabled" list and within the buffer range + auto it = std::lower_bound(marks.begin() + markPos, marks.end(), + TokenMark(bufPos)); + while (it != marks.end() && (tokens.count(it->id) == 0 || + it->bufStart + it->len > buf.size())) { it++; } @@ -236,15 +363,15 @@ public: // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. - if (cursor < end && cursor < buf.size()) { + if (bufPos < end && bufPos < buf.size()) { switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( - Tokens::Data, std::string(&buf[cursor], end - cursor), + Tokens::Data, std::string(&buf[bufPos], end - bufPos), SourceLocation(sourceId, - offsets.loadOffset(cursor).first, + offsets.loadOffset(bufPos).first, offsets.loadOffset(end).first)); - cursor = end; + bufPos = end; return true; } case WhitespaceMode::TRIM: @@ -254,30 +381,35 @@ public: size_t stringStart; size_t stringEnd; std::string content; + const char *cBuf = &buf[bufPos]; + auto filter = [cBuf, this](size_t i) -> bool { + return Utils::isWhitespace(cBuf[i]) && + !protectedChars[i]; + }; if (mode == WhitespaceMode::TRIM) { - content = Utils::trim(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::trim(cBuf, end - bufPos, stringStart, + stringEnd, filter); } else { - content = Utils::collapse(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::collapse( + cBuf, end - bufPos, stringStart, stringEnd, filter); } // If the resulting string is empty (only whitespaces), // abort if (content.empty()) { - cursor = end; + bufPos = end; break; } // Calculate the absolute positions and return the token - stringStart += cursor; - stringEnd += cursor; + stringStart += bufPos; + stringEnd += bufPos; token = Token( Tokens::Data, content, SourceLocation(sourceId, offsets.loadOffset(stringStart).first, offsets.loadOffset(stringEnd).first)); - cursor = end; + bufPos = end; return true; } } @@ -286,14 +418,18 @@ public: // If start equals end, we're currently directly at a token // instance. Return this token and advance the cursor to the end of // the token. - if (cursor == end && it != marks.end()) { + if (bufPos == end && it != marks.end()) { const size_t tokenStart = it->bufStart; const size_t tokenEnd = it->bufStart + it->len; token = Token( it->id, std::string(&buf[tokenStart], it->len), SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, offsets.loadOffset(tokenEnd).first)); - cursor = tokenEnd; + + // Update the cursor, consume the token by incrementing the marks + // pos counter + bufPos = tokenEnd; + markPos = it - marks.begin() + 1; return true; } @@ -304,11 +440,64 @@ public: } /** + * Resets the TokenizedDataImpl instance to the state it had when it was + * constructred. + */ + void clear() + { + buf.clear(); + protectedChars.clear(); + offsets.clear(); + marks.clear(); + firstLinebreak = 0; + currentIndentation = 0; + lastIndentation = 0; + numLinebreaks = 1; // Assume the stream starts with a linebreak + sorted = true; + } + + /** + * Trims the length of the TokenizedDataImpl instance to the given length. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) + { + if (length < size()) { + buf.resize(length); + protectedChars.resize(length); + offsets.trim(length); + } + } + + /** * Returns the current size of the internal buffer. * * @return the size of the internal character buffer. */ - size_t getSize() { return buf.size(); } + size_t size() const { return buf.size(); } + + /** + * Returns true if no data is in the data buffer. + * + * @return true if the "buf" instance has no data. + */ + bool empty() const { return buf.empty(); } + + /** + * Returns the current location of all data in the buffer. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const + { + if (empty()) { + return SourceLocation{sourceId}; + } + return SourceLocation{sourceId, offsets.loadOffset(0).first, + offsets.loadOffset(size()).second}; + } }; /* Class TokenizedData */ @@ -316,50 +505,90 @@ public: TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} TokenizedData::TokenizedData(SourceId sourceId) - : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) + : impl(std::make_shared<TokenizedDataImpl>(sourceId)) { } +TokenizedData::TokenizedData(const std::string &data, SourceOffset offsStart, + SourceId sourceId) + : TokenizedData(sourceId) +{ + append(data, offsStart); +} + TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, + bool protect) { - return impl->append(data, offsStart); + return impl->append(data, offsStart, protect); } size_t TokenizedData::append(char c, SourceOffset offsStart, - SourceOffset offsEnd) + SourceOffset offsEnd, bool protect) { - return impl->append(c, offsStart, offsEnd); + return impl->append(c, offsStart, offsEnd, protect); } void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->getSize() - len, len); + impl->mark(id, impl->size() - len, len, false); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) { - impl->mark(id, bufStart, len); + impl->mark(id, bufStart, len, false); } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear() { impl->clear(); } + +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const { - return impl->next(token, mode, tokens, cursor); + return impl->getLocation(); } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +TokenizedDataReader TokenizedData::reader() const { - // Copy the current cursor position to not update the actual cursor position - // if the operation was not successful - size_t cursorCopy = cursor; - if (!impl->next(token, mode, tokens, cursorCopy) || - token.id != Tokens::Data) { - return false; - } + return TokenizedDataReader(impl, TokenizedDataCursor(), + TokenizedDataCursor()); +} + +/* Class TokenizedDataReader */ - // There is indeed a text token, update the internal cursor position - cursor = cursorCopy; - return true; +TokenizedDataReader::TokenizedDataReader( + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ + return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const +{ + return readCursor.bufPos >= impl->size(); +} + +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + peekCursor = readCursor; + return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + return impl->next(token, mode, tokens, peekCursor); } } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..bc937f2 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -37,40 +37,48 @@ #include <core/common/Location.hpp> #include <core/common/Whitespace.hpp> - -#include "Token.hpp" +#include <core/common/Token.hpp> namespace ousia { // Forward declaration class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork; /** - * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * Internally used structure representing a cursor within the TokenizedData + * stream. */ -class TokenizedData { -private: +struct TokenizedDataCursor { /** - * Shared pointer pointing at the internal data. This data is shared when - * copying TokenizedData instances, which corresponds to forking a - * TokenizedData instance. + * Position within the byte buffer. */ - std::shared_ptr<TokenizedDataImpl> impl; + size_t bufPos; /** - * Contains all currently enabled token ids. + * Position within the token mark buffer. */ - std::unordered_set<TokenId> tokens; + size_t markPos; /** - * Position from which the last element was read from the internal buffer. - * This information is not shared with the other instances of TokenizedData - * pointing at the same location. + * Default constructor. The resulting cursor points at the beginning of the + * stream. + */ + TokenizedDataCursor() : bufPos(0), markPos(0) {} +}; + +/** + * The TokenizedData class stores data extracted from a user defined document. + * The data stored in TokenizedData + */ +class TokenizedData { +private: + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. */ - size_t cursor; + std::shared_ptr<TokenizedDataImpl> impl; public: /** @@ -88,6 +96,18 @@ public: TokenizedData(SourceId sourceId); /** + * Creates a new instance of TokenizedData, takes a SourceId and an initial + * string buffer. + * + * @param data is the string that should be appended to the buffer. + * @param offsStart is the start offset in bytes in the input file. + * @param sourceId is the source identifier that should be used for + * constructing the location when returning tokens. + */ + TokenizedData(const std::string &data, SourceOffset offsStart = 0, + SourceId sourceId = InvalidSourceId); + + /** * Destructor. Needs to be defined explicitly for freeing a shared pointer * of the incomplete TokenizedDataImpl type. */ @@ -101,10 +121,13 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart = 0); + size_t append(const std::string &data, SourceOffset offsStart = 0, + bool protect = false); /** * Appends a single character to the internal character buffer. @@ -112,10 +135,13 @@ public: * @param c is the character that should be appended to the buffer. * @param start is the start offset in bytes in the input file. * @param end is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect = false); /** * Stores a token ending at the last character of the current buffer. @@ -136,54 +162,194 @@ public: void mark(TokenId id, size_t bufStart, TokenLength len); /** - * Enables a single token id. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Resets the TokenizedData instance to the state it had when it was + * constructred. + */ + void clear(); + + /** + * Trims the length of the TokenizedData instance to the given length. Note + * that this function does not remove any token matches for performance + * reasons, it merely renders them incaccessible. Appending new data after + * calling trim will make the token marks accessible again. Thus this method + * should be the last function called to modify the data buffer and the + * token marks. * - * @param id is the TokenId of the token that should be enabled. + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length); + + /** + * Returns the number of characters currently represented by this + * TokenizedData instance. */ - void enableToken(TokenId id) { tokens.insert(id); } + size_t size() const; /** - * Enables a set of token ids. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Returns true if the TokenizedData instance is empty, false otherwise. * - * @param ids is the TokenId of the token that should be enabled. + * @return true if not data is stored inside the TokenizedData instance. */ - void enableToken(const std::unordered_set<TokenId> &ids) - { - tokens.insert(ids.begin(), ids.end()); - } + bool empty() const; + + /** + * Returns the location of the entire TokenizedData instance. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const; + + /** + * Returns a TokenizedDataReader instance that can be used to access the + * data. + * + * @return a new TokenizedDataReader instance pointing at the beginning of + * the internal buffer. + */ + TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: + friend TokenizedData; + + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. + */ + std::shared_ptr<const TokenizedDataImpl> impl; + + /** + * Position from which the last element was read from the internal buffer. + */ + TokenizedDataCursor readCursor; + + /** + * Position from which the last element was peeked from the internal buffer. + */ + TokenizedDataCursor peekCursor; + +protected: + /** + * Protected constructor of TokenizedDataReader, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader. + * + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor); + +public: + /** + * Returns a new TokenizedDataReaderFork from which tokens and text can be + * read without advancing this reader instance. + */ + TokenizedDataReaderFork fork(); + + /** + * Returns true if this TokenizedData instance is at the end. + * + * @return true if the end of the TokenizedData instance has been reached. + */ + bool atEnd() const; /** * Stores the next token in the given token reference, returns true if the - * operation was successful, false if there are no more tokens. + * operation was successful, false if there are no more tokens. Advances the + * internal cursor and re * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool read(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM); /** - * Stores the next text token in the given token reference, returns true if - * the operation was successful (there was indeed a text token), false if - * the next token is not a text token or there were no more tokens. + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool peek(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::TRIM); + + /** + * Consumes the peeked tokens, the read cursor will now be at the position + * of the peek cursor. + */ + void consumePeek() { readCursor = peekCursor; } + + /** + * Resets the peek cursor to the position of the read cursor. + */ + void resetPeek() { peekCursor = readCursor; } +}; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: + friend TokenizedDataReader; + + /** + * Reference pointing at the parent TokenizedDataReader to which changes may + * be commited. + */ + TokenizedDataReader &parent; + + /** + * Private constructor of TokenizedDataReaderFork, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader and a reference at the parent TokenizedDataReader. + * + * @param parent is the TokenizedDataReader instance to which the current + * read/peek progress may be commited. + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReaderFork(TokenizedDataReader &parent, + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) + { + } + +public: + /** + * Commits the read/peek progress to the underlying parent. + */ + void commit() { parent = *this; } }; } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..8d540a6 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@ #include <core/common/CharReader.hpp> #include <core/common/Exceptions.hpp> #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include "TokenizedData.hpp" #include "Tokenizer.hpp" namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch { Token token; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; + size_t dataStartOffset; /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. + * Set to true if the matched token is a primary token. */ - size_t textEnd; + bool primary; /** * Constructor of the TokenMatch class. */ - TokenMatch() : textLength(0), textEnd(0) {} + TokenMatch() : dataStartOffset(0), primary(false) {} /** * Returns true if this TokenMatch instance actually represents a match. + * + * @return true if the TokenMatch actually has a match. + */ + bool hasMatch() const { return token.id != Tokens::Empty; } + + /** + * Returns the length of the matched token. + * + * @return the length of the token string. */ - bool hasMatch() { return token.id != Tokens::Empty; } + size_t size() const { return token.content.size(); } }; /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private: size_t start; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; + size_t dataStartOffset; public: /** * Constructor of the TokenLookup class. * * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. + * @param start is the start position in the source file. + * @param dataStartOffset is the current length of the TokenizedData buffer. */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t dataStartOffset) + : node(node), start(start), dataStartOffset(dataStartOffset) { } /** * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). + * character. If a complete token is matched, stores the match in the given + * TokenMatch reference and returns true. * * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,48 @@ public: * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. + * @return true if a token was matched, false otherwise. */ - void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, - const std::vector<std::string> &tokens, SourceOffset end, - SourceId sourceId) + bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, + const std::vector<Tokenizer::TokenDescriptor> &tokens, + SourceOffset end, SourceId sourceId) { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node + // Set to true once a token has been matched + bool res = false; + + // Check whether we can continue the current token path, if not, abort auto it = node->children.find(c); if (it == node->children.end()) { - return; + return res; } // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); - if (node->type != Tokens::Empty) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - Token{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } + if (node->id != Tokens::Empty) { + const Tokenizer::TokenDescriptor &descr = tokens[node->id]; + match.token = Token(node->id, descr.string, + SourceLocation(sourceId, start, end)); + match.dataStartOffset = dataStartOffset; + match.primary = descr.primary; + res = true; } // If this state can possibly be advanced, store it in the states list. if (!node->children.empty()) { lookups.emplace_back(*this); } + return res; } }; - -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.id = Tokens::Data; -} } /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template <typename TextHandler, bool read> -bool Tokenizer::next(CharReader &reader, Token &token) +template <bool read> +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -199,45 +173,63 @@ bool Tokenizer::next(CharReader &reader, Token &token) // Prepare the lookups in the token trie const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; + TokenMatch bestMatch; std::vector<TokenLookup> lookups; std::vector<TokenLookup> nextLookups; - // Instantiate the text handler - TextHandler textHandler; - // Peek characters from the reader and try to advance the current token tree // cursor char c; + const size_t initialDataSize = data.size(); size_t charStart = reader.getPeekOffset(); const SourceId sourceId = reader.getSourceId(); while (reader.peek(c)) { const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; + const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); + if (!bestMatch.hasMatch() || !bestMatch.primary) { + lookups.emplace_back(root, charStart, dataStartOffset); } // Try to advance all other lookups with the new character + TokenMatch match; for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + // Continue if the current lookup + if (!lookup.advance(c, nextLookups, match, tokens, charEnd, + sourceId)) { + continue; + } + + // Replace the best match with longest token + if (match.size() > bestMatch.size()) { + bestMatch = match; + } + + // If the matched token is a non-primary token -- mark the match in + // the TokenizedData list + if (!match.primary) { + data.mark(match.token.id, data.size() - match.size() + 1, + match.size()); + } } - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { + + // If a token has been found and the token is a primary token, check + // whether we have to abort, otherwise if we have a non-primary match, + // reset it once it can no longer be advanced + if (bestMatch.hasMatch() && nextLookups.empty()) { + if (bestMatch.primary) { break; + } else { + bestMatch = TokenMatch{}; } - } else { - // Record all incomming characters - textHandler.append(c, charStart, charEnd); } + // Record all incomming characters + data.append(c, charStart, charEnd); + + // Swap the lookups and the nextLookups list lookups = std::move(nextLookups); nextLookups.clear(); @@ -246,60 +238,57 @@ bool Tokenizer::next(CharReader &reader, Token &token) charStart = charEnd; } - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildDataToken(textHandler, match, sourceId); + // If we found data, emit a corresponding data token + if (data.size() > initialDataSize && + (!bestMatch.hasMatch() || !bestMatch.primary || + bestMatch.dataStartOffset > initialDataSize)) { + // If we have a "bestMatch" wich starts after text data has started, + // trim the TokenizedData to this offset + if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) { + data.trim(bestMatch.dataStartOffset); + } + + // Create a token containing the data location + bestMatch.token = Token{data.getLocation()}; + } else if (bestMatch.hasMatch() && bestMatch.primary && + bestMatch.dataStartOffset == initialDataSize) { + data.trim(initialDataSize); } // Move the read/peek cursor to the end of the token, abort if an error // happens while doing so - if (match.hasMatch()) { + if (bestMatch.hasMatch()) { // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { + if (bestMatch.token.location.getEnd() == InvalidSourceOffset) { throw OusiaException{"Token end position offset out of range"}; } // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); + const size_t end = bestMatch.token.location.getEnd(); if (read) { reader.seek(end); } else { reader.seekPeekCursor(end); } - token = match.token; + + token = bestMatch.token; } else { token = Token{}; } - return match.hasMatch(); + return bestMatch.hasMatch(); } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingWhitespaceHandler, true>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingWhitespaceHandler, true>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingWhitespaceHandler, true>(reader, token); - } - return false; + return next<true>(reader, token, data); } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingWhitespaceHandler, false>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingWhitespaceHandler, false>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingWhitespaceHandler, false>(reader, token); - } - return false; + return next<false>(reader, token, data); } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary) { // Abort if an empty token should be registered if (token.empty()) { @@ -309,8 +298,8 @@ TokenId Tokenizer::registerToken(const std::string &token) // Search for a new slot in the tokens list TokenId type = Tokens::Empty; for (size_t i = nextTokenId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; + if (!tokens[i].valid()) { + tokens[i] = TokenDescriptor(token, primary); type = i; break; } @@ -320,62 +309,47 @@ TokenId Tokenizer::registerToken(const std::string &token) // override the special token type handles if (type == Tokens::Empty) { type = tokens.size(); - if (type == Tokens::Data || type == Tokens::Empty) { + if (type >= Tokens::MaxTokenId) { throw OusiaException{"Token type ids depleted!"}; } - tokens.emplace_back(token); + tokens.emplace_back(token, primary); } nextTokenId = type + 1; - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list + // Try to register the token in the trie -- if this fails, remove it from + // the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; + tokens[type] = TokenDescriptor(); nextTokenId = type; return Tokens::Empty; } return type; } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id) { // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenId = type; + if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { + tokens[id] = TokenDescriptor(); + nextTokenId = id; return true; } return false; } -std::string Tokenizer::getTokenString(TokenId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const { - whitespaceMode = mode; + if (id < tokens.size()) { + return tokens[id]; + } + return EmptyTokenDescriptor; } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } - /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool Tokenizer::next<PreservingWhitespaceHandler, false>( - CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, false>( - CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, false>( - CharReader &reader, Token &token); -template bool Tokenizer::next<PreservingWhitespaceHandler, true>( - CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, true>( - CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, true>( - CharReader &reader, Token &token); +template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &); } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@ /** * @file Tokenizer.hpp * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -28,44 +28,80 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include <set> +#include <cstdint> #include <string> #include <vector> #include <core/common/Location.hpp> -#include <core/common/Whitespace.hpp> +#include <core/common/Token.hpp> -#include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; +class TokenizedData; /** * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance. */ class Tokenizer { -private: +public: /** - * Internally used token trie. This object holds all registered tokens. + * Internally used structure describing a registered token. */ - TokenTrie trie; + struct TokenDescriptor { + /** + * String describing the token. + */ + std::string string; + + /** + * Set to true if this token is primary. + */ + bool primary; + + /** + * Constructor of the TokenDescriptor class. + * + * @param string is the string representation of the registered token. + * @param primary specifies whether the token is a primary token that + * should be returned as a single token, or a secondary token, that + * should be returned as part of TokenizedData. + */ + TokenDescriptor(const std::string &string, bool primary) + : string(string), primary(primary) + { + } + + /** + * Default constructor. + */ + TokenDescriptor() : primary(false) {} + + /** + * Returns true if the TokenDescriptor represents a valid token. + */ + bool valid() { return !string.empty(); } + }; +private: /** - * Flag defining whether whitespaces should be preserved or not. + * Internally used token trie. This object holds all registered tokens. */ - WhitespaceMode whitespaceMode; + TokenTrie trie; /** * Vector containing all registered token types. */ - std::vector<std::string> tokens; + std::vector<TokenDescriptor> tokens; /** * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private: /** * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. + * function is templated in order to force optimized code generation for + * both reading and peeking. * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. + * @tparam read specifies whether the method should read the token or just + * peek. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return false if the end of the stream has been reached, true otherwise. */ - template <typename TextHandler, bool read> - bool next(CharReader &reader, Token &token); + template <bool read> + bool next(CharReader &reader, Token &token, TokenizedData &data); public: /** * Constructor of the Tokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. */ - Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + Tokenizer(); /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. + * Registers the given string as a token. Returns a unique identifier + * describing the registered token. * * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if + * @param primary specifies whether the token is a primary token -- if true, + * the token will be returned as a single, standalone token. Otherwise the + * token will be returned as part of a "TokenizedData" structure. + * @return a unique identifier for the registered token or Tokens::Empty if * an error occured. */ - TokenId registerToken(const std::string &token); + TokenId registerToken(const std::string &token, bool primary = true); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The - *TokenId - * must have been returned by registerToken. + * TokenId must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). + * because the token with the given TokenId was already unregistered). */ - bool unregisterToken(TokenId type); + bool unregisterToken(TokenId id); /** * Returns the token that was registered under the given TokenId id or - *an - * empty string if an invalid TokenId id is given. + * an empty string if an invalid TokenId id is given. * - * @param type is the TokenId id for which the corresponding token - *string + * @param id is the TokenId for which the corresponding TokenDescriptor * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. + * @return the registered TokenDescriptor or an invalid TokenDescriptor if + * the given TokenId is invalid. */ - WhitespaceMode getWhitespaceMode(); + const TokenDescriptor& lookupToken(TokenId id) const; /** * Reads a new token from the CharReader and stores it in the given - * Token instance. + * Token instance. If the token has the id Tokens::Data, use the "getData" + * method to fetch a reference at the underlying TokenizedData instance + * storing the data. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(CharReader &reader, Token &token); + bool read(CharReader &reader, Token &token, TokenizedData &data); /** * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public: * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(CharReader &reader, Token &token); + bool peek(CharReader &reader, Token &token, TokenizedData &data); }; } |