diff options
author | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2015-02-15 21:56:04 +0100 |
---|---|---|
committer | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2015-02-15 21:56:04 +0100 |
commit | d2f99e4b43ed93ef0fa8e138e0c3afc79775b77c (patch) | |
tree | 8e7cdb894b7036b3ca01499ee9432d2e62930477 /src/core | |
parent | 40f7df390f00f85c17bd0e6527ec4ba19cbce4fc (diff) | |
parent | 4f2872d9968aec93bebff90d1238347c8a364949 (diff) |
Merge branch 'master' of somweyr.de:ousia
Diffstat (limited to 'src/core')
42 files changed, 4044 insertions, 1969 deletions
diff --git a/src/core/CodeTokenizer.cpp b/src/core/CodeTokenizer.cpp deleted file mode 100644 index d65c514..0000000 --- a/src/core/CodeTokenizer.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <cassert> - -#include "CodeTokenizer.hpp" - -namespace ousia { - -Token CodeTokenizer::constructToken(const Token &t) -{ - std::string content = buf.str(); - buf.str(std::string()); - return Token{ - returnTokenId, content, - SourceLocation{t.location.getSourceId(), startToken.location.getStart(), - t.location.getEnd()}}; -} - -void CodeTokenizer::buffer(const Token &t) { buf << t.content; } - -bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) -{ - auto it = descriptors.find(t.tokenId); - CodeTokenMode mode = CodeTokenMode::NONE; - if (it != descriptors.end()) { - mode = it->second.mode; - } - - switch (state) { - case CodeTokenizerState::NORMAL: - switch (mode) { - case CodeTokenMode::STRING_START_END: - state = CodeTokenizerState::IN_STRING; - break; - case CodeTokenMode::BLOCK_COMMENT_START: - state = CodeTokenizerState::IN_BLOCK_COMMENT; - break; - case CodeTokenMode::LINE_COMMENT: - state = CodeTokenizerState::IN_LINE_COMMENT; - break; - case CodeTokenMode::LINEBREAK: - if (!ignoreLinebreaks) { - peeked.push_back( - {it->second.id, t.content, t.location}); - } - return !ignoreLinebreaks; - default: - bool empty = true; - if (t.tokenId == TOKEN_TEXT) { - int begin = -1; - for (size_t c = 0; c < t.content.length(); c++) { - bool isWhitespace = - t.content[c] == ' ' || t.content[c] == '\t'; - if (begin < 0) { - // if we have not yet set our beginning, - // we wait for the first - // non-whitespace-character to set it. - if (!isWhitespace) { - begin = c; - } - } else { - // if we have set our beginning, we wait for the - // first whitespace character, which marks the - // end of the current word. - if (isWhitespace) { - peeked.push_back(Token{ - TOKEN_TEXT, - t.content.substr(begin, (int)c - begin), - SourceLocation{ - t.location.getSourceId(), - t.location.getStart() + begin, - t.location.getStart() + c}}); - begin = -1; - empty = false; - } - } - } - if (begin >= 0) { - peeked.push_back(Token{ - TOKEN_TEXT, t.content.substr(begin), - SourceLocation{t.location.getSourceId(), - t.location.getStart() + begin, - t.location.getEnd()}}); - empty = false; - } - } else { - empty = false; - peeked.push_back(t); - } - return !empty; - } - startToken = t; - returnTokenId = it->second.id; - return false; - case CodeTokenizerState::IN_LINE_COMMENT: - switch (mode) { - case CodeTokenMode::LINEBREAK: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_BLOCK_COMMENT: - switch (mode) { - case CodeTokenMode::BLOCK_COMMENT_END: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_STRING: - switch (mode) { - case CodeTokenMode::ESCAPE: - if (escaped) { - buffer(t); - } - escaped = !escaped; - return false; - case CodeTokenMode::STRING_START_END: - if (escaped) { - buffer(t); - escaped = false; - return false; - } else { - peeked.push_back(constructToken(t)); - state = CodeTokenizerState::NORMAL; - return true; - } - default: - if (escaped) { - // TODO: handle escaped characters? - escaped = false; - } - buffer(t); - return false; - } - } - assert(false); - return false; -} -} diff --git a/src/core/CodeTokenizer.hpp b/src/core/CodeTokenizer.hpp deleted file mode 100644 index 154f949..0000000 --- a/src/core/CodeTokenizer.hpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file CodeTokenizer.hpp - - * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) - */ -#ifndef _OUSIA_CODE_TOKENIZER_HPP_ -#define _OUSIA_CODE_TOKENIZER_HPP_ - -#include <map> -#include <sstream> - -#include <core/common/CharReader.hpp> -#include "Tokenizer.hpp" - -namespace ousia { - -/* - * This enum contains all special Token the CodeTokenizer supports, namely: - * - * 1.) An ambigous Tokens - in post programming languages single-quotes ' or - * double-quotes " - to delimit string tokens. - * 2.) A start token for line comments, which would e.g. be // in Java. - * 3.) A start token for a block comment - * 4.) An end token for a block comment. - * 5.) A linebreak token - * 6.) The escape token, which would e.g. be \ in java. - */ -enum class CodeTokenMode { - STRING_START_END, - LINE_COMMENT, - BLOCK_COMMENT_START, - BLOCK_COMMENT_END, - LINEBREAK, - ESCAPE, - NONE -}; - -/** - * A CodeTokenDescriptor defines the id the user likes to have returned for - * a Token of the mode specified, e.g. if you want to get the id 4 for a - * String Token the corresponding CodeTokenDescriptor would be inizialized - * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; - */ -struct CodeTokenDescriptor { - CodeTokenMode mode; - int id; - - CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} -}; - -/** - * The CodeTokenizer is a finite state machine with the states NORMAL, being - * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. - */ -enum class CodeTokenizerState { - NORMAL, - IN_BLOCK_COMMENT, - IN_LINE_COMMENT, - IN_STRING -}; - -/** - * The purpose of a CodeTokenizer is to make it easier to parse classical - * programming Code. It adds the following features to a regular Tokenizer: - * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens - * for the opening delimiter, the text and the closing delimiter. - * 2.) Escaping in String tokens. - * 3.) Comment Tokens (for line comments as well as block comments) - */ -class CodeTokenizer : public Tokenizer { -private: - std::map<int, CodeTokenDescriptor> descriptors; - CodeTokenizerState state; - std::stringstream buf; - Token startToken; - int returnTokenId; - bool escaped = false; - - Token constructToken(const Token &t); - void buffer(const Token &t); - -protected: - bool doPrepare(const Token &t, std::deque<Token> &peeked) override; - -public: - /** - * If you do not want comment tokens to be returned you can set this to - * true. - */ - bool ignoreComments = false; - /** - * If you do not want linebreaks to be returned you can set this to true. - */ - bool ignoreLinebreaks = false; - - /** - * - * @param input a CharReader containing the input for this tokenizer, as - * with a regular tokenizer. - * @param root a TokenTreeNode representing the root of the TokenTree. - * Please note that you have to specify all tokenIDs here that you use - * in the descriptors map. - * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. - * In this way you can specify the meaning of certain Tokens. Say you - * specified the Token "//" with the id 1 in the TokenTree. Then you could - * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map - * and this CodeTokenizer would recognize the token "//" as starting a - * line comment. - */ - CodeTokenizer(CharReader &input, const TokenTreeNode &root, - std::map<int, CodeTokenDescriptor> descriptors) - : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) - { - } -}; -} - -#endif diff --git a/src/core/Tokenizer.cpp b/src/core/Tokenizer.cpp deleted file mode 100644 index ab4735a..0000000 --- a/src/core/Tokenizer.cpp +++ /dev/null @@ -1,204 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <sstream> - -#include "Tokenizer.hpp" - -namespace ousia { - -static std::map<char, TokenTreeNode> buildChildren( - const std::map<std::string, int> &inputs) -{ - std::map<char, TokenTreeNode> children; - std::map<char, std::map<std::string, int>> nexts; - - for (auto &e : inputs) { - const std::string &s = e.first; - const int id = e.second; - if (s.empty()) { - continue; - } - char start = s[0]; - const std::string suffix = s.substr(1); - if (nexts.find(start) != nexts.end()) { - nexts[start].insert(std::make_pair(suffix, id)); - } else { - nexts.insert(std::make_pair( - start, std::map<std::string, int>{{suffix, id}})); - } - } - - for (auto &n : nexts) { - children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); - } - - return children; -} - -static int buildId(const std::map<std::string, int> &inputs) -{ - int tokenId = TOKEN_NONE; - for (auto &e : inputs) { - if (e.first.empty()) { - if (tokenId != TOKEN_NONE) { - throw TokenizerException{std::string{"Ambigous token found: "} + - std::to_string(e.second)}; - } else { - tokenId = e.second; - } - } - } - return tokenId; -} - -TokenTreeNode::TokenTreeNode(const std::map<std::string, int> &inputs) - : children(buildChildren(inputs)), tokenId(buildId(inputs)) -{ -} - -Tokenizer::Tokenizer(CharReader &input, const TokenTreeNode &root) - : input(input), root(root) -{ -} - -bool Tokenizer::prepare() -{ - std::stringstream buffer; - char c; - SourcePosition start = input.getOffset(); - bool bufEmpty = true; - while (input.peek(c)) { - if (root.children.find(c) != root.children.end()) { - // if there might be a special token, keep peeking forward - // until we find the token (or we don't). - TokenTreeNode const *n = &root; - std::stringstream tBuf; - int match = TOKEN_NONE; - while (true) { - tBuf << c; - n = &(n->children.at(c)); - if (n->tokenId != TOKEN_NONE) { - match = n->tokenId; - // from here on we found a token. If we have something - // in our buffer already, we end the search now. - if (!bufEmpty) { - break; - } else { - // if we want to return this token ( = we have nothing - // in our buffer yet) we look greedily for the longest - // possible token we can construct. - input.consumePeek(); - } - } - if (!input.peek(c)) { - // if we are at the end we break off the search. - break; - } - if (n->children.find(c) == n->children.end()) { - // if we do not find a possible continuation anymore, - // break off the search. - break; - } - } - //reset the peek pointer to the last valid position. - input.resetPeek(); - // check if we did indeed find a special token. - if (match != TOKEN_NONE) { - if (bufEmpty) { - // if we did not have text before, construct that token. - if (doPrepare( - Token{match, tBuf.str(), input.getLocation(start)}, - peeked)) { - return true; - } else { - start = input.getOffset(); - continue; - } - } else { - // otherwise we return the text before the token. - if (doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, - peeked)) { - return true; - } else{ - //we need to clear the buffer here. After all the token - //corresponding to this buffer segment is already - //constructed. - buffer.str(std::string()); - bufEmpty = true; - start = input.getOffset(); - continue; - } - } - } else{ - //if we found nothing, read at least one character. - input.peek(c); - } - } - buffer << c; - bufEmpty = false; - input.consumePeek(); - } - if (!bufEmpty) { - return doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, - peeked); - } - return false; -} - -bool Tokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) -{ - peeked.push_back(t); - return true; -} - -bool Tokenizer::next(Token &t) -{ - if (peeked.empty()) { - if (!prepare()) { - return false; - } - } - t = peeked.front(); - peeked.pop_front(); - resetPeek(); - return true; -} - -bool Tokenizer::peek(Token &t) -{ - if (peekCursor >= peeked.size()) { - if (!prepare()) { - return false; - } - } - t = peeked[peekCursor]; - peekCursor++; - return true; -} - -void Tokenizer::resetPeek() { peekCursor = 0; } - -void Tokenizer::consumePeek() -{ - while (peekCursor > 0) { - peeked.pop_front(); - peekCursor--; - } -} -} diff --git a/src/core/Tokenizer.hpp b/src/core/Tokenizer.hpp deleted file mode 100644 index 50e458c..0000000 --- a/src/core/Tokenizer.hpp +++ /dev/null @@ -1,227 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#ifndef _OUSIA_TOKENIZER_HPP_ -#define _OUSIA_TOKENIZER_HPP_ - -#include <cstdint> -#include <deque> -#include <istream> -#include <map> - -#include <core/common/CharReader.hpp> - -namespace ousia { - -/** - * This exception is currently only thrown if errors are made during the - * initialization of the Tokenizer. Have a closer look at the documentation - * of the TokenTreeNode constructor for more information. - */ -class TokenizerException : public std::exception { -public: - const std::string msg; - - TokenizerException(const std::string &msg) : msg(msg){}; - - virtual const char *what() const noexcept override { return msg.c_str(); } -}; - -/** - * The Tokenizer internally uses a TokenTree to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * The TokenTree is a construct that structures all special tokens this - * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then - * the TokenTree would look like this: - * - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ - * - * Every node in the TokenTree is a valid end state that has a $ attached to it. - * During the search algorithm the Tokenizer goes through the tree and stores - * the last valid position. If a character follows that does not lead to a new - * node in the TokenTree the search ends (and starts again at this character). - * The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). - * - * It is also not allowed that tokens have common middle parts but varying - * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and - * the input string "abc". In that case we start looking for "abd" at the - * start, won't find it, wenn we hit "c" and start the scanning process - * anew. Thus the "bc" token is not found. - * - * For most (well-behaved) tokenization schemes this is not the case, - * though. - */ -class TokenTreeNode { -public: - const std::map<char, TokenTreeNode> children; - const int tokenId; - - /** - * The TokenTreeNode constructor builds a TokenTree from the given token - * specifications. The node returned by this constructor then is the root of - * said TokenTree. - * @param inputs Specifications of tokens in map form. Each specification - * is a tuple of the text that should be matched and some unique ID (>= 0) - * that is returned to you if that Token is found in the text. - * An example for such a map would be - * { - * { "#" , 1}, - * { "##", 2}, - * { "/" , 3} - * } - * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE - * (-1) and TOKEN_TEXT (-2). - */ - TokenTreeNode(const std::map<std::string, int> &inputs); -}; - -/** - * This is a reserved constant for the empty token. - */ -static const int TOKEN_NONE = -1; -/** - * This is a reserved constant for every part of the input text that is not a - * specified token. - */ -static const int TOKEN_TEXT = -2; - -/** - * A token for us is identified by an integer tokenID (either one of the - * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). - * Additionally we return the matched text (which should only be really - * interesting in case of TOKEN_TEXT tokens) and the position in the input text. - */ -struct Token { - int tokenId; - std::string content; - SourceLocation location; - - Token(int tokenId, std::string content, SourceLocation location) - : tokenId(tokenId), - content(content), - location(location) - { - } - - Token() : tokenId(TOKEN_NONE) {} -}; - -/** - * A Tokenizer has the purpose of subdividing an input text into tokens. In our - * definition here we distinguish between two kinds of tokens: - * 1.) User-specified tokens that match a fixed text. - * 2.) Any other text between those tokens. - * The user might want to specify the tokens '#{' and '#}' for example, because - * they have some meaning in her code. The user sets the IDs to 1 and 2. - * Given the input text - * "some text #{ special command #} some text" - * the tokenizer would return the tokens: - * 1.) "some text " with the id TOKEN_TEXT (-2). - * 2.) "#{" with the id 1. - * 3.) " special command " with the id TOKEN_TEXT (-2). - * 4.) "#}" with the id 2. - * 5.) " some text" with the id TOKEN_TEXT (-2). - * This makes the subsequent parsing of files of a specific type easier. - * Note that in case of tokens with that are prefixes of other tokens the - * longest possible match is returned. - */ -class Tokenizer { -private: - CharReader &input; - const TokenTreeNode &root; - std::deque<Token> peeked; - unsigned int peekCursor = 0; - - bool prepare(); - -protected: - /** - * This method is an interface to build multiple tokens from a single one in - * derived classes. This might be interesting if you want to implement - * further logic on text tokens or similar applications. - * - * @param t a Token the "basic" tokenizer found. - * @param peeked a reference to the deque containing all temporary Tokens. - * You are supposed to append your tokens there. In the trivial case you just - * put the given Token on top of the deque. - * @return false if no token was appended to the deque (meaning that you want - * to ignore the given token explicitly) and true in all other cases. - */ - virtual bool doPrepare(const Token &t, std::deque<Token> &peeked); - -public: - /** - * @param input The input of a Tokenizer is given in the form of a - * CharReader. Please refer to the respective documentation. - * @param root This is meant to be the root of a TokenTree giving the - * specification of user-defined tokens this Tokenizer should recognize. - * The Tokenizer promises to not change the TokenTree such that you can - * re-use the same specification for multiple inputs. - * Please refer to the TokenTreeNode documentation for more information. - */ - Tokenizer(CharReader &input, const TokenTreeNode &root); - - /** - * The next method consumes one Token from the input stream and gives - * it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool next(Token &t); - /** - * The peek method does not consume the next Token but buffers it and - * shows it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool peek(Token &t); - - /** - * Resets the peek pointer to the current position in the stream (to the - * beginning of the buffer). - */ - void resetPeek(); - - /** - * Clears the peek buffer, such that all peeked Tokens are consumed. - */ - void consumePeek(); - - const CharReader &getInput() const { return input; } - - CharReader &getInput() { return input; } -}; -} - -#endif diff --git a/src/core/common/Argument.cpp b/src/core/common/Argument.cpp index bfe74a4..b10fad3 100644 --- a/src/core/common/Argument.cpp +++ b/src/core/common/Argument.cpp @@ -302,10 +302,10 @@ bool Arguments::validateMap(Variant::mapType &map, Logger &logger, } else { if (ignoreUnknown) { logger.note(std::string("Ignoring argument \"") + e.first + - std::string("\"")); + std::string("\""), e.second); } else { logger.error(std::string("Unknown argument \"") + e.first + - std::string("\"")); + std::string("\""), e.second); ok = false; } } diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 563fe2a..f8b53c6 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -18,19 +18,13 @@ #include <algorithm> #include <cctype> -#include <limits> #include <string> #include "Utils.hpp" +#include "WhitespaceHandler.hpp" namespace ousia { -std::string Utils::trim(const std::string &s) -{ - std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::isIdentifier(const std::string &name) { bool first = true; @@ -43,7 +37,27 @@ bool Utils::isIdentifier(const std::string &name) } first = false; } - return true; + return !first; +} + +bool Utils::isIdentifierOrEmpty(const std::string &name) +{ + return name.empty() || isIdentifier(name); +} + +bool Utils::isNamespacedIdentifier(const std::string &name) +{ + bool first = true; + for (char c : name) { + if (first && !isIdentifierStartCharacter(c)) { + return false; + } + if (!first && (!isIdentifierCharacter(c) && c != ':')) { + return false; + } + first = (c == ':'); + } + return !first; } bool Utils::hasNonWhitepaceChar(const std::string &s) @@ -94,5 +108,29 @@ std::string Utils::extractFileExtension(const std::string &filename) } return std::string{}; } + +std::string Utils::trim(const std::string &s) +{ + std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); + return s.substr(bounds.first, bounds.second - bounds.first); +} + +std::string Utils::collapse(const std::string &s) +{ + CollapsingWhitespaceHandler h; + appendToWhitespaceHandler(h, s, 0); + return h.toString(); +} + +bool Utils::startsWith(const std::string &s, const std::string &prefix) +{ + return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; +} + +bool Utils::endsWith(const std::string &s, const std::string &suffix) +{ + return suffix.size() <= s.size() && + s.substr(s.size() - suffix.size(), suffix.size()) == suffix; +} } diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 2c8a5b3..b5a54fc 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -74,16 +74,45 @@ public: } /** - * Returns true if the given character is in [A-Za-z][A-Za-z0-9_-]* + * Returns true if the given string is in + * \code{.txt} + * [A-Za-z][A-Za-z0-9_-]* + * \endCode + * + * @param name is the string that should be tested. + * @return true if the string matches the regular expression given above, + * false otherwise. */ static bool isIdentifier(const std::string &name); /** + * Returns true if the given string is an identifier or an empty string. + */ + static bool isIdentifierOrEmpty(const std::string &name); + + /** + * Returns true if the given string is in + * \code{.txt} + * ([A-Za-z][A-Za-z0-9_-]*)(:[A-Za-z][A-Za-z0-9_-]*)* + * \endCode + * + * @param name is the string that should be tested. + * @return true if the string matches the regular expression given above, + * false otherwise. + */ + static bool isNamespacedIdentifier(const std::string &name); + + /** + * Returns true if the given character is a linebreak character. + */ + static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } + + /** * Returns true if the given character is a whitespace character. */ static bool isWhitespace(const char c) { - return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); + return (c == ' ') || (c == '\t') || isLinebreak(c); } /** @@ -95,11 +124,6 @@ public: static bool hasNonWhitepaceChar(const std::string &s); /** - * Returns true if the given character is a whitespace character. - */ - static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } - - /** * Removes whitespace at the beginning and the end of the given string. * * @param s is the string that should be trimmed. @@ -120,8 +144,25 @@ public: template <class T, class Filter> static std::pair<size_t, size_t> trim(const T &s, Filter f) { + return trim(s, s.size(), f); + } + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param len is the number of elements in the container. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template <class T, class Filter> + static std::pair<size_t, size_t> trim(const T &s, size_t len, Filter f) + { size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { + for (size_t i = 0; i < len; i++) { if (!f(s[i])) { start = i; break; @@ -129,7 +170,7 @@ public: } size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) { + for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) { if (!f(s[i])) { end = i + 1; break; @@ -145,6 +186,15 @@ public: } /** + * Collapses the whitespaces in the given string (trims the string and + * replaces all whitespace characters by a single one). + * + * @param s is the string in which the whitespace should be collapsed. + * @return a copy of s with collapsed whitespace. + */ + static std::string collapse(const std::string &s); + + /** * Turns the elements of a collection into a string separated by the * given delimiter. * @@ -205,6 +255,24 @@ public: static std::string extractFileExtension(const std::string &filename); /** + * Checks whether the given string starts with the given prefix. + * + * @param s is the string. + * @param prefix is the string which should be checked for being a prefix of + * s. + */ + static bool startsWith(const std::string &s, const std::string &prefix); + + /** + * Checks whether the given string ends with the given suffix. + * + * @param s is the string. + * @param suffix is the string which should be checked for being a suffix of + * s. + */ + static bool endsWith(const std::string &s, const std::string &suffix); + + /** * Hash functional to be used for enum classes. * See http://stackoverflow.com/a/24847480/2188211 */ diff --git a/src/core/common/Variant.hpp b/src/core/common/Variant.hpp index 6eae7e1..ddd17d7 100644 --- a/src/core/common/Variant.hpp +++ b/src/core/common/Variant.hpp @@ -884,6 +884,21 @@ public: } /** + * If the value of the variant already is a string, the markAsMagic function + * marks this string as a "magic" value (a variant which might also be an + * identifier). Throws an exception if the variant is not a string or magic + * value. + */ + void markAsMagic() + { + if (getType() == VariantType::STRING) { + meta.setType(VariantType::MAGIC); + return; + } + throw TypeException{getType(), VariantType::STRING}; + } + + /** * Returns the value of the Variant as boolean, performs type conversion. * * @return the Variant value converted to a boolean value. @@ -1146,10 +1161,7 @@ public: * * @retun true if the */ - bool hasLocation() const - { - return meta.hasLocation(); - } + bool hasLocation() const { return meta.hasLocation(); } /** * Unpacks ans returns the stored source location. Note that the returned @@ -1158,10 +1170,7 @@ public: * * @return the stored SourceLocation. */ - SourceLocation getLocation() const - { - return meta.getLocation(); - } + SourceLocation getLocation() const { return meta.getLocation(); } /** * Packs the given source location and stores it in the metadata. Not all diff --git a/src/core/common/VariantReader.cpp b/src/core/common/VariantReader.cpp index 3f02226..fb93ad0 100644 --- a/src/core/common/VariantReader.cpp +++ b/src/core/common/VariantReader.cpp @@ -495,7 +495,7 @@ std::pair<bool, Variant::boolType> VariantReader::parseBool(CharReader &reader, bool val = false; CharReaderFork readerFork = reader.fork(); LoggerFork loggerFork = logger.fork(); - auto res = parseToken(readerFork, loggerFork, {}); + auto res = parseToken(readerFork, loggerFork, std::unordered_set<char>{}); if (res.first) { bool valid = false; if (res.second == "true") { diff --git a/src/core/common/VariantReader.hpp b/src/core/common/VariantReader.hpp index 1232f6e..44132a0 100644 --- a/src/core/common/VariantReader.hpp +++ b/src/core/common/VariantReader.hpp @@ -322,7 +322,7 @@ public: */ static std::pair<bool, Variant> parseTyped( VariantType type, CharReader &reader, Logger &logger, - const std::unordered_set<char> &delims = {}); + const std::unordered_set<char> &delims = std::unordered_set<char>{}); /** * Tries to parse an instance of the given type from the given string. The * called method is one of the parse methods defined here and adheres to the diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp new file mode 100644 index 0000000..72a2291 --- /dev/null +++ b/src/core/common/Whitespace.hpp @@ -0,0 +1,60 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Whitespace.hpp + * + * Contains the WhitespaceMode enum used in various places. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HPP_ +#define _OUSIA_WHITESPACE_HPP_ + +#include <string> +#include <utility> + +namespace ousia { + +/** + * Enum specifying the whitespace handling mode of the tokenizer and the + * parsers. + */ +enum class WhitespaceMode { + /** + * Preserves all whitespaces as they are found in the source file. + */ + PRESERVE, + + /** + * Trims whitespace at the beginning and the end of the found text. + */ + TRIM, + + /** + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ + COLLAPSE +}; + +} + +#endif /* _OUSIA_WHITESPACE_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp new file mode 100644 index 0000000..ed52ea3 --- /dev/null +++ b/src/core/common/WhitespaceHandler.hpp @@ -0,0 +1,284 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file WhitespaceHandler.hpp + * + * Contains the WhitespaceHandler classes which are used in multiple places to + * trim, compact or preserve whitespaces while at the same time maintaining the + * position information associated with the input strings. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ +#define _OUSIA_WHITESPACE_HANDLER_HPP_ + +#include <string> +#include <vector> + +#include "Utils.hpp" + +namespace ousia { + +/** + * WhitespaceHandler is a based class that can be used to collect text on a + * character-by-character basis. Note that this class and its descendants are + * hoped to be inlined by the compiler (and used in conjunction with templates), + * thus they are fully defined inside this header. + */ +class WhitespaceHandler { +public: + /** + * Start position of the extracted text. + */ + size_t textStart; + + /** + * End position of the extracted text. + */ + size_t textEnd; + + /** + * Buffer containing the extracted text. + */ + std::vector<char> textBuf; + + /** + * Constructor of the TextHandlerBase base class. Initializes the start and + * end position with zeros. + */ + WhitespaceHandler() : textStart(0), textEnd(0) {} + + /** + * Returns true if this whitespace handler has found any text and a text + * token could be emitted. + * + * @return true if the internal data buffer is non-empty. + */ + bool hasText() { return !textBuf.empty(); } + + /** + * Returns the content of the WhitespaceHandler as string. + */ + std::string toString() const + { + return std::string(textBuf.data(), textBuf.size()); + } +}; + +/** + * The PreservingWhitespaceHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Appends the given character to the internal text buffer, does not + * eliminate whitespace. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd); + } + + /** + * Static version of PreservingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + */ + static void append(char c, size_t start, size_t end, + std::vector<char> &textBuf, size_t &textStart, + size_t &textEnd) + { + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + textBuf.push_back(c); + } +}; + +/** + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. + */ +class TrimmingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Buffer used internally to temporarily store all whitespace characters. + * They are only added to the output buffer if another non-whitespace + * character is reached. + */ + std::vector<char> whitespaceBuf; + + /** + * Appends the given character to the internal text buffer, eliminates + * whitespace characters at the begin and end of the text. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); + } + + /** + * Static version of TrimmingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + * @param whitespaceBuf is a reference at the buffer for storing whitespace + * characters. + */ + static void append(char c, size_t start, size_t end, + std::vector<char> &textBuf, size_t &textStart, + size_t &textEnd, std::vector<char> &whitespaceBuf) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + whitespaceBuf.push_back(c); + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (!whitespaceBuf.empty()) { + textBuf.insert(textBuf.end(), whitespaceBuf.begin(), + whitespaceBuf.end()); + whitespaceBuf.clear(); + } + textBuf.push_back(c); + } +}; + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Flag set to true if a whitespace character was reached. + */ + bool hasWhitespace = false; + + /** + * Appends the given character to the internal text buffer, eliminates + * redundant whitespace characters. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); + } + + /** + * Static version of CollapsingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + * @param hasWhitespace is a reference at the "hasWhitespace" flag. + */ + static void append(char c, size_t start, size_t end, + std::vector<char> &textBuf, size_t &textStart, + size_t &textEnd, bool &hasWhitespace) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + hasWhitespace = true; + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (hasWhitespace) { + textBuf.push_back(' '); + hasWhitespace = false; + } + textBuf.push_back(c); + } +}; + +/** + * Function that can be used to append the given buffer (e.g. a string or a + * vector) to the whitespace handler. + * + * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. + * @tparam Buffer is an iterable type. + * @param handler is the handler to which the characters of the Buffer should be + * appended. + * @param buf is the buffer from which the characters should be read. + * @param start is the start byte offset. Each character is counted as one byte. + */ +template <typename WhitespaceHandler, typename Buffer> +inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, + size_t start) +{ + for (auto elem : buf) { + handler.append(elem, start, start + 1); + start++; + } +} +} + +#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ + diff --git a/src/core/model/Node.cpp b/src/core/model/Node.cpp index 39ee2e4..ce15cad 100644 --- a/src/core/model/Node.cpp +++ b/src/core/model/Node.cpp @@ -448,7 +448,7 @@ bool Node::doValidate(Logger &logger) const { return true; } bool Node::validateName(Logger &logger) const { - if (!Utils::isIdentifier(name)) { + if (!Utils::isIdentifierOrEmpty(name)) { logger.error(type()->name + std::string(" name \"") + name + std::string("\" is not a valid identifier"), this); diff --git a/src/core/model/Typesystem.cpp b/src/core/model/Typesystem.cpp index 506bd31..df2b9fb 100644 --- a/src/core/model/Typesystem.cpp +++ b/src/core/model/Typesystem.cpp @@ -21,7 +21,6 @@ #include <core/common/RttiBuilder.hpp> #include <core/common/Utils.hpp> #include <core/common/VariantConverter.hpp> -#include <core/common/VariantReader.hpp> namespace ousia { @@ -68,65 +67,6 @@ bool Type::build(Variant &data, Logger &logger) const return build(data, logger, NullMagicCallback); } -std::pair<bool, Variant> Type::read(CharReader &reader, Logger &logger, - const std::unordered_set<char> &delims) -{ - // try all variant types of this type and use the first successful one. - Variant v; - bool success = false; - for (auto t : getVariantTypes()) { - auto res = VariantReader::parseTyped(t, reader, logger, delims); - if (res.first) { - v = res.second; - success = true; - break; - } - } - - if (!success) { - return std::make_pair(false, Variant{}); - } - if (!build(v, logger)) { - return std::make_pair(false, Variant{}); - } - return std::make_pair(true, v); -} - -std::pair<bool, Variant> Type::read(const std::string &str, Logger &logger, - SourceId sourceId, size_t offs) -{ - // try all variant types of this type and use the first successful one. - Variant v; - bool success = false; - std::vector<LoggerFork> forks; - auto vts = getVariantTypes(); - for (auto vt : vts) { - forks.emplace_back(logger.fork()); - auto res = - VariantReader::parseTyped(vt, str, forks.back(), sourceId, offs); - if (res.first) { - v = res.second; - success = true; - forks.back().commit(); - break; - } - } - - if (!success) { - logger.error("Could not read data with any of the possible types:"); - for (size_t t = 0; t < forks.size(); t++) { - logger.note(std::string(Variant::getTypeName(vts[t])) + ":", - SourceLocation{}, MessageMode::NO_CONTEXT); - forks[t].commit(); - } - return std::make_pair(false, Variant{}); - } - if (!build(v, logger)) { - return std::make_pair(false, Variant{}); - } - return std::make_pair(true, v); -} - bool Type::doCheckIsa(Handle<const Type> type) const { return false; } bool Type::checkIsa(Handle<const Type> type) const diff --git a/src/core/model/Typesystem.hpp b/src/core/model/Typesystem.hpp index ca4f206..8079578 100644 --- a/src/core/model/Typesystem.hpp +++ b/src/core/model/Typesystem.hpp @@ -59,7 +59,27 @@ class SystemTypesystem; */ class Type : public Node { public: - enum class MagicCallbackResult { NOT_FOUND, FOUND_INVALID, FOUND_VALID }; + /** + * Enum describing the result of the MagicCallback. + */ + enum class MagicCallbackResult { + /** + * A magic value with the given name could not be resolved. + */ + NOT_FOUND, + + /** + * A magic value with the given name could be resolved, but is of the + * wrong type. + */ + FOUND_INVALID, + + /** + * A magic value with the given name could be resolved and is of the + * correct type. + */ + FOUND_VALID + }; /** * Callback function called when a variant with "magic" value is reached. @@ -70,7 +90,9 @@ public: * to which the value of the looked up constant should be written. * @param type is a const pointer at the type. TODO: Replace this with a * "ConstHandle". - * @return true if a constant was found, false otherwise. + * @return a MagicCallbackResult describing whether the magic value could + * not be resolved, could be resolved but is of the wrong type or could be + * resolved and is of the correct type. */ using MagicCallback = std::function<MagicCallbackResult(Variant &data, const Type *type)>; @@ -169,32 +191,6 @@ public: bool build(Variant &data, Logger &logger) const; /** - * Tries to parse an instance of this type from the given stream. - * - * @param reader is a reference to the CharReader instance which is - * the source for the character data. The reader will be positioned - * at the end of the type instance (or the delimiting character). - * @param delims is a set of characters which will terminate the typed - * instance if the according parser uses delimiting characters. - * These characters are not included in the result. May not be nullptr. - */ - std::pair<bool, Variant> read(CharReader &reader, Logger &logger, - const std::unordered_set<char> &delims = {}); - - /** - * Tries to parse an instance of this type from the given string. - * - * @param str is the string from which the value should be read. - * @param sourceId is an optional descriptor of the source file from which - * the element is being read. - * @param offs is the by offset in the source file at which the string - * starts. - */ - std::pair<bool, Variant> read(const std::string &str, Logger &logger, - SourceId sourceId = InvalidSourceId, - size_t offs = 0); - - /** * Returns true if and only if the given Variant adheres to this Type. In * essence this just calls the build method on a copy of the input Variant. * @@ -230,23 +226,6 @@ public: { return this->getParent().cast<Typesystem>(); } - - /** - * Returns the VariantTypes whose instances are proper input for building an - * instance of this type. - * More specifically: Every returned VariantType T should be such that: - * If a string s can be parsed according to T to a Variant v then the call - * build(v, logger) should only fail (return false) if the variant content - * does not adhere to the specific type specification. But it should be a - * properly typed input for build. - * The order of the types returned by this function determines the order in - * which a parser should try to interpret an input string s. - * - * @return the VariantTypes that arethe basis for parsing an instance of - *this - * type. - */ - virtual std::vector<VariantType> getVariantTypes() const = 0; }; /** @@ -287,16 +266,6 @@ public: * @return a variant containing an empty string. */ Variant create() const override { return Variant{""}; } - - /** - * Returns the String VariantType. - * - * @return the String VariantType. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::STRING}; - } }; /** @@ -336,16 +305,6 @@ public: * @return the integer value zero. */ Variant create() const override { return Variant{0}; } - - /** - * Returns the Int VariantType. - * - * @return the Int VariantType. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::INT}; - } }; /** @@ -385,16 +344,6 @@ public: * @return the double value zero. */ Variant create() const override { return Variant{0.0}; } - - /** - * Returns the Double VariantType. - * - * @return the Double VariantType. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::DOUBLE}; - } }; /** @@ -434,16 +383,6 @@ public: * @return a Variant with the boolean value false. */ Variant create() const override { return Variant{false}; } - - /** - * Returns the bool VariantType. - * - * @return the bool VariantType. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::BOOL}; - } }; /** @@ -483,16 +422,6 @@ public: * @return a Variant with the cardinality value "any". */ Variant create() const override { return Variant{Cardinality::any()}; } - - /** - * Returns the cardinality VariantType. - * - * @return the cardinality VariantType. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::CARDINALITY}; - } }; /** @@ -609,16 +538,6 @@ public: * name. Throws a LoggableException if the string does not exist. */ Ordinal valueOf(const std::string &name) const; - - /** - * Returns the int and string VariantTypes. - * - * @return the int and string VariantTypes. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::INT, VariantType::STRING}; - } }; /** @@ -1054,15 +973,6 @@ public: * @return true if the requested attribute name exists, false otherwise. */ bool hasAttribute(const std::string &name) const; - /** - * Returns the array and map VariantTypes. - * - * @return the array and map VariantTypes. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::MAP}; - } }; /** @@ -1128,15 +1038,6 @@ public: * @return Rooted reference pointing at the innerType. */ Rooted<Type> getInnerType() { return innerType; } - /** - * Returns the array VariantType. - * - * @return the array VariantType. - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::ARRAY}; - } }; /** @@ -1175,20 +1076,6 @@ public: * @return a Variant instance with nullptr value. */ Variant create() const override; - /** - * Returns all parseable VariantTypes (bool, int, double, array, map, - *cardinality, object, string). - * - * @return all parseable VariantTypes (bool, int, double, array, map, - *cardinality, object, string). - */ - std::vector<VariantType> getVariantTypes() const override - { - return {VariantType::BOOL, VariantType::INT, - VariantType::DOUBLE, VariantType::ARRAY, - VariantType::MAP, VariantType::CARDINALITY, - VariantType::OBJECT, VariantType::STRING}; - } }; /** @@ -1576,4 +1463,4 @@ extern const Rtti SystemTypesystem; } } -#endif /* _OUSIA_MODEL_TYPESYSTEM_HPP_ */
\ No newline at end of file +#endif /* _OUSIA_MODEL_TYPESYSTEM_HPP_ */ diff --git a/src/core/parser/ParserScope.cpp b/src/core/parser/ParserScope.cpp index 3929abf..ce3dc94 100644 --- a/src/core/parser/ParserScope.cpp +++ b/src/core/parser/ParserScope.cpp @@ -351,8 +351,7 @@ bool ParserScope::resolveType(const std::string &name, Handle<Node> owner, return resolveType(Utils::split(name, '.'), owner, logger, resultCallback); } -bool ParserScope::resolveValue(Variant &data, Handle<Type> type, - Handle<Node> owner, Logger &logger) +bool ParserScope::resolveValue(Variant &data, Handle<Type> type, Logger &logger) { return type->build( data, logger, @@ -408,7 +407,7 @@ bool ParserScope::resolveTypeWithValue(const std::vector<std::string> &path, [=](Handle<Node> resolved, Handle<Node> owner, Logger &logger) mutable { if (resolved != nullptr) { Rooted<Type> type = resolved.cast<Type>(); - scope.resolveValue(*valuePtr, type, owner, logger); + scope.resolveValue(*valuePtr, type, logger); } // Call the result callback with the type diff --git a/src/core/parser/ParserScope.hpp b/src/core/parser/ParserScope.hpp index 58fc037..185b845 100644 --- a/src/core/parser/ParserScope.hpp +++ b/src/core/parser/ParserScope.hpp @@ -702,13 +702,11 @@ public: * (even in inner structures). The data will be passed to the "build" * function of the given type. * @param type is the Typesystem type the data should be interpreted with. - * @param owner is the node for which the resolution takes place. * @param logger is the logger instance into which resolution problems * should be logged. * @return true if the value was successfully built. */ - bool resolveValue(Variant &data, Handle<Type> type, Handle<Node> owner, - Logger &logger); + bool resolveValue(Variant &data, Handle<Type> type, Logger &logger); /** * Resolves a type and makes sure the corresponding value is of the correct diff --git a/src/core/parser/ParserStack.cpp b/src/core/parser/ParserStack.cpp deleted file mode 100644 index 1265851..0000000 --- a/src/core/parser/ParserStack.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <sstream> - -#include <core/common/Utils.hpp> -#include <core/common/Exceptions.hpp> -#include <core/model/Project.hpp> - -#include "ParserScope.hpp" -#include "ParserStack.hpp" - -namespace ousia { - -/* A default handler */ - -/** - * The DefaultHandler class is used in case no element handler is specified in - * the ParserState descriptor. - */ -class DefaultHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override {} - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DefaultHandler{handlerData}; - } -}; - -/* Class Handler */ - -void Handler::data(const std::string &data, int field) -{ - if (Utils::hasNonWhitepaceChar(data)) { - logger().error("Expected command but found character data."); - } -} - -/* Class ParserStack */ - -/** - * Returns an Exception that should be thrown when a currently invalid command - * is thrown. - */ -static LoggableException InvalidCommand(const std::string &name, - const std::set<std::string> &expected) -{ - if (expected.empty()) { - return LoggableException{ - std::string{"No nested elements allowed, but got \""} + name + - std::string{"\""}}; - } else { - return LoggableException{ - std::string{"Expected "} + - (expected.size() == 1 ? std::string{"\""} - : std::string{"one of \""}) + - Utils::join(expected, "\", \"") + std::string{"\", but got \""} + - name + std::string{"\""}}; - } -} - -ParserStack::ParserStack( - ParserContext &ctx, - const std::multimap<std::string, const ParserState *> &states) - : ctx(ctx), states(states) -{ -} - -bool ParserStack::deduceState() -{ - // Assemble all states - std::vector<const ParserState *> states; - for (const auto &e : this->states) { - states.push_back(e.second); - } - - // Fetch the type signature of the scope and derive all possible states, - // abort if no unique parser state was found - std::vector<const ParserState *> possibleStates = - ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) - .deduce(); - if (possibleStates.size() != 1) { - ctx.getLogger().error( - "Error while including file: Cannot deduce parser state."); - return false; - } - - // Switch to this state by creating a dummy handler - const ParserState *state = possibleStates[0]; - Handler *handler = - DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); - stack.emplace(handler); - return true; -} - -std::set<std::string> ParserStack::expectedCommands() -{ - const ParserState *currentState = &(this->currentState()); - std::set<std::string> res; - for (const auto &v : states) { - if (v.second->parents.count(currentState)) { - res.insert(v.first); - } - } - return res; -} - -const ParserState &ParserStack::currentState() -{ - return stack.empty() ? ParserStates::None : stack.top()->state(); -} - -std::string ParserStack::currentCommandName() -{ - return stack.empty() ? std::string{} : stack.top()->name(); -} - -const ParserState *ParserStack::findTargetState(const std::string &name) -{ - const ParserState *currentState = &(this->currentState()); - auto range = states.equal_range(name); - for (auto it = range.first; it != range.second; it++) { - const ParserStateSet &parents = it->second->parents; - if (parents.count(currentState) || parents.count(&ParserStates::All)) { - return it->second; - } - } - - return nullptr; -} - -void ParserStack::start(const std::string &name, Variant::mapType &args, - const SourceLocation &location) -{ - ParserState const *targetState = findTargetState(name); -// TODO: Andreas, please improve this. -// if (!Utils::isIdentifier(name)) { -// throw LoggableException(std::string("Invalid identifier \"") + name + -// std::string("\"")); -// } - - if (targetState == nullptr) { - targetState = findTargetState("*"); - } - if (targetState == nullptr) { - throw InvalidCommand(name, expectedCommands()); - } - - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and call its start function - Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); - handler->start(args); - stack.emplace(handler); -} - -void ParserStack::start(std::string name, const Variant::mapType &args, - const SourceLocation &location) -{ - Variant::mapType argsCopy(args); - start(name, argsCopy); -} - -void ParserStack::end() -{ - // Check whether the current command could be ended - if (stack.empty()) { - throw LoggableException{"No command to end."}; - } - - // Remove the current HandlerInstance from the stack - std::shared_ptr<Handler> inst{stack.top()}; - stack.pop(); - - // Call the end function of the last Handler - inst->end(); -} - -void ParserStack::data(const std::string &data, int field) -{ - // Check whether there is any command the data can be sent to - if (stack.empty()) { - throw LoggableException{"No command to receive data."}; - } - - // Pass the data to the current Handler instance - stack.top()->data(data, field); -} -} - diff --git a/src/core/parser/ParserStack.hpp b/src/core/parser/ParserStack.hpp deleted file mode 100644 index efc4e4a..0000000 --- a/src/core/parser/ParserStack.hpp +++ /dev/null @@ -1,361 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file ParserStack.hpp - * - * Helper classes for document or description parsers. Contains the ParserStack - * class, which is an pushdown automaton responsible for accepting commands in - * the correct order and calling specified handlers. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STACK_HPP_ -#define _OUSIA_PARSER_STACK_HPP_ - -#include <cstdint> - -#include <map> -#include <memory> -#include <set> -#include <stack> -#include <vector> - -#include <core/common/Variant.hpp> -#include <core/common/Logger.hpp> -#include <core/common/Argument.hpp> - -#include "Parser.hpp" -#include "ParserContext.hpp" -#include "ParserState.hpp" - -namespace ousia { - -/** - * Struct collecting all the data that is being passed to a Handler instance. - */ -struct HandlerData { - /** - * Reference to the ParserContext instance that should be used to resolve - * references to nodes in the Graph. - */ - ParserContext &ctx; - - /** - * Contains the name of the tag that is being handled. - */ - const std::string name; - - /** - * Contains the current state of the state machine. - */ - const ParserState &state; - - /** - * Contains the state of the state machine when the parent node was handled. - */ - const ParserState &parentState; - - /** - * Current source code location. - */ - const SourceLocation location; - - /** - * Constructor of the HandlerData class. - * - * @param ctx is the parser context the handler should be executed in. - * @param name is the name of the string. - * @param state is the state this handler was called for. - * @param parentState is the state of the parent command. - * @param location is the location at which the handler is created. - */ - HandlerData(ParserContext &ctx, std::string name, const ParserState &state, - const ParserState &parentState, const SourceLocation location) - : ctx(ctx), - name(std::move(name)), - state(state), - parentState(parentState), - location(location){}; -}; - -/** - * The handler class provides a context for handling an XML tag. It has to be - * overridden and registered in the StateStack class to form handlers for - * concrete XML tags. - */ -class Handler { -private: - /** - * Structure containing the internal handler data. - */ - const HandlerData handlerData; - -public: - /** - * Constructor of the Handler class. - * - * @param data is a structure containing all data being passed to the - * handler. - */ - Handler(const HandlerData &handlerData) : handlerData(handlerData){}; - - /** - * Virtual destructor. - */ - virtual ~Handler(){}; - - /** - * Returns a reference at the ParserContext. - * - * @return a reference at the ParserContext. - */ - ParserContext &context() { return handlerData.ctx; } - - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. - */ - const std::string &name() { return handlerData.name; } - - /** - * Returns a reference at the ParserScope instance. - * - * @return a reference at the ParserScope instance. - */ - ParserScope &scope() { return handlerData.ctx.getScope(); } - - /** - * Returns a reference at the Manager instance which manages all nodes. - * - * @return a referance at the Manager instance. - */ - Manager &manager() { return handlerData.ctx.getManager(); } - - /** - * Returns a reference at the Logger instance used for logging error - * messages. - * - * @return a reference at the Logger instance. - */ - Logger &logger() { return handlerData.ctx.getLogger(); } - - /** - * Returns a reference at the Project Node, representing the project into - * which the file is currently being parsed. - * - * @return a referance at the Project Node. - */ - Rooted<Project> project() { return handlerData.ctx.getProject(); } - - /** - * Reference at the ParserState descriptor for which this Handler was - * created. - * - * @return a const reference at the constructing ParserState descriptor. - */ - const ParserState &state() { return handlerData.state; } - - /** - * Reference at the ParserState descriptor of the parent state of the state - * for which this Handler was created. Set to ParserStates::None if there - * is no parent state. - * - * @return a const reference at the parent state of the constructing - * ParserState descriptor. - */ - const ParserState &parentState() { return handlerData.parentState; } - - /** - * Returns the current location in the source file. - * - * @return the current location in the source file. - */ - SourceLocation location() { return handlerData.location; } - - /** - * Called when the command that was specified in the constructor is - * instanciated. - * - * @param args is a map from strings to variants (argument name and value). - */ - virtual void start(Variant::mapType &args) = 0; - - /** - * Called whenever the command for which this handler is defined ends. - */ - virtual void end() = 0; - - /** - * Called whenever raw data (int the form of a string) is available for the - * Handler instance. In the default handler an exception is raised if the - * received data contains non-whitespace characters. - * - * @param data is a pointer at the character data that is available for the - * Handler instance. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - virtual void data(const std::string &data, int field); -}; - -/** - * HandlerConstructor is a function pointer type used to create concrete - * instances of the Handler class. - * - * @param handlerData is the data that should be passed to the new handler - * instance. - * @return a newly created handler instance. - */ -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * The ParserStack class is a pushdown automaton responsible for turning a - * command stream into a tree of Node instances. - */ -class ParserStack { -private: - /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. - */ - const std::multimap<std::string, const ParserState *> &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::stack<std::shared_ptr<Handler>> stack; - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. - */ - std::set<std::string> expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from for the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - *state - * otherwise. - */ - const ParserState *findTargetState(const std::string &name); - -public: - /** - * Creates a new instance of the ParserStack class. - * - * @param ctx is the parser context the parser stack is working on. - * @param states is a map containing the command names and pointers at the - * corresponding ParserState instances. - */ - ParserStack(ParserContext &ctx, - const std::multimap<std::string, const ParserState *> &states); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - + an equivalent state as the one in the including file. - * - * @param scope is the ParserScope instance from which the ParserState - * should be reconstructed. - * @param logger is the logger instance to which error messages should be - * written. - * @return true if the operation was sucessful, false otherwise. - */ - bool deduceState(); - - /** - * Returns the state the ParserStack instance currently is in. - * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. - */ - const ParserState ¤tState(); - - /** - * Returns the command name that is currently being handled. - * - * @return the name of the command currently being handled by the active - * Handler instance or an empty string if no handler is currently active. - */ - std::string currentCommandName(); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * Note that the passed map will be modified. - * @param location is the location in the source file at which the command - * starts. - */ - void start(const std::string &name, Variant::mapType &args, - const SourceLocation &location = SourceLocation{}); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * @param location is the location in the source file at which the command - * starts. - */ - void start(std::string name, - const Variant::mapType &args = Variant::mapType{}, - const SourceLocation &location = SourceLocation{}); - - /** - * Function called whenever a command ends. - */ - void end(); - - /** - * Function that should be called whenever data is available for the - * command. - * - * @param data is the data that should be passed to the handler. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - void data(const std::string &data, int field = 0); - - /** - * Returns a reference to the parser context the parser stack is currently - * working on. - * - * @return a reference to the parser context. - */ - ParserContext &getContext() { return ctx; } -}; -} - -#endif /* _OUSIA_PARSER_STACK_HPP_ */ - diff --git a/src/core/parser/generic/GenericParser.cpp b/src/core/parser/generic/GenericParser.cpp deleted file mode 100644 index e69de29..0000000 --- a/src/core/parser/generic/GenericParser.cpp +++ /dev/null diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp new file mode 100644 index 0000000..6ebc549 --- /dev/null +++ b/src/core/parser/stack/Callbacks.cpp @@ -0,0 +1,23 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "Callbacks.hpp" + +namespace ousia { +} + diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp new file mode 100644 index 0000000..9c61000 --- /dev/null +++ b/src/core/parser/stack/Callbacks.hpp @@ -0,0 +1,99 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Callbacks.hpp + * + * Contains an interface defining the callbacks that can be directed from a + * StateHandler to the StateStack, and from the StateStack to + * the actual parser. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_CALLBACKS_HPP_ +#define _OUSIA_PARSER_STACK_CALLBACKS_HPP_ + +#include <string> + +#include <core/common/Whitespace.hpp> + +namespace ousia { +namespace parser_stack { + +/** + * Interface defining a set of callback functions that act as a basis for the + * StateStackCallbacks and the ParserCallbacks. + */ +class Callbacks { +public: + /** + * Virtual descructor. + */ + virtual ~Callbacks() {}; + + /** + * Sets the whitespace mode that specifies how string data should be + * processed. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; + + /** + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + virtual void registerToken(const std::string &token) = 0; + + /** + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + virtual void unregisterToken(const std::string &token) = 0; +}; + +/** + * Interface defining the callback functions that can be passed from a + * StateStack to the underlying parser. + */ +class ParserCallbacks : public Callbacks { + /** + * Checks whether the given token is supported by the parser. The parser + * returns true, if the token is supported, false if this token cannot be + * registered. Note that parsers that do not support the registration of + * tokens at all should always return "true". + * + * @param token is the token that should be checked for support. + * @return true if the token is generally supported (or the parser does not + * support registering tokens at all), false if the token is not supported, + * because e.g. it is a reserved token or it interferes with other tokens. + */ + virtual bool supportsToken(const std::string &token) = 0; +}; + +} +} + +#endif /* _OUSIA_PARSER_STACK_CALLBACKS_HPP_ */ + diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index 3647db3..d514701 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -16,28 +16,35 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "DocumentHandler.hpp" - #include <algorithm> #include <core/common/RttiBuilder.hpp> #include <core/common/Utils.hpp> +#include <core/common/VariantReader.hpp> #include <core/model/Document.hpp> #include <core/model/Domain.hpp> +#include <core/model/Project.hpp> #include <core/model/Typesystem.hpp> #include <core/parser/ParserScope.hpp> +#include <core/parser/ParserContext.hpp> + +#include "DocumentHandler.hpp" +#include "State.hpp" namespace ousia { +namespace parser_stack { /* DocumentHandler */ -void DocumentHandler::start(Variant::mapType &args) +bool DocumentHandler::start(Variant::mapType &args) { Rooted<Document> document = - project()->createDocument(args["name"].asString()); + context().getProject()->createDocument(args["name"].asString()); document->setLocation(location()); scope().push(document); scope().setFlag(ParserFlag::POST_HEAD, false); + + return true; } void DocumentHandler::end() { scope().pop(); } @@ -48,7 +55,7 @@ void DocumentChildHandler::preamble(Handle<Node> parentNode, std::string &fieldName, DocumentEntity *&parent, bool &inField) { - // check if the parent in the structure tree was an explicit field + // Check if the parent in the structure tree was an explicit field // reference. inField = parentNode->isa(&RttiTypes::DocumentField); if (inField) { @@ -56,10 +63,11 @@ void DocumentChildHandler::preamble(Handle<Node> parentNode, parentNode = scope().selectOrThrow( {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); } else { - // if it wasn't an explicit reference, we use the default field. + // If it wasn't an explicit reference, we use the default field. fieldName = DEFAULT_FIELD_NAME; } - // reference the parent entity explicitly. + + // Reference the parent entity explicitly. parent = nullptr; if (parentNode->isa(&RttiTypes::StructuredEntity)) { parent = static_cast<DocumentEntity *>( @@ -70,17 +78,13 @@ void DocumentChildHandler::preamble(Handle<Node> parentNode, } } -static void createPath(const std::string &firstFieldName, - const NodeVector<Node> &path, DocumentEntity *&parent) +static void createPath(const NodeVector<Node> &path, DocumentEntity *&parent, + size_t p0 = 1) { - // add the first element - parent = static_cast<DocumentEntity *>( - parent->createChildStructuredEntity(path[0].cast<StructuredClass>(), - Variant::mapType{}, firstFieldName, - "").get()); - + // TODO (@benjamin): These should be pushed onto the scope and poped once + // the scope is left. Otherwise stuff may not be correclty resolved. size_t S = path.size(); - for (size_t p = 2; p < S; p = p + 2) { + for (size_t p = p0; p < S; p = p + 2) { parent = static_cast<DocumentEntity *>( parent->createChildStructuredEntity( path[p].cast<StructuredClass>(), Variant::mapType{}, @@ -88,18 +92,19 @@ static void createPath(const std::string &firstFieldName, } } -static void createPath(const NodeVector<Node> &path, DocumentEntity *&parent) +static void createPath(const std::string &firstFieldName, + const NodeVector<Node> &path, DocumentEntity *&parent) { - size_t S = path.size(); - for (size_t p = 1; p < S; p = p + 2) { - parent = static_cast<DocumentEntity *>( - parent->createChildStructuredEntity( - path[p].cast<StructuredClass>(), Variant::mapType{}, - path[p - 1]->getName(), "").get()); - } + // Add the first element + parent = static_cast<DocumentEntity *>( + parent->createChildStructuredEntity(path[0].cast<StructuredClass>(), + Variant::mapType{}, firstFieldName, + "").get()); + + createPath(path, parent, 2); } -void DocumentChildHandler::start(Variant::mapType &args) +bool DocumentChildHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); Rooted<Node> parentNode = scope().selectOrThrow( @@ -112,7 +117,7 @@ void DocumentChildHandler::start(Variant::mapType &args) preamble(parentNode, fieldName, parent, inField); - // try to find a FieldDescriptor for the given tag if we are not in a + // Try to find a FieldDescriptor for the given tag if we are not in a // field already. This does _not_ try to construct transparent paths // in between. if (!inField && parent != nullptr && @@ -121,7 +126,7 @@ void DocumentChildHandler::start(Variant::mapType &args) new DocumentField(parentNode->getManager(), name(), parentNode)}; field->setLocation(location()); scope().push(field); - return; + return true; } // Otherwise create a new StructuredEntity @@ -187,27 +192,39 @@ void DocumentChildHandler::start(Variant::mapType &args) } entity->setLocation(location()); scope().push(entity); + return true; } void DocumentChildHandler::end() { scope().pop(); } -std::pair<bool, Variant> DocumentChildHandler::convertData( - Handle<FieldDescriptor> field, Logger &logger, const std::string &data) +bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field, + Variant &data, Logger &logger) { - // if the content is supposed to be of type string, we can finish - // directly. - auto vts = field->getPrimitiveType()->getVariantTypes(); - if (std::find(vts.begin(), vts.end(), VariantType::STRING) != vts.end()) { - return std::make_pair(true, Variant::fromString(data)); + bool valid = true; + Rooted<Type> type = field->getPrimitiveType(); + + // If the content is supposed to be of type string, we only need to check + // for "magic" values -- otherwise just call the "parseGenericString" + // function on the string data + if (type->isa(&RttiTypes::StringType)) { + const std::string &str = data.asString(); + // TODO: Referencing constants with "." separator should also work + if (Utils::isIdentifier(str)) { + data.markAsMagic(); + } + } else { + // Parse the string as generic string, assign the result + auto res = VariantReader::parseGenericString( + data.asString(), logger, data.getLocation().getSourceId(), + data.getLocation().getStart()); + data = res.second; } - // then try to parse the content using the type specification. - auto res = field->getPrimitiveType()->read( - data, logger, location().getSourceId(), location().getStart()); - return res; + // Now try to resolve the value for the primitive type + return valid && scope().resolveValue(data, type, logger); } -void DocumentChildHandler::data(const std::string &data, int fieldIdx) +bool DocumentChildHandler::data(Variant &data) { Rooted<Node> parentNode = scope().selectOrThrow( {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, @@ -222,11 +239,10 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) Rooted<Descriptor> desc = strctParent->getDescriptor(); // The parent from which we need to connect to the primitive content. Rooted<Node> parentClass; - /* - * We distinguish two cases here: One for fields that are given. - */ + + // We distinguish two cases here: One for fields that are given. if (inField) { - // retrieve the actual FieldDescriptor + // Retrieve the actual FieldDescriptor Rooted<FieldDescriptor> field = desc->getFieldDescriptor(fieldName); if (field == nullptr) { logger().error( @@ -234,75 +250,102 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) fieldName + "\" exists in descriptor\"" + desc->getName() + "\".", location()); - return; + return false; } - // if it is a primitive field directly, try to parse the content. + // If it is a primitive field directly, try to parse the content. if (field->isPrimitive()) { - auto res = convertData(field, logger(), data); - // add it as primitive content. - if (res.first) { - strctParent->createChildDocumentPrimitive(res.second, - fieldName); + // Add it as primitive content. + if (!convertData(field, data, logger())) { + return false; } - return; + + strctParent->createChildDocumentPrimitive(data, fieldName); + return true; } - // if it is not primitive we need to connect via transparent elements + // If it is not primitive we need to connect via transparent elements // and default fields. parentClass = field; } else { - // in case of default fields we need to construct via default fields + // In case of default fields we need to construct via default fields // and maybe transparent elements. parentClass = desc; } - /* - * Search through all permitted default fields of the parent class that - * allow primitive content at this point and could be constructed via - * transparent intermediate entities. - * We then try to parse the data using the type specified by the respective - * field. If that does not work we proceed to the next possible field. - */ - // retrieve all default fields at this point. + + // Search through all permitted default fields of the parent class that + // allow primitive content at this point and could be constructed via + // transparent intermediate entities. + + // Retrieve all default fields at this point, either from the field + // descriptor or the structured class NodeVector<FieldDescriptor> defaultFields; if (inField) { defaultFields = parentClass.cast<FieldDescriptor>()->getDefaultFields(); } else { defaultFields = parentClass.cast<StructuredClass>()->getDefaultFields(); } + + // Try to parse the data using the type specified by the respective field. + // If that does not work we proceed to the next possible field. std::vector<LoggerFork> forks; for (auto field : defaultFields) { - // then try to parse the content using the type specification. + // Then try to parse the content using the type specification. forks.emplace_back(logger().fork()); - auto res = convertData(field, forks.back(), data); - if (res.first) { - forks.back().commit(); - // if that worked, construct the necessary path. - if (inField) { - NodeVector<Node> path = - parentClass.cast<FieldDescriptor>()->pathTo(field, - logger()); - createPath(fieldName, path, strctParent); - } else { - auto pathRes = desc->pathTo(field, logger()); - assert(pathRes.second); - createPath(pathRes.first, strctParent); - } - // then create the primitive element. - strctParent->createChildDocumentPrimitive(res.second); - return; + if (!convertData(field, data, forks.back())) { + continue; } + + // The conversion worked, commit any possible warnings + forks.back().commit(); + + // Construct the necessary path + if (inField) { + NodeVector<Node> path = + parentClass.cast<FieldDescriptor>()->pathTo(field, logger()); + createPath(fieldName, path, strctParent); + } else { + auto pathRes = desc->pathTo(field, logger()); + assert(pathRes.second); + createPath(pathRes.first, strctParent); + } + + // Then create the primitive element + strctParent->createChildDocumentPrimitive(data); + return true; } - logger().error("Could not read data with any of the possible fields:"); + + // No field was found that might take the data -- dump the error messages + // from the loggers + logger().error("Could not read data with any of the possible fields:", + SourceLocation{}, MessageMode::NO_CONTEXT); size_t f = 0; for (auto field : defaultFields) { - logger().note(Utils::join(field->path(), ".") + ":", SourceLocation{}, - MessageMode::NO_CONTEXT); + logger().note(std::string("Field ") + Utils::join(field->path(), ".") + + std::string(":"), + SourceLocation{}, MessageMode::NO_CONTEXT); forks[f].commit(); f++; } + return false; +} + +namespace States { +const State Document = StateBuilder() + .parent(&None) + .createdNodeType(&RttiTypes::Document) + .elementHandler(DocumentHandler::create) + .arguments({Argument::String("name", "")}); + +const State DocumentChild = StateBuilder() + .parents({&Document, &DocumentChild}) + .createdNodeTypes({&RttiTypes::StructureNode, + &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}) + .elementHandler(DocumentChildHandler::create); +} } namespace RttiTypes { -const Rtti DocumentField = - RttiBuilder<ousia::DocumentField>("DocumentField").parent(&Node); +const Rtti DocumentField = RttiBuilder<ousia::parser_stack::DocumentField>( + "DocumentField").parent(&Node); +} } -}
\ No newline at end of file diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index cb124aa..2c474f9 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -19,14 +19,21 @@ /** * @file DocumentHandler.hpp * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + * Contains the Handler instances used for parsing actual documents. This file + * declares to classes: The Document handler which parses the "document" command + * that introduces a new document and the "DocumentChildHandler" which parses + * the actual user defined tags. + * + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_DOCUMENT_HANDLER_HPP_ -#define _OUSIA_DOCUMENT_HANDLER_HPP_ +#ifndef _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ +#define _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ #include <core/common/Variant.hpp> -#include <core/parser/ParserStack.hpp> +#include <core/model/Node.hpp> + +#include "Handler.hpp" namespace ousia { @@ -35,51 +42,121 @@ class Rtti; class DocumentEntity; class FieldDescriptor; -class DocumentHandler : public Handler { +namespace parser_stack { +/** + * The DocumentHandler class parses the "document" tag that is used to introduce + * a new document. Note that this tag is not mandatory in osml files -- if the + * first command is not a typesystem, domain or any other declarative command, + * the DocumentHandler will be implicitly called. + */ +class DocumentHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the ImportHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new DocumentHandler{handlerData}; } }; +/** + * Temporary Node that is being pushed onto the ParserScope in order to indicate + * the field the parser is currently in. The name of the Node is stored in the + * "name" field of the parent Node class. + */ class DocumentField : public Node { public: using Node::Node; }; -class DocumentChildHandler : public Handler { +/** + * The DocumentChildHandler class performs the actual parsing of the user + * defined elements in an Ousía document. + */ +class DocumentChildHandler : public StaticHandler { private: + /** + * Code shared by both the start() and the end() method. Checks whether the + * parser currently is in a field and returns the name of this field. + * + * @param parentNode is the next possible parent node (a document, + * a structured entity, an annotation entity or a field). + * @param fieldName is an output parameter to which the name of the current + * field is written (or unchanged if we're not in a field). + * @param parent is an output parameter to which the parent document entity + * will be written. + * @param inField is set to true if we actually are in a field. + */ void preamble(Handle<Node> parentNode, std::string &fieldName, DocumentEntity *&parent, bool &inField); - std::pair<bool, Variant> convertData(Handle<FieldDescriptor> field, - Logger &logger, - const std::string &data); + /** + * Tries to convert the given data to the type that is specified in the + * given primitive field. + * + * @param field is the primitive field for which the data is intended. + * @param data is the is the data that should be converted, the result is + * written into this argument as output variable. + * @param logger is the Logger instance to which error messages should be + * written. Needed to allow the convertData function to write to a forked + * Logger instance. + * @return true if the operation was successful, false otherwise. + */ + bool convertData(Handle<FieldDescriptor> field, Variant &data, + Logger &logger); public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; - - void data(const std::string &data, int fieldIdx) override; - + bool data(Variant &data) override; + + /** + * Creates a new instance of the DocumentChildHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new DocumentChildHandler{handlerData}; } }; +namespace States { +/** + * State constant representing the "document" tag. + */ +extern const State Document; + +/** + * State contstant representing any user-defined element within a document. + */ +extern const State DocumentChild; +} + +} + namespace RttiTypes { +/** + * RttiType for the internally used DocumentField class. + */ extern const Rtti DocumentField; } + } -#endif + +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/DomainHandler.cpp b/src/core/parser/stack/DomainHandler.cpp index 6571717..a2c8eec 100644 --- a/src/core/parser/stack/DomainHandler.cpp +++ b/src/core/parser/stack/DomainHandler.cpp @@ -16,29 +16,48 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "DomainHandler.hpp" - #include <core/common/RttiBuilder.hpp> +#include <core/model/Document.hpp> #include <core/model/Domain.hpp> +#include <core/model/Project.hpp> #include <core/parser/ParserScope.hpp> +#include <core/parser/ParserContext.hpp> + +#include "DocumentHandler.hpp" +#include "DomainHandler.hpp" +#include "State.hpp" +#include "TypesystemHandler.hpp" namespace ousia { +namespace parser_stack { /* DomainHandler */ -void DomainHandler::start(Variant::mapType &args) +bool DomainHandler::start(Variant::mapType &args) { - Rooted<Domain> domain = project()->createDomain(args["name"].asString()); + // Create the Domain node + Rooted<Domain> domain = + context().getProject()->createDomain(args["name"].asString()); domain->setLocation(location()); + // If the domain is defined inside a document, add the reference to the + // document + Rooted<Document> document = scope().select<Document>(); + if (document != nullptr) { + document->reference(domain); + } + + // Push the typesystem onto the scope, set the POST_HEAD flag to true scope().push(domain); + scope().setFlag(ParserFlag::POST_HEAD, false); + return true; } void DomainHandler::end() { scope().pop(); } /* DomainStructHandler */ -void DomainStructHandler::start(Variant::mapType &args) +bool DomainStructHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -63,12 +82,13 @@ void DomainStructHandler::start(Variant::mapType &args) } scope().push(structuredClass); + return true; } void DomainStructHandler::end() { scope().pop(); } /* DomainAnnotationHandler */ -void DomainAnnotationHandler::start(Variant::mapType &args) +bool DomainAnnotationHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -79,13 +99,14 @@ void DomainAnnotationHandler::start(Variant::mapType &args) annotationClass->setLocation(location()); scope().push(annotationClass); + return true; } void DomainAnnotationHandler::end() { scope().pop(); } /* DomainAttributesHandler */ -void DomainAttributesHandler::start(Variant::mapType &args) +bool DomainAttributesHandler::start(Variant::mapType &args) { // Fetch the current typesystem and create the struct node Rooted<Descriptor> parent = scope().selectOrThrow<Descriptor>(); @@ -94,13 +115,14 @@ void DomainAttributesHandler::start(Variant::mapType &args) attrDesc->setLocation(location()); scope().push(attrDesc); + return true; } void DomainAttributesHandler::end() { scope().pop(); } /* DomainFieldHandler */ -void DomainFieldHandler::start(Variant::mapType &args) +bool DomainFieldHandler::start(Variant::mapType &args) { FieldDescriptor::FieldType type; if (args["isSubtree"].asBool()) { @@ -116,13 +138,14 @@ void DomainFieldHandler::start(Variant::mapType &args) field->setLocation(location()); scope().push(field); + return true; } void DomainFieldHandler::end() { scope().pop(); } /* DomainFieldRefHandler */ -void DomainFieldRefHandler::start(Variant::mapType &args) +bool DomainFieldRefHandler::start(Variant::mapType &args) { Rooted<Descriptor> parent = scope().selectOrThrow<Descriptor>(); @@ -135,13 +158,14 @@ void DomainFieldRefHandler::start(Variant::mapType &args) field.cast<FieldDescriptor>(), logger); } }); + return true; } void DomainFieldRefHandler::end() {} /* DomainPrimitiveHandler */ -void DomainPrimitiveHandler::start(Variant::mapType &args) +bool DomainPrimitiveHandler::start(Variant::mapType &args) { Rooted<Descriptor> parent = scope().selectOrThrow<Descriptor>(); @@ -167,13 +191,14 @@ void DomainPrimitiveHandler::start(Variant::mapType &args) }); scope().push(field); + return true; } void DomainPrimitiveHandler::end() { scope().pop(); } /* DomainChildHandler */ -void DomainChildHandler::start(Variant::mapType &args) +bool DomainChildHandler::start(Variant::mapType &args) { Rooted<FieldDescriptor> field = scope().selectOrThrow<FieldDescriptor>(); @@ -186,13 +211,12 @@ void DomainChildHandler::start(Variant::mapType &args) child.cast<StructuredClass>()); } }); + return true; } -void DomainChildHandler::end() {} - /* DomainParentHandler */ -void DomainParentHandler::start(Variant::mapType &args) +bool DomainParentHandler::start(Variant::mapType &args) { Rooted<StructuredClass> strct = scope().selectOrThrow<StructuredClass>(); @@ -200,12 +224,14 @@ void DomainParentHandler::start(Variant::mapType &args) new DomainParent(strct->getManager(), args["ref"].asString(), strct)}; parent->setLocation(location()); scope().push(parent); + return true; } void DomainParentHandler::end() { scope().pop(); } /* DomainParentFieldHandler */ -void DomainParentFieldHandler::start(Variant::mapType &args) + +bool DomainParentFieldHandler::start(Variant::mapType &args) { Rooted<DomainParent> parentNameNode = scope().selectOrThrow<DomainParent>(); FieldDescriptor::FieldType type; @@ -233,13 +259,12 @@ void DomainParentFieldHandler::start(Variant::mapType &args) field->addChild(strct.cast<StructuredClass>()); } }); + return true; } -void DomainParentFieldHandler::end() {} - /* DomainParentFieldRefHandler */ -void DomainParentFieldRefHandler::start(Variant::mapType &args) +bool DomainParentFieldRefHandler::start(Variant::mapType &args) { Rooted<DomainParent> parentNameNode = scope().selectOrThrow<DomainParent>(); @@ -265,12 +290,104 @@ void DomainParentFieldRefHandler::start(Variant::mapType &args) field->addChild(strct.cast<StructuredClass>()); } }); + return true; } -void DomainParentFieldRefHandler::end() {} +namespace States { +const State Domain = StateBuilder() + .parents({&None, &Document}) + .createdNodeType(&RttiTypes::Domain) + .elementHandler(DomainHandler::create) + .arguments({Argument::String("name")}); + +const State DomainStruct = + StateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::StructuredClass) + .elementHandler(DomainStructHandler::create) + .arguments({Argument::String("name"), + Argument::Cardinality("cardinality", Cardinality::any()), + Argument::Bool("isRoot", false), + Argument::Bool("transparent", false), + Argument::String("isa", "")}); + +const State DomainAnnotation = + StateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::AnnotationClass) + .elementHandler(DomainAnnotationHandler::create) + .arguments({Argument::String("name")}); + +const State DomainAttributes = + StateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(DomainAttributesHandler::create) + .arguments({}); + +const State DomainAttribute = + StateBuilder() + .parent(&DomainAttributes) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +const State DomainField = StateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldHandler::create) + .arguments({Argument::String("name", ""), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +const State DomainFieldRef = + StateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldRefHandler::create) + .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); + +const State DomainStructPrimitive = + StateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainPrimitiveHandler::create) + .arguments( + {Argument::String("name", ""), Argument::Bool("isSubtree", false), + Argument::Bool("optional", false), Argument::String("type")}); + +const State DomainStructChild = StateBuilder() + .parent(&DomainField) + .elementHandler(DomainChildHandler::create) + .arguments({Argument::String("ref")}); + +const State DomainStructParent = + StateBuilder() + .parent(&DomainStruct) + .createdNodeType(&RttiTypes::DomainParent) + .elementHandler(DomainParentHandler::create) + .arguments({Argument::String("ref")}); + +const State DomainStructParentField = + StateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldHandler::create) + .arguments({Argument::String("name", ""), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +const State DomainStructParentFieldRef = + StateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldRefHandler::create) + .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); +} +} namespace RttiTypes { -const Rtti DomainParent = - RttiBuilder<ousia::DomainParent>("DomainParent").parent(&Node); +const Rtti DomainParent = RttiBuilder<ousia::parser_stack::DomainParent>( + "DomainParent").parent(&Node); } } diff --git a/src/core/parser/stack/DomainHandler.hpp b/src/core/parser/stack/DomainHandler.hpp index 7398812..76172d6 100644 --- a/src/core/parser/stack/DomainHandler.hpp +++ b/src/core/parser/stack/DomainHandler.hpp @@ -19,26 +19,34 @@ /** * @file DomainHandler.hpp * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + * Contains the Handler classes used for parsing Domain descriptors. This + * includes the "domain" tag and all describing tags below the "domain" tag. + * + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_DOMAIN_HANDLER_HPP_ #define _OUSIA_DOMAIN_HANDLER_HPP_ #include <core/common/Variant.hpp> -#include <core/parser/ParserStack.hpp> +#include <core/model/Node.hpp> + +#include "Handler.hpp" namespace ousia { // Forward declarations class Rtti; -class DomainHandler : public Handler { -public: - using Handler::Handler; +namespace parser_stack { + +// TODO: Documentation - void start(Variant::mapType &args) override; +class DomainHandler : public StaticHandler { +public: + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -47,12 +55,11 @@ public: } }; -class DomainStructHandler : public Handler { +class DomainStructHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -61,12 +68,11 @@ public: } }; -class DomainAnnotationHandler : public Handler { +class DomainAnnotationHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -75,12 +81,11 @@ public: } }; -class DomainAttributesHandler : public Handler { +class DomainAttributesHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -89,12 +94,11 @@ public: } }; -class DomainFieldHandler : public Handler { +class DomainFieldHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -103,12 +107,11 @@ public: } }; -class DomainFieldRefHandler : public Handler { +class DomainFieldRefHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -117,12 +120,11 @@ public: } }; -class DomainPrimitiveHandler : public Handler { +class DomainPrimitiveHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -131,13 +133,11 @@ public: } }; -class DomainChildHandler : public Handler { +class DomainChildHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; - void end() override; + bool start(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -150,16 +150,11 @@ public: using Node::Node; }; -namespace RttiTypes { -extern const Rtti DomainParent; -} - -class DomainParentHandler : public Handler { +class DomainParentHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; static Handler *create(const HandlerData &handlerData) @@ -168,13 +163,11 @@ public: } }; -class DomainParentFieldHandler : public Handler { +class DomainParentFieldHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; - - void end() override; + bool start(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { @@ -182,18 +175,83 @@ public: } }; -class DomainParentFieldRefHandler : public Handler { +class DomainParentFieldRefHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; - - void end() override; + bool start(Variant::mapType &args) override; static Handler *create(const HandlerData &handlerData) { return new DomainParentFieldRefHandler{handlerData}; } }; + +namespace States { +/** + * State representing a "domain" struct. + */ +extern const State Domain; + +/** + * State representing a "struct" tag within a domain description. + */ +extern const State DomainStruct; + +/** + * State representing an "annotation" tag within a domain description. + */ +extern const State DomainAnnotation; + +/** + * State representing an "attributes" tag within a structure or annotation. + */ +extern const State DomainAttributes; + +/** + * State representing an "attribute" tag within the "attributes". + */ +extern const State DomainAttribute; + +/** + * State representing a "field" tag within a structure or annotation. + */ +extern const State DomainField; + +/** + * State representing a "fieldref" tag within a structure or annotation. + */ +extern const State DomainFieldRef; + +/** + * State representing a "primitive" tag within a structure or annotation. + */ +extern const State DomainStructPrimitive; + +/** + * State representing a "child" tag within a structure or annotation. + */ +extern const State DomainStructChild; + +/** + * State representing a "parent" tag within a structure or annotation. + */ +extern const State DomainStructParent; + +/** + * State representing a "field" tag within a "parent" tag. + */ +extern const State DomainStructParentField; + +/** + * State representing a "fieldRef" tag within a "parent" tag. + */ +extern const State DomainStructParentFieldRef; +} +} + +namespace RttiTypes { +extern const Rtti DomainParent; +} } #endif diff --git a/src/core/parser/stack/GenericParserStates.cpp b/src/core/parser/stack/GenericParserStates.cpp new file mode 100644 index 0000000..69a6e0e --- /dev/null +++ b/src/core/parser/stack/GenericParserStates.cpp @@ -0,0 +1,53 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "DocumentHandler.hpp" +#include "DomainHandler.hpp" +#include "GenericParserStates.hpp" +#include "ImportIncludeHandler.hpp" +#include "TypesystemHandler.hpp" + +namespace ousia { +namespace parser_stack { + +const std::multimap<std::string, const State *> GenericParserStates{ + {"document", &States::Document}, + {"*", &States::DocumentChild}, + {"domain", &States::Domain}, + {"struct", &States::DomainStruct}, + {"annotation", &States::DomainAnnotation}, + {"attributes", &States::DomainAttributes}, + {"attribute", &States::DomainAttribute}, + {"field", &States::DomainField}, + {"fieldRef", &States::DomainFieldRef}, + {"primitive", &States::DomainStructPrimitive}, + {"childRef", &States::DomainStructChild}, + {"parentRef", &States::DomainStructParent}, + {"field", &States::DomainStructParentField}, + {"fieldRef", &States::DomainStructParentFieldRef}, + {"typesystem", &States::Typesystem}, + {"enum", &States::TypesystemEnum}, + {"entry", &States::TypesystemEnumEntry}, + {"struct", &States::TypesystemStruct}, + {"field", &States::TypesystemStructField}, + {"constant", &States::TypesystemConstant}, + {"import", &States::Import}, + {"include", &States::Include}}; +} +} + diff --git a/src/core/parser/generic/GenericParser.hpp b/src/core/parser/stack/GenericParserStates.hpp index 4f29f94..552eee5 100644 --- a/src/core/parser/generic/GenericParser.hpp +++ b/src/core/parser/stack/GenericParserStates.hpp @@ -17,33 +17,33 @@ */ /** - * @file GenericParser.hpp + * @file GenericParserStates.hpp * - * The GenericParser class builds an abstraction layer that separates the - * underlying document format (e.g. osdm or osdmx) from the actual process of - * building the document model. It provides a set of genric functions that - * should be called by the inheriting concrete parser class, e.g. indicating a - * command with parameters, the start/end of a field or the start/end of an - * annotation. The GenericParser maintains an internal stack of - * ParserStateHandlers and relays the commands to the elements of this stack. + * Contains a multimap which maps between tag/command names to the corresponding + * state descriptors. This multimap is used to initialize the push down + * automaton residing inside the "Stack" class. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_GENERIC_PARSER_HPP_ -#define _OUSIA_GENERIC_PARSER_HPP_ +#ifndef _OUSIA_PARSER_STACK_GENERIC_PARSER_STATES_HPP_ +#define _OUSIA_PARSER_STACK_GENERIC_PARSER_STATES_HPP_ -#include <core/parser/Parseer.hpp> +#include <string> +#include <map> namespace ousia { +namespace parser_stack { -class GenericParser : public Parser { - - - -}; +// Forward declarations +class State; +/** + * Map between tagnames and references to the corresponding State instances. + */ +extern const std::multimap<std::string, const State *> GenericParserStates; +} } -#endif _OUSIA_GENERIC_PARSER_HPP_ +#endif /* _OUSIA_PARSER_STACK_GENERIC_PARSER_STATES_HPP_ */ diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp new file mode 100644 index 0000000..bf5d4ea --- /dev/null +++ b/src/core/parser/stack/Handler.cpp @@ -0,0 +1,254 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/Exceptions.hpp> +#include <core/common/Logger.hpp> +#include <core/parser/ParserContext.hpp> + +#include "Callbacks.hpp" +#include "Handler.hpp" +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class HandlerData */ + +HandlerData::HandlerData(ParserContext &ctx, /*Callbacks &callbacks,*/ + const std::string &name, const State &state, + const SourceLocation &location) + : ctx(ctx), + /*callbacks(callbacks),*/ + name(name), + state(state), + location(location) +{ +} + +/* Class Handler */ + +Handler::Handler(const HandlerData &handlerData) + : handlerData(handlerData), internalLogger(nullptr) +{ +} + +Handler::~Handler() {} + +ParserContext &Handler::context() { return handlerData.ctx; } + +ParserScope &Handler::scope() { return handlerData.ctx.getScope(); } + +Manager &Handler::manager() { return handlerData.ctx.getManager(); } + +Logger &Handler::logger() +{ + if (internalLogger != nullptr) { + return *internalLogger; + } + return handlerData.ctx.getLogger(); +} + +const SourceLocation &Handler::location() const { return handlerData.location; } + +const std::string &Handler::name() const { return handlerData.name; } + +void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ + /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/ +} + +void Handler::registerToken(const std::string &token) +{ + /*handlerData.callbacks.registerToken(token);*/ +} + +void Handler::unregisterToken(const std::string &token) +{ + /*handlerData.callbacks.unregisterToken(token);*/ +} + +const std::string &Handler::getName() const { return name(); } + +const State &Handler::getState() const { return handlerData.state; } + +void Handler::setLogger(Logger &logger) { internalLogger = &logger; } + +void Handler::resetLogger() { internalLogger = nullptr; } + +const SourceLocation &Handler::getLocation() const { return location(); } + +/* Class EmptyHandler */ + +bool EmptyHandler::start(Variant::mapType &args) +{ + // Just accept anything + return true; +} + +void EmptyHandler::end() +{ + // Do nothing if a command ends +} + +bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex) +{ + // Accept any field + return true; +} + +void EmptyHandler::fieldEnd() +{ + // Do not handle fields +} + +bool EmptyHandler::annotationStart(const Variant &className, + Variant::mapType &args) +{ + // Accept any data + return true; +} + +bool EmptyHandler::annotationEnd(const Variant &className, + const Variant &elementName) +{ + // Accept any annotation + return true; +} + +bool EmptyHandler::data(Variant &data) +{ + // Support any data + return true; +} + +Handler *EmptyHandler::create(const HandlerData &handlerData) +{ + return new EmptyHandler(handlerData); +} + +/* Class StaticHandler */ + +bool StaticHandler::start(Variant::mapType &args) +{ + // Do nothing in the default implementation, accept anything + return true; +} + +void StaticHandler::end() +{ + // Do nothing here +} + +bool StaticHandler::fieldStart(bool &isDefault, size_t fieldIdx) +{ + // Return true if either the default field is requested or the field index + // is zero. This simulates that there is exactly one field (a default field) + if (fieldIdx == 0) { + isDefault = true; + return true; + } + return false; +} + +void StaticHandler::fieldEnd() +{ + // Do nothing here +} + +bool StaticHandler::annotationStart(const Variant &className, + Variant::mapType &args) +{ + // No annotations supported + return false; +} + +bool StaticHandler::annotationEnd(const Variant &className, + const Variant &elementName) +{ + // No annotations supported + return false; +} + +bool StaticHandler::data(Variant &data) +{ + logger().error("Did not expect any data here", data); + return false; +} + +/* Class StaticFieldHandler */ + +StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData, + const std::string &argName) + : StaticHandler(handlerData), argName(argName), handled(false) +{ +} + +bool StaticFieldHandler::start(Variant::mapType &args) +{ + if (!argName.empty()) { + auto it = args.find(argName); + if (it != args.end() && !it->second.toString().empty()) { + handled = true; + doHandle(it->second, args); + return true; + } + } + + this->args = args; + return true; +} + +void StaticFieldHandler::end() +{ + if (!handled) { + if (!argName.empty()) { + logger().error(std::string("Required argument \"") + argName + + std::string("\" is missing."), + location()); + } else { + logger().error("Command requires data, but no data given", + location()); + } + } +} + +bool StaticFieldHandler::data(Variant &data) +{ + // Call the doHandle function if this has not been done before + if (!handled) { + handled = true; + doHandle(data, args); + return true; + } + + // The doHandle function was already called, print an error message + logger().error( + std::string("Found data, but the corresponding argument \"") + argName + + std::string("\" was already specified"), + data); + + // Print the location at which the attribute was originally specified + auto it = args.find(argName); + if (it != args.end()) { + logger().note(std::string("Attribute was specified here:"), it->second); + } + return false; +} +} +} + diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp new file mode 100644 index 0000000..7cda7a4 --- /dev/null +++ b/src/core/parser/stack/Handler.hpp @@ -0,0 +1,421 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef _OUSIA_PARSER_STACK_HANDLER_HPP_ +#define _OUSIA_PARSER_STACK_HANDLER_HPP_ + +#include <string> + +#include <core/common/Location.hpp> +#include <core/common/Variant.hpp> +#include <core/common/Whitespace.hpp> + +namespace ousia { + +// Forward declarations +class ParserScope; +class ParserContext; +class Logger; + +namespace parser_stack { + +// More forward declarations +class Callbacks; +class State; + +/** + * Class collecting all the data that is being passed to a Handler + * instance. + */ +class HandlerData { +public: + /** + * Reference to the ParserContext instance that should be used to resolve + * references to nodes in the Graph. + */ + ParserContext &ctx; + + /** + * Reference at an instance of the Callbacks class, used for + * modifying the behaviour of the parser (like registering tokens, setting + * the data type or changing the whitespace handling mode). + */ + // Callbacks &callbacks; + + /** + * Contains the name of the command that is being handled. + */ + std::string name; + + /** + * Contains the current state of the state machine. + */ + const State &state; + + /** + * Current source code location. + */ + SourceLocation location; + + /** + * Constructor of the HandlerData class. + * + * @param ctx is the parser context the handler should be executed in. + * @param callbacks is an instance of Callbacks used to notify + * the parser about certain state changes. + * @param name is the name of the string. + * @param state is the state this handler was called for. + * @param location is the location at which the handler is created. + */ + HandlerData(ParserContext &ctx, + /*Callbacks &callbacks,*/ const std::string &name, + const State &state, const SourceLocation &location); +}; + +/** + * The Handler class provides a context for handling a generic stack element. + * It has to beoverridden and registered in the StateStack class to form + * handlers for concrete XML tags. + */ +class Handler { +private: + /** + * Structure containing the internal handler data. + */ + const HandlerData handlerData; + + /** + * Reference at the current logger. If not nullptr, this will override the + * logger from the ParserContext specified in the handlerData. + */ + Logger *internalLogger; + +protected: + /** + * Constructor of the Handler class. + * + * @param data is a structure containing all data being passed to the + * handler. + */ + Handler(const HandlerData &handlerData); + + /** + * Returns a reference at the ParserContext. + * + * @return a reference at the ParserContext. + */ + ParserContext &context(); + + /** + * Returns a reference at the ParserScope instance. + * + * @return a reference at the ParserScope instance. + */ + ParserScope &scope(); + + /** + * Returns a reference at the Manager instance which manages all nodes. + * + * @return a referance at the Manager instance. + */ + Manager &manager(); + + /** + * Returns a reference at the Logger instance used for logging error + * messages. + * + * @return a reference at the Logger instance. + */ + Logger &logger(); + + /** + * Returns the location of the element in the source file, for which this + * Handler was created. + * + * @return the location of the Handler in the source file. + */ + const SourceLocation &location() const; + + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &name() const; + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); + + /** + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + void setWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Calls the corresponding function in the Callbacks instance. + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + void registerToken(const std::string &token); + + /** + * Calls the corresponding function in the Callbacks instance. + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + void unregisterToken(const std::string &token); + + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &getName() const; + + /** + * Reference at the State descriptor for which this Handler was created. + * + * @return a const reference at the constructing State descriptor. + */ + const State &getState() const; + + /** + * Sets the internal logger to the given logger instance. + * + * @param logger is the Logger instance to which the logger should be set. + */ + void setLogger(Logger &logger); + + /** + * Resets the logger instance to the logger instance provided in the + * ParserContext. + */ + void resetLogger(); + + /** + * Returns the location of the element in the source file, for which this + * Handler was created. + * + * @return the location of the Handler in the source file. + */ + const SourceLocation &getLocation() const; + + /** + * Called when the command that was specified in the constructor is + * instanciated. + * + * @param args is a map from strings to variants (argument name and value). + * @return true if the handler was successful in starting the element it + * represents, false otherwise. + */ + virtual bool start(Variant::mapType &args) = 0; + + /** + * Called before the command for which this handler is defined ends (is + * forever removed from the stack). + */ + virtual void end() = 0; + + /** + * Called when a new field starts, while the handler is active. This + * function should return true if the field is supported, false otherwise. + * No error should be logged if the field cannot be started, the caller will + * take care of that (since it is always valid to start a default field, + * even though the corresponding structure does not have a field, as long as + * no data is fed into the field). + * + * @param isDefault is set to true if the field that is being started is the + * default/tree field. The handler should set the value of this variable to + * true if the referenced field is indeed the default field. + * @param fieldIdx is the numerical index of the field. + */ + virtual bool fieldStart(bool &isDefault, size_t fieldIdx) = 0; + + /** + * Called when a previously opened field ends, while the handler is active. + * Note that a "fieldStart" and "fieldEnd" are always called alternately. + */ + virtual void fieldEnd() = 0; + + /** + * Called whenever an annotation starts while this handler is active. The + * function should return true if starting the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the name in the source code. + * @param args is a map from strings to variants (argument name and value). + * @return true if the mentioned annotation could be started here, false + * if an error occurred. + */ + virtual bool annotationStart(const Variant &className, + Variant::mapType &args) = 0; + + /** + * Called whenever an annotation ends while this handler is active. The + * function should return true if ending the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the class name in the source code. + * @param elementName is a string variant containing the name of the + * annotation class and the location of the element name in the source code. + * @return true if the mentioned annotation could be started here, false if + * an error occurred. + */ + virtual bool annotationEnd(const Variant &className, + const Variant &elementName) = 0; + + /** + * Called whenever raw data (int the form of a string) is available for the + * Handler instance. Should return true if the data could be handled, false + * otherwise. + * + * @param data is a string variant containing the character data and its + * location. + * @return true if the data could be handled, false otherwise. + */ + virtual bool data(Variant &data) = 0; +}; + +/** + * HandlerConstructor is a function pointer type used to create concrete + * instances of the Handler class. + * + * @param handlerData is the data that should be passed to the new handler + * instance. + * @return a newly created handler instance. + */ +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * The EmptyHandler class is used in case no element handler is specified in + * the State descriptor. It just accepts all data and does nothing. + */ +class EmptyHandler : public Handler { +protected: + using Handler::Handler; + +public: + bool start(Variant::mapType &args) override; + void end() override; + bool fieldStart(bool &isDefault, size_t fieldIdx) override; + void fieldEnd() override; + bool annotationStart(const Variant &className, + Variant::mapType &args) override; + bool annotationEnd(const Variant &className, + const Variant &elementName) override; + bool data(Variant &data) override; + + /** + * Creates an instance of the EmptyHandler class. + */ + static Handler *create(const HandlerData &handlerData); +}; + +/** + * The StaticHandler class is used to handle predifined commands which do + * neither support annotations, nor multiple fields. Child classes can decide + * whether a single data field should be used. + */ +class StaticHandler : public Handler { +protected: + using Handler::Handler; + +public: + bool start(Variant::mapType &args) override; + void end() override; + bool fieldStart(bool &isDefault, size_t fieldIdx) override; + void fieldEnd() override; + bool annotationStart(const Variant &className, + Variant::mapType &args) override; + bool annotationEnd(const Variant &className, + const Variant &elementName) override; + bool data(Variant &data) override; +}; + +/** + * The StaticFieldHandler class is used to handle predifined commands which do + * neither support annotations, nor multiple fields. Additionally, it captures a + * data entry from a single default field. + */ +class StaticFieldHandler : public StaticHandler { +private: + /** + * Set to the name of the data argument that should be used instead of the + * data field, if no data field is given. + */ + std::string argName; + + /** + * Set to true, once the "doHandle" function has been called. + */ + bool handled; + + /** + * Map containing the arguments given in the start function. + */ + Variant::mapType args; + +protected: + /** + * Constructor of the StaticFieldHandler class. + * + * @param handlerData is a structure containing the internal data that + * should be stored inside the handler. + * @param name of the data argument that -- if present -- should be used + * instead of the data field. If empty, data is not captured from the + * arguments. If both, data in the data field and the argument, are given, + * this results in an error. + */ + StaticFieldHandler(const HandlerData &handlerData, + const std::string &argName); + + /** + * Function that should be overriden in order to handle the field data and + * the other arguments. This function is not called if no data was given. + * + * @param fieldData is the captured field data. + * @param args are the arguments that were given in the "start" function. + */ + virtual void doHandle(const Variant &fieldData, + Variant::mapType &args) = 0; + +public: + bool start(Variant::mapType &args) override; + void end() override; + bool data(Variant &data) override; +}; +} +} + +#endif /* _OUSIA_PARSER_STACK_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/ImportIncludeHandler.cpp b/src/core/parser/stack/ImportIncludeHandler.cpp index 94ee82d..d1ea97d 100644 --- a/src/core/parser/stack/ImportIncludeHandler.cpp +++ b/src/core/parser/stack/ImportIncludeHandler.cpp @@ -16,50 +16,22 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "ImportIncludeHandler.hpp" - +#include <core/model/RootNode.hpp> #include <core/parser/ParserScope.hpp> +#include <core/parser/ParserContext.hpp> -namespace ousia { - -/* ImportIncludeHandler */ - -void ImportIncludeHandler::start(Variant::mapType &args) -{ - rel = args["rel"].asString(); - type = args["type"].asString(); - src = args["src"].asString(); - srcInArgs = !src.empty(); -} +#include "DomainHandler.hpp" +#include "DocumentHandler.hpp" +#include "ImportIncludeHandler.hpp" +#include "State.hpp" +#include "TypesystemHandler.hpp" -void ImportIncludeHandler::data(const std::string &data, int field) -{ - if (srcInArgs) { - logger().error("\"src\" attribute has already been set"); - return; - } - if (field != 0) { - logger().error("Command has only one field."); - return; - } - src.append(data); -} +namespace ousia { +namespace parser_stack { /* ImportHandler */ -void ImportHandler::start(Variant::mapType &args) -{ - ImportIncludeHandler::start(args); - - // Make sure imports are still possible - if (scope().getFlag(ParserFlag::POST_HEAD)) { - logger().error("Imports must be listed before other commands.", - location()); - return; - } -} - -void ImportHandler::end() +void ImportHandler::doHandle(const Variant &fieldData, Variant::mapType &args) { // Fetch the last node and check whether an import is valid at this // position @@ -75,8 +47,9 @@ void ImportHandler::end() // Perform the actual import, register the imported node within the leaf // node - Rooted<Node> imported = - context().import(src, type, rel, leafRootNode->getReferenceTypes()); + Rooted<Node> imported = context().import( + fieldData.asString(), args["type"].asString(), args["rel"].asString(), + leafRootNode->getReferenceTypes()); if (imported != nullptr) { leafRootNode->reference(imported); } @@ -84,13 +57,26 @@ void ImportHandler::end() /* IncludeHandler */ -void IncludeHandler::start(Variant::mapType &args) +void IncludeHandler::doHandle(const Variant &fieldData, Variant::mapType &args) { - ImportIncludeHandler::start(args); + context().include(fieldData.asString(), args["type"].asString(), + args["rel"].asString(), {&RttiTypes::Node}); } -void IncludeHandler::end() -{ - context().include(src, type, rel, {&RttiTypes::Node}); +namespace States { +const State Import = + StateBuilder() + .parents({&Document, &Typesystem, &Domain}) + .elementHandler(ImportHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +const State Include = + StateBuilder() + .parent(&All) + .elementHandler(IncludeHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); +} } } diff --git a/src/core/parser/stack/ImportIncludeHandler.hpp b/src/core/parser/stack/ImportIncludeHandler.hpp index b0767be..6168639 100644 --- a/src/core/parser/stack/ImportIncludeHandler.hpp +++ b/src/core/parser/stack/ImportIncludeHandler.hpp @@ -19,6 +19,9 @@ /** * @file ImportIncludeHandler.hpp * + * Contains the conceptually similar handlers for the "include" and "import" + * commands. + * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -26,51 +29,78 @@ #define _OUSIA_IMPORT_INCLUDE_HANDLER_HPP_ #include <core/common/Variant.hpp> -#include <core/parser/ParserStack.hpp> - -namespace ousia { -class ImportIncludeHandler : public Handler { -protected: - bool srcInArgs = false; - std::string rel; - std::string type; - std::string src; +#include "Handler.hpp" -public: - using Handler::Handler; - - void start(Variant::mapType &args) override; - - void data(const std::string &data, int field) override; -}; +namespace ousia { +namespace parser_stack { -class ImportHandler : public ImportIncludeHandler { +/** + * The ImportHandler is responsible for handling the "import" command. An import + * creates a reference to a specified file. The specified file is parsed (if + * this has not already been done) outside of the context of the current file. + * If the specified resource has already been parsed, a reference to the already + * parsed file is inserted. Imports are only possible before no other content + * has been parsed. + */ +class ImportHandler : public StaticFieldHandler { public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override; - - void end() override; - + using StaticFieldHandler::StaticFieldHandler; + + void doHandle(const Variant &fieldData, + Variant::mapType &args) override; + + /** + * Creates a new instance of the ImportHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { - return new ImportHandler{handlerData}; + return new ImportHandler{handlerData, "src"}; } }; -class IncludeHandler : public ImportIncludeHandler { +/** + * The IncludeHandler is responsible for handling the "include" command. The + * included file is parsed in the context of the current file and will change + * the content that is currently being parsed. Includes are possible at (almost) + * any position in the source file. + */ +class IncludeHandler : public StaticFieldHandler { public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override; - - void end() override; - + using StaticFieldHandler::StaticFieldHandler; + + void doHandle(const Variant &fieldData, + Variant::mapType &args) override; + + /** + * Creates a new instance of the IncludeHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { - return new IncludeHandler{handlerData}; + return new IncludeHandler{handlerData, "src"}; } }; + +namespace States { +/** + * State representing the "import" command. + */ +extern const State Import; + +/** + * State representing the "include" command. + */ +extern const State Include; +} + +} } #endif diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp new file mode 100644 index 0000000..47f7d2c --- /dev/null +++ b/src/core/parser/stack/Stack.cpp @@ -0,0 +1,550 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <sstream> + +#include <core/common/Logger.hpp> +#include <core/common/Utils.hpp> +#include <core/common/Exceptions.hpp> +#include <core/parser/ParserScope.hpp> +#include <core/parser/ParserContext.hpp> + +#include "Handler.hpp" +#include "Stack.hpp" +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class HandlerInfo */ + +HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {} + +HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler) + : handler(handler), + fieldIdx(0), + valid(true), + implicit(false), + inField(false), + inDefaultField(false), + inImplicitDefaultField(false), + inValidField(false), + hadDefaultField(false) +{ +} + +HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField, + bool inDefaultField, bool inImplicitDefaultField, + bool inValidField) + : handler(nullptr), + fieldIdx(0), + valid(valid), + implicit(implicit), + inField(inField), + inDefaultField(inDefaultField), + inImplicitDefaultField(inImplicitDefaultField), + inValidField(inValidField), + hadDefaultField(false) +{ +} + +HandlerInfo::~HandlerInfo() +{ + // Do nothing +} + +void HandlerInfo::fieldStart(bool isDefault, bool isImplicit, bool isValid) +{ + inField = true; + inDefaultField = isDefault || isImplicit; + inImplicitDefaultField = isImplicit; + inValidField = isValid; + hadDefaultField = hadDefaultField || inDefaultField; + fieldIdx++; +} + +void HandlerInfo::fieldEnd() +{ + inField = false; + inDefaultField = false; + inImplicitDefaultField = false; + inValidField = false; +} + +/** + * Stub instance of HandlerInfo containing no handler information. + */ +static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true}; + +/* Helper functions */ + +/** + * Returns an Exception that should be thrown when a currently invalid command + * is thrown. + * + * @param name is the name of the command for which no state transition is + * found. + * @param expected is a set containing the names of the expected commands. + */ +static LoggableException buildInvalidCommandException( + const std::string &name, const std::set<std::string> &expected) +{ + if (expected.empty()) { + return LoggableException{ + std::string{"No nested elements allowed, but got \""} + name + + std::string{"\""}}; + } else { + return LoggableException{ + std::string{"Expected "} + + (expected.size() == 1 ? std::string{"\""} + : std::string{"one of \""}) + + Utils::join(expected, "\", \"") + std::string{"\", but got \""} + + name + std::string{"\""}}; + } +} + +/* Class Stack */ + +Stack::Stack(ParserContext &ctx, + const std::multimap<std::string, const State *> &states) + : ctx(ctx), states(states) +{ + // If the scope instance is not empty we need to deduce the current parser + // state + if (!ctx.getScope().isEmpty()) { + deduceState(); + } +} + +Stack::~Stack() +{ + while (!stack.empty()) { + // Fetch the topmost stack element + HandlerInfo &info = currentInfo(); + + // It is an error if we're still in a field of an element while the + // Stack instance is destroyed. Log that + if (handlersValid()) { + if (info.inField && !info.implicit && + !info.inImplicitDefaultField) { + logger().error( + std::string("Reached end of stream, but command \"") + + info.handler->getName() + + "\" has not ended yet. Command was started here:", + info.handler->getLocation()); + } + } + + // Remove the command from the stack + endCurrentHandler(); + } +} + +void Stack::deduceState() +{ + // Assemble all states + std::vector<const State *> states; + for (const auto &e : this->states) { + states.push_back(e.second); + } + + // Fetch the type signature of the scope and derive all possible states, + // abort if no unique parser state was found + std::vector<const State *> possibleStates = + StateDeductor(ctx.getScope().getStackTypeSignature(), states).deduce(); + if (possibleStates.size() != 1U) { + throw LoggableException( + "Error while including file: Cannot deduce parser state."); + } + + // Switch to this state by creating a handler, but do not call its start + // function + const State &state = *possibleStates[0]; + HandlerConstructor ctor = + state.elementHandler ? state.elementHandler : EmptyHandler::create; + + std::shared_ptr<Handler> handler = + std::shared_ptr<Handler>{ctor({ctx, "", state, SourceLocation{}})}; + stack.emplace_back(handler); + + // Set the correct flags for this implicit handler + HandlerInfo &info = currentInfo(); + info.implicit = true; + info.fieldStart(true, false, true); +} + +std::set<std::string> Stack::expectedCommands() +{ + const State *currentState = &(this->currentState()); + std::set<std::string> res; + for (const auto &v : states) { + if (v.second->parents.count(currentState)) { + res.insert(v.first); + } + } + return res; +} + +const State &Stack::currentState() +{ + return stack.empty() ? States::None : stack.back().handler->getState(); +} + +std::string Stack::currentCommandName() +{ + return stack.empty() ? std::string{} : stack.back().handler->getName(); +} + +const State *Stack::findTargetState(const std::string &name) +{ + const State *currentState = &(this->currentState()); + auto range = states.equal_range(name); + for (auto it = range.first; it != range.second; it++) { + const StateSet &parents = it->second->parents; + if (parents.count(currentState) || parents.count(&States::All)) { + return it->second; + } + } + + return nullptr; +} + +const State *Stack::findTargetStateOrWildcard(const std::string &name) +{ + // Try to find the target state with the given name, if none is found, try + // find a matching "*" state. + State const *targetState = findTargetState(name); + if (targetState == nullptr) { + return findTargetState("*"); + } + return targetState; +} + +HandlerInfo &Stack::currentInfo() +{ + return stack.empty() ? EmptyHandlerInfo : stack.back(); +} +HandlerInfo &Stack::lastInfo() +{ + return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2]; +} + +void Stack::endCurrentHandler() +{ + if (!stack.empty()) { + // Fetch the handler info for the current top-level element + HandlerInfo &info = stack.back(); + + // Do not call any callback functions while the stack is marked as + // invalid or this is an elment marked as "implicit" + if (!info.implicit && handlersValid()) { + // Make sure the fieldEnd handler is called if the element still + // is in a field + if (info.inField) { + info.handler->fieldEnd(); + info.fieldEnd(); + } + + // Call the "end" function of the corresponding Handler instance + info.handler->end(); + } + + // Remove the element from the stack + stack.pop_back(); + } +} + +bool Stack::ensureHandlerIsInField() +{ + // If the current handler is not in a field (and actually has a handler) + // try to start a default field + HandlerInfo &info = currentInfo(); + if (!info.inField && info.handler != nullptr) { + // Abort if the element already had a default field + if (info.hadDefaultField) { + return false; + } + + // Try to start a new default field, abort if this did not work + bool isDefault = true; + if (!info.handler->fieldStart(isDefault, info.fieldIdx)) { + info.handler->fieldEnd(); + endCurrentHandler(); + return false; + } + + // Mark the field as started + info.fieldStart(true, true, true); + } + return true; +} + +bool Stack::handlersValid() +{ + for (auto it = stack.crbegin(); it != stack.crend(); it++) { + if (!it->valid) { + return false; + } + } + return true; +} + +Logger &Stack::logger() { return ctx.getLogger(); } + +void Stack::command(const Variant &name, const Variant::mapType &args) +{ + // Make sure the given identifier is valid (preventing "*" from being + // malicously passed to this function) + if (!Utils::isNamespacedIdentifier(name.asString())) { + throw LoggableException(std::string("Invalid identifier \"") + + name.asString() + std::string("\""), + name); + } + + while (true) { + // Try to find a target state for the given command, if none can be + // found and the current command does not have an open field, then try + // to create an empty default field, otherwise this is an exception + const State *targetState = findTargetStateOrWildcard(name.asString()); + if (targetState == nullptr) { + if (!currentInfo().inField) { + endCurrentHandler(); + continue; + } else { + throw buildInvalidCommandException(name.asString(), + expectedCommands()); + } + } + + // Make sure we're currently inside a field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } + + // Fork the logger. We do not want any validation errors to skip + LoggerFork loggerFork = logger().fork(); + + // Instantiate the handler and push it onto the stack + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : EmptyHandler::create; + std::shared_ptr<Handler> handler{ + ctor({ctx, name.asString(), *targetState, name.getLocation()})}; + stack.emplace_back(handler); + + // Fetch the HandlerInfo for the parent element and the current element + HandlerInfo &parentInfo = lastInfo(); + HandlerInfo &info = currentInfo(); + + // Call the "start" method of the handler, store the result of the start + // method as the validity of the handler -- do not call the start method + // if the stack is currently invalid (as this may cause further, + // unwanted errors) + bool validStack = handlersValid(); + info.valid = false; + if (validStack) { + // Canonicalize the arguments (if this has not already been done), + // allow additional arguments + Variant::mapType canonicalArgs = args; + targetState->arguments.validateMap(canonicalArgs, loggerFork, true); + + handler->setLogger(loggerFork); + try { + info.valid = handler->start(canonicalArgs); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + handler->resetLogger(); + } + + // We started the command within an implicit default field and it is not + // valid -- remove both the new handler and the parent field from the + // stack + if (!info.valid && parentInfo.inImplicitDefaultField) { + endCurrentHandler(); + endCurrentHandler(); + continue; + } + + // If we ended up here, starting the command may or may not have worked, + // but after all, we cannot unroll the stack any further. Update the + // "valid" flag, commit any potential error messages and return. + info.valid = parentInfo.valid && info.valid; + loggerFork.commit(); + return; + } +} + +void Stack::data(const Variant &data) +{ + while (true) { + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException("No command here to receive data."); + } + + // Fetch the current command handler information + HandlerInfo &info = currentInfo(); + + // Make sure the current handler has an open field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } + + // If this field should not get any data, log an error and do not call + // the "data" handler + if (!info.inValidField) { + logger().error("Did not expect any data here", data); + } + + if (handlersValid() && info.inValidField) { + // Fork the logger and set it as temporary logger for the "start" + // method. We only want to keep error messages if this was not a try + // to implicitly open a default field. + LoggerFork loggerFork = logger().fork(); + info.handler->setLogger(loggerFork); + + // Pass the data to the current Handler instance + bool valid = false; + try { + Variant dataCopy = data; + valid = info.handler->data(dataCopy); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + + // Reset the logger instance as soon as possible + info.handler->resetLogger(); + + // If placing the data here failed and we're currently in an + // implicitly opened field, just unroll the stack to the next field + // and try again + if (!valid && info.inImplicitDefaultField) { + endCurrentHandler(); + continue; + } + + // Commit the content of the logger fork. Do not change the valid + // flag. + loggerFork.commit(); + } + + // There was no reason to unroll the stack any further, so continue + return; + } +} + +void Stack::fieldStart(bool isDefault) +{ + // Make sure the current handler stack is not empty + if (stack.empty()) { + throw LoggableException( + "No command for which a field could be started"); + } + + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (info.inField) { + logger().error( + "Got field start, but there is no command for which to start the " + "field."); + return; + } + + // Copy the isDefault flag to a local variable, the fieldStart method will + // write into this variable + bool defaultField = isDefault; + + // Do not call the "fieldStart" function if we're in an invalid subtree + bool valid = false; + if (handlersValid()) { + try { + valid = info.handler->fieldStart(defaultField, info.fieldIdx); + } + catch (LoggableException ex) { + logger().log(ex); + } + if (!valid && !defaultField) { + logger().error( + std::string("Cannot start a new field here (index ") + + std::to_string(info.fieldIdx + 1) + + std::string("), field does not exist")); + } + } + + // Mark the field as started + info.fieldStart(defaultField, false, valid); +} + +void Stack::fieldEnd() +{ + // Make sure the current handler stack is not empty + if (stack.empty()) { + throw LoggableException("No command for which a field could be ended"); + } + + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (!info.inField) { + logger().error( + "Got field end, but there is no command for which to end the " + "field."); + return; + } + + // Only continue if the current handler stack is in a valid state, do not + // call the fieldEnd function if something went wrong before + if (handlersValid()) { + try { + info.handler->fieldEnd(); + } + catch (LoggableException ex) { + logger().log(ex); + } + } + + // This command no longer is in a field + info.fieldEnd(); + + // As soon as this command had a default field, remove it from the stack + if (info.hadDefaultField) { + endCurrentHandler(); + } +} + +void Stack::annotationStart(const Variant &className, const Variant &args) +{ + // TODO +} + +void Stack::annotationEnd(const Variant &className, const Variant &elementName) +{ + // TODO +} + +void Stack::token(Variant token) +{ + // TODO +} +} +} + diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp new file mode 100644 index 0000000..76eefd9 --- /dev/null +++ b/src/core/parser/stack/Stack.hpp @@ -0,0 +1,341 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Stack.hpp + * + * Helper classes for document or description parsers. Contains the + * Stack class, which is an pushdown automaton responsible for + * accepting commands in the correct order and calling specified handlers. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_STACK_HPP_ +#define _OUSIA_PARSER_STACK_STACK_HPP_ + +#include <cstdint> + +#include <map> +#include <memory> +#include <set> +#include <vector> + +#include <core/common/Variant.hpp> +#include <core/parser/Parser.hpp> + +namespace ousia { + +// Forward declarations +class ParserContext; +class Logger; + +namespace parser_stack { + +// Forward declarations +class Handler; +class State; + +/** + * The HandlerInfo class is used internally by the stack to associate additional + * (mutable) data with a handler instance. + */ +class HandlerInfo { +public: + /** + * Pointer pointing at the actual handler instance. + */ + std::shared_ptr<Handler> handler; + + /** + * Next field index to be passed to the "fieldStart" function of the Handler + * class. + */ + size_t fieldIdx; + + /** + * Set to true if the handler is valid (which is the case if the "start" + * method has returned true). If the handler is invalid, no more calls are + * directed at it until it can be removed from the stack. + */ + bool valid : 1; + + /** + * Set to true if this is an implicit handler, that was created when the + * current stack state was deduced. + */ + bool implicit : 1; + + /** + * Set to true if the handler currently is in a field. + */ + bool inField : 1; + + /** + * Set to true if the handler currently is in the default field. + */ + bool inDefaultField : 1; + + /** + * Set to true if the handler currently is in an implicitly started default + * field. + */ + bool inImplicitDefaultField : 1; + + /** + * Set to false if this field is only opened pro-forma and does not accept + * any data. Otherwise set to true. + */ + bool inValidField : 1; + + /** + * Set to true, if the default field was already started. + */ + bool hadDefaultField : 1; + + /** + * Default constructor of the HandlerInfo class. + */ + HandlerInfo(); + /** + * Constructor of the HandlerInfo class, allows to set all flags manually. + */ + HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField, bool inValidField); + + /** + * Constructor of the HandlerInfo class, taking a shared_ptr to the handler + * to which additional information should be attached. + */ + HandlerInfo(std::shared_ptr<Handler> handler); + + /** + * Destructor of the HandlerInfo class (to allow Handler to be forward + * declared). + */ + ~HandlerInfo(); + + /** + * Updates the "field" flags according to a "fieldStart" event. + */ + void fieldStart(bool isDefault, bool isImplicit, bool isValid); + + /** + * Updates the "fields" flags according to a "fieldEnd" event. + */ + void fieldEnd(); +}; + +/** + * The Stack class is a pushdown automaton responsible for turning a command + * stream into a tree of Node instances. It does so by following a state + * transition graph and creating a set of Handler instances, which are placed + * on the stack. + */ +class Stack { +private: + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap<std::string, const State *> &states; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::vector<HandlerInfo> stack; + + /** + * Return the reference in the Logger instance stored within the context. + */ + Logger &logger(); + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set<std::string> expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetState(const std::string &name); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state, also including the wildcard "*" state. + * Throws an exception if the given target state is not a valid identifier. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetStateOrWildcard(const std::string &name); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + * an equivalent state as the one in the including file. + */ + void deduceState(); + + /** + * Returns a reference at the current HandlerInfo instance (or a stub + * HandlerInfo instance if the stack is empty). + */ + HandlerInfo ¤tInfo(); + + /** + * Returns a reference at the last HandlerInfo instance (or a stub + * HandlerInfo instance if the stack has only one element). + */ + HandlerInfo &lastInfo(); + + /** + * Ends the current handler and removes the corresponding element from the + * stack. + */ + void endCurrentHandler(); + + /** + * Tries to start a default field for the current handler, if currently the + * handler is not inside a field and did not have a default field yet. + * + * @return true if the handler is inside a field, false if no field could + * be started. + */ + bool ensureHandlerIsInField(); + + /** + * Returns true if all handlers on the stack are currently valid, or false + * if at least one handler is invalid. + * + * @return true if all handlers on the stack are valid. + */ + bool handlersValid(); + +public: + /** + * Creates a new instance of the Stack class. + * + * @param ctx is the parser context the parser stack is working on. + * @param states is a map containing the command names and pointers at the + * corresponding State instances. + */ + Stack(ParserContext &ctx, + const std::multimap<std::string, const State *> &states); + + /** + * Destructor of the Stack class. + */ + ~Stack(); + + /** + * Returns the state the Stack instance currently is in. + * + * @return the state of the currently active Handler instance or STATE_NONE + * if no handler is on the stack. + */ + const State ¤tState(); + + /** + * Returns the command name that is currently being handled. + * + * @return the name of the command currently being handled by the active + * Handler instance or an empty string if no handler is currently active. + */ + std::string currentCommandName(); + + /** + * Function that should be called whenever a new command is reached. + * + * @param name is the name of the command (including the namespace + * separator ':') and its corresponding location. Must be a string variant. + * @param args is a map containing the arguments that were passed to the + * command. + */ + void command(const Variant &name, const Variant::mapType &args); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. May only be called if the currently is a command on the + * stack. + * + * @param data is a string variant containing the data that has been found. + */ + void data(const Variant &data); + + /** + * Function that should be called whenever a new field starts. Fields of the + * same command may not be separated by calls to data or annotations. Doing + * so will result in a LoggableException. + * + * @param isDefault should be set to true if the started field explicitly + * is the default field. + */ + void fieldStart(bool isDefault); + + /** + * Function that should be called whenever a field ends. Calling this + * function if there is no field to end will result in a LoggableException. + */ + void fieldEnd(); + + /** + * Function that should be called whenever an annotation starts. + * + * @param name is the name of the annotation class. + * @param args is a map variant containing the arguments that were passed + * to the annotation. + */ + void annotationStart(const Variant &className, const Variant &args); + + /** + * Function that should be called whenever an annotation ends. + * + * @param name is the name of the annotation class that was ended. + * @param annotationName is the name of the annotation that was ended. + */ + void annotationEnd(const Variant &className, const Variant &elementName); + + /** + * Function that should be called whenever a previously registered token + * is found in the input stream. + * + * @param token is string variant containing the token that was encountered. + */ + void token(Variant token); +}; +} +} + +#endif /* _OUSIA_STACK_HPP_ */ + diff --git a/src/core/parser/ParserState.cpp b/src/core/parser/stack/State.cpp index f635d86..d72f533 100644 --- a/src/core/parser/ParserState.cpp +++ b/src/core/parser/stack/State.cpp @@ -16,88 +16,97 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "ParserState.hpp" +#include "State.hpp" namespace ousia { +namespace parser_stack { -/* Class ParserState */ +/* Class State */ -ParserState::ParserState() : elementHandler(nullptr) {} +State::State() : elementHandler(nullptr) {} -ParserState::ParserState(ParserStateSet parents, Arguments arguments, +State::State(StateSet parents, Arguments arguments, RttiSet createdNodeTypes, - HandlerConstructor elementHandler) + HandlerConstructor elementHandler, + bool supportsAnnotations) : parents(parents), arguments(arguments), createdNodeTypes(createdNodeTypes), - elementHandler(elementHandler) + elementHandler(elementHandler), + supportsAnnotations(supportsAnnotations) { } -ParserState::ParserState(const ParserStateBuilder &builder) - : ParserState(builder.build()) +State::State(const StateBuilder &builder) + : State(builder.build()) { } -/* Class ParserStateBuilder */ +/* Class StateBuilder */ -ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) +StateBuilder &StateBuilder::copy(const State &state) { this->state = state; return *this; } -ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) +StateBuilder &StateBuilder::parent(const State *parent) { - state.parents = ParserStateSet{parent}; + state.parents = StateSet{parent}; return *this; } -ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) +StateBuilder &StateBuilder::parents(const StateSet &parents) { state.parents = parents; return *this; } -ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) +StateBuilder &StateBuilder::arguments(const Arguments &arguments) { state.arguments = arguments; return *this; } -ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) +StateBuilder &StateBuilder::createdNodeType(const Rtti *type) { state.createdNodeTypes = RttiSet{type}; return *this; } -ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) +StateBuilder &StateBuilder::createdNodeTypes(const RttiSet &types) { state.createdNodeTypes = types; return *this; } -ParserStateBuilder &ParserStateBuilder::elementHandler( +StateBuilder &StateBuilder::elementHandler( HandlerConstructor elementHandler) { state.elementHandler = elementHandler; return *this; } -const ParserState &ParserStateBuilder::build() const { return state; } +StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations) +{ + state.supportsAnnotations = supportsAnnotations; + return *this; +} -/* Class ParserStateDeductor */ +const State &StateBuilder::build() const { return state; } -ParserStateDeductor::ParserStateDeductor( +/* Class StateDeductor */ + +StateDeductor::StateDeductor( std::vector<const Rtti *> signature, - std::vector<const ParserState *> states) + std::vector<const State *> states) : tbl(signature.size()), signature(std::move(signature)), states(std::move(states)) { } -bool ParserStateDeductor::isActive(size_t d, const ParserState *s) +bool StateDeductor::isActive(size_t d, const State *s) { // Lookup the "active" state of (d, s), if it was not already set // (e.second is true) we'll have to calculate it @@ -123,7 +132,7 @@ bool ParserStateDeductor::isActive(size_t d, const ParserState *s) // Check whether any of the parent nodes were active -- either for // the previous element (if this one is generative) or for the // current element (assuming this node was not generative) - for (const ParserState *parent : s->parents) { + for (const State *parent : s->parents) { if ((isGenerative && isActive(d - 1, parent)) || isActive(d, parent)) { res = true; @@ -136,9 +145,9 @@ bool ParserStateDeductor::isActive(size_t d, const ParserState *s) return res; } -std::vector<const ParserState *> ParserStateDeductor::deduce() +std::vector<const State *> StateDeductor::deduce() { - std::vector<const ParserState *> res; + std::vector<const State *> res; if (!signature.empty()) { const size_t D = signature.size(); for (auto s : states) { @@ -153,9 +162,10 @@ std::vector<const ParserState *> ParserStateDeductor::deduce() /* Constant initializations */ -namespace ParserStates { -const ParserState All; -const ParserState None; +namespace States { +const State All; +const State None; +} } } diff --git a/src/core/parser/ParserState.hpp b/src/core/parser/stack/State.hpp index 6487fdd..4766235 100644 --- a/src/core/parser/ParserState.hpp +++ b/src/core/parser/stack/State.hpp @@ -17,10 +17,10 @@ */ /** - * @file ParserState.hpp + * @file State.hpp * - * Defines the ParserState class used within the ParserStack pushdown - * automaton and the ParserStateBuilder class for convenient construction of + * Defines the State class used within the ParserStack pushdown + * automaton and the StateBuilder class for convenient construction of * such classes. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) @@ -33,12 +33,14 @@ #include <core/common/Rtti.hpp> #include <core/common/Argument.hpp> +#include <core/common/Whitespace.hpp> namespace ousia { +namespace parser_stack { // Forward declarations -class ParserStateBuilder; -class ParserState; +class StateBuilder; +class State; class HandlerData; class Handler; using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); @@ -47,17 +49,17 @@ using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); * Set of pointers of parser states -- used for specifying a set of parent * states. */ -using ParserStateSet = std::unordered_set<const ParserState *>; +using StateSet = std::unordered_set<const State *>; /** - * Class used for the complete specification of a ParserState. Stores possible + * Class used for the complete specification of a State. Stores possible * parent states, state handlers and arguments to be passed to that state. */ -struct ParserState { +struct State { /** * Vector containing all possible parent states. */ - ParserStateSet parents; + StateSet parents; /** * Descriptor of the arguments that should be passed to the handler. @@ -66,8 +68,8 @@ struct ParserState { /** * Set containing the types of the nodes that may be created in this - * ParserState. This information is needed for Parsers to reconstruct the - * current ParserState from a given ParserScope when a file is included. + * State. This information is needed for Parsers to reconstruct the + * current State from a given ParserScope when a file is included. */ RttiSet createdNodeTypes; @@ -79,109 +81,119 @@ struct ParserState { HandlerConstructor elementHandler; /** + * Set to true if this handler does support annotations. This is almost + * always false (e.g. all description handlers), except for document + * element handlers. + */ + bool supportsAnnotations; + + /** * Default constructor, initializes the handlers with nullptr. */ - ParserState(); + State(); /** - * Constructor taking values for all fields. Use the ParserStateBuilder - * class for a more convenient construction of ParserState instances. + * Constructor taking values for all fields. Use the StateBuilder + * class for a more convenient construction of State instances. * * @param parents is a vector containing all possible parent states. * @param arguments is a descriptor of arguments that should be passed to * the handler. * @param createdNodeTypes is a set containing the types of the nodes tha - * may be created in this ParserState. This information is needed for - * Parsers to reconstruct the current ParserState from a given ParserScope + * may be created in this State. This information is needed for + * Parsers to reconstruct the current State from a given ParserScope * when a file is included. * @param elementHandler is a pointer at a function which creates a new * concrete Handler instance for the elements described by this state. May * be nullptr in which case no handler instance is created. + * @param supportsAnnotations specifies whether annotations are supported + * here at all. */ - ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, + State(StateSet parents, Arguments arguments = Arguments{}, RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr); + HandlerConstructor elementHandler = nullptr, + bool supportsAnnotations = false); /** - * Creates this ParserState from the given ParserStateBuilder instance. + * Creates this State from the given StateBuilder instance. */ - ParserState(const ParserStateBuilder &builder); + State(const StateBuilder &builder); }; /** - * The ParserStateBuilder class is a class used for conveniently building new - * ParserState instances. + * The StateBuilder class is a class used for conveniently building new + * State instances. */ -class ParserStateBuilder { +class StateBuilder { private: /** - * ParserState instance that is currently being built by the - * ParserStateBuilder. + * State instance that is currently being built by the + * StateBuilder. */ - ParserState state; + State state; public: /** - * Copies the ParserState instance and uses it as internal state. Overrides - * all changes made by the ParserStateBuilder. + * Copies the State instance and uses it as internal state. Overrides + * all changes made by the StateBuilder. * * @param state is the state that should be copied. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder ©(const ParserState &state); + StateBuilder ©(const State &state); /** * Sets the possible parent states to the single given parent element. * - * @param parent is a pointer at the parent ParserState instance that should + * @param parent is a pointer at the parent State instance that should * be the possible parent state. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &parent(const ParserState *parent); + StateBuilder &parent(const State *parent); /** - * Sets the ParserState instances in the given ParserStateSet as the list of + * Sets the State instances in the given StateSet as the list of * supported parent states. * - * @param parents is a set of pointers at ParserState instances that should + * @param parents is a set of pointers at State instances that should * be the possible parent states. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &parents(const ParserStateSet &parents); + StateBuilder &parents(const StateSet &parents); /** * Sets the arguments that should be passed to the parser state handler to * those given as argument. * * @param arguments is the Arguments instance describing the Arguments that - * should be parsed to a Handler for this ParserState. - * @return a reference at this ParserStateBuilder instance for method + * should be parsed to a Handler for this State. + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &arguments(const Arguments &arguments); + StateBuilder &arguments(const Arguments &arguments); /** * Sets the Node types this state may produce to the given Rtti descriptor. * * @param type is the Rtti descriptor of the Type that may be produced by * this state. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &createdNodeType(const Rtti *type); + StateBuilder &createdNodeType(const Rtti *type); /** * Sets the Node types this state may produce to the given Rtti descriptors. * * @param types is a set of Rtti descriptors of the Types that may be * produced by this state. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &createdNodeTypes(const RttiSet &types); + StateBuilder &createdNodeTypes(const RttiSet &types); /** * Sets the constructor for the element handler. The constructor creates a @@ -191,31 +203,42 @@ public: * * @param elementHandler is the HandlerConstructor that should create a * new Handler instance. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); + StateBuilder &elementHandler(HandlerConstructor elementHandler); /** - * Returns a reference at the internal ParserState instance that was built - * using the ParserStateBuilder. + * Sets the state of the "supportsAnnotations" flags (default value is + * false) * - * @return the built ParserState. + * @param supportsAnnotations should be set to true, if annotations are + * supported for the handlers associated with this document. + * @return a reference at this StateBuilder instance for method + * chaining. */ - const ParserState &build() const; + StateBuilder &supportsAnnotations(bool supportsAnnotations); + + /** + * Returns a reference at the internal State instance that was built + * using the StateBuilder. + * + * @return the built State. + */ + const State &build() const; }; /** - * Class used to deduce the ParserState a Parser is currently in based on the + * Class used to deduce the State a Parser is currently in based on the * types of the Nodes that currently are on the ParserStack. Uses dynamic * programming in order to solve this problem. */ -class ParserStateDeductor { +class StateDeductor { public: /** * Type containing the dynamic programming table. */ - using Table = std::vector<std::unordered_map<const ParserState *, bool>>; + using Table = std::vector<std::unordered_map<const State *, bool>>; private: /** @@ -231,7 +254,7 @@ private: /** * List of states that should be checked for being active. */ - const std::vector<const ParserState *> states; + const std::vector<const State *> states; /** * Used internally to check whether the given parser stack s may have been @@ -239,20 +262,20 @@ private: * * @param d is the signature element. * @param s is the parser state. - * @return true if the the given ParserState may have been active. + * @return true if the the given State may have been active. */ - bool isActive(size_t d, const ParserState *s); + bool isActive(size_t d, const State *s); public: /** - * Constructor of the ParserStateDeductor class. + * Constructor of the StateDeductor class. * * @param signature a Node type signature describing the types of the nodes * which currently reside on e.g. the ParserScope stack. * @param states is a list of states that should be checked. */ - ParserStateDeductor(std::vector<const Rtti *> signature, - std::vector<const ParserState *> states); + StateDeductor(std::vector<const Rtti *> signature, + std::vector<const State *> states); /** * Selects all active states from the given states. Only considers those @@ -260,23 +283,24 @@ public: * * @return a list of states that may actually have been active. */ - std::vector<const ParserState *> deduce(); + std::vector<const State *> deduce(); }; /** - * The ParserStates namespace contains all the global state constants used + * The States namespace contains all the global state constants used * in the ParserStack class. */ -namespace ParserStates { +namespace States { /** * State representing all states. */ -extern const ParserState All; +extern const State All; /** * State representing the initial state. */ -extern const ParserState None; +extern const State None; +} } } diff --git a/src/core/parser/stack/TypesystemHandler.cpp b/src/core/parser/stack/TypesystemHandler.cpp index 2cc7dfb..8fd9525 100644 --- a/src/core/parser/stack/TypesystemHandler.cpp +++ b/src/core/parser/stack/TypesystemHandler.cpp @@ -16,32 +16,46 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "TypesystemHandler.hpp" - #include <core/model/Typesystem.hpp> +#include <core/model/Domain.hpp> #include <core/parser/ParserScope.hpp> +#include <core/parser/ParserContext.hpp> + +#include "DomainHandler.hpp" +#include "State.hpp" +#include "TypesystemHandler.hpp" namespace ousia { +namespace parser_stack { /* TypesystemHandler */ -void TypesystemHandler::start(Variant::mapType &args) +bool TypesystemHandler::start(Variant::mapType &args) { // Create the typesystem instance Rooted<Typesystem> typesystem = - project()->createTypesystem(args["name"].asString()); + context().getProject()->createTypesystem(args["name"].asString()); typesystem->setLocation(location()); + // If the typesystem is defined inside a domain, add a reference to the + // typesystem to the domain + Rooted<Domain> domain = scope().select<Domain>(); + if (domain != nullptr) { + domain->reference(typesystem); + } + // Push the typesystem onto the scope, set the POST_HEAD flag to true scope().push(typesystem); scope().setFlag(ParserFlag::POST_HEAD, false); + + return true; } void TypesystemHandler::end() { scope().pop(); } /* TypesystemEnumHandler */ -void TypesystemEnumHandler::start(Variant::mapType &args) +bool TypesystemEnumHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -52,33 +66,24 @@ void TypesystemEnumHandler::start(Variant::mapType &args) enumType->setLocation(location()); scope().push(enumType); + + return true; } void TypesystemEnumHandler::end() { scope().pop(); } /* TypesystemEnumEntryHandler */ -void TypesystemEnumEntryHandler::start(Variant::mapType &args) {} - -void TypesystemEnumEntryHandler::end() +void TypesystemEnumEntryHandler::doHandle(const Variant &fieldData, + Variant::mapType &args) { Rooted<EnumType> enumType = scope().selectOrThrow<EnumType>(); - enumType->addEntry(entry, logger()); -} - -void TypesystemEnumEntryHandler::data(const std::string &data, int field) -{ - if (field != 0) { - // TODO: This should be stored in the HandlerData - logger().error("Enum entry only has one field."); - return; - } - entry.append(data); + enumType->addEntry(fieldData.asString(), logger()); } /* TypesystemStructHandler */ -void TypesystemStructHandler::start(Variant::mapType &args) +bool TypesystemStructHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -103,13 +108,15 @@ void TypesystemStructHandler::start(Variant::mapType &args) }); } scope().push(structType); + + return true; } void TypesystemStructHandler::end() { scope().pop(); } /* TypesystemStructFieldHandler */ -void TypesystemStructFieldHandler::start(Variant::mapType &args) +bool TypesystemStructFieldHandler::start(Variant::mapType &args) { // Read the argument values const std::string &name = args["name"].asString(); @@ -142,13 +149,13 @@ void TypesystemStructFieldHandler::start(Variant::mapType &args) } }); } -} -void TypesystemStructFieldHandler::end() {} + return true; +} /* TypesystemConstantHandler */ -void TypesystemConstantHandler::start(Variant::mapType &args) +bool TypesystemConstantHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -169,7 +176,51 @@ void TypesystemConstantHandler::start(Variant::mapType &args) constant.cast<Constant>()->setType(type.cast<Type>(), logger); } }); + + return true; } -void TypesystemConstantHandler::end() {} +namespace States { +const State Typesystem = StateBuilder() + .parents({&None, &Domain}) + .createdNodeType(&RttiTypes::Typesystem) + .elementHandler(TypesystemHandler::create) + .arguments({Argument::String("name", "")}); + +const State TypesystemEnum = StateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::EnumType) + .elementHandler(TypesystemEnumHandler::create) + .arguments({Argument::String("name")}); + +const State TypesystemEnumEntry = + StateBuilder() + .parent(&TypesystemEnum) + .elementHandler(TypesystemEnumEntryHandler::create) + .arguments({}); + +const State TypesystemStruct = + StateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(TypesystemStructHandler::create) + .arguments({Argument::String("name"), Argument::String("parent", "")}); + +const State TypesystemStructField = + StateBuilder() + .parent(&TypesystemStruct) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +const State TypesystemConstant = + StateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::Constant) + .elementHandler(TypesystemConstantHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("value")}); +} } +} + diff --git a/src/core/parser/stack/TypesystemHandler.hpp b/src/core/parser/stack/TypesystemHandler.hpp index 76a7bc9..85494f1 100644 --- a/src/core/parser/stack/TypesystemHandler.hpp +++ b/src/core/parser/stack/TypesystemHandler.hpp @@ -19,6 +19,9 @@ /** * @file TypesystemHandler.hpp * + * Contains the Handler classes used to parse Typesystem descriptions. The + * Handlers parse all the tags found below and including the "typesystem" tag. + * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -26,96 +29,180 @@ #define _OUSIA_TYPESYSTEM_HANDLER_HPP_ #include <core/common/Variant.hpp> -#include <core/parser/ParserStack.hpp> + +#include "Handler.hpp" namespace ousia { +namespace parser_stack { -class TypesystemHandler : public Handler { +/** + * Handles the occurance of the "typesystem" tag. Creates a new Typesystem + * instance and places it on the ParserScope. + */ +class TypesystemHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemHandler{handlerData}; } }; -class TypesystemEnumHandler : public Handler { +/** + * Handles the occurance of the "enum" tag. Creates a new EnumType instance and + * places it on the ParserScope. + */ +class TypesystemEnumHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemEnumHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemEnumHandler{handlerData}; } }; -class TypesystemEnumEntryHandler : public Handler { +/** + * Handles the occurance of the "entry" tag within an "enum" tag. Creates a new + * EnumType instance and places it on the ParserScope. + */ +class TypesystemEnumEntryHandler : public StaticFieldHandler { public: - using Handler::Handler; + using StaticFieldHandler::StaticFieldHandler; - std::string entry; - - void start(Variant::mapType &args) override; - - void end() override; - - void data(const std::string &data, int field) override; + void doHandle(const Variant &fieldData, Variant::mapType &args) override; + /** + * Creates a new instance of the TypesystemEnumEntryHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { - return new TypesystemEnumEntryHandler{handlerData}; + return new TypesystemEnumEntryHandler{handlerData, "name"}; } }; -class TypesystemStructHandler : public Handler { +/** + * Handles the occurance of the "struct" tag within a typesystem description. + * Creates a new StructType instance and places it on the ParserScope. + */ +class TypesystemStructHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemStructHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemStructHandler{handlerData}; } }; -class TypesystemStructFieldHandler : public Handler { +/** + * Handles the occurance of the "field" tag within a typesystem structure + * description. Places a new Attribute instance in the StructType instance + * that is currently at the top of the scope. + */ +class TypesystemStructFieldHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; - void end() override; + bool start(Variant::mapType &args) override; + /** + * Creates a new instance of the TypesystemStructFieldHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemStructFieldHandler{handlerData}; } }; -class TypesystemConstantHandler : public Handler { +/** + * Handles the occurance of the "constant" tag within a typesystem structure + * description. Places a new Constant instance in the current typesystem. + */ +class TypesystemConstantHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; - - void end() override; + bool start(Variant::mapType &args) override; + /** + * Creates a new instance of the TypesystemConstantHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemConstantHandler{handlerData}; } }; + +namespace States { +/** + * State representing the "typesystem" tag. + */ +extern const State Typesystem; +/** + * State representing the "enum" tag within a typesystem. + */ +extern const State TypesystemEnum; +/** + * State representing the "entry" tag within an enum. + */ +extern const State TypesystemEnumEntry; +/** + * State representing the "struct" tag within a typesystem. + */ +extern const State TypesystemStruct; +/** + * State representing the "field" tag within a typesystem structure. + */ +extern const State TypesystemStructField; +/** + * State representing the "constant" tag within a typesystem. + */ +extern const State TypesystemConstant; +} +} } #endif diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/core/parser/utils/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::make_shared<Node>()).first; + } + node = it->second.get(); + } + + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { + return false; + } + + // Otherwise just set the type to the given type. + node->type = type; + return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another type + node = it->second.get(); + if ((node->type != EmptyToken || node->children.size() > 1) && + (i + 1 != token.size())) { + subtreeRoot = node; + subtreeKey = token[i + 1]; + } + } + + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { + return false; + } + + // If the target node has children, we cannot delete the subtree. Set the + // type to EmptyToken instead + if (!node->children.empty()) { + node->type = EmptyToken; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return EmptyToken; + } + node = it->second.get(); + } + return node->type; +} +} + diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/core/parser/utils/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include <cstdint> +#include <memory> +#include <limits> +#include <unordered_map> + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenTypeId type; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param type is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, TokenTypeId type) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp new file mode 100644 index 0000000..3c8177d --- /dev/null +++ b/src/core/parser/utils/Tokenizer.cpp @@ -0,0 +1,381 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <memory> +#include <vector> + +#include <core/common/CharReader.hpp> +#include <core/common/Exceptions.hpp> +#include <core/common/Utils.hpp> +#include <core/common/WhitespaceHandler.hpp> + +#include "Tokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + Token token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the Token instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * Tokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, + const std::vector<std::string> &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + Token{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; + } + match.token.type = TextToken; +} +} + +/* Class Tokenizer */ + +Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template <typename TextHandler, bool read> +bool Tokenizer::next(CharReader &reader, Token &token) +{ + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector<TokenLookup> lookups; + std::vector<TokenLookup> nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = Token{}; + } + return match.hasMatch(); +} + +bool Tokenizer::read(CharReader &reader, Token &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next<PreservingWhitespaceHandler, true>(reader, token); + case WhitespaceMode::TRIM: + return next<TrimmingWhitespaceHandler, true>(reader, token); + case WhitespaceMode::COLLAPSE: + return next<CollapsingWhitespaceHandler, true>(reader, token); + } + return false; +} + +bool Tokenizer::peek(CharReader &reader, Token &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next<PreservingWhitespaceHandler, false>(reader, token); + case WhitespaceMode::TRIM: + return next<TrimmingWhitespaceHandler, false>(reader, token); + case WhitespaceMode::COLLAPSE: + return next<CollapsingWhitespaceHandler, false>(reader, token); + } + return false; +} + +TokenTypeId Tokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } + + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; + + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool Tokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string Tokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool Tokenizer::next<PreservingWhitespaceHandler, false>( + CharReader &reader, Token &token); +template bool Tokenizer::next<TrimmingWhitespaceHandler, false>( + CharReader &reader, Token &token); +template bool Tokenizer::next<CollapsingWhitespaceHandler, false>( + CharReader &reader, Token &token); +template bool Tokenizer::next<PreservingWhitespaceHandler, true>( + CharReader &reader, Token &token); +template bool Tokenizer::next<TrimmingWhitespaceHandler, true>( + CharReader &reader, Token &token); +template bool Tokenizer::next<CollapsingWhitespaceHandler, true>( + CharReader &reader, Token &token); +} + diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp new file mode 100644 index 0000000..6b4e116 --- /dev/null +++ b/src/core/parser/utils/Tokenizer.hpp @@ -0,0 +1,231 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Tokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include <set> +#include <string> +#include <vector> + +#include <core/common/Location.hpp> +#include <core/common/Whitespace.hpp> + +#include "TokenTrie.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; + +/** + * The Token structure describes a token discovered by the Tokenizer. + */ +struct Token { + /** + * Id of the type of this token. + */ + TokenTypeId type; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + Token() : type(EmptyToken) {} + + /** + * Constructor of the Token struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + Token(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the Token struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + Token(TokenTypeId type) : type(type) {} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; + +/** + * The Tokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. Note that the + * Tokenizer always tries to extract the longest possible token from the + * tokenizer. + */ +class Tokenizer { +private: + /** + * Internally used token trie. This object holds all registered tokens. + */ + TokenTrie trie; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token types. + */ + std::vector<std::string> tokens; + + /** + * Next index in the tokens list where to search for a new token id. + */ + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. + * + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template <typename TextHandler, bool read> + bool next(CharReader &reader, Token &token); + +public: + /** + * Constructor of the Tokenizer class. + * + * @param whitespaceMode specifies how whitespace should be handled. + */ + Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. + */ + TokenTypeId registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenTypeId. + * + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * Token instance. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(CharReader &reader, Token &token); + + /** + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool peek(CharReader &reader, Token &token); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + |