From efe60ac3c3a8725ac71329c0bb19fa9d9c58f399 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:42:05 +0100 Subject: Moved specific file format parsers to formats/ folder, moved old tokenizer to css code (this is the only place where it is actually used) --- src/core/CodeTokenizer.cpp | 169 ------------- src/core/CodeTokenizer.hpp | 136 ---------- src/core/Tokenizer.cpp | 204 --------------- src/core/Tokenizer.hpp | 227 ----------------- src/core/parser/ParserStack.cpp | 216 ---------------- src/core/parser/ParserStack.hpp | 361 --------------------------- src/core/parser/ParserState.cpp | 161 ------------ src/core/parser/ParserState.hpp | 284 --------------------- src/core/parser/generic/ParserState.cpp | 161 ++++++++++++ src/core/parser/generic/ParserState.hpp | 284 +++++++++++++++++++++ src/core/parser/generic/ParserStateStack.cpp | 216 ++++++++++++++++ src/core/parser/generic/ParserStateStack.hpp | 361 +++++++++++++++++++++++++++ 12 files changed, 1022 insertions(+), 1758 deletions(-) delete mode 100644 src/core/CodeTokenizer.cpp delete mode 100644 src/core/CodeTokenizer.hpp delete mode 100644 src/core/Tokenizer.cpp delete mode 100644 src/core/Tokenizer.hpp delete mode 100644 src/core/parser/ParserStack.cpp delete mode 100644 src/core/parser/ParserStack.hpp delete mode 100644 src/core/parser/ParserState.cpp delete mode 100644 src/core/parser/ParserState.hpp create mode 100644 src/core/parser/generic/ParserState.cpp create mode 100644 src/core/parser/generic/ParserState.hpp create mode 100644 src/core/parser/generic/ParserStateStack.cpp create mode 100644 src/core/parser/generic/ParserStateStack.hpp (limited to 'src/core') diff --git a/src/core/CodeTokenizer.cpp b/src/core/CodeTokenizer.cpp deleted file mode 100644 index d65c514..0000000 --- a/src/core/CodeTokenizer.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "CodeTokenizer.hpp" - -namespace ousia { - -Token CodeTokenizer::constructToken(const Token &t) -{ - std::string content = buf.str(); - buf.str(std::string()); - return Token{ - returnTokenId, content, - SourceLocation{t.location.getSourceId(), startToken.location.getStart(), - t.location.getEnd()}}; -} - -void CodeTokenizer::buffer(const Token &t) { buf << t.content; } - -bool CodeTokenizer::doPrepare(const Token &t, std::deque &peeked) -{ - auto it = descriptors.find(t.tokenId); - CodeTokenMode mode = CodeTokenMode::NONE; - if (it != descriptors.end()) { - mode = it->second.mode; - } - - switch (state) { - case CodeTokenizerState::NORMAL: - switch (mode) { - case CodeTokenMode::STRING_START_END: - state = CodeTokenizerState::IN_STRING; - break; - case CodeTokenMode::BLOCK_COMMENT_START: - state = CodeTokenizerState::IN_BLOCK_COMMENT; - break; - case CodeTokenMode::LINE_COMMENT: - state = CodeTokenizerState::IN_LINE_COMMENT; - break; - case CodeTokenMode::LINEBREAK: - if (!ignoreLinebreaks) { - peeked.push_back( - {it->second.id, t.content, t.location}); - } - return !ignoreLinebreaks; - default: - bool empty = true; - if (t.tokenId == TOKEN_TEXT) { - int begin = -1; - for (size_t c = 0; c < t.content.length(); c++) { - bool isWhitespace = - t.content[c] == ' ' || t.content[c] == '\t'; - if (begin < 0) { - // if we have not yet set our beginning, - // we wait for the first - // non-whitespace-character to set it. - if (!isWhitespace) { - begin = c; - } - } else { - // if we have set our beginning, we wait for the - // first whitespace character, which marks the - // end of the current word. - if (isWhitespace) { - peeked.push_back(Token{ - TOKEN_TEXT, - t.content.substr(begin, (int)c - begin), - SourceLocation{ - t.location.getSourceId(), - t.location.getStart() + begin, - t.location.getStart() + c}}); - begin = -1; - empty = false; - } - } - } - if (begin >= 0) { - peeked.push_back(Token{ - TOKEN_TEXT, t.content.substr(begin), - SourceLocation{t.location.getSourceId(), - t.location.getStart() + begin, - t.location.getEnd()}}); - empty = false; - } - } else { - empty = false; - peeked.push_back(t); - } - return !empty; - } - startToken = t; - returnTokenId = it->second.id; - return false; - case CodeTokenizerState::IN_LINE_COMMENT: - switch (mode) { - case CodeTokenMode::LINEBREAK: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_BLOCK_COMMENT: - switch (mode) { - case CodeTokenMode::BLOCK_COMMENT_END: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_STRING: - switch (mode) { - case CodeTokenMode::ESCAPE: - if (escaped) { - buffer(t); - } - escaped = !escaped; - return false; - case CodeTokenMode::STRING_START_END: - if (escaped) { - buffer(t); - escaped = false; - return false; - } else { - peeked.push_back(constructToken(t)); - state = CodeTokenizerState::NORMAL; - return true; - } - default: - if (escaped) { - // TODO: handle escaped characters? - escaped = false; - } - buffer(t); - return false; - } - } - assert(false); - return false; -} -} diff --git a/src/core/CodeTokenizer.hpp b/src/core/CodeTokenizer.hpp deleted file mode 100644 index 154f949..0000000 --- a/src/core/CodeTokenizer.hpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file CodeTokenizer.hpp - - * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) - */ -#ifndef _OUSIA_CODE_TOKENIZER_HPP_ -#define _OUSIA_CODE_TOKENIZER_HPP_ - -#include -#include - -#include -#include "Tokenizer.hpp" - -namespace ousia { - -/* - * This enum contains all special Token the CodeTokenizer supports, namely: - * - * 1.) An ambigous Tokens - in post programming languages single-quotes ' or - * double-quotes " - to delimit string tokens. - * 2.) A start token for line comments, which would e.g. be // in Java. - * 3.) A start token for a block comment - * 4.) An end token for a block comment. - * 5.) A linebreak token - * 6.) The escape token, which would e.g. be \ in java. - */ -enum class CodeTokenMode { - STRING_START_END, - LINE_COMMENT, - BLOCK_COMMENT_START, - BLOCK_COMMENT_END, - LINEBREAK, - ESCAPE, - NONE -}; - -/** - * A CodeTokenDescriptor defines the id the user likes to have returned for - * a Token of the mode specified, e.g. if you want to get the id 4 for a - * String Token the corresponding CodeTokenDescriptor would be inizialized - * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; - */ -struct CodeTokenDescriptor { - CodeTokenMode mode; - int id; - - CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} -}; - -/** - * The CodeTokenizer is a finite state machine with the states NORMAL, being - * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. - */ -enum class CodeTokenizerState { - NORMAL, - IN_BLOCK_COMMENT, - IN_LINE_COMMENT, - IN_STRING -}; - -/** - * The purpose of a CodeTokenizer is to make it easier to parse classical - * programming Code. It adds the following features to a regular Tokenizer: - * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens - * for the opening delimiter, the text and the closing delimiter. - * 2.) Escaping in String tokens. - * 3.) Comment Tokens (for line comments as well as block comments) - */ -class CodeTokenizer : public Tokenizer { -private: - std::map descriptors; - CodeTokenizerState state; - std::stringstream buf; - Token startToken; - int returnTokenId; - bool escaped = false; - - Token constructToken(const Token &t); - void buffer(const Token &t); - -protected: - bool doPrepare(const Token &t, std::deque &peeked) override; - -public: - /** - * If you do not want comment tokens to be returned you can set this to - * true. - */ - bool ignoreComments = false; - /** - * If you do not want linebreaks to be returned you can set this to true. - */ - bool ignoreLinebreaks = false; - - /** - * - * @param input a CharReader containing the input for this tokenizer, as - * with a regular tokenizer. - * @param root a TokenTreeNode representing the root of the TokenTree. - * Please note that you have to specify all tokenIDs here that you use - * in the descriptors map. - * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. - * In this way you can specify the meaning of certain Tokens. Say you - * specified the Token "//" with the id 1 in the TokenTree. Then you could - * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map - * and this CodeTokenizer would recognize the token "//" as starting a - * line comment. - */ - CodeTokenizer(CharReader &input, const TokenTreeNode &root, - std::map descriptors) - : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) - { - } -}; -} - -#endif diff --git a/src/core/Tokenizer.cpp b/src/core/Tokenizer.cpp deleted file mode 100644 index ab4735a..0000000 --- a/src/core/Tokenizer.cpp +++ /dev/null @@ -1,204 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "Tokenizer.hpp" - -namespace ousia { - -static std::map buildChildren( - const std::map &inputs) -{ - std::map children; - std::map> nexts; - - for (auto &e : inputs) { - const std::string &s = e.first; - const int id = e.second; - if (s.empty()) { - continue; - } - char start = s[0]; - const std::string suffix = s.substr(1); - if (nexts.find(start) != nexts.end()) { - nexts[start].insert(std::make_pair(suffix, id)); - } else { - nexts.insert(std::make_pair( - start, std::map{{suffix, id}})); - } - } - - for (auto &n : nexts) { - children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); - } - - return children; -} - -static int buildId(const std::map &inputs) -{ - int tokenId = TOKEN_NONE; - for (auto &e : inputs) { - if (e.first.empty()) { - if (tokenId != TOKEN_NONE) { - throw TokenizerException{std::string{"Ambigous token found: "} + - std::to_string(e.second)}; - } else { - tokenId = e.second; - } - } - } - return tokenId; -} - -TokenTreeNode::TokenTreeNode(const std::map &inputs) - : children(buildChildren(inputs)), tokenId(buildId(inputs)) -{ -} - -Tokenizer::Tokenizer(CharReader &input, const TokenTreeNode &root) - : input(input), root(root) -{ -} - -bool Tokenizer::prepare() -{ - std::stringstream buffer; - char c; - SourcePosition start = input.getOffset(); - bool bufEmpty = true; - while (input.peek(c)) { - if (root.children.find(c) != root.children.end()) { - // if there might be a special token, keep peeking forward - // until we find the token (or we don't). - TokenTreeNode const *n = &root; - std::stringstream tBuf; - int match = TOKEN_NONE; - while (true) { - tBuf << c; - n = &(n->children.at(c)); - if (n->tokenId != TOKEN_NONE) { - match = n->tokenId; - // from here on we found a token. If we have something - // in our buffer already, we end the search now. - if (!bufEmpty) { - break; - } else { - // if we want to return this token ( = we have nothing - // in our buffer yet) we look greedily for the longest - // possible token we can construct. - input.consumePeek(); - } - } - if (!input.peek(c)) { - // if we are at the end we break off the search. - break; - } - if (n->children.find(c) == n->children.end()) { - // if we do not find a possible continuation anymore, - // break off the search. - break; - } - } - //reset the peek pointer to the last valid position. - input.resetPeek(); - // check if we did indeed find a special token. - if (match != TOKEN_NONE) { - if (bufEmpty) { - // if we did not have text before, construct that token. - if (doPrepare( - Token{match, tBuf.str(), input.getLocation(start)}, - peeked)) { - return true; - } else { - start = input.getOffset(); - continue; - } - } else { - // otherwise we return the text before the token. - if (doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, - peeked)) { - return true; - } else{ - //we need to clear the buffer here. After all the token - //corresponding to this buffer segment is already - //constructed. - buffer.str(std::string()); - bufEmpty = true; - start = input.getOffset(); - continue; - } - } - } else{ - //if we found nothing, read at least one character. - input.peek(c); - } - } - buffer << c; - bufEmpty = false; - input.consumePeek(); - } - if (!bufEmpty) { - return doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, - peeked); - } - return false; -} - -bool Tokenizer::doPrepare(const Token &t, std::deque &peeked) -{ - peeked.push_back(t); - return true; -} - -bool Tokenizer::next(Token &t) -{ - if (peeked.empty()) { - if (!prepare()) { - return false; - } - } - t = peeked.front(); - peeked.pop_front(); - resetPeek(); - return true; -} - -bool Tokenizer::peek(Token &t) -{ - if (peekCursor >= peeked.size()) { - if (!prepare()) { - return false; - } - } - t = peeked[peekCursor]; - peekCursor++; - return true; -} - -void Tokenizer::resetPeek() { peekCursor = 0; } - -void Tokenizer::consumePeek() -{ - while (peekCursor > 0) { - peeked.pop_front(); - peekCursor--; - } -} -} diff --git a/src/core/Tokenizer.hpp b/src/core/Tokenizer.hpp deleted file mode 100644 index 50e458c..0000000 --- a/src/core/Tokenizer.hpp +++ /dev/null @@ -1,227 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_TOKENIZER_HPP_ -#define _OUSIA_TOKENIZER_HPP_ - -#include -#include -#include -#include - -#include - -namespace ousia { - -/** - * This exception is currently only thrown if errors are made during the - * initialization of the Tokenizer. Have a closer look at the documentation - * of the TokenTreeNode constructor for more information. - */ -class TokenizerException : public std::exception { -public: - const std::string msg; - - TokenizerException(const std::string &msg) : msg(msg){}; - - virtual const char *what() const noexcept override { return msg.c_str(); } -}; - -/** - * The Tokenizer internally uses a TokenTree to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * The TokenTree is a construct that structures all special tokens this - * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then - * the TokenTree would look like this: - * - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ - * - * Every node in the TokenTree is a valid end state that has a $ attached to it. - * During the search algorithm the Tokenizer goes through the tree and stores - * the last valid position. If a character follows that does not lead to a new - * node in the TokenTree the search ends (and starts again at this character). - * The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). - * - * It is also not allowed that tokens have common middle parts but varying - * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and - * the input string "abc". In that case we start looking for "abd" at the - * start, won't find it, wenn we hit "c" and start the scanning process - * anew. Thus the "bc" token is not found. - * - * For most (well-behaved) tokenization schemes this is not the case, - * though. - */ -class TokenTreeNode { -public: - const std::map children; - const int tokenId; - - /** - * The TokenTreeNode constructor builds a TokenTree from the given token - * specifications. The node returned by this constructor then is the root of - * said TokenTree. - * @param inputs Specifications of tokens in map form. Each specification - * is a tuple of the text that should be matched and some unique ID (>= 0) - * that is returned to you if that Token is found in the text. - * An example for such a map would be - * { - * { "#" , 1}, - * { "##", 2}, - * { "/" , 3} - * } - * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE - * (-1) and TOKEN_TEXT (-2). - */ - TokenTreeNode(const std::map &inputs); -}; - -/** - * This is a reserved constant for the empty token. - */ -static const int TOKEN_NONE = -1; -/** - * This is a reserved constant for every part of the input text that is not a - * specified token. - */ -static const int TOKEN_TEXT = -2; - -/** - * A token for us is identified by an integer tokenID (either one of the - * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). - * Additionally we return the matched text (which should only be really - * interesting in case of TOKEN_TEXT tokens) and the position in the input text. - */ -struct Token { - int tokenId; - std::string content; - SourceLocation location; - - Token(int tokenId, std::string content, SourceLocation location) - : tokenId(tokenId), - content(content), - location(location) - { - } - - Token() : tokenId(TOKEN_NONE) {} -}; - -/** - * A Tokenizer has the purpose of subdividing an input text into tokens. In our - * definition here we distinguish between two kinds of tokens: - * 1.) User-specified tokens that match a fixed text. - * 2.) Any other text between those tokens. - * The user might want to specify the tokens '#{' and '#}' for example, because - * they have some meaning in her code. The user sets the IDs to 1 and 2. - * Given the input text - * "some text #{ special command #} some text" - * the tokenizer would return the tokens: - * 1.) "some text " with the id TOKEN_TEXT (-2). - * 2.) "#{" with the id 1. - * 3.) " special command " with the id TOKEN_TEXT (-2). - * 4.) "#}" with the id 2. - * 5.) " some text" with the id TOKEN_TEXT (-2). - * This makes the subsequent parsing of files of a specific type easier. - * Note that in case of tokens with that are prefixes of other tokens the - * longest possible match is returned. - */ -class Tokenizer { -private: - CharReader &input; - const TokenTreeNode &root; - std::deque peeked; - unsigned int peekCursor = 0; - - bool prepare(); - -protected: - /** - * This method is an interface to build multiple tokens from a single one in - * derived classes. This might be interesting if you want to implement - * further logic on text tokens or similar applications. - * - * @param t a Token the "basic" tokenizer found. - * @param peeked a reference to the deque containing all temporary Tokens. - * You are supposed to append your tokens there. In the trivial case you just - * put the given Token on top of the deque. - * @return false if no token was appended to the deque (meaning that you want - * to ignore the given token explicitly) and true in all other cases. - */ - virtual bool doPrepare(const Token &t, std::deque &peeked); - -public: - /** - * @param input The input of a Tokenizer is given in the form of a - * CharReader. Please refer to the respective documentation. - * @param root This is meant to be the root of a TokenTree giving the - * specification of user-defined tokens this Tokenizer should recognize. - * The Tokenizer promises to not change the TokenTree such that you can - * re-use the same specification for multiple inputs. - * Please refer to the TokenTreeNode documentation for more information. - */ - Tokenizer(CharReader &input, const TokenTreeNode &root); - - /** - * The next method consumes one Token from the input stream and gives - * it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool next(Token &t); - /** - * The peek method does not consume the next Token but buffers it and - * shows it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool peek(Token &t); - - /** - * Resets the peek pointer to the current position in the stream (to the - * beginning of the buffer). - */ - void resetPeek(); - - /** - * Clears the peek buffer, such that all peeked Tokens are consumed. - */ - void consumePeek(); - - const CharReader &getInput() const { return input; } - - CharReader &getInput() { return input; } -}; -} - -#endif diff --git a/src/core/parser/ParserStack.cpp b/src/core/parser/ParserStack.cpp deleted file mode 100644 index 1265851..0000000 --- a/src/core/parser/ParserStack.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include -#include - -#include "ParserScope.hpp" -#include "ParserStack.hpp" - -namespace ousia { - -/* A default handler */ - -/** - * The DefaultHandler class is used in case no element handler is specified in - * the ParserState descriptor. - */ -class DefaultHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override {} - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DefaultHandler{handlerData}; - } -}; - -/* Class Handler */ - -void Handler::data(const std::string &data, int field) -{ - if (Utils::hasNonWhitepaceChar(data)) { - logger().error("Expected command but found character data."); - } -} - -/* Class ParserStack */ - -/** - * Returns an Exception that should be thrown when a currently invalid command - * is thrown. - */ -static LoggableException InvalidCommand(const std::string &name, - const std::set &expected) -{ - if (expected.empty()) { - return LoggableException{ - std::string{"No nested elements allowed, but got \""} + name + - std::string{"\""}}; - } else { - return LoggableException{ - std::string{"Expected "} + - (expected.size() == 1 ? std::string{"\""} - : std::string{"one of \""}) + - Utils::join(expected, "\", \"") + std::string{"\", but got \""} + - name + std::string{"\""}}; - } -} - -ParserStack::ParserStack( - ParserContext &ctx, - const std::multimap &states) - : ctx(ctx), states(states) -{ -} - -bool ParserStack::deduceState() -{ - // Assemble all states - std::vector states; - for (const auto &e : this->states) { - states.push_back(e.second); - } - - // Fetch the type signature of the scope and derive all possible states, - // abort if no unique parser state was found - std::vector possibleStates = - ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) - .deduce(); - if (possibleStates.size() != 1) { - ctx.getLogger().error( - "Error while including file: Cannot deduce parser state."); - return false; - } - - // Switch to this state by creating a dummy handler - const ParserState *state = possibleStates[0]; - Handler *handler = - DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); - stack.emplace(handler); - return true; -} - -std::set ParserStack::expectedCommands() -{ - const ParserState *currentState = &(this->currentState()); - std::set res; - for (const auto &v : states) { - if (v.second->parents.count(currentState)) { - res.insert(v.first); - } - } - return res; -} - -const ParserState &ParserStack::currentState() -{ - return stack.empty() ? ParserStates::None : stack.top()->state(); -} - -std::string ParserStack::currentCommandName() -{ - return stack.empty() ? std::string{} : stack.top()->name(); -} - -const ParserState *ParserStack::findTargetState(const std::string &name) -{ - const ParserState *currentState = &(this->currentState()); - auto range = states.equal_range(name); - for (auto it = range.first; it != range.second; it++) { - const ParserStateSet &parents = it->second->parents; - if (parents.count(currentState) || parents.count(&ParserStates::All)) { - return it->second; - } - } - - return nullptr; -} - -void ParserStack::start(const std::string &name, Variant::mapType &args, - const SourceLocation &location) -{ - ParserState const *targetState = findTargetState(name); -// TODO: Andreas, please improve this. -// if (!Utils::isIdentifier(name)) { -// throw LoggableException(std::string("Invalid identifier \"") + name + -// std::string("\"")); -// } - - if (targetState == nullptr) { - targetState = findTargetState("*"); - } - if (targetState == nullptr) { - throw InvalidCommand(name, expectedCommands()); - } - - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and call its start function - Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); - handler->start(args); - stack.emplace(handler); -} - -void ParserStack::start(std::string name, const Variant::mapType &args, - const SourceLocation &location) -{ - Variant::mapType argsCopy(args); - start(name, argsCopy); -} - -void ParserStack::end() -{ - // Check whether the current command could be ended - if (stack.empty()) { - throw LoggableException{"No command to end."}; - } - - // Remove the current HandlerInstance from the stack - std::shared_ptr inst{stack.top()}; - stack.pop(); - - // Call the end function of the last Handler - inst->end(); -} - -void ParserStack::data(const std::string &data, int field) -{ - // Check whether there is any command the data can be sent to - if (stack.empty()) { - throw LoggableException{"No command to receive data."}; - } - - // Pass the data to the current Handler instance - stack.top()->data(data, field); -} -} - diff --git a/src/core/parser/ParserStack.hpp b/src/core/parser/ParserStack.hpp deleted file mode 100644 index efc4e4a..0000000 --- a/src/core/parser/ParserStack.hpp +++ /dev/null @@ -1,361 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserStack.hpp - * - * Helper classes for document or description parsers. Contains the ParserStack - * class, which is an pushdown automaton responsible for accepting commands in - * the correct order and calling specified handlers. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STACK_HPP_ -#define _OUSIA_PARSER_STACK_HPP_ - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "Parser.hpp" -#include "ParserContext.hpp" -#include "ParserState.hpp" - -namespace ousia { - -/** - * Struct collecting all the data that is being passed to a Handler instance. - */ -struct HandlerData { - /** - * Reference to the ParserContext instance that should be used to resolve - * references to nodes in the Graph. - */ - ParserContext &ctx; - - /** - * Contains the name of the tag that is being handled. - */ - const std::string name; - - /** - * Contains the current state of the state machine. - */ - const ParserState &state; - - /** - * Contains the state of the state machine when the parent node was handled. - */ - const ParserState &parentState; - - /** - * Current source code location. - */ - const SourceLocation location; - - /** - * Constructor of the HandlerData class. - * - * @param ctx is the parser context the handler should be executed in. - * @param name is the name of the string. - * @param state is the state this handler was called for. - * @param parentState is the state of the parent command. - * @param location is the location at which the handler is created. - */ - HandlerData(ParserContext &ctx, std::string name, const ParserState &state, - const ParserState &parentState, const SourceLocation location) - : ctx(ctx), - name(std::move(name)), - state(state), - parentState(parentState), - location(location){}; -}; - -/** - * The handler class provides a context for handling an XML tag. It has to be - * overridden and registered in the StateStack class to form handlers for - * concrete XML tags. - */ -class Handler { -private: - /** - * Structure containing the internal handler data. - */ - const HandlerData handlerData; - -public: - /** - * Constructor of the Handler class. - * - * @param data is a structure containing all data being passed to the - * handler. - */ - Handler(const HandlerData &handlerData) : handlerData(handlerData){}; - - /** - * Virtual destructor. - */ - virtual ~Handler(){}; - - /** - * Returns a reference at the ParserContext. - * - * @return a reference at the ParserContext. - */ - ParserContext &context() { return handlerData.ctx; } - - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. - */ - const std::string &name() { return handlerData.name; } - - /** - * Returns a reference at the ParserScope instance. - * - * @return a reference at the ParserScope instance. - */ - ParserScope &scope() { return handlerData.ctx.getScope(); } - - /** - * Returns a reference at the Manager instance which manages all nodes. - * - * @return a referance at the Manager instance. - */ - Manager &manager() { return handlerData.ctx.getManager(); } - - /** - * Returns a reference at the Logger instance used for logging error - * messages. - * - * @return a reference at the Logger instance. - */ - Logger &logger() { return handlerData.ctx.getLogger(); } - - /** - * Returns a reference at the Project Node, representing the project into - * which the file is currently being parsed. - * - * @return a referance at the Project Node. - */ - Rooted project() { return handlerData.ctx.getProject(); } - - /** - * Reference at the ParserState descriptor for which this Handler was - * created. - * - * @return a const reference at the constructing ParserState descriptor. - */ - const ParserState &state() { return handlerData.state; } - - /** - * Reference at the ParserState descriptor of the parent state of the state - * for which this Handler was created. Set to ParserStates::None if there - * is no parent state. - * - * @return a const reference at the parent state of the constructing - * ParserState descriptor. - */ - const ParserState &parentState() { return handlerData.parentState; } - - /** - * Returns the current location in the source file. - * - * @return the current location in the source file. - */ - SourceLocation location() { return handlerData.location; } - - /** - * Called when the command that was specified in the constructor is - * instanciated. - * - * @param args is a map from strings to variants (argument name and value). - */ - virtual void start(Variant::mapType &args) = 0; - - /** - * Called whenever the command for which this handler is defined ends. - */ - virtual void end() = 0; - - /** - * Called whenever raw data (int the form of a string) is available for the - * Handler instance. In the default handler an exception is raised if the - * received data contains non-whitespace characters. - * - * @param data is a pointer at the character data that is available for the - * Handler instance. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - virtual void data(const std::string &data, int field); -}; - -/** - * HandlerConstructor is a function pointer type used to create concrete - * instances of the Handler class. - * - * @param handlerData is the data that should be passed to the new handler - * instance. - * @return a newly created handler instance. - */ -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * The ParserStack class is a pushdown automaton responsible for turning a - * command stream into a tree of Node instances. - */ -class ParserStack { -private: - /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. - */ - const std::multimap &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::stack> stack; - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. - */ - std::set expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from for the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - *state - * otherwise. - */ - const ParserState *findTargetState(const std::string &name); - -public: - /** - * Creates a new instance of the ParserStack class. - * - * @param ctx is the parser context the parser stack is working on. - * @param states is a map containing the command names and pointers at the - * corresponding ParserState instances. - */ - ParserStack(ParserContext &ctx, - const std::multimap &states); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - + an equivalent state as the one in the including file. - * - * @param scope is the ParserScope instance from which the ParserState - * should be reconstructed. - * @param logger is the logger instance to which error messages should be - * written. - * @return true if the operation was sucessful, false otherwise. - */ - bool deduceState(); - - /** - * Returns the state the ParserStack instance currently is in. - * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. - */ - const ParserState ¤tState(); - - /** - * Returns the command name that is currently being handled. - * - * @return the name of the command currently being handled by the active - * Handler instance or an empty string if no handler is currently active. - */ - std::string currentCommandName(); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * Note that the passed map will be modified. - * @param location is the location in the source file at which the command - * starts. - */ - void start(const std::string &name, Variant::mapType &args, - const SourceLocation &location = SourceLocation{}); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * @param location is the location in the source file at which the command - * starts. - */ - void start(std::string name, - const Variant::mapType &args = Variant::mapType{}, - const SourceLocation &location = SourceLocation{}); - - /** - * Function called whenever a command ends. - */ - void end(); - - /** - * Function that should be called whenever data is available for the - * command. - * - * @param data is the data that should be passed to the handler. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - void data(const std::string &data, int field = 0); - - /** - * Returns a reference to the parser context the parser stack is currently - * working on. - * - * @return a reference to the parser context. - */ - ParserContext &getContext() { return ctx; } -}; -} - -#endif /* _OUSIA_PARSER_STACK_HPP_ */ - diff --git a/src/core/parser/ParserState.cpp b/src/core/parser/ParserState.cpp deleted file mode 100644 index f635d86..0000000 --- a/src/core/parser/ParserState.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "ParserState.hpp" - -namespace ousia { - -/* Class ParserState */ - -ParserState::ParserState() : elementHandler(nullptr) {} - -ParserState::ParserState(ParserStateSet parents, Arguments arguments, - RttiSet createdNodeTypes, - HandlerConstructor elementHandler) - : parents(parents), - arguments(arguments), - createdNodeTypes(createdNodeTypes), - elementHandler(elementHandler) -{ -} - -ParserState::ParserState(const ParserStateBuilder &builder) - : ParserState(builder.build()) -{ -} - -/* Class ParserStateBuilder */ - -ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) -{ - this->state = state; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) -{ - state.parents = ParserStateSet{parent}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) -{ - state.parents = parents; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) -{ - state.arguments = arguments; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) -{ - state.createdNodeTypes = RttiSet{type}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) -{ - state.createdNodeTypes = types; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::elementHandler( - HandlerConstructor elementHandler) -{ - state.elementHandler = elementHandler; - return *this; -} - -const ParserState &ParserStateBuilder::build() const { return state; } - -/* Class ParserStateDeductor */ - -ParserStateDeductor::ParserStateDeductor( - std::vector signature, - std::vector states) - : tbl(signature.size()), - signature(std::move(signature)), - states(std::move(states)) -{ -} - -bool ParserStateDeductor::isActive(size_t d, const ParserState *s) -{ - // Lookup the "active" state of (d, s), if it was not already set - // (e.second is true) we'll have to calculate it - auto e = tbl[d].emplace(s, false); - bool &res = e.first->second; - if (!e.second) { - return res; - } - - // Check whether this node is generative (may have produced the Node - // described by the current Signature element) - bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); - - if (isGenerative && d == 0) { - // End of recursion -- the last signature element is reached and the - // node was generative - res = true; - } else { - // Try repetition of this node - if (isGenerative && isActive(d - 1, s)) { - res = true; - } else { - // Check whether any of the parent nodes were active -- either for - // the previous element (if this one is generative) or for the - // current element (assuming this node was not generative) - for (const ParserState *parent : s->parents) { - if ((isGenerative && isActive(d - 1, parent)) || - isActive(d, parent)) { - res = true; - break; - } - } - } - } - - return res; -} - -std::vector ParserStateDeductor::deduce() -{ - std::vector res; - if (!signature.empty()) { - const size_t D = signature.size(); - for (auto s : states) { - if (signature[D - 1]->isOneOf(s->createdNodeTypes) && - isActive(D - 1, s)) { - res.push_back(s); - } - } - } - return res; -} - -/* Constant initializations */ - -namespace ParserStates { -const ParserState All; -const ParserState None; -} -} - diff --git a/src/core/parser/ParserState.hpp b/src/core/parser/ParserState.hpp deleted file mode 100644 index 6487fdd..0000000 --- a/src/core/parser/ParserState.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserState.hpp - * - * Defines the ParserState class used within the ParserStack pushdown - * automaton and the ParserStateBuilder class for convenient construction of - * such classes. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_HPP_ -#define _OUSIA_PARSER_STATE_HPP_ - -#include - -#include -#include - -namespace ousia { - -// Forward declarations -class ParserStateBuilder; -class ParserState; -class HandlerData; -class Handler; -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * Set of pointers of parser states -- used for specifying a set of parent - * states. - */ -using ParserStateSet = std::unordered_set; - -/** - * Class used for the complete specification of a ParserState. Stores possible - * parent states, state handlers and arguments to be passed to that state. - */ -struct ParserState { - /** - * Vector containing all possible parent states. - */ - ParserStateSet parents; - - /** - * Descriptor of the arguments that should be passed to the handler. - */ - Arguments arguments; - - /** - * Set containing the types of the nodes that may be created in this - * ParserState. This information is needed for Parsers to reconstruct the - * current ParserState from a given ParserScope when a file is included. - */ - RttiSet createdNodeTypes; - - /** - * Pointer at a function which creates a new concrete Handler instance for - * the elements described by this state. May be nullptr in which case no - * handler instance is created. - */ - HandlerConstructor elementHandler; - - /** - * Default constructor, initializes the handlers with nullptr. - */ - ParserState(); - - /** - * Constructor taking values for all fields. Use the ParserStateBuilder - * class for a more convenient construction of ParserState instances. - * - * @param parents is a vector containing all possible parent states. - * @param arguments is a descriptor of arguments that should be passed to - * the handler. - * @param createdNodeTypes is a set containing the types of the nodes tha - * may be created in this ParserState. This information is needed for - * Parsers to reconstruct the current ParserState from a given ParserScope - * when a file is included. - * @param elementHandler is a pointer at a function which creates a new - * concrete Handler instance for the elements described by this state. May - * be nullptr in which case no handler instance is created. - */ - ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, - RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr); - - /** - * Creates this ParserState from the given ParserStateBuilder instance. - */ - ParserState(const ParserStateBuilder &builder); -}; - -/** - * The ParserStateBuilder class is a class used for conveniently building new - * ParserState instances. - */ -class ParserStateBuilder { -private: - /** - * ParserState instance that is currently being built by the - * ParserStateBuilder. - */ - ParserState state; - -public: - /** - * Copies the ParserState instance and uses it as internal state. Overrides - * all changes made by the ParserStateBuilder. - * - * @param state is the state that should be copied. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder ©(const ParserState &state); - - /** - * Sets the possible parent states to the single given parent element. - * - * @param parent is a pointer at the parent ParserState instance that should - * be the possible parent state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parent(const ParserState *parent); - - /** - * Sets the ParserState instances in the given ParserStateSet as the list of - * supported parent states. - * - * @param parents is a set of pointers at ParserState instances that should - * be the possible parent states. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parents(const ParserStateSet &parents); - - /** - * Sets the arguments that should be passed to the parser state handler to - * those given as argument. - * - * @param arguments is the Arguments instance describing the Arguments that - * should be parsed to a Handler for this ParserState. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &arguments(const Arguments &arguments); - - /** - * Sets the Node types this state may produce to the given Rtti descriptor. - * - * @param type is the Rtti descriptor of the Type that may be produced by - * this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeType(const Rtti *type); - - /** - * Sets the Node types this state may produce to the given Rtti descriptors. - * - * @param types is a set of Rtti descriptors of the Types that may be - * produced by this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeTypes(const RttiSet &types); - - /** - * Sets the constructor for the element handler. The constructor creates a - * new concrete Handler instance for the elements described by this state. - * May be nullptr in which case no handler instance is created (this is - * the default value). - * - * @param elementHandler is the HandlerConstructor that should create a - * new Handler instance. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); - - /** - * Returns a reference at the internal ParserState instance that was built - * using the ParserStateBuilder. - * - * @return the built ParserState. - */ - const ParserState &build() const; -}; - -/** - * Class used to deduce the ParserState a Parser is currently in based on the - * types of the Nodes that currently are on the ParserStack. Uses dynamic - * programming in order to solve this problem. - */ -class ParserStateDeductor { -public: - /** - * Type containing the dynamic programming table. - */ - using Table = std::vector>; - -private: - /** - * Dynamic programming table. - */ - Table tbl; - - /** - * Signature given in the constructor. - */ - const std::vector signature; - - /** - * List of states that should be checked for being active. - */ - const std::vector states; - - /** - * Used internally to check whether the given parser stack s may have been - * active for signature element d. - * - * @param d is the signature element. - * @param s is the parser state. - * @return true if the the given ParserState may have been active. - */ - bool isActive(size_t d, const ParserState *s); - -public: - /** - * Constructor of the ParserStateDeductor class. - * - * @param signature a Node type signature describing the types of the nodes - * which currently reside on e.g. the ParserScope stack. - * @param states is a list of states that should be checked. - */ - ParserStateDeductor(std::vector signature, - std::vector states); - - /** - * Selects all active states from the given states. Only considers those - * states that may have produced the last signature element. - * - * @return a list of states that may actually have been active. - */ - std::vector deduce(); -}; - -/** - * The ParserStates namespace contains all the global state constants used - * in the ParserStack class. - */ -namespace ParserStates { -/** - * State representing all states. - */ -extern const ParserState All; - -/** - * State representing the initial state. - */ -extern const ParserState None; -} -} - -#endif /* _OUSIA_PARSER_STATE_HPP_ */ - diff --git a/src/core/parser/generic/ParserState.cpp b/src/core/parser/generic/ParserState.cpp new file mode 100644 index 0000000..f635d86 --- /dev/null +++ b/src/core/parser/generic/ParserState.cpp @@ -0,0 +1,161 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "ParserState.hpp" + +namespace ousia { + +/* Class ParserState */ + +ParserState::ParserState() : elementHandler(nullptr) {} + +ParserState::ParserState(ParserStateSet parents, Arguments arguments, + RttiSet createdNodeTypes, + HandlerConstructor elementHandler) + : parents(parents), + arguments(arguments), + createdNodeTypes(createdNodeTypes), + elementHandler(elementHandler) +{ +} + +ParserState::ParserState(const ParserStateBuilder &builder) + : ParserState(builder.build()) +{ +} + +/* Class ParserStateBuilder */ + +ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) +{ + this->state = state; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) +{ + state.parents = ParserStateSet{parent}; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) +{ + state.parents = parents; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) +{ + state.arguments = arguments; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) +{ + state.createdNodeTypes = RttiSet{type}; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) +{ + state.createdNodeTypes = types; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::elementHandler( + HandlerConstructor elementHandler) +{ + state.elementHandler = elementHandler; + return *this; +} + +const ParserState &ParserStateBuilder::build() const { return state; } + +/* Class ParserStateDeductor */ + +ParserStateDeductor::ParserStateDeductor( + std::vector signature, + std::vector states) + : tbl(signature.size()), + signature(std::move(signature)), + states(std::move(states)) +{ +} + +bool ParserStateDeductor::isActive(size_t d, const ParserState *s) +{ + // Lookup the "active" state of (d, s), if it was not already set + // (e.second is true) we'll have to calculate it + auto e = tbl[d].emplace(s, false); + bool &res = e.first->second; + if (!e.second) { + return res; + } + + // Check whether this node is generative (may have produced the Node + // described by the current Signature element) + bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); + + if (isGenerative && d == 0) { + // End of recursion -- the last signature element is reached and the + // node was generative + res = true; + } else { + // Try repetition of this node + if (isGenerative && isActive(d - 1, s)) { + res = true; + } else { + // Check whether any of the parent nodes were active -- either for + // the previous element (if this one is generative) or for the + // current element (assuming this node was not generative) + for (const ParserState *parent : s->parents) { + if ((isGenerative && isActive(d - 1, parent)) || + isActive(d, parent)) { + res = true; + break; + } + } + } + } + + return res; +} + +std::vector ParserStateDeductor::deduce() +{ + std::vector res; + if (!signature.empty()) { + const size_t D = signature.size(); + for (auto s : states) { + if (signature[D - 1]->isOneOf(s->createdNodeTypes) && + isActive(D - 1, s)) { + res.push_back(s); + } + } + } + return res; +} + +/* Constant initializations */ + +namespace ParserStates { +const ParserState All; +const ParserState None; +} +} + diff --git a/src/core/parser/generic/ParserState.hpp b/src/core/parser/generic/ParserState.hpp new file mode 100644 index 0000000..6487fdd --- /dev/null +++ b/src/core/parser/generic/ParserState.hpp @@ -0,0 +1,284 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file ParserState.hpp + * + * Defines the ParserState class used within the ParserStack pushdown + * automaton and the ParserStateBuilder class for convenient construction of + * such classes. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_HPP_ +#define _OUSIA_PARSER_STATE_HPP_ + +#include + +#include +#include + +namespace ousia { + +// Forward declarations +class ParserStateBuilder; +class ParserState; +class HandlerData; +class Handler; +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * Set of pointers of parser states -- used for specifying a set of parent + * states. + */ +using ParserStateSet = std::unordered_set; + +/** + * Class used for the complete specification of a ParserState. Stores possible + * parent states, state handlers and arguments to be passed to that state. + */ +struct ParserState { + /** + * Vector containing all possible parent states. + */ + ParserStateSet parents; + + /** + * Descriptor of the arguments that should be passed to the handler. + */ + Arguments arguments; + + /** + * Set containing the types of the nodes that may be created in this + * ParserState. This information is needed for Parsers to reconstruct the + * current ParserState from a given ParserScope when a file is included. + */ + RttiSet createdNodeTypes; + + /** + * Pointer at a function which creates a new concrete Handler instance for + * the elements described by this state. May be nullptr in which case no + * handler instance is created. + */ + HandlerConstructor elementHandler; + + /** + * Default constructor, initializes the handlers with nullptr. + */ + ParserState(); + + /** + * Constructor taking values for all fields. Use the ParserStateBuilder + * class for a more convenient construction of ParserState instances. + * + * @param parents is a vector containing all possible parent states. + * @param arguments is a descriptor of arguments that should be passed to + * the handler. + * @param createdNodeTypes is a set containing the types of the nodes tha + * may be created in this ParserState. This information is needed for + * Parsers to reconstruct the current ParserState from a given ParserScope + * when a file is included. + * @param elementHandler is a pointer at a function which creates a new + * concrete Handler instance for the elements described by this state. May + * be nullptr in which case no handler instance is created. + */ + ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, + RttiSet createdNodeTypes = RttiSet{}, + HandlerConstructor elementHandler = nullptr); + + /** + * Creates this ParserState from the given ParserStateBuilder instance. + */ + ParserState(const ParserStateBuilder &builder); +}; + +/** + * The ParserStateBuilder class is a class used for conveniently building new + * ParserState instances. + */ +class ParserStateBuilder { +private: + /** + * ParserState instance that is currently being built by the + * ParserStateBuilder. + */ + ParserState state; + +public: + /** + * Copies the ParserState instance and uses it as internal state. Overrides + * all changes made by the ParserStateBuilder. + * + * @param state is the state that should be copied. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder ©(const ParserState &state); + + /** + * Sets the possible parent states to the single given parent element. + * + * @param parent is a pointer at the parent ParserState instance that should + * be the possible parent state. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &parent(const ParserState *parent); + + /** + * Sets the ParserState instances in the given ParserStateSet as the list of + * supported parent states. + * + * @param parents is a set of pointers at ParserState instances that should + * be the possible parent states. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &parents(const ParserStateSet &parents); + + /** + * Sets the arguments that should be passed to the parser state handler to + * those given as argument. + * + * @param arguments is the Arguments instance describing the Arguments that + * should be parsed to a Handler for this ParserState. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &arguments(const Arguments &arguments); + + /** + * Sets the Node types this state may produce to the given Rtti descriptor. + * + * @param type is the Rtti descriptor of the Type that may be produced by + * this state. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &createdNodeType(const Rtti *type); + + /** + * Sets the Node types this state may produce to the given Rtti descriptors. + * + * @param types is a set of Rtti descriptors of the Types that may be + * produced by this state. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &createdNodeTypes(const RttiSet &types); + + /** + * Sets the constructor for the element handler. The constructor creates a + * new concrete Handler instance for the elements described by this state. + * May be nullptr in which case no handler instance is created (this is + * the default value). + * + * @param elementHandler is the HandlerConstructor that should create a + * new Handler instance. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); + + /** + * Returns a reference at the internal ParserState instance that was built + * using the ParserStateBuilder. + * + * @return the built ParserState. + */ + const ParserState &build() const; +}; + +/** + * Class used to deduce the ParserState a Parser is currently in based on the + * types of the Nodes that currently are on the ParserStack. Uses dynamic + * programming in order to solve this problem. + */ +class ParserStateDeductor { +public: + /** + * Type containing the dynamic programming table. + */ + using Table = std::vector>; + +private: + /** + * Dynamic programming table. + */ + Table tbl; + + /** + * Signature given in the constructor. + */ + const std::vector signature; + + /** + * List of states that should be checked for being active. + */ + const std::vector states; + + /** + * Used internally to check whether the given parser stack s may have been + * active for signature element d. + * + * @param d is the signature element. + * @param s is the parser state. + * @return true if the the given ParserState may have been active. + */ + bool isActive(size_t d, const ParserState *s); + +public: + /** + * Constructor of the ParserStateDeductor class. + * + * @param signature a Node type signature describing the types of the nodes + * which currently reside on e.g. the ParserScope stack. + * @param states is a list of states that should be checked. + */ + ParserStateDeductor(std::vector signature, + std::vector states); + + /** + * Selects all active states from the given states. Only considers those + * states that may have produced the last signature element. + * + * @return a list of states that may actually have been active. + */ + std::vector deduce(); +}; + +/** + * The ParserStates namespace contains all the global state constants used + * in the ParserStack class. + */ +namespace ParserStates { +/** + * State representing all states. + */ +extern const ParserState All; + +/** + * State representing the initial state. + */ +extern const ParserState None; +} +} + +#endif /* _OUSIA_PARSER_STATE_HPP_ */ + diff --git a/src/core/parser/generic/ParserStateStack.cpp b/src/core/parser/generic/ParserStateStack.cpp new file mode 100644 index 0000000..1265851 --- /dev/null +++ b/src/core/parser/generic/ParserStateStack.cpp @@ -0,0 +1,216 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include "ParserScope.hpp" +#include "ParserStack.hpp" + +namespace ousia { + +/* A default handler */ + +/** + * The DefaultHandler class is used in case no element handler is specified in + * the ParserState descriptor. + */ +class DefaultHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override {} + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DefaultHandler{handlerData}; + } +}; + +/* Class Handler */ + +void Handler::data(const std::string &data, int field) +{ + if (Utils::hasNonWhitepaceChar(data)) { + logger().error("Expected command but found character data."); + } +} + +/* Class ParserStack */ + +/** + * Returns an Exception that should be thrown when a currently invalid command + * is thrown. + */ +static LoggableException InvalidCommand(const std::string &name, + const std::set &expected) +{ + if (expected.empty()) { + return LoggableException{ + std::string{"No nested elements allowed, but got \""} + name + + std::string{"\""}}; + } else { + return LoggableException{ + std::string{"Expected "} + + (expected.size() == 1 ? std::string{"\""} + : std::string{"one of \""}) + + Utils::join(expected, "\", \"") + std::string{"\", but got \""} + + name + std::string{"\""}}; + } +} + +ParserStack::ParserStack( + ParserContext &ctx, + const std::multimap &states) + : ctx(ctx), states(states) +{ +} + +bool ParserStack::deduceState() +{ + // Assemble all states + std::vector states; + for (const auto &e : this->states) { + states.push_back(e.second); + } + + // Fetch the type signature of the scope and derive all possible states, + // abort if no unique parser state was found + std::vector possibleStates = + ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) + .deduce(); + if (possibleStates.size() != 1) { + ctx.getLogger().error( + "Error while including file: Cannot deduce parser state."); + return false; + } + + // Switch to this state by creating a dummy handler + const ParserState *state = possibleStates[0]; + Handler *handler = + DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); + stack.emplace(handler); + return true; +} + +std::set ParserStack::expectedCommands() +{ + const ParserState *currentState = &(this->currentState()); + std::set res; + for (const auto &v : states) { + if (v.second->parents.count(currentState)) { + res.insert(v.first); + } + } + return res; +} + +const ParserState &ParserStack::currentState() +{ + return stack.empty() ? ParserStates::None : stack.top()->state(); +} + +std::string ParserStack::currentCommandName() +{ + return stack.empty() ? std::string{} : stack.top()->name(); +} + +const ParserState *ParserStack::findTargetState(const std::string &name) +{ + const ParserState *currentState = &(this->currentState()); + auto range = states.equal_range(name); + for (auto it = range.first; it != range.second; it++) { + const ParserStateSet &parents = it->second->parents; + if (parents.count(currentState) || parents.count(&ParserStates::All)) { + return it->second; + } + } + + return nullptr; +} + +void ParserStack::start(const std::string &name, Variant::mapType &args, + const SourceLocation &location) +{ + ParserState const *targetState = findTargetState(name); +// TODO: Andreas, please improve this. +// if (!Utils::isIdentifier(name)) { +// throw LoggableException(std::string("Invalid identifier \"") + name + +// std::string("\"")); +// } + + if (targetState == nullptr) { + targetState = findTargetState("*"); + } + if (targetState == nullptr) { + throw InvalidCommand(name, expectedCommands()); + } + + // Fetch the associated constructor + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : DefaultHandler::create; + + // Canonicalize the arguments, allow additional arguments + targetState->arguments.validateMap(args, ctx.getLogger(), true); + + // Instantiate the handler and call its start function + Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); + handler->start(args); + stack.emplace(handler); +} + +void ParserStack::start(std::string name, const Variant::mapType &args, + const SourceLocation &location) +{ + Variant::mapType argsCopy(args); + start(name, argsCopy); +} + +void ParserStack::end() +{ + // Check whether the current command could be ended + if (stack.empty()) { + throw LoggableException{"No command to end."}; + } + + // Remove the current HandlerInstance from the stack + std::shared_ptr inst{stack.top()}; + stack.pop(); + + // Call the end function of the last Handler + inst->end(); +} + +void ParserStack::data(const std::string &data, int field) +{ + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException{"No command to receive data."}; + } + + // Pass the data to the current Handler instance + stack.top()->data(data, field); +} +} + diff --git a/src/core/parser/generic/ParserStateStack.hpp b/src/core/parser/generic/ParserStateStack.hpp new file mode 100644 index 0000000..efc4e4a --- /dev/null +++ b/src/core/parser/generic/ParserStateStack.hpp @@ -0,0 +1,361 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file ParserStack.hpp + * + * Helper classes for document or description parsers. Contains the ParserStack + * class, which is an pushdown automaton responsible for accepting commands in + * the correct order and calling specified handlers. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_HPP_ +#define _OUSIA_PARSER_STACK_HPP_ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "Parser.hpp" +#include "ParserContext.hpp" +#include "ParserState.hpp" + +namespace ousia { + +/** + * Struct collecting all the data that is being passed to a Handler instance. + */ +struct HandlerData { + /** + * Reference to the ParserContext instance that should be used to resolve + * references to nodes in the Graph. + */ + ParserContext &ctx; + + /** + * Contains the name of the tag that is being handled. + */ + const std::string name; + + /** + * Contains the current state of the state machine. + */ + const ParserState &state; + + /** + * Contains the state of the state machine when the parent node was handled. + */ + const ParserState &parentState; + + /** + * Current source code location. + */ + const SourceLocation location; + + /** + * Constructor of the HandlerData class. + * + * @param ctx is the parser context the handler should be executed in. + * @param name is the name of the string. + * @param state is the state this handler was called for. + * @param parentState is the state of the parent command. + * @param location is the location at which the handler is created. + */ + HandlerData(ParserContext &ctx, std::string name, const ParserState &state, + const ParserState &parentState, const SourceLocation location) + : ctx(ctx), + name(std::move(name)), + state(state), + parentState(parentState), + location(location){}; +}; + +/** + * The handler class provides a context for handling an XML tag. It has to be + * overridden and registered in the StateStack class to form handlers for + * concrete XML tags. + */ +class Handler { +private: + /** + * Structure containing the internal handler data. + */ + const HandlerData handlerData; + +public: + /** + * Constructor of the Handler class. + * + * @param data is a structure containing all data being passed to the + * handler. + */ + Handler(const HandlerData &handlerData) : handlerData(handlerData){}; + + /** + * Virtual destructor. + */ + virtual ~Handler(){}; + + /** + * Returns a reference at the ParserContext. + * + * @return a reference at the ParserContext. + */ + ParserContext &context() { return handlerData.ctx; } + + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &name() { return handlerData.name; } + + /** + * Returns a reference at the ParserScope instance. + * + * @return a reference at the ParserScope instance. + */ + ParserScope &scope() { return handlerData.ctx.getScope(); } + + /** + * Returns a reference at the Manager instance which manages all nodes. + * + * @return a referance at the Manager instance. + */ + Manager &manager() { return handlerData.ctx.getManager(); } + + /** + * Returns a reference at the Logger instance used for logging error + * messages. + * + * @return a reference at the Logger instance. + */ + Logger &logger() { return handlerData.ctx.getLogger(); } + + /** + * Returns a reference at the Project Node, representing the project into + * which the file is currently being parsed. + * + * @return a referance at the Project Node. + */ + Rooted project() { return handlerData.ctx.getProject(); } + + /** + * Reference at the ParserState descriptor for which this Handler was + * created. + * + * @return a const reference at the constructing ParserState descriptor. + */ + const ParserState &state() { return handlerData.state; } + + /** + * Reference at the ParserState descriptor of the parent state of the state + * for which this Handler was created. Set to ParserStates::None if there + * is no parent state. + * + * @return a const reference at the parent state of the constructing + * ParserState descriptor. + */ + const ParserState &parentState() { return handlerData.parentState; } + + /** + * Returns the current location in the source file. + * + * @return the current location in the source file. + */ + SourceLocation location() { return handlerData.location; } + + /** + * Called when the command that was specified in the constructor is + * instanciated. + * + * @param args is a map from strings to variants (argument name and value). + */ + virtual void start(Variant::mapType &args) = 0; + + /** + * Called whenever the command for which this handler is defined ends. + */ + virtual void end() = 0; + + /** + * Called whenever raw data (int the form of a string) is available for the + * Handler instance. In the default handler an exception is raised if the + * received data contains non-whitespace characters. + * + * @param data is a pointer at the character data that is available for the + * Handler instance. + * @param field is the field number (the interpretation of this value + * depends on the format that is being parsed). + */ + virtual void data(const std::string &data, int field); +}; + +/** + * HandlerConstructor is a function pointer type used to create concrete + * instances of the Handler class. + * + * @param handlerData is the data that should be passed to the new handler + * instance. + * @return a newly created handler instance. + */ +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * The ParserStack class is a pushdown automaton responsible for turning a + * command stream into a tree of Node instances. + */ +class ParserStack { +private: + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap &states; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::stack> stack; + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from for the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + *state + * otherwise. + */ + const ParserState *findTargetState(const std::string &name); + +public: + /** + * Creates a new instance of the ParserStack class. + * + * @param ctx is the parser context the parser stack is working on. + * @param states is a map containing the command names and pointers at the + * corresponding ParserState instances. + */ + ParserStack(ParserContext &ctx, + const std::multimap &states); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + + an equivalent state as the one in the including file. + * + * @param scope is the ParserScope instance from which the ParserState + * should be reconstructed. + * @param logger is the logger instance to which error messages should be + * written. + * @return true if the operation was sucessful, false otherwise. + */ + bool deduceState(); + + /** + * Returns the state the ParserStack instance currently is in. + * + * @return the state of the currently active Handler instance or STATE_NONE + * if no handler is on the stack. + */ + const ParserState ¤tState(); + + /** + * Returns the command name that is currently being handled. + * + * @return the name of the command currently being handled by the active + * Handler instance or an empty string if no handler is currently active. + */ + std::string currentCommandName(); + + /** + * Function that should be called whenever a new command starts. + * + * @param name is the name of the command. + * @param args is a map from strings to variants (argument name and value). + * Note that the passed map will be modified. + * @param location is the location in the source file at which the command + * starts. + */ + void start(const std::string &name, Variant::mapType &args, + const SourceLocation &location = SourceLocation{}); + + /** + * Function that should be called whenever a new command starts. + * + * @param name is the name of the command. + * @param args is a map from strings to variants (argument name and value). + * @param location is the location in the source file at which the command + * starts. + */ + void start(std::string name, + const Variant::mapType &args = Variant::mapType{}, + const SourceLocation &location = SourceLocation{}); + + /** + * Function called whenever a command ends. + */ + void end(); + + /** + * Function that should be called whenever data is available for the + * command. + * + * @param data is the data that should be passed to the handler. + * @param field is the field number (the interpretation of this value + * depends on the format that is being parsed). + */ + void data(const std::string &data, int field = 0); + + /** + * Returns a reference to the parser context the parser stack is currently + * working on. + * + * @return a reference to the parser context. + */ + ParserContext &getContext() { return ctx; } +}; +} + +#endif /* _OUSIA_PARSER_STACK_HPP_ */ + -- cgit v1.2.3