From efe60ac3c3a8725ac71329c0bb19fa9d9c58f399 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:42:05 +0100 Subject: Moved specific file format parsers to formats/ folder, moved old tokenizer to css code (this is the only place where it is actually used) --- src/core/CodeTokenizer.cpp | 169 --- src/core/CodeTokenizer.hpp | 136 --- src/core/Tokenizer.cpp | 204 ---- src/core/Tokenizer.hpp | 227 ---- src/core/parser/ParserStack.cpp | 216 ---- src/core/parser/ParserStack.hpp | 361 ------- src/core/parser/ParserState.cpp | 161 --- src/core/parser/ParserState.hpp | 284 ----- src/core/parser/generic/ParserState.cpp | 161 +++ src/core/parser/generic/ParserState.hpp | 284 +++++ src/core/parser/generic/ParserStateStack.cpp | 216 ++++ src/core/parser/generic/ParserStateStack.hpp | 361 +++++++ src/formats/osdmx/OsdmxParser.cpp | 1435 ++++++++++++++++++++++++++ src/formats/osdmx/OsdmxParser.hpp | 55 + src/plugins/css/CodeTokenizer.cpp | 169 +++ src/plugins/css/CodeTokenizer.hpp | 136 +++ src/plugins/css/Tokenizer.cpp | 204 ++++ src/plugins/css/Tokenizer.hpp | 227 ++++ src/plugins/xml/XmlParser.cpp | 1435 -------------------------- src/plugins/xml/XmlParser.hpp | 55 - test/core/CodeTokenizerTest.cpp | 100 -- test/core/TokenizerTest.cpp | 118 --- test/formats/osdmx/OsdmxParserTest.cpp | 314 ++++++ test/plugins/css/CodeTokenizerTest.cpp | 100 ++ test/plugins/css/TokenizerTest.cpp | 118 +++ test/plugins/xml/XmlParserTest.cpp | 314 ------ 26 files changed, 3780 insertions(+), 3780 deletions(-) delete mode 100644 src/core/CodeTokenizer.cpp delete mode 100644 src/core/CodeTokenizer.hpp delete mode 100644 src/core/Tokenizer.cpp delete mode 100644 src/core/Tokenizer.hpp delete mode 100644 src/core/parser/ParserStack.cpp delete mode 100644 src/core/parser/ParserStack.hpp delete mode 100644 src/core/parser/ParserState.cpp delete mode 100644 src/core/parser/ParserState.hpp create mode 100644 src/core/parser/generic/ParserState.cpp create mode 100644 src/core/parser/generic/ParserState.hpp create mode 100644 src/core/parser/generic/ParserStateStack.cpp create mode 100644 src/core/parser/generic/ParserStateStack.hpp create mode 100644 src/formats/osdmx/OsdmxParser.cpp create mode 100644 src/formats/osdmx/OsdmxParser.hpp create mode 100644 src/plugins/css/CodeTokenizer.cpp create mode 100644 src/plugins/css/CodeTokenizer.hpp create mode 100644 src/plugins/css/Tokenizer.cpp create mode 100644 src/plugins/css/Tokenizer.hpp delete mode 100644 src/plugins/xml/XmlParser.cpp delete mode 100644 src/plugins/xml/XmlParser.hpp delete mode 100644 test/core/CodeTokenizerTest.cpp delete mode 100644 test/core/TokenizerTest.cpp create mode 100644 test/formats/osdmx/OsdmxParserTest.cpp create mode 100644 test/plugins/css/CodeTokenizerTest.cpp create mode 100644 test/plugins/css/TokenizerTest.cpp delete mode 100644 test/plugins/xml/XmlParserTest.cpp diff --git a/src/core/CodeTokenizer.cpp b/src/core/CodeTokenizer.cpp deleted file mode 100644 index d65c514..0000000 --- a/src/core/CodeTokenizer.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "CodeTokenizer.hpp" - -namespace ousia { - -Token CodeTokenizer::constructToken(const Token &t) -{ - std::string content = buf.str(); - buf.str(std::string()); - return Token{ - returnTokenId, content, - SourceLocation{t.location.getSourceId(), startToken.location.getStart(), - t.location.getEnd()}}; -} - -void CodeTokenizer::buffer(const Token &t) { buf << t.content; } - -bool CodeTokenizer::doPrepare(const Token &t, std::deque &peeked) -{ - auto it = descriptors.find(t.tokenId); - CodeTokenMode mode = CodeTokenMode::NONE; - if (it != descriptors.end()) { - mode = it->second.mode; - } - - switch (state) { - case CodeTokenizerState::NORMAL: - switch (mode) { - case CodeTokenMode::STRING_START_END: - state = CodeTokenizerState::IN_STRING; - break; - case CodeTokenMode::BLOCK_COMMENT_START: - state = CodeTokenizerState::IN_BLOCK_COMMENT; - break; - case CodeTokenMode::LINE_COMMENT: - state = CodeTokenizerState::IN_LINE_COMMENT; - break; - case CodeTokenMode::LINEBREAK: - if (!ignoreLinebreaks) { - peeked.push_back( - {it->second.id, t.content, t.location}); - } - return !ignoreLinebreaks; - default: - bool empty = true; - if (t.tokenId == TOKEN_TEXT) { - int begin = -1; - for (size_t c = 0; c < t.content.length(); c++) { - bool isWhitespace = - t.content[c] == ' ' || t.content[c] == '\t'; - if (begin < 0) { - // if we have not yet set our beginning, - // we wait for the first - // non-whitespace-character to set it. - if (!isWhitespace) { - begin = c; - } - } else { - // if we have set our beginning, we wait for the - // first whitespace character, which marks the - // end of the current word. - if (isWhitespace) { - peeked.push_back(Token{ - TOKEN_TEXT, - t.content.substr(begin, (int)c - begin), - SourceLocation{ - t.location.getSourceId(), - t.location.getStart() + begin, - t.location.getStart() + c}}); - begin = -1; - empty = false; - } - } - } - if (begin >= 0) { - peeked.push_back(Token{ - TOKEN_TEXT, t.content.substr(begin), - SourceLocation{t.location.getSourceId(), - t.location.getStart() + begin, - t.location.getEnd()}}); - empty = false; - } - } else { - empty = false; - peeked.push_back(t); - } - return !empty; - } - startToken = t; - returnTokenId = it->second.id; - return false; - case CodeTokenizerState::IN_LINE_COMMENT: - switch (mode) { - case CodeTokenMode::LINEBREAK: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_BLOCK_COMMENT: - switch (mode) { - case CodeTokenMode::BLOCK_COMMENT_END: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_STRING: - switch (mode) { - case CodeTokenMode::ESCAPE: - if (escaped) { - buffer(t); - } - escaped = !escaped; - return false; - case CodeTokenMode::STRING_START_END: - if (escaped) { - buffer(t); - escaped = false; - return false; - } else { - peeked.push_back(constructToken(t)); - state = CodeTokenizerState::NORMAL; - return true; - } - default: - if (escaped) { - // TODO: handle escaped characters? - escaped = false; - } - buffer(t); - return false; - } - } - assert(false); - return false; -} -} diff --git a/src/core/CodeTokenizer.hpp b/src/core/CodeTokenizer.hpp deleted file mode 100644 index 154f949..0000000 --- a/src/core/CodeTokenizer.hpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file CodeTokenizer.hpp - - * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) - */ -#ifndef _OUSIA_CODE_TOKENIZER_HPP_ -#define _OUSIA_CODE_TOKENIZER_HPP_ - -#include -#include - -#include -#include "Tokenizer.hpp" - -namespace ousia { - -/* - * This enum contains all special Token the CodeTokenizer supports, namely: - * - * 1.) An ambigous Tokens - in post programming languages single-quotes ' or - * double-quotes " - to delimit string tokens. - * 2.) A start token for line comments, which would e.g. be // in Java. - * 3.) A start token for a block comment - * 4.) An end token for a block comment. - * 5.) A linebreak token - * 6.) The escape token, which would e.g. be \ in java. - */ -enum class CodeTokenMode { - STRING_START_END, - LINE_COMMENT, - BLOCK_COMMENT_START, - BLOCK_COMMENT_END, - LINEBREAK, - ESCAPE, - NONE -}; - -/** - * A CodeTokenDescriptor defines the id the user likes to have returned for - * a Token of the mode specified, e.g. if you want to get the id 4 for a - * String Token the corresponding CodeTokenDescriptor would be inizialized - * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; - */ -struct CodeTokenDescriptor { - CodeTokenMode mode; - int id; - - CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} -}; - -/** - * The CodeTokenizer is a finite state machine with the states NORMAL, being - * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. - */ -enum class CodeTokenizerState { - NORMAL, - IN_BLOCK_COMMENT, - IN_LINE_COMMENT, - IN_STRING -}; - -/** - * The purpose of a CodeTokenizer is to make it easier to parse classical - * programming Code. It adds the following features to a regular Tokenizer: - * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens - * for the opening delimiter, the text and the closing delimiter. - * 2.) Escaping in String tokens. - * 3.) Comment Tokens (for line comments as well as block comments) - */ -class CodeTokenizer : public Tokenizer { -private: - std::map descriptors; - CodeTokenizerState state; - std::stringstream buf; - Token startToken; - int returnTokenId; - bool escaped = false; - - Token constructToken(const Token &t); - void buffer(const Token &t); - -protected: - bool doPrepare(const Token &t, std::deque &peeked) override; - -public: - /** - * If you do not want comment tokens to be returned you can set this to - * true. - */ - bool ignoreComments = false; - /** - * If you do not want linebreaks to be returned you can set this to true. - */ - bool ignoreLinebreaks = false; - - /** - * - * @param input a CharReader containing the input for this tokenizer, as - * with a regular tokenizer. - * @param root a TokenTreeNode representing the root of the TokenTree. - * Please note that you have to specify all tokenIDs here that you use - * in the descriptors map. - * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. - * In this way you can specify the meaning of certain Tokens. Say you - * specified the Token "//" with the id 1 in the TokenTree. Then you could - * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map - * and this CodeTokenizer would recognize the token "//" as starting a - * line comment. - */ - CodeTokenizer(CharReader &input, const TokenTreeNode &root, - std::map descriptors) - : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) - { - } -}; -} - -#endif diff --git a/src/core/Tokenizer.cpp b/src/core/Tokenizer.cpp deleted file mode 100644 index ab4735a..0000000 --- a/src/core/Tokenizer.cpp +++ /dev/null @@ -1,204 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "Tokenizer.hpp" - -namespace ousia { - -static std::map buildChildren( - const std::map &inputs) -{ - std::map children; - std::map> nexts; - - for (auto &e : inputs) { - const std::string &s = e.first; - const int id = e.second; - if (s.empty()) { - continue; - } - char start = s[0]; - const std::string suffix = s.substr(1); - if (nexts.find(start) != nexts.end()) { - nexts[start].insert(std::make_pair(suffix, id)); - } else { - nexts.insert(std::make_pair( - start, std::map{{suffix, id}})); - } - } - - for (auto &n : nexts) { - children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); - } - - return children; -} - -static int buildId(const std::map &inputs) -{ - int tokenId = TOKEN_NONE; - for (auto &e : inputs) { - if (e.first.empty()) { - if (tokenId != TOKEN_NONE) { - throw TokenizerException{std::string{"Ambigous token found: "} + - std::to_string(e.second)}; - } else { - tokenId = e.second; - } - } - } - return tokenId; -} - -TokenTreeNode::TokenTreeNode(const std::map &inputs) - : children(buildChildren(inputs)), tokenId(buildId(inputs)) -{ -} - -Tokenizer::Tokenizer(CharReader &input, const TokenTreeNode &root) - : input(input), root(root) -{ -} - -bool Tokenizer::prepare() -{ - std::stringstream buffer; - char c; - SourcePosition start = input.getOffset(); - bool bufEmpty = true; - while (input.peek(c)) { - if (root.children.find(c) != root.children.end()) { - // if there might be a special token, keep peeking forward - // until we find the token (or we don't). - TokenTreeNode const *n = &root; - std::stringstream tBuf; - int match = TOKEN_NONE; - while (true) { - tBuf << c; - n = &(n->children.at(c)); - if (n->tokenId != TOKEN_NONE) { - match = n->tokenId; - // from here on we found a token. If we have something - // in our buffer already, we end the search now. - if (!bufEmpty) { - break; - } else { - // if we want to return this token ( = we have nothing - // in our buffer yet) we look greedily for the longest - // possible token we can construct. - input.consumePeek(); - } - } - if (!input.peek(c)) { - // if we are at the end we break off the search. - break; - } - if (n->children.find(c) == n->children.end()) { - // if we do not find a possible continuation anymore, - // break off the search. - break; - } - } - //reset the peek pointer to the last valid position. - input.resetPeek(); - // check if we did indeed find a special token. - if (match != TOKEN_NONE) { - if (bufEmpty) { - // if we did not have text before, construct that token. - if (doPrepare( - Token{match, tBuf.str(), input.getLocation(start)}, - peeked)) { - return true; - } else { - start = input.getOffset(); - continue; - } - } else { - // otherwise we return the text before the token. - if (doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, - peeked)) { - return true; - } else{ - //we need to clear the buffer here. After all the token - //corresponding to this buffer segment is already - //constructed. - buffer.str(std::string()); - bufEmpty = true; - start = input.getOffset(); - continue; - } - } - } else{ - //if we found nothing, read at least one character. - input.peek(c); - } - } - buffer << c; - bufEmpty = false; - input.consumePeek(); - } - if (!bufEmpty) { - return doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, - peeked); - } - return false; -} - -bool Tokenizer::doPrepare(const Token &t, std::deque &peeked) -{ - peeked.push_back(t); - return true; -} - -bool Tokenizer::next(Token &t) -{ - if (peeked.empty()) { - if (!prepare()) { - return false; - } - } - t = peeked.front(); - peeked.pop_front(); - resetPeek(); - return true; -} - -bool Tokenizer::peek(Token &t) -{ - if (peekCursor >= peeked.size()) { - if (!prepare()) { - return false; - } - } - t = peeked[peekCursor]; - peekCursor++; - return true; -} - -void Tokenizer::resetPeek() { peekCursor = 0; } - -void Tokenizer::consumePeek() -{ - while (peekCursor > 0) { - peeked.pop_front(); - peekCursor--; - } -} -} diff --git a/src/core/Tokenizer.hpp b/src/core/Tokenizer.hpp deleted file mode 100644 index 50e458c..0000000 --- a/src/core/Tokenizer.hpp +++ /dev/null @@ -1,227 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_TOKENIZER_HPP_ -#define _OUSIA_TOKENIZER_HPP_ - -#include -#include -#include -#include - -#include - -namespace ousia { - -/** - * This exception is currently only thrown if errors are made during the - * initialization of the Tokenizer. Have a closer look at the documentation - * of the TokenTreeNode constructor for more information. - */ -class TokenizerException : public std::exception { -public: - const std::string msg; - - TokenizerException(const std::string &msg) : msg(msg){}; - - virtual const char *what() const noexcept override { return msg.c_str(); } -}; - -/** - * The Tokenizer internally uses a TokenTree to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * The TokenTree is a construct that structures all special tokens this - * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then - * the TokenTree would look like this: - * - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ - * - * Every node in the TokenTree is a valid end state that has a $ attached to it. - * During the search algorithm the Tokenizer goes through the tree and stores - * the last valid position. If a character follows that does not lead to a new - * node in the TokenTree the search ends (and starts again at this character). - * The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). - * - * It is also not allowed that tokens have common middle parts but varying - * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and - * the input string "abc". In that case we start looking for "abd" at the - * start, won't find it, wenn we hit "c" and start the scanning process - * anew. Thus the "bc" token is not found. - * - * For most (well-behaved) tokenization schemes this is not the case, - * though. - */ -class TokenTreeNode { -public: - const std::map children; - const int tokenId; - - /** - * The TokenTreeNode constructor builds a TokenTree from the given token - * specifications. The node returned by this constructor then is the root of - * said TokenTree. - * @param inputs Specifications of tokens in map form. Each specification - * is a tuple of the text that should be matched and some unique ID (>= 0) - * that is returned to you if that Token is found in the text. - * An example for such a map would be - * { - * { "#" , 1}, - * { "##", 2}, - * { "/" , 3} - * } - * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE - * (-1) and TOKEN_TEXT (-2). - */ - TokenTreeNode(const std::map &inputs); -}; - -/** - * This is a reserved constant for the empty token. - */ -static const int TOKEN_NONE = -1; -/** - * This is a reserved constant for every part of the input text that is not a - * specified token. - */ -static const int TOKEN_TEXT = -2; - -/** - * A token for us is identified by an integer tokenID (either one of the - * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). - * Additionally we return the matched text (which should only be really - * interesting in case of TOKEN_TEXT tokens) and the position in the input text. - */ -struct Token { - int tokenId; - std::string content; - SourceLocation location; - - Token(int tokenId, std::string content, SourceLocation location) - : tokenId(tokenId), - content(content), - location(location) - { - } - - Token() : tokenId(TOKEN_NONE) {} -}; - -/** - * A Tokenizer has the purpose of subdividing an input text into tokens. In our - * definition here we distinguish between two kinds of tokens: - * 1.) User-specified tokens that match a fixed text. - * 2.) Any other text between those tokens. - * The user might want to specify the tokens '#{' and '#}' for example, because - * they have some meaning in her code. The user sets the IDs to 1 and 2. - * Given the input text - * "some text #{ special command #} some text" - * the tokenizer would return the tokens: - * 1.) "some text " with the id TOKEN_TEXT (-2). - * 2.) "#{" with the id 1. - * 3.) " special command " with the id TOKEN_TEXT (-2). - * 4.) "#}" with the id 2. - * 5.) " some text" with the id TOKEN_TEXT (-2). - * This makes the subsequent parsing of files of a specific type easier. - * Note that in case of tokens with that are prefixes of other tokens the - * longest possible match is returned. - */ -class Tokenizer { -private: - CharReader &input; - const TokenTreeNode &root; - std::deque peeked; - unsigned int peekCursor = 0; - - bool prepare(); - -protected: - /** - * This method is an interface to build multiple tokens from a single one in - * derived classes. This might be interesting if you want to implement - * further logic on text tokens or similar applications. - * - * @param t a Token the "basic" tokenizer found. - * @param peeked a reference to the deque containing all temporary Tokens. - * You are supposed to append your tokens there. In the trivial case you just - * put the given Token on top of the deque. - * @return false if no token was appended to the deque (meaning that you want - * to ignore the given token explicitly) and true in all other cases. - */ - virtual bool doPrepare(const Token &t, std::deque &peeked); - -public: - /** - * @param input The input of a Tokenizer is given in the form of a - * CharReader. Please refer to the respective documentation. - * @param root This is meant to be the root of a TokenTree giving the - * specification of user-defined tokens this Tokenizer should recognize. - * The Tokenizer promises to not change the TokenTree such that you can - * re-use the same specification for multiple inputs. - * Please refer to the TokenTreeNode documentation for more information. - */ - Tokenizer(CharReader &input, const TokenTreeNode &root); - - /** - * The next method consumes one Token from the input stream and gives - * it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool next(Token &t); - /** - * The peek method does not consume the next Token but buffers it and - * shows it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool peek(Token &t); - - /** - * Resets the peek pointer to the current position in the stream (to the - * beginning of the buffer). - */ - void resetPeek(); - - /** - * Clears the peek buffer, such that all peeked Tokens are consumed. - */ - void consumePeek(); - - const CharReader &getInput() const { return input; } - - CharReader &getInput() { return input; } -}; -} - -#endif diff --git a/src/core/parser/ParserStack.cpp b/src/core/parser/ParserStack.cpp deleted file mode 100644 index 1265851..0000000 --- a/src/core/parser/ParserStack.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include -#include - -#include "ParserScope.hpp" -#include "ParserStack.hpp" - -namespace ousia { - -/* A default handler */ - -/** - * The DefaultHandler class is used in case no element handler is specified in - * the ParserState descriptor. - */ -class DefaultHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override {} - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DefaultHandler{handlerData}; - } -}; - -/* Class Handler */ - -void Handler::data(const std::string &data, int field) -{ - if (Utils::hasNonWhitepaceChar(data)) { - logger().error("Expected command but found character data."); - } -} - -/* Class ParserStack */ - -/** - * Returns an Exception that should be thrown when a currently invalid command - * is thrown. - */ -static LoggableException InvalidCommand(const std::string &name, - const std::set &expected) -{ - if (expected.empty()) { - return LoggableException{ - std::string{"No nested elements allowed, but got \""} + name + - std::string{"\""}}; - } else { - return LoggableException{ - std::string{"Expected "} + - (expected.size() == 1 ? std::string{"\""} - : std::string{"one of \""}) + - Utils::join(expected, "\", \"") + std::string{"\", but got \""} + - name + std::string{"\""}}; - } -} - -ParserStack::ParserStack( - ParserContext &ctx, - const std::multimap &states) - : ctx(ctx), states(states) -{ -} - -bool ParserStack::deduceState() -{ - // Assemble all states - std::vector states; - for (const auto &e : this->states) { - states.push_back(e.second); - } - - // Fetch the type signature of the scope and derive all possible states, - // abort if no unique parser state was found - std::vector possibleStates = - ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) - .deduce(); - if (possibleStates.size() != 1) { - ctx.getLogger().error( - "Error while including file: Cannot deduce parser state."); - return false; - } - - // Switch to this state by creating a dummy handler - const ParserState *state = possibleStates[0]; - Handler *handler = - DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); - stack.emplace(handler); - return true; -} - -std::set ParserStack::expectedCommands() -{ - const ParserState *currentState = &(this->currentState()); - std::set res; - for (const auto &v : states) { - if (v.second->parents.count(currentState)) { - res.insert(v.first); - } - } - return res; -} - -const ParserState &ParserStack::currentState() -{ - return stack.empty() ? ParserStates::None : stack.top()->state(); -} - -std::string ParserStack::currentCommandName() -{ - return stack.empty() ? std::string{} : stack.top()->name(); -} - -const ParserState *ParserStack::findTargetState(const std::string &name) -{ - const ParserState *currentState = &(this->currentState()); - auto range = states.equal_range(name); - for (auto it = range.first; it != range.second; it++) { - const ParserStateSet &parents = it->second->parents; - if (parents.count(currentState) || parents.count(&ParserStates::All)) { - return it->second; - } - } - - return nullptr; -} - -void ParserStack::start(const std::string &name, Variant::mapType &args, - const SourceLocation &location) -{ - ParserState const *targetState = findTargetState(name); -// TODO: Andreas, please improve this. -// if (!Utils::isIdentifier(name)) { -// throw LoggableException(std::string("Invalid identifier \"") + name + -// std::string("\"")); -// } - - if (targetState == nullptr) { - targetState = findTargetState("*"); - } - if (targetState == nullptr) { - throw InvalidCommand(name, expectedCommands()); - } - - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and call its start function - Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); - handler->start(args); - stack.emplace(handler); -} - -void ParserStack::start(std::string name, const Variant::mapType &args, - const SourceLocation &location) -{ - Variant::mapType argsCopy(args); - start(name, argsCopy); -} - -void ParserStack::end() -{ - // Check whether the current command could be ended - if (stack.empty()) { - throw LoggableException{"No command to end."}; - } - - // Remove the current HandlerInstance from the stack - std::shared_ptr inst{stack.top()}; - stack.pop(); - - // Call the end function of the last Handler - inst->end(); -} - -void ParserStack::data(const std::string &data, int field) -{ - // Check whether there is any command the data can be sent to - if (stack.empty()) { - throw LoggableException{"No command to receive data."}; - } - - // Pass the data to the current Handler instance - stack.top()->data(data, field); -} -} - diff --git a/src/core/parser/ParserStack.hpp b/src/core/parser/ParserStack.hpp deleted file mode 100644 index efc4e4a..0000000 --- a/src/core/parser/ParserStack.hpp +++ /dev/null @@ -1,361 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserStack.hpp - * - * Helper classes for document or description parsers. Contains the ParserStack - * class, which is an pushdown automaton responsible for accepting commands in - * the correct order and calling specified handlers. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STACK_HPP_ -#define _OUSIA_PARSER_STACK_HPP_ - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "Parser.hpp" -#include "ParserContext.hpp" -#include "ParserState.hpp" - -namespace ousia { - -/** - * Struct collecting all the data that is being passed to a Handler instance. - */ -struct HandlerData { - /** - * Reference to the ParserContext instance that should be used to resolve - * references to nodes in the Graph. - */ - ParserContext &ctx; - - /** - * Contains the name of the tag that is being handled. - */ - const std::string name; - - /** - * Contains the current state of the state machine. - */ - const ParserState &state; - - /** - * Contains the state of the state machine when the parent node was handled. - */ - const ParserState &parentState; - - /** - * Current source code location. - */ - const SourceLocation location; - - /** - * Constructor of the HandlerData class. - * - * @param ctx is the parser context the handler should be executed in. - * @param name is the name of the string. - * @param state is the state this handler was called for. - * @param parentState is the state of the parent command. - * @param location is the location at which the handler is created. - */ - HandlerData(ParserContext &ctx, std::string name, const ParserState &state, - const ParserState &parentState, const SourceLocation location) - : ctx(ctx), - name(std::move(name)), - state(state), - parentState(parentState), - location(location){}; -}; - -/** - * The handler class provides a context for handling an XML tag. It has to be - * overridden and registered in the StateStack class to form handlers for - * concrete XML tags. - */ -class Handler { -private: - /** - * Structure containing the internal handler data. - */ - const HandlerData handlerData; - -public: - /** - * Constructor of the Handler class. - * - * @param data is a structure containing all data being passed to the - * handler. - */ - Handler(const HandlerData &handlerData) : handlerData(handlerData){}; - - /** - * Virtual destructor. - */ - virtual ~Handler(){}; - - /** - * Returns a reference at the ParserContext. - * - * @return a reference at the ParserContext. - */ - ParserContext &context() { return handlerData.ctx; } - - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. - */ - const std::string &name() { return handlerData.name; } - - /** - * Returns a reference at the ParserScope instance. - * - * @return a reference at the ParserScope instance. - */ - ParserScope &scope() { return handlerData.ctx.getScope(); } - - /** - * Returns a reference at the Manager instance which manages all nodes. - * - * @return a referance at the Manager instance. - */ - Manager &manager() { return handlerData.ctx.getManager(); } - - /** - * Returns a reference at the Logger instance used for logging error - * messages. - * - * @return a reference at the Logger instance. - */ - Logger &logger() { return handlerData.ctx.getLogger(); } - - /** - * Returns a reference at the Project Node, representing the project into - * which the file is currently being parsed. - * - * @return a referance at the Project Node. - */ - Rooted project() { return handlerData.ctx.getProject(); } - - /** - * Reference at the ParserState descriptor for which this Handler was - * created. - * - * @return a const reference at the constructing ParserState descriptor. - */ - const ParserState &state() { return handlerData.state; } - - /** - * Reference at the ParserState descriptor of the parent state of the state - * for which this Handler was created. Set to ParserStates::None if there - * is no parent state. - * - * @return a const reference at the parent state of the constructing - * ParserState descriptor. - */ - const ParserState &parentState() { return handlerData.parentState; } - - /** - * Returns the current location in the source file. - * - * @return the current location in the source file. - */ - SourceLocation location() { return handlerData.location; } - - /** - * Called when the command that was specified in the constructor is - * instanciated. - * - * @param args is a map from strings to variants (argument name and value). - */ - virtual void start(Variant::mapType &args) = 0; - - /** - * Called whenever the command for which this handler is defined ends. - */ - virtual void end() = 0; - - /** - * Called whenever raw data (int the form of a string) is available for the - * Handler instance. In the default handler an exception is raised if the - * received data contains non-whitespace characters. - * - * @param data is a pointer at the character data that is available for the - * Handler instance. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - virtual void data(const std::string &data, int field); -}; - -/** - * HandlerConstructor is a function pointer type used to create concrete - * instances of the Handler class. - * - * @param handlerData is the data that should be passed to the new handler - * instance. - * @return a newly created handler instance. - */ -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * The ParserStack class is a pushdown automaton responsible for turning a - * command stream into a tree of Node instances. - */ -class ParserStack { -private: - /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. - */ - const std::multimap &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::stack> stack; - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. - */ - std::set expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from for the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - *state - * otherwise. - */ - const ParserState *findTargetState(const std::string &name); - -public: - /** - * Creates a new instance of the ParserStack class. - * - * @param ctx is the parser context the parser stack is working on. - * @param states is a map containing the command names and pointers at the - * corresponding ParserState instances. - */ - ParserStack(ParserContext &ctx, - const std::multimap &states); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - + an equivalent state as the one in the including file. - * - * @param scope is the ParserScope instance from which the ParserState - * should be reconstructed. - * @param logger is the logger instance to which error messages should be - * written. - * @return true if the operation was sucessful, false otherwise. - */ - bool deduceState(); - - /** - * Returns the state the ParserStack instance currently is in. - * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. - */ - const ParserState ¤tState(); - - /** - * Returns the command name that is currently being handled. - * - * @return the name of the command currently being handled by the active - * Handler instance or an empty string if no handler is currently active. - */ - std::string currentCommandName(); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * Note that the passed map will be modified. - * @param location is the location in the source file at which the command - * starts. - */ - void start(const std::string &name, Variant::mapType &args, - const SourceLocation &location = SourceLocation{}); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * @param location is the location in the source file at which the command - * starts. - */ - void start(std::string name, - const Variant::mapType &args = Variant::mapType{}, - const SourceLocation &location = SourceLocation{}); - - /** - * Function called whenever a command ends. - */ - void end(); - - /** - * Function that should be called whenever data is available for the - * command. - * - * @param data is the data that should be passed to the handler. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - void data(const std::string &data, int field = 0); - - /** - * Returns a reference to the parser context the parser stack is currently - * working on. - * - * @return a reference to the parser context. - */ - ParserContext &getContext() { return ctx; } -}; -} - -#endif /* _OUSIA_PARSER_STACK_HPP_ */ - diff --git a/src/core/parser/ParserState.cpp b/src/core/parser/ParserState.cpp deleted file mode 100644 index f635d86..0000000 --- a/src/core/parser/ParserState.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "ParserState.hpp" - -namespace ousia { - -/* Class ParserState */ - -ParserState::ParserState() : elementHandler(nullptr) {} - -ParserState::ParserState(ParserStateSet parents, Arguments arguments, - RttiSet createdNodeTypes, - HandlerConstructor elementHandler) - : parents(parents), - arguments(arguments), - createdNodeTypes(createdNodeTypes), - elementHandler(elementHandler) -{ -} - -ParserState::ParserState(const ParserStateBuilder &builder) - : ParserState(builder.build()) -{ -} - -/* Class ParserStateBuilder */ - -ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) -{ - this->state = state; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) -{ - state.parents = ParserStateSet{parent}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) -{ - state.parents = parents; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) -{ - state.arguments = arguments; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) -{ - state.createdNodeTypes = RttiSet{type}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) -{ - state.createdNodeTypes = types; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::elementHandler( - HandlerConstructor elementHandler) -{ - state.elementHandler = elementHandler; - return *this; -} - -const ParserState &ParserStateBuilder::build() const { return state; } - -/* Class ParserStateDeductor */ - -ParserStateDeductor::ParserStateDeductor( - std::vector signature, - std::vector states) - : tbl(signature.size()), - signature(std::move(signature)), - states(std::move(states)) -{ -} - -bool ParserStateDeductor::isActive(size_t d, const ParserState *s) -{ - // Lookup the "active" state of (d, s), if it was not already set - // (e.second is true) we'll have to calculate it - auto e = tbl[d].emplace(s, false); - bool &res = e.first->second; - if (!e.second) { - return res; - } - - // Check whether this node is generative (may have produced the Node - // described by the current Signature element) - bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); - - if (isGenerative && d == 0) { - // End of recursion -- the last signature element is reached and the - // node was generative - res = true; - } else { - // Try repetition of this node - if (isGenerative && isActive(d - 1, s)) { - res = true; - } else { - // Check whether any of the parent nodes were active -- either for - // the previous element (if this one is generative) or for the - // current element (assuming this node was not generative) - for (const ParserState *parent : s->parents) { - if ((isGenerative && isActive(d - 1, parent)) || - isActive(d, parent)) { - res = true; - break; - } - } - } - } - - return res; -} - -std::vector ParserStateDeductor::deduce() -{ - std::vector res; - if (!signature.empty()) { - const size_t D = signature.size(); - for (auto s : states) { - if (signature[D - 1]->isOneOf(s->createdNodeTypes) && - isActive(D - 1, s)) { - res.push_back(s); - } - } - } - return res; -} - -/* Constant initializations */ - -namespace ParserStates { -const ParserState All; -const ParserState None; -} -} - diff --git a/src/core/parser/ParserState.hpp b/src/core/parser/ParserState.hpp deleted file mode 100644 index 6487fdd..0000000 --- a/src/core/parser/ParserState.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserState.hpp - * - * Defines the ParserState class used within the ParserStack pushdown - * automaton and the ParserStateBuilder class for convenient construction of - * such classes. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_HPP_ -#define _OUSIA_PARSER_STATE_HPP_ - -#include - -#include -#include - -namespace ousia { - -// Forward declarations -class ParserStateBuilder; -class ParserState; -class HandlerData; -class Handler; -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * Set of pointers of parser states -- used for specifying a set of parent - * states. - */ -using ParserStateSet = std::unordered_set; - -/** - * Class used for the complete specification of a ParserState. Stores possible - * parent states, state handlers and arguments to be passed to that state. - */ -struct ParserState { - /** - * Vector containing all possible parent states. - */ - ParserStateSet parents; - - /** - * Descriptor of the arguments that should be passed to the handler. - */ - Arguments arguments; - - /** - * Set containing the types of the nodes that may be created in this - * ParserState. This information is needed for Parsers to reconstruct the - * current ParserState from a given ParserScope when a file is included. - */ - RttiSet createdNodeTypes; - - /** - * Pointer at a function which creates a new concrete Handler instance for - * the elements described by this state. May be nullptr in which case no - * handler instance is created. - */ - HandlerConstructor elementHandler; - - /** - * Default constructor, initializes the handlers with nullptr. - */ - ParserState(); - - /** - * Constructor taking values for all fields. Use the ParserStateBuilder - * class for a more convenient construction of ParserState instances. - * - * @param parents is a vector containing all possible parent states. - * @param arguments is a descriptor of arguments that should be passed to - * the handler. - * @param createdNodeTypes is a set containing the types of the nodes tha - * may be created in this ParserState. This information is needed for - * Parsers to reconstruct the current ParserState from a given ParserScope - * when a file is included. - * @param elementHandler is a pointer at a function which creates a new - * concrete Handler instance for the elements described by this state. May - * be nullptr in which case no handler instance is created. - */ - ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, - RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr); - - /** - * Creates this ParserState from the given ParserStateBuilder instance. - */ - ParserState(const ParserStateBuilder &builder); -}; - -/** - * The ParserStateBuilder class is a class used for conveniently building new - * ParserState instances. - */ -class ParserStateBuilder { -private: - /** - * ParserState instance that is currently being built by the - * ParserStateBuilder. - */ - ParserState state; - -public: - /** - * Copies the ParserState instance and uses it as internal state. Overrides - * all changes made by the ParserStateBuilder. - * - * @param state is the state that should be copied. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder ©(const ParserState &state); - - /** - * Sets the possible parent states to the single given parent element. - * - * @param parent is a pointer at the parent ParserState instance that should - * be the possible parent state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parent(const ParserState *parent); - - /** - * Sets the ParserState instances in the given ParserStateSet as the list of - * supported parent states. - * - * @param parents is a set of pointers at ParserState instances that should - * be the possible parent states. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parents(const ParserStateSet &parents); - - /** - * Sets the arguments that should be passed to the parser state handler to - * those given as argument. - * - * @param arguments is the Arguments instance describing the Arguments that - * should be parsed to a Handler for this ParserState. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &arguments(const Arguments &arguments); - - /** - * Sets the Node types this state may produce to the given Rtti descriptor. - * - * @param type is the Rtti descriptor of the Type that may be produced by - * this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeType(const Rtti *type); - - /** - * Sets the Node types this state may produce to the given Rtti descriptors. - * - * @param types is a set of Rtti descriptors of the Types that may be - * produced by this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeTypes(const RttiSet &types); - - /** - * Sets the constructor for the element handler. The constructor creates a - * new concrete Handler instance for the elements described by this state. - * May be nullptr in which case no handler instance is created (this is - * the default value). - * - * @param elementHandler is the HandlerConstructor that should create a - * new Handler instance. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); - - /** - * Returns a reference at the internal ParserState instance that was built - * using the ParserStateBuilder. - * - * @return the built ParserState. - */ - const ParserState &build() const; -}; - -/** - * Class used to deduce the ParserState a Parser is currently in based on the - * types of the Nodes that currently are on the ParserStack. Uses dynamic - * programming in order to solve this problem. - */ -class ParserStateDeductor { -public: - /** - * Type containing the dynamic programming table. - */ - using Table = std::vector>; - -private: - /** - * Dynamic programming table. - */ - Table tbl; - - /** - * Signature given in the constructor. - */ - const std::vector signature; - - /** - * List of states that should be checked for being active. - */ - const std::vector states; - - /** - * Used internally to check whether the given parser stack s may have been - * active for signature element d. - * - * @param d is the signature element. - * @param s is the parser state. - * @return true if the the given ParserState may have been active. - */ - bool isActive(size_t d, const ParserState *s); - -public: - /** - * Constructor of the ParserStateDeductor class. - * - * @param signature a Node type signature describing the types of the nodes - * which currently reside on e.g. the ParserScope stack. - * @param states is a list of states that should be checked. - */ - ParserStateDeductor(std::vector signature, - std::vector states); - - /** - * Selects all active states from the given states. Only considers those - * states that may have produced the last signature element. - * - * @return a list of states that may actually have been active. - */ - std::vector deduce(); -}; - -/** - * The ParserStates namespace contains all the global state constants used - * in the ParserStack class. - */ -namespace ParserStates { -/** - * State representing all states. - */ -extern const ParserState All; - -/** - * State representing the initial state. - */ -extern const ParserState None; -} -} - -#endif /* _OUSIA_PARSER_STATE_HPP_ */ - diff --git a/src/core/parser/generic/ParserState.cpp b/src/core/parser/generic/ParserState.cpp new file mode 100644 index 0000000..f635d86 --- /dev/null +++ b/src/core/parser/generic/ParserState.cpp @@ -0,0 +1,161 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "ParserState.hpp" + +namespace ousia { + +/* Class ParserState */ + +ParserState::ParserState() : elementHandler(nullptr) {} + +ParserState::ParserState(ParserStateSet parents, Arguments arguments, + RttiSet createdNodeTypes, + HandlerConstructor elementHandler) + : parents(parents), + arguments(arguments), + createdNodeTypes(createdNodeTypes), + elementHandler(elementHandler) +{ +} + +ParserState::ParserState(const ParserStateBuilder &builder) + : ParserState(builder.build()) +{ +} + +/* Class ParserStateBuilder */ + +ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) +{ + this->state = state; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) +{ + state.parents = ParserStateSet{parent}; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) +{ + state.parents = parents; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) +{ + state.arguments = arguments; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) +{ + state.createdNodeTypes = RttiSet{type}; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) +{ + state.createdNodeTypes = types; + return *this; +} + +ParserStateBuilder &ParserStateBuilder::elementHandler( + HandlerConstructor elementHandler) +{ + state.elementHandler = elementHandler; + return *this; +} + +const ParserState &ParserStateBuilder::build() const { return state; } + +/* Class ParserStateDeductor */ + +ParserStateDeductor::ParserStateDeductor( + std::vector signature, + std::vector states) + : tbl(signature.size()), + signature(std::move(signature)), + states(std::move(states)) +{ +} + +bool ParserStateDeductor::isActive(size_t d, const ParserState *s) +{ + // Lookup the "active" state of (d, s), if it was not already set + // (e.second is true) we'll have to calculate it + auto e = tbl[d].emplace(s, false); + bool &res = e.first->second; + if (!e.second) { + return res; + } + + // Check whether this node is generative (may have produced the Node + // described by the current Signature element) + bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); + + if (isGenerative && d == 0) { + // End of recursion -- the last signature element is reached and the + // node was generative + res = true; + } else { + // Try repetition of this node + if (isGenerative && isActive(d - 1, s)) { + res = true; + } else { + // Check whether any of the parent nodes were active -- either for + // the previous element (if this one is generative) or for the + // current element (assuming this node was not generative) + for (const ParserState *parent : s->parents) { + if ((isGenerative && isActive(d - 1, parent)) || + isActive(d, parent)) { + res = true; + break; + } + } + } + } + + return res; +} + +std::vector ParserStateDeductor::deduce() +{ + std::vector res; + if (!signature.empty()) { + const size_t D = signature.size(); + for (auto s : states) { + if (signature[D - 1]->isOneOf(s->createdNodeTypes) && + isActive(D - 1, s)) { + res.push_back(s); + } + } + } + return res; +} + +/* Constant initializations */ + +namespace ParserStates { +const ParserState All; +const ParserState None; +} +} + diff --git a/src/core/parser/generic/ParserState.hpp b/src/core/parser/generic/ParserState.hpp new file mode 100644 index 0000000..6487fdd --- /dev/null +++ b/src/core/parser/generic/ParserState.hpp @@ -0,0 +1,284 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file ParserState.hpp + * + * Defines the ParserState class used within the ParserStack pushdown + * automaton and the ParserStateBuilder class for convenient construction of + * such classes. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_HPP_ +#define _OUSIA_PARSER_STATE_HPP_ + +#include + +#include +#include + +namespace ousia { + +// Forward declarations +class ParserStateBuilder; +class ParserState; +class HandlerData; +class Handler; +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * Set of pointers of parser states -- used for specifying a set of parent + * states. + */ +using ParserStateSet = std::unordered_set; + +/** + * Class used for the complete specification of a ParserState. Stores possible + * parent states, state handlers and arguments to be passed to that state. + */ +struct ParserState { + /** + * Vector containing all possible parent states. + */ + ParserStateSet parents; + + /** + * Descriptor of the arguments that should be passed to the handler. + */ + Arguments arguments; + + /** + * Set containing the types of the nodes that may be created in this + * ParserState. This information is needed for Parsers to reconstruct the + * current ParserState from a given ParserScope when a file is included. + */ + RttiSet createdNodeTypes; + + /** + * Pointer at a function which creates a new concrete Handler instance for + * the elements described by this state. May be nullptr in which case no + * handler instance is created. + */ + HandlerConstructor elementHandler; + + /** + * Default constructor, initializes the handlers with nullptr. + */ + ParserState(); + + /** + * Constructor taking values for all fields. Use the ParserStateBuilder + * class for a more convenient construction of ParserState instances. + * + * @param parents is a vector containing all possible parent states. + * @param arguments is a descriptor of arguments that should be passed to + * the handler. + * @param createdNodeTypes is a set containing the types of the nodes tha + * may be created in this ParserState. This information is needed for + * Parsers to reconstruct the current ParserState from a given ParserScope + * when a file is included. + * @param elementHandler is a pointer at a function which creates a new + * concrete Handler instance for the elements described by this state. May + * be nullptr in which case no handler instance is created. + */ + ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, + RttiSet createdNodeTypes = RttiSet{}, + HandlerConstructor elementHandler = nullptr); + + /** + * Creates this ParserState from the given ParserStateBuilder instance. + */ + ParserState(const ParserStateBuilder &builder); +}; + +/** + * The ParserStateBuilder class is a class used for conveniently building new + * ParserState instances. + */ +class ParserStateBuilder { +private: + /** + * ParserState instance that is currently being built by the + * ParserStateBuilder. + */ + ParserState state; + +public: + /** + * Copies the ParserState instance and uses it as internal state. Overrides + * all changes made by the ParserStateBuilder. + * + * @param state is the state that should be copied. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder ©(const ParserState &state); + + /** + * Sets the possible parent states to the single given parent element. + * + * @param parent is a pointer at the parent ParserState instance that should + * be the possible parent state. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &parent(const ParserState *parent); + + /** + * Sets the ParserState instances in the given ParserStateSet as the list of + * supported parent states. + * + * @param parents is a set of pointers at ParserState instances that should + * be the possible parent states. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &parents(const ParserStateSet &parents); + + /** + * Sets the arguments that should be passed to the parser state handler to + * those given as argument. + * + * @param arguments is the Arguments instance describing the Arguments that + * should be parsed to a Handler for this ParserState. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &arguments(const Arguments &arguments); + + /** + * Sets the Node types this state may produce to the given Rtti descriptor. + * + * @param type is the Rtti descriptor of the Type that may be produced by + * this state. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &createdNodeType(const Rtti *type); + + /** + * Sets the Node types this state may produce to the given Rtti descriptors. + * + * @param types is a set of Rtti descriptors of the Types that may be + * produced by this state. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &createdNodeTypes(const RttiSet &types); + + /** + * Sets the constructor for the element handler. The constructor creates a + * new concrete Handler instance for the elements described by this state. + * May be nullptr in which case no handler instance is created (this is + * the default value). + * + * @param elementHandler is the HandlerConstructor that should create a + * new Handler instance. + * @return a reference at this ParserStateBuilder instance for method + * chaining. + */ + ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); + + /** + * Returns a reference at the internal ParserState instance that was built + * using the ParserStateBuilder. + * + * @return the built ParserState. + */ + const ParserState &build() const; +}; + +/** + * Class used to deduce the ParserState a Parser is currently in based on the + * types of the Nodes that currently are on the ParserStack. Uses dynamic + * programming in order to solve this problem. + */ +class ParserStateDeductor { +public: + /** + * Type containing the dynamic programming table. + */ + using Table = std::vector>; + +private: + /** + * Dynamic programming table. + */ + Table tbl; + + /** + * Signature given in the constructor. + */ + const std::vector signature; + + /** + * List of states that should be checked for being active. + */ + const std::vector states; + + /** + * Used internally to check whether the given parser stack s may have been + * active for signature element d. + * + * @param d is the signature element. + * @param s is the parser state. + * @return true if the the given ParserState may have been active. + */ + bool isActive(size_t d, const ParserState *s); + +public: + /** + * Constructor of the ParserStateDeductor class. + * + * @param signature a Node type signature describing the types of the nodes + * which currently reside on e.g. the ParserScope stack. + * @param states is a list of states that should be checked. + */ + ParserStateDeductor(std::vector signature, + std::vector states); + + /** + * Selects all active states from the given states. Only considers those + * states that may have produced the last signature element. + * + * @return a list of states that may actually have been active. + */ + std::vector deduce(); +}; + +/** + * The ParserStates namespace contains all the global state constants used + * in the ParserStack class. + */ +namespace ParserStates { +/** + * State representing all states. + */ +extern const ParserState All; + +/** + * State representing the initial state. + */ +extern const ParserState None; +} +} + +#endif /* _OUSIA_PARSER_STATE_HPP_ */ + diff --git a/src/core/parser/generic/ParserStateStack.cpp b/src/core/parser/generic/ParserStateStack.cpp new file mode 100644 index 0000000..1265851 --- /dev/null +++ b/src/core/parser/generic/ParserStateStack.cpp @@ -0,0 +1,216 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include "ParserScope.hpp" +#include "ParserStack.hpp" + +namespace ousia { + +/* A default handler */ + +/** + * The DefaultHandler class is used in case no element handler is specified in + * the ParserState descriptor. + */ +class DefaultHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override {} + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DefaultHandler{handlerData}; + } +}; + +/* Class Handler */ + +void Handler::data(const std::string &data, int field) +{ + if (Utils::hasNonWhitepaceChar(data)) { + logger().error("Expected command but found character data."); + } +} + +/* Class ParserStack */ + +/** + * Returns an Exception that should be thrown when a currently invalid command + * is thrown. + */ +static LoggableException InvalidCommand(const std::string &name, + const std::set &expected) +{ + if (expected.empty()) { + return LoggableException{ + std::string{"No nested elements allowed, but got \""} + name + + std::string{"\""}}; + } else { + return LoggableException{ + std::string{"Expected "} + + (expected.size() == 1 ? std::string{"\""} + : std::string{"one of \""}) + + Utils::join(expected, "\", \"") + std::string{"\", but got \""} + + name + std::string{"\""}}; + } +} + +ParserStack::ParserStack( + ParserContext &ctx, + const std::multimap &states) + : ctx(ctx), states(states) +{ +} + +bool ParserStack::deduceState() +{ + // Assemble all states + std::vector states; + for (const auto &e : this->states) { + states.push_back(e.second); + } + + // Fetch the type signature of the scope and derive all possible states, + // abort if no unique parser state was found + std::vector possibleStates = + ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) + .deduce(); + if (possibleStates.size() != 1) { + ctx.getLogger().error( + "Error while including file: Cannot deduce parser state."); + return false; + } + + // Switch to this state by creating a dummy handler + const ParserState *state = possibleStates[0]; + Handler *handler = + DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); + stack.emplace(handler); + return true; +} + +std::set ParserStack::expectedCommands() +{ + const ParserState *currentState = &(this->currentState()); + std::set res; + for (const auto &v : states) { + if (v.second->parents.count(currentState)) { + res.insert(v.first); + } + } + return res; +} + +const ParserState &ParserStack::currentState() +{ + return stack.empty() ? ParserStates::None : stack.top()->state(); +} + +std::string ParserStack::currentCommandName() +{ + return stack.empty() ? std::string{} : stack.top()->name(); +} + +const ParserState *ParserStack::findTargetState(const std::string &name) +{ + const ParserState *currentState = &(this->currentState()); + auto range = states.equal_range(name); + for (auto it = range.first; it != range.second; it++) { + const ParserStateSet &parents = it->second->parents; + if (parents.count(currentState) || parents.count(&ParserStates::All)) { + return it->second; + } + } + + return nullptr; +} + +void ParserStack::start(const std::string &name, Variant::mapType &args, + const SourceLocation &location) +{ + ParserState const *targetState = findTargetState(name); +// TODO: Andreas, please improve this. +// if (!Utils::isIdentifier(name)) { +// throw LoggableException(std::string("Invalid identifier \"") + name + +// std::string("\"")); +// } + + if (targetState == nullptr) { + targetState = findTargetState("*"); + } + if (targetState == nullptr) { + throw InvalidCommand(name, expectedCommands()); + } + + // Fetch the associated constructor + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : DefaultHandler::create; + + // Canonicalize the arguments, allow additional arguments + targetState->arguments.validateMap(args, ctx.getLogger(), true); + + // Instantiate the handler and call its start function + Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); + handler->start(args); + stack.emplace(handler); +} + +void ParserStack::start(std::string name, const Variant::mapType &args, + const SourceLocation &location) +{ + Variant::mapType argsCopy(args); + start(name, argsCopy); +} + +void ParserStack::end() +{ + // Check whether the current command could be ended + if (stack.empty()) { + throw LoggableException{"No command to end."}; + } + + // Remove the current HandlerInstance from the stack + std::shared_ptr inst{stack.top()}; + stack.pop(); + + // Call the end function of the last Handler + inst->end(); +} + +void ParserStack::data(const std::string &data, int field) +{ + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException{"No command to receive data."}; + } + + // Pass the data to the current Handler instance + stack.top()->data(data, field); +} +} + diff --git a/src/core/parser/generic/ParserStateStack.hpp b/src/core/parser/generic/ParserStateStack.hpp new file mode 100644 index 0000000..efc4e4a --- /dev/null +++ b/src/core/parser/generic/ParserStateStack.hpp @@ -0,0 +1,361 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file ParserStack.hpp + * + * Helper classes for document or description parsers. Contains the ParserStack + * class, which is an pushdown automaton responsible for accepting commands in + * the correct order and calling specified handlers. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_HPP_ +#define _OUSIA_PARSER_STACK_HPP_ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "Parser.hpp" +#include "ParserContext.hpp" +#include "ParserState.hpp" + +namespace ousia { + +/** + * Struct collecting all the data that is being passed to a Handler instance. + */ +struct HandlerData { + /** + * Reference to the ParserContext instance that should be used to resolve + * references to nodes in the Graph. + */ + ParserContext &ctx; + + /** + * Contains the name of the tag that is being handled. + */ + const std::string name; + + /** + * Contains the current state of the state machine. + */ + const ParserState &state; + + /** + * Contains the state of the state machine when the parent node was handled. + */ + const ParserState &parentState; + + /** + * Current source code location. + */ + const SourceLocation location; + + /** + * Constructor of the HandlerData class. + * + * @param ctx is the parser context the handler should be executed in. + * @param name is the name of the string. + * @param state is the state this handler was called for. + * @param parentState is the state of the parent command. + * @param location is the location at which the handler is created. + */ + HandlerData(ParserContext &ctx, std::string name, const ParserState &state, + const ParserState &parentState, const SourceLocation location) + : ctx(ctx), + name(std::move(name)), + state(state), + parentState(parentState), + location(location){}; +}; + +/** + * The handler class provides a context for handling an XML tag. It has to be + * overridden and registered in the StateStack class to form handlers for + * concrete XML tags. + */ +class Handler { +private: + /** + * Structure containing the internal handler data. + */ + const HandlerData handlerData; + +public: + /** + * Constructor of the Handler class. + * + * @param data is a structure containing all data being passed to the + * handler. + */ + Handler(const HandlerData &handlerData) : handlerData(handlerData){}; + + /** + * Virtual destructor. + */ + virtual ~Handler(){}; + + /** + * Returns a reference at the ParserContext. + * + * @return a reference at the ParserContext. + */ + ParserContext &context() { return handlerData.ctx; } + + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &name() { return handlerData.name; } + + /** + * Returns a reference at the ParserScope instance. + * + * @return a reference at the ParserScope instance. + */ + ParserScope &scope() { return handlerData.ctx.getScope(); } + + /** + * Returns a reference at the Manager instance which manages all nodes. + * + * @return a referance at the Manager instance. + */ + Manager &manager() { return handlerData.ctx.getManager(); } + + /** + * Returns a reference at the Logger instance used for logging error + * messages. + * + * @return a reference at the Logger instance. + */ + Logger &logger() { return handlerData.ctx.getLogger(); } + + /** + * Returns a reference at the Project Node, representing the project into + * which the file is currently being parsed. + * + * @return a referance at the Project Node. + */ + Rooted project() { return handlerData.ctx.getProject(); } + + /** + * Reference at the ParserState descriptor for which this Handler was + * created. + * + * @return a const reference at the constructing ParserState descriptor. + */ + const ParserState &state() { return handlerData.state; } + + /** + * Reference at the ParserState descriptor of the parent state of the state + * for which this Handler was created. Set to ParserStates::None if there + * is no parent state. + * + * @return a const reference at the parent state of the constructing + * ParserState descriptor. + */ + const ParserState &parentState() { return handlerData.parentState; } + + /** + * Returns the current location in the source file. + * + * @return the current location in the source file. + */ + SourceLocation location() { return handlerData.location; } + + /** + * Called when the command that was specified in the constructor is + * instanciated. + * + * @param args is a map from strings to variants (argument name and value). + */ + virtual void start(Variant::mapType &args) = 0; + + /** + * Called whenever the command for which this handler is defined ends. + */ + virtual void end() = 0; + + /** + * Called whenever raw data (int the form of a string) is available for the + * Handler instance. In the default handler an exception is raised if the + * received data contains non-whitespace characters. + * + * @param data is a pointer at the character data that is available for the + * Handler instance. + * @param field is the field number (the interpretation of this value + * depends on the format that is being parsed). + */ + virtual void data(const std::string &data, int field); +}; + +/** + * HandlerConstructor is a function pointer type used to create concrete + * instances of the Handler class. + * + * @param handlerData is the data that should be passed to the new handler + * instance. + * @return a newly created handler instance. + */ +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * The ParserStack class is a pushdown automaton responsible for turning a + * command stream into a tree of Node instances. + */ +class ParserStack { +private: + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap &states; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::stack> stack; + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from for the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + *state + * otherwise. + */ + const ParserState *findTargetState(const std::string &name); + +public: + /** + * Creates a new instance of the ParserStack class. + * + * @param ctx is the parser context the parser stack is working on. + * @param states is a map containing the command names and pointers at the + * corresponding ParserState instances. + */ + ParserStack(ParserContext &ctx, + const std::multimap &states); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + + an equivalent state as the one in the including file. + * + * @param scope is the ParserScope instance from which the ParserState + * should be reconstructed. + * @param logger is the logger instance to which error messages should be + * written. + * @return true if the operation was sucessful, false otherwise. + */ + bool deduceState(); + + /** + * Returns the state the ParserStack instance currently is in. + * + * @return the state of the currently active Handler instance or STATE_NONE + * if no handler is on the stack. + */ + const ParserState ¤tState(); + + /** + * Returns the command name that is currently being handled. + * + * @return the name of the command currently being handled by the active + * Handler instance or an empty string if no handler is currently active. + */ + std::string currentCommandName(); + + /** + * Function that should be called whenever a new command starts. + * + * @param name is the name of the command. + * @param args is a map from strings to variants (argument name and value). + * Note that the passed map will be modified. + * @param location is the location in the source file at which the command + * starts. + */ + void start(const std::string &name, Variant::mapType &args, + const SourceLocation &location = SourceLocation{}); + + /** + * Function that should be called whenever a new command starts. + * + * @param name is the name of the command. + * @param args is a map from strings to variants (argument name and value). + * @param location is the location in the source file at which the command + * starts. + */ + void start(std::string name, + const Variant::mapType &args = Variant::mapType{}, + const SourceLocation &location = SourceLocation{}); + + /** + * Function called whenever a command ends. + */ + void end(); + + /** + * Function that should be called whenever data is available for the + * command. + * + * @param data is the data that should be passed to the handler. + * @param field is the field number (the interpretation of this value + * depends on the format that is being parsed). + */ + void data(const std::string &data, int field = 0); + + /** + * Returns a reference to the parser context the parser stack is currently + * working on. + * + * @return a reference to the parser context. + */ + ParserContext &getContext() { return ctx; } +}; +} + +#endif /* _OUSIA_PARSER_STACK_HPP_ */ + diff --git a/src/formats/osdmx/OsdmxParser.cpp b/src/formats/osdmx/OsdmxParser.cpp new file mode 100644 index 0000000..c46d9de --- /dev/null +++ b/src/formats/osdmx/OsdmxParser.cpp @@ -0,0 +1,1435 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "XmlParser.hpp" + +namespace ousia { + +/* HeadNode Helper class */ + +namespace { +class HeadNode : public Node { +public: + using Node::Node; +}; +} + +namespace RttiTypes { +static Rtti HeadNode = RttiBuilder("HeadNode"); +} + +/* Element Handler Classes */ + +class DocumentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted document = + project()->createDocument(args["name"].asString()); + document->setLocation(location()); + scope().push(document); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentHandler{handlerData}; + } +}; + +class DocumentField : public Node { +public: + DocumentField(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DocumentField = + RttiBuilder("DocumentField").parent(&Node); +} + +class DocumentChildHandler : public Handler { +public: + using Handler::Handler; + + void preamble(Handle parentNode, std::string &fieldName, + DocumentEntity *&parent, bool &inField) + { + // check if the parent in the structure tree was an explicit field + // reference. + inField = parentNode->isa(&RttiTypes::DocumentField); + if (inField) { + fieldName = parentNode->getName(); + parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); + } else { + // if it wasn't an explicit reference, we use the default field. + fieldName = DEFAULT_FIELD_NAME; + } + // reference the parent entity explicitly. + parent = nullptr; + if (parentNode->isa(&RttiTypes::StructuredEntity)) { + parent = static_cast( + parentNode.cast().get()); + } else if (parentNode->isa(&RttiTypes::AnnotationEntity)) { + parent = static_cast( + parentNode.cast().get()); + } + } + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::Document, &RttiTypes::StructuredEntity, + &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // try to find a FieldDescriptor for the given tag if we are not in a + // field already. + // TODO: Consider fields of transparent classes + if (!inField && parent != nullptr && + parent->getDescriptor()->hasField(name())) { + Rooted field{new DocumentField( + parentNode->getManager(), fieldName, parentNode)}; + field->setLocation(location()); + scope().push(field); + return; + } + + // Otherwise create a new StructuredEntity + // TODO: Consider Anchors and AnnotationEntities + Rooted strct = scope().resolve( + Utils::split(name(), ':'), logger()); + if (strct == nullptr) { + // if we could not resolve the name, throw an exception. + throw LoggableException( + std::string("\"") + name() + "\" could not be resolved.", + location()); + } + + std::string name; + auto it = args.find("name"); + if (it != args.end()) { + name = it->second.asString(); + args.erase(it); + } + + Rooted entity; + if (parentNode->isa(&RttiTypes::Document)) { + entity = parentNode.cast()->createRootStructuredEntity( + strct, args, name); + } else { + // calculate a path if transparent entities are needed in between. + auto path = parent->getDescriptor()->pathTo(strct); + if (path.empty()) { + throw LoggableException( + std::string("An instance of \"") + strct->getName() + + "\" is not allowed as child of an instance of \"" + + parent->getDescriptor()->getName() + "\"", + location()); + } + + // create all transparent entities until the last field. + for (size_t p = 1; p < path.size() - 1; p = p + 2) { + parent = static_cast( + parent->createChildStructuredEntity( + path[p].cast(), + Variant::mapType{}, path[p - 1]->getName(), + "").get()); + } + entity = parent->createChildStructuredEntity(strct, args, fieldName, + name); + } + entity->setLocation(location()); + scope().push(entity); + } + + void end() override { scope().pop(); } + + void data(const std::string &data, int fieldIdx) override + { + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // retrieve the correct FieldDescriptor. + // TODO: Consider fields of transparent classes + Rooted desc = parent->getDescriptor(); + Rooted field = desc->getFieldDescriptor(fieldName); + if (field == nullptr) { + logger().error( + std::string("Can't handle data because no field with name \"") + + fieldName + "\" exists in descriptor\"" + desc->getName() + + "\".", + location()); + return; + } + if (!field->isPrimitive()) { + logger().error(std::string("Can't handle data because field \"") + + fieldName + "\" of descriptor \"" + + desc->getName() + "\" is not primitive!", + location()); + return; + } + + // try to parse the content. + auto res = VariantReader::parseGenericString( + data, logger(), location().getSourceId(), location().getStart()); + if (!res.first) { + return; + } + // try to convert it to the correct type. + if (!field->getPrimitiveType()->build(res.second, logger())) { + return; + } + // add it as primitive content. + parent->createChildDocumentPrimitive(res.second, fieldName); + } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentChildHandler{handlerData}; + } +}; + +class TypesystemHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Create the typesystem instance + Rooted typesystem = + project()->createTypesystem(args["name"].asString()); + typesystem->setLocation(location()); + + // Push the typesystem onto the scope, set the POST_HEAD flag to true + scope().push(typesystem); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemHandler{handlerData}; + } +}; + +class TypesystemEnumHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the current typesystem and create the enum node + Rooted typesystem = scope().selectOrThrow(); + Rooted enumType = + typesystem->createEnumType(args["name"].asString()); + enumType->setLocation(location()); + + scope().push(enumType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumHandler{handlerData}; + } +}; + +class TypesystemEnumEntryHandler : public Handler { +public: + using Handler::Handler; + + std::string entry; + + void start(Variant::mapType &args) override {} + + void end() override + { + Rooted enumType = scope().selectOrThrow(); + enumType->addEntry(entry, logger()); + } + + void data(const std::string &data, int field) override + { + if (field != 0) { + // TODO: This should be stored in the HandlerData + logger().error("Enum entry only has one field."); + return; + } + entry.append(data); + } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumEntryHandler{handlerData}; + } +}; + +class TypesystemStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the arguments used for creating this type + const std::string &name = args["name"].asString(); + const std::string &parent = args["parent"].asString(); + + // Fetch the current typesystem and create the struct node + Rooted typesystem = scope().selectOrThrow(); + Rooted structType = typesystem->createStructType(name); + structType->setLocation(location()); + + // Try to resolve the parent type and set it as parent structure + if (!parent.empty()) { + scope().resolve( + parent, structType, logger(), + [](Handle parent, Handle structType, + Logger &logger) { + if (parent != nullptr) { + structType.cast()->setParentStructure( + parent.cast(), logger); + } + }); + } + scope().push(structType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructHandler{handlerData}; + } +}; + +class TypesystemStructFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &defaultValue = args["default"]; + const bool optional = + !(defaultValue.isObject() && defaultValue.asObject() == nullptr); + + Rooted structType = scope().selectOrThrow(); + Rooted attribute = + structType->createAttribute(name, defaultValue, optional, logger()); + attribute->setLocation(location()); + + // Try to resolve the type and default value + if (optional) { + scope().resolveTypeWithValue( + type, attribute, attribute->getDefaultValue(), logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } else { + scope().resolveType( + type, attribute, logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructFieldHandler{handlerData}; + } +}; + +class TypesystemConstantHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &value = args["value"]; + + Rooted typesystem = scope().selectOrThrow(); + Rooted constant = typesystem->createConstant(name, value); + constant->setLocation(location()); + + // Try to resolve the type + scope().resolveTypeWithValue( + type, constant, constant->getValue(), logger(), + [](Handle type, Handle constant, Logger &logger) { + if (type != nullptr) { + constant.cast()->setType(type.cast(), + logger); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemConstantHandler{handlerData}; + } +}; + +/* + * Domain Handlers + */ + +class DomainHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted domain = + project()->createDomain(args["name"].asString()); + domain->setLocation(location()); + + scope().push(domain); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainHandler{handlerData}; + } +}; + +class DomainStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted structuredClass = domain->createStructuredClass( + args["name"].asString(), args["cardinality"].asCardinality(), + nullptr, args["transparent"].asBool(), args["isRoot"].asBool()); + structuredClass->setLocation(location()); + + const std::string &isa = args["isa"].asString(); + if (!isa.empty()) { + scope().resolve( + isa, structuredClass, logger(), + [](Handle superclass, Handle structuredClass, + Logger &logger) { + if (superclass != nullptr) { + structuredClass.cast()->setSuperclass( + superclass.cast(), logger); + } + }); + } + + scope().push(structuredClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainStructHandler{handlerData}; + } +}; + +class DomainAnnotationHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted annotationClass = + domain->createAnnotationClass(args["name"].asString()); + annotationClass->setLocation(location()); + + scope().push(annotationClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAnnotationHandler{handlerData}; + } +}; + +class DomainAttributesHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Fetch the current typesystem and create the struct node + Rooted parent = scope().selectOrThrow(); + + Rooted attrDesc = parent->getAttributesDescriptor(); + attrDesc->setLocation(location()); + + scope().push(attrDesc); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAttributesHandler{handlerData}; + } +}; + +class DomainFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createFieldDescriptor( + type, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldHandler{handlerData}; + } +}; + +class DomainFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + scope().resolve( + name, parent, logger(), + [](Handle field, Handle parent, Logger &logger) { + if (field != nullptr) { + parent.cast()->addFieldDescriptor( + field.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldRefHandler{handlerData}; + } +}; + +class DomainPrimitiveHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createPrimitiveFieldDescriptor( + nullptr, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + const std::string &type = args["type"].asString(); + scope().resolve( + type, field, logger(), + [](Handle type, Handle field, Logger &logger) { + if (type != nullptr) { + field.cast()->setPrimitiveType( + type.cast()); + } + }); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainPrimitiveHandler{handlerData}; + } +}; + +class DomainChildHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted field = + scope().selectOrThrow(); + + const std::string &ref = args["ref"].asString(); + scope().resolve( + ref, field, logger(), + [](Handle child, Handle field, Logger &logger) { + if (child != nullptr) { + field.cast()->addChild( + child.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainChildHandler{handlerData}; + } +}; + +class DomainParent : public Node { +public: + DomainParent(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DomainParent = + RttiBuilder("DomainParent").parent(&Node); +} + +class DomainParentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted strct = + scope().selectOrThrow(); + + Rooted parent{new DomainParent( + strct->getManager(), args["name"].asString(), strct)}; + parent->setLocation(location()); + scope().push(parent); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentHandler{handlerData}; + } +}; + +class DomainParentFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + const std::string &name = args["name"].asString(); + const bool optional = args["optional"].asBool(); + Rooted strct = + parentNameNode->getParent().cast(); + + // resolve the parent, create the declared field and add the declared + // StructuredClass as child to it. + scope().resolve( + parentNameNode->getName(), strct, logger(), + [type, name, optional](Handle parent, Handle strct, + Logger &logger) { + if (parent != nullptr) { + Rooted field = + parent.cast()->createFieldDescriptor( + type, name, optional); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldHandler{handlerData}; + } +}; + +class DomainParentFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + Rooted strct = + parentNameNode->getParent().cast(); + auto loc = location(); + + // resolve the parent, get the referenced field and add the declared + // StructuredClass as child to it. + scope().resolve(parentNameNode->getName(), strct, logger(), + [name, loc](Handle parent, + Handle strct, + Logger &logger) { + if (parent != nullptr) { + auto res = parent.cast()->resolve( + &RttiTypes::FieldDescriptor, name); + if (res.size() != 1) { + logger.error( + std::string("Could not find referenced field ") + name, + loc); + return; + } + Rooted field = + res[0].node.cast(); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldRefHandler{handlerData}; + } +}; + +/* + * Import and Include Handler + */ + +class ImportIncludeHandler : public Handler { +public: + using Handler::Handler; + + bool srcInArgs = false; + std::string rel; + std::string type; + std::string src; + + void start(Variant::mapType &args) override + { + rel = args["rel"].asString(); + type = args["type"].asString(); + src = args["src"].asString(); + srcInArgs = !src.empty(); + } + + void data(const std::string &data, int field) override + { + if (srcInArgs) { + logger().error("\"src\" attribute has already been set"); + return; + } + if (field != 0) { + logger().error("Command has only one field."); + return; + } + src.append(data); + } +}; + +class ImportHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + + // Make sure imports are still possible + if (scope().getFlag(ParserFlag::POST_HEAD)) { + logger().error("Imports must be listed before other commands.", + location()); + return; + } + } + + void end() override + { + // Fetch the last node and check whether an import is valid at this + // position + Rooted leaf = scope().getLeaf(); + if (leaf == nullptr || !leaf->isa(&RttiTypes::RootNode)) { + logger().error( + "Import not supported here, must be inside a document, domain " + "or typesystem command.", + location()); + return; + } + Rooted leafRootNode = leaf.cast(); + + // Perform the actual import, register the imported node within the leaf + // node + Rooted imported = + context().import(src, type, rel, leafRootNode->getReferenceTypes()); + if (imported != nullptr) { + leafRootNode->reference(imported); + } + } + + static Handler *create(const HandlerData &handlerData) + { + return new ImportHandler{handlerData}; + } +}; + +class IncludeHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + } + + void end() override + { + context().include(src, type, rel, {&RttiTypes::Node}); + } + + static Handler *create(const HandlerData &handlerData) + { + return new IncludeHandler{handlerData}; + } +}; + +namespace ParserStates { +/* Document states */ +static const ParserState Document = + ParserStateBuilder() + .parent(&None) + .createdNodeType(&RttiTypes::Document) + .elementHandler(DocumentHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState DocumentChild = + ParserStateBuilder() + .parents({&Document, &DocumentChild}) + .createdNodeTypes({&RttiTypes::StructureNode, + &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}) + .elementHandler(DocumentChildHandler::create); + +/* Domain states */ +static const ParserState Domain = ParserStateBuilder() + .parents({&None, &Document}) + .createdNodeType(&RttiTypes::Domain) + .elementHandler(DomainHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStruct = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::StructuredClass) + .elementHandler(DomainStructHandler::create) + .arguments({Argument::String("name"), + Argument::Cardinality("cardinality", Cardinality::any()), + Argument::Bool("isRoot", false), + Argument::Bool("transparent", false), + Argument::String("isa", "")}); + +static const ParserState DomainAnnotation = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::AnnotationClass) + .elementHandler(DomainAnnotationHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainAttributes = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(DomainAttributesHandler::create) + .arguments({}); + +static const ParserState DomainAttribute = + ParserStateBuilder() + .parent(&DomainAttributes) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState DomainField = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainFieldRef = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +static const ParserState DomainStructPrimitive = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainPrimitiveHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("optional", false), + Argument::String("type")}); + +static const ParserState DomainStructChild = + ParserStateBuilder() + .parent(&DomainField) + .elementHandler(DomainChildHandler::create) + .arguments({Argument::String("ref")}); + +static const ParserState DomainStructParent = + ParserStateBuilder() + .parent(&DomainStruct) + .createdNodeType(&RttiTypes::DomainParent) + .elementHandler(DomainParentHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStructParentField = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainStructParentFieldRef = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +/* Typesystem states */ +static const ParserState Typesystem = + ParserStateBuilder() + .parents({&None, &Domain}) + .createdNodeType(&RttiTypes::Typesystem) + .elementHandler(TypesystemHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState TypesystemEnum = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::EnumType) + .elementHandler(TypesystemEnumHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState TypesystemEnumEntry = + ParserStateBuilder() + .parent(&TypesystemEnum) + .elementHandler(TypesystemEnumEntryHandler::create) + .arguments({}); + +static const ParserState TypesystemStruct = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(TypesystemStructHandler::create) + .arguments({Argument::String("name"), Argument::String("parent", "")}); + +static const ParserState TypesystemStructField = + ParserStateBuilder() + .parent(&TypesystemStruct) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState TypesystemConstant = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::Constant) + .elementHandler(TypesystemConstantHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("value")}); + +/* Special states for import and include */ +static const ParserState Import = + ParserStateBuilder() + .parents({&Document, &Typesystem, &Domain}) + .elementHandler(ImportHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const ParserState Include = + ParserStateBuilder() + .parent(&All) + .elementHandler(IncludeHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const std::multimap XmlStates{ + {"document", &Document}, + {"*", &DocumentChild}, + {"domain", &Domain}, + {"struct", &DomainStruct}, + {"annotation", &DomainAnnotation}, + {"attributes", &DomainAttributes}, + {"attribute", &DomainAttribute}, + {"field", &DomainField}, + {"fieldRef", &DomainFieldRef}, + {"primitive", &DomainStructPrimitive}, + {"child", &DomainStructChild}, + {"parent", &DomainStructParent}, + {"field", &DomainStructParentField}, + {"fieldRef", &DomainStructParentFieldRef}, + {"typesystem", &Typesystem}, + {"enum", &TypesystemEnum}, + {"entry", &TypesystemEnumEntry}, + {"struct", &TypesystemStruct}, + {"field", &TypesystemStructField}, + {"constant", &TypesystemConstant}, + {"import", &Import}, + {"include", &Include}}; +} + +/** + * Structue containing the private data that is being passed to the + * XML-Handlers. + */ +struct XMLUserData { + /** + * Containing the depth of the current XML file + */ + size_t depth; + + /** + * Reference at the ParserStack instance. + */ + ParserStack *stack; + + /** + * Reference at the CharReader instance. + */ + CharReader *reader; + + /** + * Constructor of the XMLUserData struct. + * + * @param stack is a pointer at the ParserStack instance. + * @param reader is a pointer at the CharReader instance. + */ + XMLUserData(ParserStack *stack, CharReader *reader) + : depth(0), stack(stack), reader(reader) + { + } +}; + +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class ScopedExpatXmlParser { +private: + /** + * Internal pointer to the XML_Parser instance. + */ + XML_Parser parser; + +public: + /** + * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * from the expat library. Throws a parser exception if the XML parser + * cannot be initialized. + * + * @param encoding is the protocol-defined encoding passed to expat (or + * nullptr if expat should determine the encoding by itself). + */ + ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + { + parser = XML_ParserCreate(encoding); + if (!parser) { + throw LoggableException{ + "Internal error: Could not create expat XML parser!"}; + } + } + + /** + * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + */ + ~ScopedExpatXmlParser() + { + if (parser) { + XML_ParserFree(parser); + parser = nullptr; + } + } + + /** + * Returns the XML_Parser pointer. + */ + XML_Parser operator&() { return parser; } +}; + +/* Adapter Expat -> ParserStack */ + +static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) +{ + // Fetch the parser stack and the associated user data + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + // Fetch the current location in the XML file + size_t offs = XML_GetCurrentByteIndex(p); + + // Build the source location and update the default location of the + // current + // logger instance + SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; + stack->getContext().getLogger().setDefaultLocation(loc); + return loc; +} + +enum class XMLAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +static std::map reconstructXMLAttributeOffsets( + CharReader &reader, SourceLocation location) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + size_t offs = location.getStart(); + if (!location.isValid() || offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XMLAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::SEARCH_ATTR; + } + break; + case XMLAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XMLAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XMLAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XMLAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XMLAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + attrName.str(std::string{}); + state = XMLAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XMLAttributeState::IN_ATTR_NAME; + } + } + break; + case XMLAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, start anew + state = XMLAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} + +static void xmlStartElementHandler(void *p, const XML_Char *name, + const XML_Char **attrs) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + SourceLocation loc = syncLoggerPosition(parser); + + // Read the argument locations -- this is only a stupid and slow hack, + // but it is necessary, as expat doesn't give use the byte offset of the + // arguments. + std::map offs = + reconstructXMLAttributeOffsets(*userData->reader, loc); + + // Assemble the arguments + Variant::mapType args; + + const XML_Char **attr = attrs; + while (*attr) { + // Convert the C string to a std::string + const std::string key{*(attr++)}; + + // Search the location of the key + SourceLocation keyLoc; + auto it = offs.find(key); + if (it != offs.end()) { + keyLoc = it->second; + } + + // Parse the string, pass the location of the key + std::pair value = VariantReader::parseGenericString( + *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + keyLoc.getStart()); + args.emplace(key, value.second); + } + + // Call the start function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->start(std::string(name), args, loc); + } + + // Increment the current depth + userData->depth++; +} + +static void xmlEndElementHandler(void *p, const XML_Char *name) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + syncLoggerPosition(parser); + + // Decrement the current depth + userData->depth--; + + // Call the end function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->end(); + } +} + +static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + size_t ulen = len > 0 ? static_cast(len) : 0; + syncLoggerPosition(parser, ulen); + const std::string data = Utils::trim(std::string{s, ulen}); + if (!data.empty()) { + stack->data(data); + } +} + +/* Class XmlParser */ + +void XmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ + // Create the parser object + ScopedExpatXmlParser p{"UTF-8"}; + + // Create the parser stack instance, if we're starting on a non-empty scope, + // try to deduce the parser state + ParserStack stack(ctx, ParserStates::XmlStates); + if (!ctx.getScope().isEmpty()) { + if (!stack.deduceState()) { + return; + } + } + + // Pass the reference to the ParserStack to the XML handler + XMLUserData data(&stack, &reader); + XML_SetUserData(&p, &data); + XML_UseParserAsHandlerArg(&p); + + // Set the callback functions + XML_SetStartElementHandler(&p, xmlStartElementHandler); + XML_SetEndElementHandler(&p, xmlEndElementHandler); + XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + + // Feed data into expat while there is data to process + constexpr size_t BUFFER_SIZE = 64 * 1024; + while (true) { + // Fetch a buffer from expat for the input data + char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); + if (!buf) { + throw LoggableException{ + "Internal error: XML parser out of memory!"}; + } + + // Read into the buffer + size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + + // Parse the data and handle any XML error + if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { + // Fetch the xml parser byte offset + size_t offs = XML_GetCurrentByteIndex(&p); + + // Throw a corresponding exception + XML_Error code = XML_GetErrorCode(&p); + std::string msg = std::string{XML_ErrorString(code)}; + throw LoggableException{"XML: " + msg, + SourceLocation{ctx.getSourceId(), offs}}; + } + + // Abort once there are no more bytes in the stream + if (bytesRead == 0) { + break; + } + } +} +} + diff --git a/src/formats/osdmx/OsdmxParser.hpp b/src/formats/osdmx/OsdmxParser.hpp new file mode 100644 index 0000000..c8b6302 --- /dev/null +++ b/src/formats/osdmx/OsdmxParser.hpp @@ -0,0 +1,55 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file XmlParser.hpp + * + * Contains the parser responsible for reading Ousía XML Documents (extension + * oxd) and Ousía XML Modules (extension oxm). + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_XML_PARSER_HPP_ +#define _OUSIA_XML_PARSER_HPP_ + +#include + +namespace ousia { + +/** + * The XmlParser class implements parsing the various types of Ousía XML + * documents using the expat stream XML parser. + */ +class XmlParser : public Parser { +protected: + /** + * Parses the given input stream as XML file and returns the parsed + * top-level node. + * + * @param reader is the CharReader from which the input should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_XML_PARSER_HPP_ */ + diff --git a/src/plugins/css/CodeTokenizer.cpp b/src/plugins/css/CodeTokenizer.cpp new file mode 100644 index 0000000..d65c514 --- /dev/null +++ b/src/plugins/css/CodeTokenizer.cpp @@ -0,0 +1,169 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include "CodeTokenizer.hpp" + +namespace ousia { + +Token CodeTokenizer::constructToken(const Token &t) +{ + std::string content = buf.str(); + buf.str(std::string()); + return Token{ + returnTokenId, content, + SourceLocation{t.location.getSourceId(), startToken.location.getStart(), + t.location.getEnd()}}; +} + +void CodeTokenizer::buffer(const Token &t) { buf << t.content; } + +bool CodeTokenizer::doPrepare(const Token &t, std::deque &peeked) +{ + auto it = descriptors.find(t.tokenId); + CodeTokenMode mode = CodeTokenMode::NONE; + if (it != descriptors.end()) { + mode = it->second.mode; + } + + switch (state) { + case CodeTokenizerState::NORMAL: + switch (mode) { + case CodeTokenMode::STRING_START_END: + state = CodeTokenizerState::IN_STRING; + break; + case CodeTokenMode::BLOCK_COMMENT_START: + state = CodeTokenizerState::IN_BLOCK_COMMENT; + break; + case CodeTokenMode::LINE_COMMENT: + state = CodeTokenizerState::IN_LINE_COMMENT; + break; + case CodeTokenMode::LINEBREAK: + if (!ignoreLinebreaks) { + peeked.push_back( + {it->second.id, t.content, t.location}); + } + return !ignoreLinebreaks; + default: + bool empty = true; + if (t.tokenId == TOKEN_TEXT) { + int begin = -1; + for (size_t c = 0; c < t.content.length(); c++) { + bool isWhitespace = + t.content[c] == ' ' || t.content[c] == '\t'; + if (begin < 0) { + // if we have not yet set our beginning, + // we wait for the first + // non-whitespace-character to set it. + if (!isWhitespace) { + begin = c; + } + } else { + // if we have set our beginning, we wait for the + // first whitespace character, which marks the + // end of the current word. + if (isWhitespace) { + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin, (int)c - begin), + SourceLocation{ + t.location.getSourceId(), + t.location.getStart() + begin, + t.location.getStart() + c}}); + begin = -1; + empty = false; + } + } + } + if (begin >= 0) { + peeked.push_back(Token{ + TOKEN_TEXT, t.content.substr(begin), + SourceLocation{t.location.getSourceId(), + t.location.getStart() + begin, + t.location.getEnd()}}); + empty = false; + } + } else { + empty = false; + peeked.push_back(t); + } + return !empty; + } + startToken = t; + returnTokenId = it->second.id; + return false; + case CodeTokenizerState::IN_LINE_COMMENT: + switch (mode) { + case CodeTokenMode::LINEBREAK: + state = CodeTokenizerState::NORMAL; + if (!ignoreComments) { + peeked.push_back(constructToken(t)); + } + return !ignoreComments; + default: + if (!ignoreComments) { + buffer(t); + } + return false; + } + case CodeTokenizerState::IN_BLOCK_COMMENT: + switch (mode) { + case CodeTokenMode::BLOCK_COMMENT_END: + state = CodeTokenizerState::NORMAL; + if (!ignoreComments) { + peeked.push_back(constructToken(t)); + } + return !ignoreComments; + default: + if (!ignoreComments) { + buffer(t); + } + return false; + } + case CodeTokenizerState::IN_STRING: + switch (mode) { + case CodeTokenMode::ESCAPE: + if (escaped) { + buffer(t); + } + escaped = !escaped; + return false; + case CodeTokenMode::STRING_START_END: + if (escaped) { + buffer(t); + escaped = false; + return false; + } else { + peeked.push_back(constructToken(t)); + state = CodeTokenizerState::NORMAL; + return true; + } + default: + if (escaped) { + // TODO: handle escaped characters? + escaped = false; + } + buffer(t); + return false; + } + } + assert(false); + return false; +} +} diff --git a/src/plugins/css/CodeTokenizer.hpp b/src/plugins/css/CodeTokenizer.hpp new file mode 100644 index 0000000..154f949 --- /dev/null +++ b/src/plugins/css/CodeTokenizer.hpp @@ -0,0 +1,136 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file CodeTokenizer.hpp + + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) + */ +#ifndef _OUSIA_CODE_TOKENIZER_HPP_ +#define _OUSIA_CODE_TOKENIZER_HPP_ + +#include +#include + +#include +#include "Tokenizer.hpp" + +namespace ousia { + +/* + * This enum contains all special Token the CodeTokenizer supports, namely: + * + * 1.) An ambigous Tokens - in post programming languages single-quotes ' or + * double-quotes " - to delimit string tokens. + * 2.) A start token for line comments, which would e.g. be // in Java. + * 3.) A start token for a block comment + * 4.) An end token for a block comment. + * 5.) A linebreak token + * 6.) The escape token, which would e.g. be \ in java. + */ +enum class CodeTokenMode { + STRING_START_END, + LINE_COMMENT, + BLOCK_COMMENT_START, + BLOCK_COMMENT_END, + LINEBREAK, + ESCAPE, + NONE +}; + +/** + * A CodeTokenDescriptor defines the id the user likes to have returned for + * a Token of the mode specified, e.g. if you want to get the id 4 for a + * String Token the corresponding CodeTokenDescriptor would be inizialized + * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; + */ +struct CodeTokenDescriptor { + CodeTokenMode mode; + int id; + + CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} +}; + +/** + * The CodeTokenizer is a finite state machine with the states NORMAL, being + * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. + */ +enum class CodeTokenizerState { + NORMAL, + IN_BLOCK_COMMENT, + IN_LINE_COMMENT, + IN_STRING +}; + +/** + * The purpose of a CodeTokenizer is to make it easier to parse classical + * programming Code. It adds the following features to a regular Tokenizer: + * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens + * for the opening delimiter, the text and the closing delimiter. + * 2.) Escaping in String tokens. + * 3.) Comment Tokens (for line comments as well as block comments) + */ +class CodeTokenizer : public Tokenizer { +private: + std::map descriptors; + CodeTokenizerState state; + std::stringstream buf; + Token startToken; + int returnTokenId; + bool escaped = false; + + Token constructToken(const Token &t); + void buffer(const Token &t); + +protected: + bool doPrepare(const Token &t, std::deque &peeked) override; + +public: + /** + * If you do not want comment tokens to be returned you can set this to + * true. + */ + bool ignoreComments = false; + /** + * If you do not want linebreaks to be returned you can set this to true. + */ + bool ignoreLinebreaks = false; + + /** + * + * @param input a CharReader containing the input for this tokenizer, as + * with a regular tokenizer. + * @param root a TokenTreeNode representing the root of the TokenTree. + * Please note that you have to specify all tokenIDs here that you use + * in the descriptors map. + * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. + * In this way you can specify the meaning of certain Tokens. Say you + * specified the Token "//" with the id 1 in the TokenTree. Then you could + * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map + * and this CodeTokenizer would recognize the token "//" as starting a + * line comment. + */ + CodeTokenizer(CharReader &input, const TokenTreeNode &root, + std::map descriptors) + : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) + { + } +}; +} + +#endif diff --git a/src/plugins/css/Tokenizer.cpp b/src/plugins/css/Tokenizer.cpp new file mode 100644 index 0000000..ab4735a --- /dev/null +++ b/src/plugins/css/Tokenizer.cpp @@ -0,0 +1,204 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include "Tokenizer.hpp" + +namespace ousia { + +static std::map buildChildren( + const std::map &inputs) +{ + std::map children; + std::map> nexts; + + for (auto &e : inputs) { + const std::string &s = e.first; + const int id = e.second; + if (s.empty()) { + continue; + } + char start = s[0]; + const std::string suffix = s.substr(1); + if (nexts.find(start) != nexts.end()) { + nexts[start].insert(std::make_pair(suffix, id)); + } else { + nexts.insert(std::make_pair( + start, std::map{{suffix, id}})); + } + } + + for (auto &n : nexts) { + children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); + } + + return children; +} + +static int buildId(const std::map &inputs) +{ + int tokenId = TOKEN_NONE; + for (auto &e : inputs) { + if (e.first.empty()) { + if (tokenId != TOKEN_NONE) { + throw TokenizerException{std::string{"Ambigous token found: "} + + std::to_string(e.second)}; + } else { + tokenId = e.second; + } + } + } + return tokenId; +} + +TokenTreeNode::TokenTreeNode(const std::map &inputs) + : children(buildChildren(inputs)), tokenId(buildId(inputs)) +{ +} + +Tokenizer::Tokenizer(CharReader &input, const TokenTreeNode &root) + : input(input), root(root) +{ +} + +bool Tokenizer::prepare() +{ + std::stringstream buffer; + char c; + SourcePosition start = input.getOffset(); + bool bufEmpty = true; + while (input.peek(c)) { + if (root.children.find(c) != root.children.end()) { + // if there might be a special token, keep peeking forward + // until we find the token (or we don't). + TokenTreeNode const *n = &root; + std::stringstream tBuf; + int match = TOKEN_NONE; + while (true) { + tBuf << c; + n = &(n->children.at(c)); + if (n->tokenId != TOKEN_NONE) { + match = n->tokenId; + // from here on we found a token. If we have something + // in our buffer already, we end the search now. + if (!bufEmpty) { + break; + } else { + // if we want to return this token ( = we have nothing + // in our buffer yet) we look greedily for the longest + // possible token we can construct. + input.consumePeek(); + } + } + if (!input.peek(c)) { + // if we are at the end we break off the search. + break; + } + if (n->children.find(c) == n->children.end()) { + // if we do not find a possible continuation anymore, + // break off the search. + break; + } + } + //reset the peek pointer to the last valid position. + input.resetPeek(); + // check if we did indeed find a special token. + if (match != TOKEN_NONE) { + if (bufEmpty) { + // if we did not have text before, construct that token. + if (doPrepare( + Token{match, tBuf.str(), input.getLocation(start)}, + peeked)) { + return true; + } else { + start = input.getOffset(); + continue; + } + } else { + // otherwise we return the text before the token. + if (doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, + peeked)) { + return true; + } else{ + //we need to clear the buffer here. After all the token + //corresponding to this buffer segment is already + //constructed. + buffer.str(std::string()); + bufEmpty = true; + start = input.getOffset(); + continue; + } + } + } else{ + //if we found nothing, read at least one character. + input.peek(c); + } + } + buffer << c; + bufEmpty = false; + input.consumePeek(); + } + if (!bufEmpty) { + return doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, + peeked); + } + return false; +} + +bool Tokenizer::doPrepare(const Token &t, std::deque &peeked) +{ + peeked.push_back(t); + return true; +} + +bool Tokenizer::next(Token &t) +{ + if (peeked.empty()) { + if (!prepare()) { + return false; + } + } + t = peeked.front(); + peeked.pop_front(); + resetPeek(); + return true; +} + +bool Tokenizer::peek(Token &t) +{ + if (peekCursor >= peeked.size()) { + if (!prepare()) { + return false; + } + } + t = peeked[peekCursor]; + peekCursor++; + return true; +} + +void Tokenizer::resetPeek() { peekCursor = 0; } + +void Tokenizer::consumePeek() +{ + while (peekCursor > 0) { + peeked.pop_front(); + peekCursor--; + } +} +} diff --git a/src/plugins/css/Tokenizer.hpp b/src/plugins/css/Tokenizer.hpp new file mode 100644 index 0000000..50e458c --- /dev/null +++ b/src/plugins/css/Tokenizer.hpp @@ -0,0 +1,227 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_TOKENIZER_HPP_ +#define _OUSIA_TOKENIZER_HPP_ + +#include +#include +#include +#include + +#include + +namespace ousia { + +/** + * This exception is currently only thrown if errors are made during the + * initialization of the Tokenizer. Have a closer look at the documentation + * of the TokenTreeNode constructor for more information. + */ +class TokenizerException : public std::exception { +public: + const std::string msg; + + TokenizerException(const std::string &msg) : msg(msg){}; + + virtual const char *what() const noexcept override { return msg.c_str(); } +}; + +/** + * The Tokenizer internally uses a TokenTree to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * The TokenTree is a construct that structures all special tokens this + * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then + * the TokenTree would look like this: + * + * a + * | \ + * a $ + * | \ + * b c + * | | + * $ $ + * + * Every node in the TokenTree is a valid end state that has a $ attached to it. + * During the search algorithm the Tokenizer goes through the tree and stores + * the last valid position. If a character follows that does not lead to a new + * node in the TokenTree the search ends (and starts again at this character). + * The token corresponding to the last valid position is returned. + * + * This allows us to uniquely identify the matching token given a certain + * input text. Note that this is a greedy matching approach that does not + * work if you're using truly ambiguous tokens (that have the same text). + * + * It is also not allowed that tokens have common middle parts but varying + * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and + * the input string "abc". In that case we start looking for "abd" at the + * start, won't find it, wenn we hit "c" and start the scanning process + * anew. Thus the "bc" token is not found. + * + * For most (well-behaved) tokenization schemes this is not the case, + * though. + */ +class TokenTreeNode { +public: + const std::map children; + const int tokenId; + + /** + * The TokenTreeNode constructor builds a TokenTree from the given token + * specifications. The node returned by this constructor then is the root of + * said TokenTree. + * @param inputs Specifications of tokens in map form. Each specification + * is a tuple of the text that should be matched and some unique ID (>= 0) + * that is returned to you if that Token is found in the text. + * An example for such a map would be + * { + * { "#" , 1}, + * { "##", 2}, + * { "/" , 3} + * } + * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE + * (-1) and TOKEN_TEXT (-2). + */ + TokenTreeNode(const std::map &inputs); +}; + +/** + * This is a reserved constant for the empty token. + */ +static const int TOKEN_NONE = -1; +/** + * This is a reserved constant for every part of the input text that is not a + * specified token. + */ +static const int TOKEN_TEXT = -2; + +/** + * A token for us is identified by an integer tokenID (either one of the + * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). + * Additionally we return the matched text (which should only be really + * interesting in case of TOKEN_TEXT tokens) and the position in the input text. + */ +struct Token { + int tokenId; + std::string content; + SourceLocation location; + + Token(int tokenId, std::string content, SourceLocation location) + : tokenId(tokenId), + content(content), + location(location) + { + } + + Token() : tokenId(TOKEN_NONE) {} +}; + +/** + * A Tokenizer has the purpose of subdividing an input text into tokens. In our + * definition here we distinguish between two kinds of tokens: + * 1.) User-specified tokens that match a fixed text. + * 2.) Any other text between those tokens. + * The user might want to specify the tokens '#{' and '#}' for example, because + * they have some meaning in her code. The user sets the IDs to 1 and 2. + * Given the input text + * "some text #{ special command #} some text" + * the tokenizer would return the tokens: + * 1.) "some text " with the id TOKEN_TEXT (-2). + * 2.) "#{" with the id 1. + * 3.) " special command " with the id TOKEN_TEXT (-2). + * 4.) "#}" with the id 2. + * 5.) " some text" with the id TOKEN_TEXT (-2). + * This makes the subsequent parsing of files of a specific type easier. + * Note that in case of tokens with that are prefixes of other tokens the + * longest possible match is returned. + */ +class Tokenizer { +private: + CharReader &input; + const TokenTreeNode &root; + std::deque peeked; + unsigned int peekCursor = 0; + + bool prepare(); + +protected: + /** + * This method is an interface to build multiple tokens from a single one in + * derived classes. This might be interesting if you want to implement + * further logic on text tokens or similar applications. + * + * @param t a Token the "basic" tokenizer found. + * @param peeked a reference to the deque containing all temporary Tokens. + * You are supposed to append your tokens there. In the trivial case you just + * put the given Token on top of the deque. + * @return false if no token was appended to the deque (meaning that you want + * to ignore the given token explicitly) and true in all other cases. + */ + virtual bool doPrepare(const Token &t, std::deque &peeked); + +public: + /** + * @param input The input of a Tokenizer is given in the form of a + * CharReader. Please refer to the respective documentation. + * @param root This is meant to be the root of a TokenTree giving the + * specification of user-defined tokens this Tokenizer should recognize. + * The Tokenizer promises to not change the TokenTree such that you can + * re-use the same specification for multiple inputs. + * Please refer to the TokenTreeNode documentation for more information. + */ + Tokenizer(CharReader &input, const TokenTreeNode &root); + + /** + * The next method consumes one Token from the input stream and gives + * it to the user (stored in the input argument). + * + * @param t a Token reference that is set to the next found token. + * @return true if a next token was found and false if the input is at its + * end. + */ + bool next(Token &t); + /** + * The peek method does not consume the next Token but buffers it and + * shows it to the user (stored in the input argument). + * + * @param t a Token reference that is set to the next found token. + * @return true if a next token was found and false if the input is at its + * end. + */ + bool peek(Token &t); + + /** + * Resets the peek pointer to the current position in the stream (to the + * beginning of the buffer). + */ + void resetPeek(); + + /** + * Clears the peek buffer, such that all peeked Tokens are consumed. + */ + void consumePeek(); + + const CharReader &getInput() const { return input; } + + CharReader &getInput() { return input; } +}; +} + +#endif diff --git a/src/plugins/xml/XmlParser.cpp b/src/plugins/xml/XmlParser.cpp deleted file mode 100644 index c46d9de..0000000 --- a/src/plugins/xml/XmlParser.cpp +++ /dev/null @@ -1,1435 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "XmlParser.hpp" - -namespace ousia { - -/* HeadNode Helper class */ - -namespace { -class HeadNode : public Node { -public: - using Node::Node; -}; -} - -namespace RttiTypes { -static Rtti HeadNode = RttiBuilder("HeadNode"); -} - -/* Element Handler Classes */ - -class DocumentHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted document = - project()->createDocument(args["name"].asString()); - document->setLocation(location()); - scope().push(document); - scope().setFlag(ParserFlag::POST_HEAD, false); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DocumentHandler{handlerData}; - } -}; - -class DocumentField : public Node { -public: - DocumentField(Manager &mgr, std::string name, Handle parent) - : Node(mgr, name, parent) - { - } -}; - -namespace RttiTypes { -const Rtti DocumentField = - RttiBuilder("DocumentField").parent(&Node); -} - -class DocumentChildHandler : public Handler { -public: - using Handler::Handler; - - void preamble(Handle parentNode, std::string &fieldName, - DocumentEntity *&parent, bool &inField) - { - // check if the parent in the structure tree was an explicit field - // reference. - inField = parentNode->isa(&RttiTypes::DocumentField); - if (inField) { - fieldName = parentNode->getName(); - parentNode = scope().selectOrThrow( - {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); - } else { - // if it wasn't an explicit reference, we use the default field. - fieldName = DEFAULT_FIELD_NAME; - } - // reference the parent entity explicitly. - parent = nullptr; - if (parentNode->isa(&RttiTypes::StructuredEntity)) { - parent = static_cast( - parentNode.cast().get()); - } else if (parentNode->isa(&RttiTypes::AnnotationEntity)) { - parent = static_cast( - parentNode.cast().get()); - } - } - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - Rooted parentNode = scope().selectOrThrow( - {&RttiTypes::Document, &RttiTypes::StructuredEntity, - &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}); - - std::string fieldName; - DocumentEntity *parent; - bool inField; - - preamble(parentNode, fieldName, parent, inField); - - // try to find a FieldDescriptor for the given tag if we are not in a - // field already. - // TODO: Consider fields of transparent classes - if (!inField && parent != nullptr && - parent->getDescriptor()->hasField(name())) { - Rooted field{new DocumentField( - parentNode->getManager(), fieldName, parentNode)}; - field->setLocation(location()); - scope().push(field); - return; - } - - // Otherwise create a new StructuredEntity - // TODO: Consider Anchors and AnnotationEntities - Rooted strct = scope().resolve( - Utils::split(name(), ':'), logger()); - if (strct == nullptr) { - // if we could not resolve the name, throw an exception. - throw LoggableException( - std::string("\"") + name() + "\" could not be resolved.", - location()); - } - - std::string name; - auto it = args.find("name"); - if (it != args.end()) { - name = it->second.asString(); - args.erase(it); - } - - Rooted entity; - if (parentNode->isa(&RttiTypes::Document)) { - entity = parentNode.cast()->createRootStructuredEntity( - strct, args, name); - } else { - // calculate a path if transparent entities are needed in between. - auto path = parent->getDescriptor()->pathTo(strct); - if (path.empty()) { - throw LoggableException( - std::string("An instance of \"") + strct->getName() + - "\" is not allowed as child of an instance of \"" + - parent->getDescriptor()->getName() + "\"", - location()); - } - - // create all transparent entities until the last field. - for (size_t p = 1; p < path.size() - 1; p = p + 2) { - parent = static_cast( - parent->createChildStructuredEntity( - path[p].cast(), - Variant::mapType{}, path[p - 1]->getName(), - "").get()); - } - entity = parent->createChildStructuredEntity(strct, args, fieldName, - name); - } - entity->setLocation(location()); - scope().push(entity); - } - - void end() override { scope().pop(); } - - void data(const std::string &data, int fieldIdx) override - { - Rooted parentNode = scope().selectOrThrow( - {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}); - - std::string fieldName; - DocumentEntity *parent; - bool inField; - - preamble(parentNode, fieldName, parent, inField); - - // retrieve the correct FieldDescriptor. - // TODO: Consider fields of transparent classes - Rooted desc = parent->getDescriptor(); - Rooted field = desc->getFieldDescriptor(fieldName); - if (field == nullptr) { - logger().error( - std::string("Can't handle data because no field with name \"") + - fieldName + "\" exists in descriptor\"" + desc->getName() + - "\".", - location()); - return; - } - if (!field->isPrimitive()) { - logger().error(std::string("Can't handle data because field \"") + - fieldName + "\" of descriptor \"" + - desc->getName() + "\" is not primitive!", - location()); - return; - } - - // try to parse the content. - auto res = VariantReader::parseGenericString( - data, logger(), location().getSourceId(), location().getStart()); - if (!res.first) { - return; - } - // try to convert it to the correct type. - if (!field->getPrimitiveType()->build(res.second, logger())) { - return; - } - // add it as primitive content. - parent->createChildDocumentPrimitive(res.second, fieldName); - } - - static Handler *create(const HandlerData &handlerData) - { - return new DocumentChildHandler{handlerData}; - } -}; - -class TypesystemHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Create the typesystem instance - Rooted typesystem = - project()->createTypesystem(args["name"].asString()); - typesystem->setLocation(location()); - - // Push the typesystem onto the scope, set the POST_HEAD flag to true - scope().push(typesystem); - scope().setFlag(ParserFlag::POST_HEAD, false); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemHandler{handlerData}; - } -}; - -class TypesystemEnumHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Fetch the current typesystem and create the enum node - Rooted typesystem = scope().selectOrThrow(); - Rooted enumType = - typesystem->createEnumType(args["name"].asString()); - enumType->setLocation(location()); - - scope().push(enumType); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemEnumHandler{handlerData}; - } -}; - -class TypesystemEnumEntryHandler : public Handler { -public: - using Handler::Handler; - - std::string entry; - - void start(Variant::mapType &args) override {} - - void end() override - { - Rooted enumType = scope().selectOrThrow(); - enumType->addEntry(entry, logger()); - } - - void data(const std::string &data, int field) override - { - if (field != 0) { - // TODO: This should be stored in the HandlerData - logger().error("Enum entry only has one field."); - return; - } - entry.append(data); - } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemEnumEntryHandler{handlerData}; - } -}; - -class TypesystemStructHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Fetch the arguments used for creating this type - const std::string &name = args["name"].asString(); - const std::string &parent = args["parent"].asString(); - - // Fetch the current typesystem and create the struct node - Rooted typesystem = scope().selectOrThrow(); - Rooted structType = typesystem->createStructType(name); - structType->setLocation(location()); - - // Try to resolve the parent type and set it as parent structure - if (!parent.empty()) { - scope().resolve( - parent, structType, logger(), - [](Handle parent, Handle structType, - Logger &logger) { - if (parent != nullptr) { - structType.cast()->setParentStructure( - parent.cast(), logger); - } - }); - } - scope().push(structType); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemStructHandler{handlerData}; - } -}; - -class TypesystemStructFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Read the argument values - const std::string &name = args["name"].asString(); - const std::string &type = args["type"].asString(); - const Variant &defaultValue = args["default"]; - const bool optional = - !(defaultValue.isObject() && defaultValue.asObject() == nullptr); - - Rooted structType = scope().selectOrThrow(); - Rooted attribute = - structType->createAttribute(name, defaultValue, optional, logger()); - attribute->setLocation(location()); - - // Try to resolve the type and default value - if (optional) { - scope().resolveTypeWithValue( - type, attribute, attribute->getDefaultValue(), logger(), - [](Handle type, Handle attribute, Logger &logger) { - if (type != nullptr) { - attribute.cast()->setType(type.cast(), - logger); - } - }); - } else { - scope().resolveType( - type, attribute, logger(), - [](Handle type, Handle attribute, Logger &logger) { - if (type != nullptr) { - attribute.cast()->setType(type.cast(), - logger); - } - }); - } - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemStructFieldHandler{handlerData}; - } -}; - -class TypesystemConstantHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Read the argument values - const std::string &name = args["name"].asString(); - const std::string &type = args["type"].asString(); - const Variant &value = args["value"]; - - Rooted typesystem = scope().selectOrThrow(); - Rooted constant = typesystem->createConstant(name, value); - constant->setLocation(location()); - - // Try to resolve the type - scope().resolveTypeWithValue( - type, constant, constant->getValue(), logger(), - [](Handle type, Handle constant, Logger &logger) { - if (type != nullptr) { - constant.cast()->setType(type.cast(), - logger); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemConstantHandler{handlerData}; - } -}; - -/* - * Domain Handlers - */ - -class DomainHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted domain = - project()->createDomain(args["name"].asString()); - domain->setLocation(location()); - - scope().push(domain); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainHandler{handlerData}; - } -}; - -class DomainStructHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - Rooted domain = scope().selectOrThrow(); - - Rooted structuredClass = domain->createStructuredClass( - args["name"].asString(), args["cardinality"].asCardinality(), - nullptr, args["transparent"].asBool(), args["isRoot"].asBool()); - structuredClass->setLocation(location()); - - const std::string &isa = args["isa"].asString(); - if (!isa.empty()) { - scope().resolve( - isa, structuredClass, logger(), - [](Handle superclass, Handle structuredClass, - Logger &logger) { - if (superclass != nullptr) { - structuredClass.cast()->setSuperclass( - superclass.cast(), logger); - } - }); - } - - scope().push(structuredClass); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainStructHandler{handlerData}; - } -}; - -class DomainAnnotationHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - Rooted domain = scope().selectOrThrow(); - - Rooted annotationClass = - domain->createAnnotationClass(args["name"].asString()); - annotationClass->setLocation(location()); - - scope().push(annotationClass); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainAnnotationHandler{handlerData}; - } -}; - -class DomainAttributesHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Fetch the current typesystem and create the struct node - Rooted parent = scope().selectOrThrow(); - - Rooted attrDesc = parent->getAttributesDescriptor(); - attrDesc->setLocation(location()); - - scope().push(attrDesc); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainAttributesHandler{handlerData}; - } -}; - -class DomainFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - FieldDescriptor::FieldType type; - if (args["isSubtree"].asBool()) { - type = FieldDescriptor::FieldType::SUBTREE; - } else { - type = FieldDescriptor::FieldType::TREE; - } - - Rooted parent = scope().selectOrThrow(); - - Rooted field = parent->createFieldDescriptor( - type, args["name"].asString(), args["optional"].asBool()); - field->setLocation(location()); - - scope().push(field); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainFieldHandler{handlerData}; - } -}; - -class DomainFieldRefHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parent = scope().selectOrThrow(); - - const std::string &name = args["name"].asString(); - scope().resolve( - name, parent, logger(), - [](Handle field, Handle parent, Logger &logger) { - if (field != nullptr) { - parent.cast()->addFieldDescriptor( - field.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainFieldRefHandler{handlerData}; - } -}; - -class DomainPrimitiveHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parent = scope().selectOrThrow(); - - Rooted field = parent->createPrimitiveFieldDescriptor( - nullptr, args["name"].asString(), args["optional"].asBool()); - field->setLocation(location()); - - const std::string &type = args["type"].asString(); - scope().resolve( - type, field, logger(), - [](Handle type, Handle field, Logger &logger) { - if (type != nullptr) { - field.cast()->setPrimitiveType( - type.cast()); - } - }); - - scope().push(field); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainPrimitiveHandler{handlerData}; - } -}; - -class DomainChildHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted field = - scope().selectOrThrow(); - - const std::string &ref = args["ref"].asString(); - scope().resolve( - ref, field, logger(), - [](Handle child, Handle field, Logger &logger) { - if (child != nullptr) { - field.cast()->addChild( - child.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainChildHandler{handlerData}; - } -}; - -class DomainParent : public Node { -public: - DomainParent(Manager &mgr, std::string name, Handle parent) - : Node(mgr, name, parent) - { - } -}; - -namespace RttiTypes { -const Rtti DomainParent = - RttiBuilder("DomainParent").parent(&Node); -} - -class DomainParentHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted strct = - scope().selectOrThrow(); - - Rooted parent{new DomainParent( - strct->getManager(), args["name"].asString(), strct)}; - parent->setLocation(location()); - scope().push(parent); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentHandler{handlerData}; - } -}; - -class DomainParentFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parentNameNode = - scope().selectOrThrow(); - FieldDescriptor::FieldType type; - if (args["isSubtree"].asBool()) { - type = FieldDescriptor::FieldType::SUBTREE; - } else { - type = FieldDescriptor::FieldType::TREE; - } - - const std::string &name = args["name"].asString(); - const bool optional = args["optional"].asBool(); - Rooted strct = - parentNameNode->getParent().cast(); - - // resolve the parent, create the declared field and add the declared - // StructuredClass as child to it. - scope().resolve( - parentNameNode->getName(), strct, logger(), - [type, name, optional](Handle parent, Handle strct, - Logger &logger) { - if (parent != nullptr) { - Rooted field = - parent.cast()->createFieldDescriptor( - type, name, optional); - field->addChild(strct.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentFieldHandler{handlerData}; - } -}; - -class DomainParentFieldRefHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parentNameNode = - scope().selectOrThrow(); - - const std::string &name = args["name"].asString(); - Rooted strct = - parentNameNode->getParent().cast(); - auto loc = location(); - - // resolve the parent, get the referenced field and add the declared - // StructuredClass as child to it. - scope().resolve(parentNameNode->getName(), strct, logger(), - [name, loc](Handle parent, - Handle strct, - Logger &logger) { - if (parent != nullptr) { - auto res = parent.cast()->resolve( - &RttiTypes::FieldDescriptor, name); - if (res.size() != 1) { - logger.error( - std::string("Could not find referenced field ") + name, - loc); - return; - } - Rooted field = - res[0].node.cast(); - field->addChild(strct.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentFieldRefHandler{handlerData}; - } -}; - -/* - * Import and Include Handler - */ - -class ImportIncludeHandler : public Handler { -public: - using Handler::Handler; - - bool srcInArgs = false; - std::string rel; - std::string type; - std::string src; - - void start(Variant::mapType &args) override - { - rel = args["rel"].asString(); - type = args["type"].asString(); - src = args["src"].asString(); - srcInArgs = !src.empty(); - } - - void data(const std::string &data, int field) override - { - if (srcInArgs) { - logger().error("\"src\" attribute has already been set"); - return; - } - if (field != 0) { - logger().error("Command has only one field."); - return; - } - src.append(data); - } -}; - -class ImportHandler : public ImportIncludeHandler { -public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override - { - ImportIncludeHandler::start(args); - - // Make sure imports are still possible - if (scope().getFlag(ParserFlag::POST_HEAD)) { - logger().error("Imports must be listed before other commands.", - location()); - return; - } - } - - void end() override - { - // Fetch the last node and check whether an import is valid at this - // position - Rooted leaf = scope().getLeaf(); - if (leaf == nullptr || !leaf->isa(&RttiTypes::RootNode)) { - logger().error( - "Import not supported here, must be inside a document, domain " - "or typesystem command.", - location()); - return; - } - Rooted leafRootNode = leaf.cast(); - - // Perform the actual import, register the imported node within the leaf - // node - Rooted imported = - context().import(src, type, rel, leafRootNode->getReferenceTypes()); - if (imported != nullptr) { - leafRootNode->reference(imported); - } - } - - static Handler *create(const HandlerData &handlerData) - { - return new ImportHandler{handlerData}; - } -}; - -class IncludeHandler : public ImportIncludeHandler { -public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override - { - ImportIncludeHandler::start(args); - } - - void end() override - { - context().include(src, type, rel, {&RttiTypes::Node}); - } - - static Handler *create(const HandlerData &handlerData) - { - return new IncludeHandler{handlerData}; - } -}; - -namespace ParserStates { -/* Document states */ -static const ParserState Document = - ParserStateBuilder() - .parent(&None) - .createdNodeType(&RttiTypes::Document) - .elementHandler(DocumentHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState DocumentChild = - ParserStateBuilder() - .parents({&Document, &DocumentChild}) - .createdNodeTypes({&RttiTypes::StructureNode, - &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}) - .elementHandler(DocumentChildHandler::create); - -/* Domain states */ -static const ParserState Domain = ParserStateBuilder() - .parents({&None, &Document}) - .createdNodeType(&RttiTypes::Domain) - .elementHandler(DomainHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStruct = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::StructuredClass) - .elementHandler(DomainStructHandler::create) - .arguments({Argument::String("name"), - Argument::Cardinality("cardinality", Cardinality::any()), - Argument::Bool("isRoot", false), - Argument::Bool("transparent", false), - Argument::String("isa", "")}); - -static const ParserState DomainAnnotation = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::AnnotationClass) - .elementHandler(DomainAnnotationHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainAttributes = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(DomainAttributesHandler::create) - .arguments({}); - -static const ParserState DomainAttribute = - ParserStateBuilder() - .parent(&DomainAttributes) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState DomainField = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainFieldRef = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldRefHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); - -static const ParserState DomainStructPrimitive = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainPrimitiveHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("optional", false), - Argument::String("type")}); - -static const ParserState DomainStructChild = - ParserStateBuilder() - .parent(&DomainField) - .elementHandler(DomainChildHandler::create) - .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParent = - ParserStateBuilder() - .parent(&DomainStruct) - .createdNodeType(&RttiTypes::DomainParent) - .elementHandler(DomainParentHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStructParentField = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainStructParentFieldRef = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldRefHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); - -/* Typesystem states */ -static const ParserState Typesystem = - ParserStateBuilder() - .parents({&None, &Domain}) - .createdNodeType(&RttiTypes::Typesystem) - .elementHandler(TypesystemHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState TypesystemEnum = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::EnumType) - .elementHandler(TypesystemEnumHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState TypesystemEnumEntry = - ParserStateBuilder() - .parent(&TypesystemEnum) - .elementHandler(TypesystemEnumEntryHandler::create) - .arguments({}); - -static const ParserState TypesystemStruct = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(TypesystemStructHandler::create) - .arguments({Argument::String("name"), Argument::String("parent", "")}); - -static const ParserState TypesystemStructField = - ParserStateBuilder() - .parent(&TypesystemStruct) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState TypesystemConstant = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::Constant) - .elementHandler(TypesystemConstantHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("value")}); - -/* Special states for import and include */ -static const ParserState Import = - ParserStateBuilder() - .parents({&Document, &Typesystem, &Domain}) - .elementHandler(ImportHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const ParserState Include = - ParserStateBuilder() - .parent(&All) - .elementHandler(IncludeHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const std::multimap XmlStates{ - {"document", &Document}, - {"*", &DocumentChild}, - {"domain", &Domain}, - {"struct", &DomainStruct}, - {"annotation", &DomainAnnotation}, - {"attributes", &DomainAttributes}, - {"attribute", &DomainAttribute}, - {"field", &DomainField}, - {"fieldRef", &DomainFieldRef}, - {"primitive", &DomainStructPrimitive}, - {"child", &DomainStructChild}, - {"parent", &DomainStructParent}, - {"field", &DomainStructParentField}, - {"fieldRef", &DomainStructParentFieldRef}, - {"typesystem", &Typesystem}, - {"enum", &TypesystemEnum}, - {"entry", &TypesystemEnumEntry}, - {"struct", &TypesystemStruct}, - {"field", &TypesystemStructField}, - {"constant", &TypesystemConstant}, - {"import", &Import}, - {"include", &Include}}; -} - -/** - * Structue containing the private data that is being passed to the - * XML-Handlers. - */ -struct XMLUserData { - /** - * Containing the depth of the current XML file - */ - size_t depth; - - /** - * Reference at the ParserStack instance. - */ - ParserStack *stack; - - /** - * Reference at the CharReader instance. - */ - CharReader *reader; - - /** - * Constructor of the XMLUserData struct. - * - * @param stack is a pointer at the ParserStack instance. - * @param reader is a pointer at the CharReader instance. - */ - XMLUserData(ParserStack *stack, CharReader *reader) - : depth(0), stack(stack), reader(reader) - { - } -}; - -/** - * Wrapper class around the XML_Parser pointer which safely frees it whenever - * the scope is left (e.g. because an exception was thrown). - */ -class ScopedExpatXmlParser { -private: - /** - * Internal pointer to the XML_Parser instance. - */ - XML_Parser parser; - -public: - /** - * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS - * from the expat library. Throws a parser exception if the XML parser - * cannot be initialized. - * - * @param encoding is the protocol-defined encoding passed to expat (or - * nullptr if expat should determine the encoding by itself). - */ - ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) - { - parser = XML_ParserCreate(encoding); - if (!parser) { - throw LoggableException{ - "Internal error: Could not create expat XML parser!"}; - } - } - - /** - * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. - */ - ~ScopedExpatXmlParser() - { - if (parser) { - XML_ParserFree(parser); - parser = nullptr; - } - } - - /** - * Returns the XML_Parser pointer. - */ - XML_Parser operator&() { return parser; } -}; - -/* Adapter Expat -> ParserStack */ - -static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) -{ - // Fetch the parser stack and the associated user data - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - // Fetch the current location in the XML file - size_t offs = XML_GetCurrentByteIndex(p); - - // Build the source location and update the default location of the - // current - // logger instance - SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; - stack->getContext().getLogger().setDefaultLocation(loc); - return loc; -} - -enum class XMLAttributeState { - IN_TAG_NAME, - SEARCH_ATTR, - IN_ATTR_NAME, - HAS_ATTR_NAME, - HAS_ATTR_EQUALS, - IN_ATTR_DATA -}; - -static std::map reconstructXMLAttributeOffsets( - CharReader &reader, SourceLocation location) -{ - std::map res; - - // Fork the reader, we don't want to mess up the XML parsing process, do we? - CharReaderFork readerFork = reader.fork(); - - // Move the read cursor to the start location, abort if this does not work - size_t offs = location.getStart(); - if (!location.isValid() || offs != readerFork.seek(offs)) { - return res; - } - - // Now all we need to do is to implement one half of an XML parser. As this - // is inherently complicated we'll totaly fail at it. Don't care. All we - // want to get is those darn offsets for pretty error messages... (and we - // can assume the XML is valid as it was already read by expat) - XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; - char c; - std::stringstream attrName; - while (readerFork.read(c)) { - // Abort at the end of the tag - if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { - return res; - } - - // One state machine to rule them all, one state machine to find them, - // One state machine to bring them all and in the darkness bind them - // (the byte offsets) - switch (state) { - case XMLAttributeState::IN_TAG_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::SEARCH_ATTR; - } - break; - case XMLAttributeState::SEARCH_ATTR: - if (!Utils::isWhitespace(c)) { - state = XMLAttributeState::IN_ATTR_NAME; - attrName << c; - } - break; - case XMLAttributeState::IN_ATTR_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::HAS_ATTR_NAME; - } else if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - } else { - attrName << c; - } - break; - case XMLAttributeState::HAS_ATTR_NAME: - if (!Utils::isWhitespace(c)) { - if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - break; - } - // Well, this is a strange XML file... We expected to - // see a '=' here! Try to continue with the - // "HAS_ATTR_EQUALS" state as this state will hopefully - // inlcude some error recovery - } else { - // Skip whitespace here - break; - } - // Fallthrough - case XMLAttributeState::HAS_ATTR_EQUALS: - if (!Utils::isWhitespace(c)) { - if (c == '"') { - // Here we are! We have found the beginning of an - // attribute. Let's quickly lock the current offset away - // in the result map - res.emplace(attrName.str(), - SourceLocation{reader.getSourceId(), - readerFork.getOffset()}); - attrName.str(std::string{}); - state = XMLAttributeState::IN_ATTR_DATA; - } else { - // No, this XML file is not well formed. Assume we're in - // an attribute name once again - attrName.str(std::string{&c, 1}); - state = XMLAttributeState::IN_ATTR_NAME; - } - } - break; - case XMLAttributeState::IN_ATTR_DATA: - if (c == '"') { - // We're at the end of the attribute data, start anew - state = XMLAttributeState::SEARCH_ATTR; - } - break; - } - } - return res; -} - -static void xmlStartElementHandler(void *p, const XML_Char *name, - const XML_Char **attrs) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - SourceLocation loc = syncLoggerPosition(parser); - - // Read the argument locations -- this is only a stupid and slow hack, - // but it is necessary, as expat doesn't give use the byte offset of the - // arguments. - std::map offs = - reconstructXMLAttributeOffsets(*userData->reader, loc); - - // Assemble the arguments - Variant::mapType args; - - const XML_Char **attr = attrs; - while (*attr) { - // Convert the C string to a std::string - const std::string key{*(attr++)}; - - // Search the location of the key - SourceLocation keyLoc; - auto it = offs.find(key); - if (it != offs.end()) { - keyLoc = it->second; - } - - // Parse the string, pass the location of the key - std::pair value = VariantReader::parseGenericString( - *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), - keyLoc.getStart()); - args.emplace(key, value.second); - } - - // Call the start function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->start(std::string(name), args, loc); - } - - // Increment the current depth - userData->depth++; -} - -static void xmlEndElementHandler(void *p, const XML_Char *name) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - syncLoggerPosition(parser); - - // Decrement the current depth - userData->depth--; - - // Call the end function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->end(); - } -} - -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - size_t ulen = len > 0 ? static_cast(len) : 0; - syncLoggerPosition(parser, ulen); - const std::string data = Utils::trim(std::string{s, ulen}); - if (!data.empty()) { - stack->data(data); - } -} - -/* Class XmlParser */ - -void XmlParser::doParse(CharReader &reader, ParserContext &ctx) -{ - // Create the parser object - ScopedExpatXmlParser p{"UTF-8"}; - - // Create the parser stack instance, if we're starting on a non-empty scope, - // try to deduce the parser state - ParserStack stack(ctx, ParserStates::XmlStates); - if (!ctx.getScope().isEmpty()) { - if (!stack.deduceState()) { - return; - } - } - - // Pass the reference to the ParserStack to the XML handler - XMLUserData data(&stack, &reader); - XML_SetUserData(&p, &data); - XML_UseParserAsHandlerArg(&p); - - // Set the callback functions - XML_SetStartElementHandler(&p, xmlStartElementHandler); - XML_SetEndElementHandler(&p, xmlEndElementHandler); - XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); - - // Feed data into expat while there is data to process - constexpr size_t BUFFER_SIZE = 64 * 1024; - while (true) { - // Fetch a buffer from expat for the input data - char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); - if (!buf) { - throw LoggableException{ - "Internal error: XML parser out of memory!"}; - } - - // Read into the buffer - size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); - - // Parse the data and handle any XML error - if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { - // Fetch the xml parser byte offset - size_t offs = XML_GetCurrentByteIndex(&p); - - // Throw a corresponding exception - XML_Error code = XML_GetErrorCode(&p); - std::string msg = std::string{XML_ErrorString(code)}; - throw LoggableException{"XML: " + msg, - SourceLocation{ctx.getSourceId(), offs}}; - } - - // Abort once there are no more bytes in the stream - if (bytesRead == 0) { - break; - } - } -} -} - diff --git a/src/plugins/xml/XmlParser.hpp b/src/plugins/xml/XmlParser.hpp deleted file mode 100644 index c8b6302..0000000 --- a/src/plugins/xml/XmlParser.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file XmlParser.hpp - * - * Contains the parser responsible for reading Ousía XML Documents (extension - * oxd) and Ousía XML Modules (extension oxm). - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_XML_PARSER_HPP_ -#define _OUSIA_XML_PARSER_HPP_ - -#include - -namespace ousia { - -/** - * The XmlParser class implements parsing the various types of Ousía XML - * documents using the expat stream XML parser. - */ -class XmlParser : public Parser { -protected: - /** - * Parses the given input stream as XML file and returns the parsed - * top-level node. - * - * @param reader is the CharReader from which the input should be read. - * @param ctx is a reference to the ParserContext instance that should be - * used. - */ - void doParse(CharReader &reader, ParserContext &ctx) override; -}; - -} - -#endif /* _OUSIA_XML_PARSER_HPP_ */ - diff --git a/test/core/CodeTokenizerTest.cpp b/test/core/CodeTokenizerTest.cpp deleted file mode 100644 index 2d4d5a7..0000000 --- a/test/core/CodeTokenizerTest.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -static const int BLOCK_COMMENT = 30; -static const int LINE_COMMENT = 31; -static const int STRING = 20; -static const int ESCAPE = 21; -static const int LINEBREAK = 21; -static const int CURLY_OPEN = 40; -static const int CURLY_CLOSE = 41; - -TEST(CodeTokenizer, testTokenizer) -{ - CharReader reader{ - "/**\n" // 1 - " * Some Block Comment\n" // 2 - " */\n" // 3 - "var my_string = 'My \\'String\\'';\n" // 4 - "// and a line comment\n" // 5 - "var my_obj = { a = 4;}", 0}; // 6 - // 123456789012345678901234567890123456789 - // 0 1 2 3 - TokenTreeNode root{{{"/*", 1}, - {"*/", 2}, - {"//", 3}, - {"'", 4}, - {"\\", 5}, - {"{", CURLY_OPEN}, - {"}", CURLY_CLOSE}, - {"\n", 6}}}; - std::map descriptors{ - // the block comment start Token has the id 1 and if the Tokenizer - // returns a Block Comment Token that should have the id 10. - {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}}, - {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}}, - {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}}, - {4, {CodeTokenMode::STRING_START_END, STRING}}, - {5, {CodeTokenMode::ESCAPE, ESCAPE}}, - {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; - - std::vector expected = { - {BLOCK_COMMENT, "*\n * Some Block Comment\n ", SourceLocation{0, 0, 29}}, - {LINEBREAK, "\n", SourceLocation{0, 29, 30}}, - {TOKEN_TEXT, "var", SourceLocation{0, 30, 33}}, - {TOKEN_TEXT, "my_string", SourceLocation{0, 34, 43}}, - {TOKEN_TEXT, "=", SourceLocation{0, 44, 45}}, - {STRING, "My 'String'", SourceLocation{0, 46, 61}}, - {TOKEN_TEXT, ";", SourceLocation{0, 61, 62}}, - {LINEBREAK, "\n", SourceLocation{0, 62, 63}}, - // this is slightly counter-intuitive but makes sense if you think about - // it: As a line comment is ended by a line break the line break is - // technically still a part of the line comment and thus the ending - // is in the next line. - {LINE_COMMENT, " and a line comment", SourceLocation{0, 63, 85}}, - {TOKEN_TEXT, "var", SourceLocation{0, 85, 88}}, - {TOKEN_TEXT, "my_obj", SourceLocation{0, 89, 95}}, - {TOKEN_TEXT, "=", SourceLocation{0, 96, 97}}, - {CURLY_OPEN, "{", SourceLocation{0, 98, 99}}, - {TOKEN_TEXT, "a", SourceLocation{0, 100, 101}}, - {TOKEN_TEXT, "=", SourceLocation{0, 102, 103}}, - {TOKEN_TEXT, "4;", SourceLocation{0, 104, 106}}, - {CURLY_CLOSE, "}", SourceLocation{0, 106, 107}}, - }; - - CodeTokenizer tokenizer{reader, root, descriptors}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} -} - diff --git a/test/core/TokenizerTest.cpp b/test/core/TokenizerTest.cpp deleted file mode 100644 index c53f93d..0000000 --- a/test/core/TokenizerTest.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include - -namespace ousia { -TEST(TokenTreeNode, testConstructor) -{ - TokenTreeNode root{{{"a", 1}, {"aab", 2}, {"aac", 3}, {"abd", 4}}}; - - ASSERT_EQ(-1, root.tokenId); - ASSERT_EQ(1U, root.children.size()); - ASSERT_TRUE(root.children.find('a') != root.children.end()); - - const TokenTreeNode &a = root.children.at('a'); - ASSERT_EQ(1, a.tokenId); - ASSERT_EQ(2U, a.children.size()); - ASSERT_TRUE(a.children.find('a') != a.children.end()); - ASSERT_TRUE(a.children.find('b') != a.children.end()); - - const TokenTreeNode &aa = a.children.at('a'); - ASSERT_EQ(-1, aa.tokenId); - ASSERT_EQ(2U, aa.children.size()); - ASSERT_TRUE(aa.children.find('b') != aa.children.end()); - ASSERT_TRUE(aa.children.find('c') != aa.children.end()); - - const TokenTreeNode &aab = aa.children.at('b'); - ASSERT_EQ(2, aab.tokenId); - ASSERT_EQ(0U, aab.children.size()); - - const TokenTreeNode &aac = aa.children.at('c'); - ASSERT_EQ(3, aac.tokenId); - ASSERT_EQ(0U, aac.children.size()); - - const TokenTreeNode &ab = a.children.at('b'); - ASSERT_EQ(-1, ab.tokenId); - ASSERT_EQ(1U, ab.children.size()); - ASSERT_TRUE(ab.children.find('d') != ab.children.end()); - - const TokenTreeNode &abd = ab.children.at('d'); - ASSERT_EQ(4, abd.tokenId); - ASSERT_EQ(0U, abd.children.size()); -} - -TEST(Tokenizer, testTokenization) -{ - TokenTreeNode root{{{"/", 1}, {"/*", 2}, {"*/", 3}}}; - - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - - std::vector expected = { - {TOKEN_TEXT, "Test", SourceLocation{0, 0, 4}}, - {1, "/", SourceLocation{0, 4, 5}}, - {TOKEN_TEXT, "Test ", SourceLocation{0, 5, 10}}, - {2, "/*", SourceLocation{0, 10, 12}}, - {TOKEN_TEXT, " Block Comment ", SourceLocation{0, 12, 27}}, - {3, "*/", SourceLocation{0, 27, 29}}}; - - Tokenizer tokenizer{reader, root}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} - -TEST(Tokenizer, testIncompleteTokens) -{ - TokenTreeNode root{{{"ab", 1}, {"c", 2}}}; - - CharReader reader{"ac", 0}; - - std::vector expected = { - {TOKEN_TEXT, "a", SourceLocation{0, 0, 1}}, - {2, "c", SourceLocation{0, 1, 2}}}; - - Tokenizer tokenizer{reader, root}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} -} - diff --git a/test/formats/osdmx/OsdmxParserTest.cpp b/test/formats/osdmx/OsdmxParserTest.cpp new file mode 100644 index 0000000..af1ef56 --- /dev/null +++ b/test/formats/osdmx/OsdmxParserTest.cpp @@ -0,0 +1,314 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace ousia { + +namespace RttiTypes { +extern const Rtti Document; +extern const Rtti Domain; +extern const Rtti Typesystem; +} + +struct XmlStandaloneEnvironment : public StandaloneEnvironment { + XmlParser xmlParser; + FileLocator fileLocator; + + XmlStandaloneEnvironment(ConcreteLogger &logger) + : StandaloneEnvironment(logger) + { + fileLocator.addDefaultSearchPaths(); + fileLocator.addUnittestSearchPath("xmlparser"); + + registry.registerDefaultExtensions(); + registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, + {&RttiTypes::Node}, &xmlParser); + registry.registerResourceLocator(&fileLocator); + } +}; + +static TerminalLogger logger(std::cerr, true); + +TEST(XmlParser, mismatchedTag) +{ + XmlStandaloneEnvironment env(logger); + env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); + ASSERT_TRUE(logger.hasError()); +} + +TEST(XmlParser, generic) +{ + XmlStandaloneEnvironment env(logger); + env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); +#ifdef MANAGER_GRAPHVIZ_EXPORT + env.manager.exportGraphviz("xmlDocument.dot"); +#endif +} + +static void checkAttributes(Handle expected, + Handle desc) +{ + if (expected == nullptr) { + ASSERT_TRUE(desc->getAttributesDescriptor()->getAttributes().empty()); + } else { + ASSERT_EQ(expected->getName(), + desc->getAttributesDescriptor()->getName()); + auto &attrs_exp = expected->getAttributes(); + auto &attrs = desc->getAttributesDescriptor()->getAttributes(); + ASSERT_EQ(attrs_exp.size(), attrs.size()); + for (size_t i = 0; i < attrs_exp.size(); i++) { + ASSERT_EQ(attrs_exp[i]->getName(), attrs[i]->getName()); + ASSERT_EQ(attrs_exp[i]->getType(), attrs[i]->getType()); + ASSERT_EQ(attrs_exp[i]->isOptional(), attrs[i]->isOptional()); + ASSERT_EQ(attrs_exp[i]->getDefaultValue(), + attrs[i]->getDefaultValue()); + } + } +} + +static void checkStructuredClass( + Handle n, const std::string &name, Handle domain, + Variant cardinality = Cardinality::any(), + Handle attributesDescriptor = nullptr, + Handle superclass = nullptr, bool transparent = false, + bool root = false) +{ + ASSERT_FALSE(n == nullptr); + Handle sc = n.cast(); + ASSERT_FALSE(sc == nullptr); + ASSERT_EQ(name, sc->getName()); + ASSERT_EQ(domain, sc->getParent()); + ASSERT_EQ(cardinality, sc->getCardinality()); + ASSERT_EQ(transparent, sc->isTransparent()); + ASSERT_EQ(root, sc->hasRootPermission()); + checkAttributes(attributesDescriptor, sc); +} + +static Rooted checkStructuredClass( + const std::string &resolve, const std::string &name, Handle domain, + Variant cardinality = Cardinality::any(), + Handle attributesDescriptor = nullptr, + Handle superclass = nullptr, bool transparent = false, + bool root = false) +{ + auto res = domain->resolve(&RttiTypes::StructuredClass, resolve); + if (res.size() != 1) { + throw OusiaException("resolution error!"); + } + Handle sc = res[0].node.cast(); + checkStructuredClass(sc, name, domain, cardinality, attributesDescriptor, + superclass, transparent, root); + return sc; +} + +static void checkAnnotationClass( + Handle n, const std::string &name, Handle domain, + Handle attributesDescriptor = nullptr) +{ + ASSERT_FALSE(n == nullptr); + Handle ac = n.cast(); + ASSERT_FALSE(ac == nullptr); + ASSERT_EQ(name, ac->getName()); + ASSERT_EQ(domain, ac->getParent()); + checkAttributes(attributesDescriptor, ac); +} + +static Rooted checkAnnotationClass( + const std::string &resolve, const std::string &name, Handle domain, + Handle attributesDescriptor = nullptr) +{ + auto res = domain->resolve(&RttiTypes::AnnotationClass, resolve); + if (res.size() != 1) { + throw OusiaException("resolution error!"); + } + Handle ac = res[0].node.cast(); + checkAnnotationClass(ac, name, domain, attributesDescriptor); + return ac; +} + +static void checkFieldDescriptor( + Handle n, const std::string &name, Handle parent, + NodeVector children, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + ASSERT_FALSE(n == nullptr); + Handle field = n.cast(); + ASSERT_FALSE(field.isNull()); + ASSERT_EQ(name, field->getName()); + ASSERT_EQ(parent, field->getParent()); + ASSERT_EQ(type, field->getFieldType()); + ASSERT_EQ(primitiveType, field->getPrimitiveType()); + ASSERT_EQ(optional, field->isOptional()); + // check the children. + ASSERT_EQ(children.size(), field->getChildren().size()); + for (unsigned int c = 0; c < children.size(); c++) { + ASSERT_EQ(children[c], field->getChildren()[c]); + } +} + +static void checkFieldDescriptor( + Handle desc, Handle parent, + NodeVector children, + const std::string &name = DEFAULT_FIELD_NAME, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); + ASSERT_EQ(1, res.size()); + checkFieldDescriptor(res[0].node, name, parent, children, type, + primitiveType, optional); +} + +static void checkFieldDescriptor( + Handle desc, NodeVector children, + const std::string &name = DEFAULT_FIELD_NAME, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + checkFieldDescriptor(desc, desc, children, name, type, primitiveType, + optional); +} + +TEST(XmlParser, domainParsing) +{ + XmlStandaloneEnvironment env(logger); + Rooted book_domain_node = + env.parse("book_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(book_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + // check the domain node. + Rooted book_domain = book_domain_node.cast(); + ASSERT_EQ("book", book_domain->getName()); + // get the book struct node. + Cardinality single; + single.merge({1}); + Rooted bookAuthor{ + new StructType(book_domain->getManager(), "", nullptr)}; + bookAuthor->addAttribute( + {new Attribute(book_domain->getManager(), "author", + env.project->getSystemTypesystem()->getStringType(), + "")}, + logger); + Rooted book = checkStructuredClass( + "book", "book", book_domain, single, bookAuthor, nullptr, false, true); + // get the chapter struct node. + Rooted chapter = + checkStructuredClass("chapter", "chapter", book_domain); + Rooted section = + checkStructuredClass("section", "section", book_domain); + Rooted subsection = + checkStructuredClass("subsection", "subsection", book_domain); + Rooted paragraph = + checkStructuredClass("paragraph", "paragraph", book_domain, + Cardinality::any(), nullptr, nullptr, true, false); + Rooted text = + checkStructuredClass("text", "text", book_domain, Cardinality::any(), + nullptr, nullptr, true, false); + + // check the FieldDescriptors. + checkFieldDescriptor(book, {chapter, paragraph}); + checkFieldDescriptor(chapter, {section, paragraph}); + checkFieldDescriptor(section, {subsection, paragraph}); + checkFieldDescriptor(subsection, {paragraph}); + checkFieldDescriptor(paragraph, {text}); + checkFieldDescriptor( + text, {}, DEFAULT_FIELD_NAME, FieldDescriptor::FieldType::PRIMITIVE, + env.project->getSystemTypesystem()->getStringType(), false); + + // check parent handling using the headings domain. + Rooted headings_domain_node = + env.parse("headings_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(headings_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + Rooted headings_domain = headings_domain_node.cast(); + // now there should be a heading struct. + Rooted heading = + checkStructuredClass("heading", "heading", headings_domain, single, + nullptr, nullptr, true, false); + // which should be a reference to the paragraph descriptor. + checkFieldDescriptor(heading, paragraph, {text}); + // and each struct in the book domain (except for text) should have a + // heading field now. + checkFieldDescriptor(book, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(chapter, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(section, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(subsection, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(paragraph, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + + // check annotation handling using the comments domain. + Rooted comments_domain_node = + env.parse("comments_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(comments_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + Rooted comments_domain = comments_domain_node.cast(); + // now we should be able to find a comment annotation. + Rooted comment_anno = + checkAnnotationClass("comment", "comment", comments_domain); + // as well as a comment struct + Rooted comment = + checkStructuredClass("comment", "comment", comments_domain); + // and a reply struct + Rooted reply = + checkStructuredClass("reply", "reply", comments_domain); + // check the fields for each of them. + { + std::vector> descs{comment_anno, comment, reply}; + for (auto &d : descs) { + checkFieldDescriptor(d, {paragraph}, "content", + FieldDescriptor::FieldType::SUBTREE, nullptr, + false); + checkFieldDescriptor(d, {reply}, "replies", + FieldDescriptor::FieldType::SUBTREE, nullptr, + false); + } + } + // paragraph should have comment as child now as well. + checkFieldDescriptor(paragraph, {text, comment}); + // as should heading, because it references the paragraph default field. + checkFieldDescriptor(heading, paragraph, {text, comment}); +} + +TEST(XmlParser, documentParsing) +{ + XmlStandaloneEnvironment env(logger); + Rooted book_domain_node = + env.parse("simple_book.oxd", "", "", RttiSet{&RttiTypes::Document}); + //TODO: Check result +} +} + diff --git a/test/plugins/css/CodeTokenizerTest.cpp b/test/plugins/css/CodeTokenizerTest.cpp new file mode 100644 index 0000000..2d4d5a7 --- /dev/null +++ b/test/plugins/css/CodeTokenizerTest.cpp @@ -0,0 +1,100 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +static const int BLOCK_COMMENT = 30; +static const int LINE_COMMENT = 31; +static const int STRING = 20; +static const int ESCAPE = 21; +static const int LINEBREAK = 21; +static const int CURLY_OPEN = 40; +static const int CURLY_CLOSE = 41; + +TEST(CodeTokenizer, testTokenizer) +{ + CharReader reader{ + "/**\n" // 1 + " * Some Block Comment\n" // 2 + " */\n" // 3 + "var my_string = 'My \\'String\\'';\n" // 4 + "// and a line comment\n" // 5 + "var my_obj = { a = 4;}", 0}; // 6 + // 123456789012345678901234567890123456789 + // 0 1 2 3 + TokenTreeNode root{{{"/*", 1}, + {"*/", 2}, + {"//", 3}, + {"'", 4}, + {"\\", 5}, + {"{", CURLY_OPEN}, + {"}", CURLY_CLOSE}, + {"\n", 6}}}; + std::map descriptors{ + // the block comment start Token has the id 1 and if the Tokenizer + // returns a Block Comment Token that should have the id 10. + {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}}, + {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}}, + {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}}, + {4, {CodeTokenMode::STRING_START_END, STRING}}, + {5, {CodeTokenMode::ESCAPE, ESCAPE}}, + {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; + + std::vector expected = { + {BLOCK_COMMENT, "*\n * Some Block Comment\n ", SourceLocation{0, 0, 29}}, + {LINEBREAK, "\n", SourceLocation{0, 29, 30}}, + {TOKEN_TEXT, "var", SourceLocation{0, 30, 33}}, + {TOKEN_TEXT, "my_string", SourceLocation{0, 34, 43}}, + {TOKEN_TEXT, "=", SourceLocation{0, 44, 45}}, + {STRING, "My 'String'", SourceLocation{0, 46, 61}}, + {TOKEN_TEXT, ";", SourceLocation{0, 61, 62}}, + {LINEBREAK, "\n", SourceLocation{0, 62, 63}}, + // this is slightly counter-intuitive but makes sense if you think about + // it: As a line comment is ended by a line break the line break is + // technically still a part of the line comment and thus the ending + // is in the next line. + {LINE_COMMENT, " and a line comment", SourceLocation{0, 63, 85}}, + {TOKEN_TEXT, "var", SourceLocation{0, 85, 88}}, + {TOKEN_TEXT, "my_obj", SourceLocation{0, 89, 95}}, + {TOKEN_TEXT, "=", SourceLocation{0, 96, 97}}, + {CURLY_OPEN, "{", SourceLocation{0, 98, 99}}, + {TOKEN_TEXT, "a", SourceLocation{0, 100, 101}}, + {TOKEN_TEXT, "=", SourceLocation{0, 102, 103}}, + {TOKEN_TEXT, "4;", SourceLocation{0, 104, 106}}, + {CURLY_CLOSE, "}", SourceLocation{0, 106, 107}}, + }; + + CodeTokenizer tokenizer{reader, root, descriptors}; + + Token t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.next(t)); + EXPECT_EQ(te.tokenId, t.tokenId); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.next(t)); +} +} + diff --git a/test/plugins/css/TokenizerTest.cpp b/test/plugins/css/TokenizerTest.cpp new file mode 100644 index 0000000..c53f93d --- /dev/null +++ b/test/plugins/css/TokenizerTest.cpp @@ -0,0 +1,118 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include + +namespace ousia { +TEST(TokenTreeNode, testConstructor) +{ + TokenTreeNode root{{{"a", 1}, {"aab", 2}, {"aac", 3}, {"abd", 4}}}; + + ASSERT_EQ(-1, root.tokenId); + ASSERT_EQ(1U, root.children.size()); + ASSERT_TRUE(root.children.find('a') != root.children.end()); + + const TokenTreeNode &a = root.children.at('a'); + ASSERT_EQ(1, a.tokenId); + ASSERT_EQ(2U, a.children.size()); + ASSERT_TRUE(a.children.find('a') != a.children.end()); + ASSERT_TRUE(a.children.find('b') != a.children.end()); + + const TokenTreeNode &aa = a.children.at('a'); + ASSERT_EQ(-1, aa.tokenId); + ASSERT_EQ(2U, aa.children.size()); + ASSERT_TRUE(aa.children.find('b') != aa.children.end()); + ASSERT_TRUE(aa.children.find('c') != aa.children.end()); + + const TokenTreeNode &aab = aa.children.at('b'); + ASSERT_EQ(2, aab.tokenId); + ASSERT_EQ(0U, aab.children.size()); + + const TokenTreeNode &aac = aa.children.at('c'); + ASSERT_EQ(3, aac.tokenId); + ASSERT_EQ(0U, aac.children.size()); + + const TokenTreeNode &ab = a.children.at('b'); + ASSERT_EQ(-1, ab.tokenId); + ASSERT_EQ(1U, ab.children.size()); + ASSERT_TRUE(ab.children.find('d') != ab.children.end()); + + const TokenTreeNode &abd = ab.children.at('d'); + ASSERT_EQ(4, abd.tokenId); + ASSERT_EQ(0U, abd.children.size()); +} + +TEST(Tokenizer, testTokenization) +{ + TokenTreeNode root{{{"/", 1}, {"/*", 2}, {"*/", 3}}}; + + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + + std::vector expected = { + {TOKEN_TEXT, "Test", SourceLocation{0, 0, 4}}, + {1, "/", SourceLocation{0, 4, 5}}, + {TOKEN_TEXT, "Test ", SourceLocation{0, 5, 10}}, + {2, "/*", SourceLocation{0, 10, 12}}, + {TOKEN_TEXT, " Block Comment ", SourceLocation{0, 12, 27}}, + {3, "*/", SourceLocation{0, 27, 29}}}; + + Tokenizer tokenizer{reader, root}; + + Token t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.next(t)); + EXPECT_EQ(te.tokenId, t.tokenId); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.next(t)); +} + +TEST(Tokenizer, testIncompleteTokens) +{ + TokenTreeNode root{{{"ab", 1}, {"c", 2}}}; + + CharReader reader{"ac", 0}; + + std::vector expected = { + {TOKEN_TEXT, "a", SourceLocation{0, 0, 1}}, + {2, "c", SourceLocation{0, 1, 2}}}; + + Tokenizer tokenizer{reader, root}; + + Token t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.next(t)); + EXPECT_EQ(te.tokenId, t.tokenId); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.next(t)); +} +} + diff --git a/test/plugins/xml/XmlParserTest.cpp b/test/plugins/xml/XmlParserTest.cpp deleted file mode 100644 index af1ef56..0000000 --- a/test/plugins/xml/XmlParserTest.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace ousia { - -namespace RttiTypes { -extern const Rtti Document; -extern const Rtti Domain; -extern const Rtti Typesystem; -} - -struct XmlStandaloneEnvironment : public StandaloneEnvironment { - XmlParser xmlParser; - FileLocator fileLocator; - - XmlStandaloneEnvironment(ConcreteLogger &logger) - : StandaloneEnvironment(logger) - { - fileLocator.addDefaultSearchPaths(); - fileLocator.addUnittestSearchPath("xmlparser"); - - registry.registerDefaultExtensions(); - registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, - {&RttiTypes::Node}, &xmlParser); - registry.registerResourceLocator(&fileLocator); - } -}; - -static TerminalLogger logger(std::cerr, true); - -TEST(XmlParser, mismatchedTag) -{ - XmlStandaloneEnvironment env(logger); - env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); - ASSERT_TRUE(logger.hasError()); -} - -TEST(XmlParser, generic) -{ - XmlStandaloneEnvironment env(logger); - env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); -#ifdef MANAGER_GRAPHVIZ_EXPORT - env.manager.exportGraphviz("xmlDocument.dot"); -#endif -} - -static void checkAttributes(Handle expected, - Handle desc) -{ - if (expected == nullptr) { - ASSERT_TRUE(desc->getAttributesDescriptor()->getAttributes().empty()); - } else { - ASSERT_EQ(expected->getName(), - desc->getAttributesDescriptor()->getName()); - auto &attrs_exp = expected->getAttributes(); - auto &attrs = desc->getAttributesDescriptor()->getAttributes(); - ASSERT_EQ(attrs_exp.size(), attrs.size()); - for (size_t i = 0; i < attrs_exp.size(); i++) { - ASSERT_EQ(attrs_exp[i]->getName(), attrs[i]->getName()); - ASSERT_EQ(attrs_exp[i]->getType(), attrs[i]->getType()); - ASSERT_EQ(attrs_exp[i]->isOptional(), attrs[i]->isOptional()); - ASSERT_EQ(attrs_exp[i]->getDefaultValue(), - attrs[i]->getDefaultValue()); - } - } -} - -static void checkStructuredClass( - Handle n, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - ASSERT_FALSE(n == nullptr); - Handle sc = n.cast(); - ASSERT_FALSE(sc == nullptr); - ASSERT_EQ(name, sc->getName()); - ASSERT_EQ(domain, sc->getParent()); - ASSERT_EQ(cardinality, sc->getCardinality()); - ASSERT_EQ(transparent, sc->isTransparent()); - ASSERT_EQ(root, sc->hasRootPermission()); - checkAttributes(attributesDescriptor, sc); -} - -static Rooted checkStructuredClass( - const std::string &resolve, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - auto res = domain->resolve(&RttiTypes::StructuredClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle sc = res[0].node.cast(); - checkStructuredClass(sc, name, domain, cardinality, attributesDescriptor, - superclass, transparent, root); - return sc; -} - -static void checkAnnotationClass( - Handle n, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - ASSERT_FALSE(n == nullptr); - Handle ac = n.cast(); - ASSERT_FALSE(ac == nullptr); - ASSERT_EQ(name, ac->getName()); - ASSERT_EQ(domain, ac->getParent()); - checkAttributes(attributesDescriptor, ac); -} - -static Rooted checkAnnotationClass( - const std::string &resolve, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - auto res = domain->resolve(&RttiTypes::AnnotationClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle ac = res[0].node.cast(); - checkAnnotationClass(ac, name, domain, attributesDescriptor); - return ac; -} - -static void checkFieldDescriptor( - Handle n, const std::string &name, Handle parent, - NodeVector children, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - ASSERT_FALSE(n == nullptr); - Handle field = n.cast(); - ASSERT_FALSE(field.isNull()); - ASSERT_EQ(name, field->getName()); - ASSERT_EQ(parent, field->getParent()); - ASSERT_EQ(type, field->getFieldType()); - ASSERT_EQ(primitiveType, field->getPrimitiveType()); - ASSERT_EQ(optional, field->isOptional()); - // check the children. - ASSERT_EQ(children.size(), field->getChildren().size()); - for (unsigned int c = 0; c < children.size(); c++) { - ASSERT_EQ(children[c], field->getChildren()[c]); - } -} - -static void checkFieldDescriptor( - Handle desc, Handle parent, - NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); - ASSERT_EQ(1, res.size()); - checkFieldDescriptor(res[0].node, name, parent, children, type, - primitiveType, optional); -} - -static void checkFieldDescriptor( - Handle desc, NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - checkFieldDescriptor(desc, desc, children, name, type, primitiveType, - optional); -} - -TEST(XmlParser, domainParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("book_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(book_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - // check the domain node. - Rooted book_domain = book_domain_node.cast(); - ASSERT_EQ("book", book_domain->getName()); - // get the book struct node. - Cardinality single; - single.merge({1}); - Rooted bookAuthor{ - new StructType(book_domain->getManager(), "", nullptr)}; - bookAuthor->addAttribute( - {new Attribute(book_domain->getManager(), "author", - env.project->getSystemTypesystem()->getStringType(), - "")}, - logger); - Rooted book = checkStructuredClass( - "book", "book", book_domain, single, bookAuthor, nullptr, false, true); - // get the chapter struct node. - Rooted chapter = - checkStructuredClass("chapter", "chapter", book_domain); - Rooted section = - checkStructuredClass("section", "section", book_domain); - Rooted subsection = - checkStructuredClass("subsection", "subsection", book_domain); - Rooted paragraph = - checkStructuredClass("paragraph", "paragraph", book_domain, - Cardinality::any(), nullptr, nullptr, true, false); - Rooted text = - checkStructuredClass("text", "text", book_domain, Cardinality::any(), - nullptr, nullptr, true, false); - - // check the FieldDescriptors. - checkFieldDescriptor(book, {chapter, paragraph}); - checkFieldDescriptor(chapter, {section, paragraph}); - checkFieldDescriptor(section, {subsection, paragraph}); - checkFieldDescriptor(subsection, {paragraph}); - checkFieldDescriptor(paragraph, {text}); - checkFieldDescriptor( - text, {}, DEFAULT_FIELD_NAME, FieldDescriptor::FieldType::PRIMITIVE, - env.project->getSystemTypesystem()->getStringType(), false); - - // check parent handling using the headings domain. - Rooted headings_domain_node = - env.parse("headings_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(headings_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted headings_domain = headings_domain_node.cast(); - // now there should be a heading struct. - Rooted heading = - checkStructuredClass("heading", "heading", headings_domain, single, - nullptr, nullptr, true, false); - // which should be a reference to the paragraph descriptor. - checkFieldDescriptor(heading, paragraph, {text}); - // and each struct in the book domain (except for text) should have a - // heading field now. - checkFieldDescriptor(book, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(chapter, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(section, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(subsection, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(paragraph, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - - // check annotation handling using the comments domain. - Rooted comments_domain_node = - env.parse("comments_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(comments_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted comments_domain = comments_domain_node.cast(); - // now we should be able to find a comment annotation. - Rooted comment_anno = - checkAnnotationClass("comment", "comment", comments_domain); - // as well as a comment struct - Rooted comment = - checkStructuredClass("comment", "comment", comments_domain); - // and a reply struct - Rooted reply = - checkStructuredClass("reply", "reply", comments_domain); - // check the fields for each of them. - { - std::vector> descs{comment_anno, comment, reply}; - for (auto &d : descs) { - checkFieldDescriptor(d, {paragraph}, "content", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - checkFieldDescriptor(d, {reply}, "replies", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - } - } - // paragraph should have comment as child now as well. - checkFieldDescriptor(paragraph, {text, comment}); - // as should heading, because it references the paragraph default field. - checkFieldDescriptor(heading, paragraph, {text, comment}); -} - -TEST(XmlParser, documentParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("simple_book.oxd", "", "", RttiSet{&RttiTypes::Document}); - //TODO: Check result -} -} - -- cgit v1.2.3