From ce5ab62b564476dfacba33507f1541166fda2bfb Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:40 +0100 Subject: renamed osdm to osml and osdmx to osxml --- src/formats/osdm/OsdmStreamParser.cpp | 640 --------------- src/formats/osdm/OsdmStreamParser.hpp | 351 -------- src/formats/osdmx/OsdmxParser.cpp | 1435 --------------------------------- src/formats/osdmx/OsdmxParser.hpp | 55 -- src/formats/osml/OsmlParser.cpp | 57 ++ src/formats/osml/OsmlParser.hpp | 48 ++ src/formats/osml/OsmlStreamParser.cpp | 640 +++++++++++++++ src/formats/osml/OsmlStreamParser.hpp | 350 ++++++++ src/formats/osxml/OsxmlParser.cpp | 1435 +++++++++++++++++++++++++++++++++ src/formats/osxml/OsxmlParser.hpp | 55 ++ 10 files changed, 2585 insertions(+), 2481 deletions(-) delete mode 100644 src/formats/osdm/OsdmStreamParser.cpp delete mode 100644 src/formats/osdm/OsdmStreamParser.hpp delete mode 100644 src/formats/osdmx/OsdmxParser.cpp delete mode 100644 src/formats/osdmx/OsdmxParser.hpp create mode 100644 src/formats/osml/OsmlParser.cpp create mode 100644 src/formats/osml/OsmlParser.hpp create mode 100644 src/formats/osml/OsmlStreamParser.cpp create mode 100644 src/formats/osml/OsmlStreamParser.hpp create mode 100644 src/formats/osxml/OsxmlParser.cpp create mode 100644 src/formats/osxml/OsxmlParser.hpp (limited to 'src') diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osdm/OsdmStreamParser.cpp deleted file mode 100644 index 8cb8caf..0000000 --- a/src/formats/osdm/OsdmStreamParser.cpp +++ /dev/null @@ -1,640 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include -#include -#include - -#include "OsdmStreamParser.hpp" - -namespace ousia { - -/** - * Plain format default tokenizer. - */ -class PlainFormatTokens : public DynamicTokenizer { -public: - /** - * Id of the backslash token. - */ - TokenTypeId Backslash; - - /** - * Id of the line comment token. - */ - TokenTypeId LineComment; - - /** - * Id of the block comment start token. - */ - TokenTypeId BlockCommentStart; - - /** - * Id of the block comment end token. - */ - TokenTypeId BlockCommentEnd; - - /** - * Id of the field start token. - */ - TokenTypeId FieldStart; - - /** - * Id of the field end token. - */ - TokenTypeId FieldEnd; - - /** - * Registers the plain format tokens in the internal tokenizer. - */ - PlainFormatTokens() - { - Backslash = registerToken("\\"); - LineComment = registerToken("%"); - BlockCommentStart = registerToken("%{"); - BlockCommentEnd = registerToken("}%"); - FieldStart = registerToken("{"); - FieldEnd = registerToken("}"); - } -}; - -static const PlainFormatTokens Tokens; - -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: - /** - * Internal character buffer. - */ - std::vector buf; - - /** - * Start location of the character data. - */ - SourceOffset start; - - /** - * End location of the character data. - */ - SourceOffset end; - -public: - /** - * Default constructor, initializes start and end with zeros. - */ - DataHandler() : start(0), end(0) {} - - /** - * Returns true if the internal buffer is empty. - * - * @return true if no characters were added to the internal buffer, false - * otherwise. - */ - bool isEmpty() { return buf.empty(); } - - /** - * Appends a single character to the internal buffer. - * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. - */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) - { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; - } - - /** - * Appends a string to the internal buffer. - * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. - */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) - { - if (isEmpty()) { - start = stringStart; - } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; - } - - /** - * Converts the internal buffer to a variant with attached location - * information. - * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. - */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } -}; - -OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) - : reader(reader), logger(logger), tokenizer(Tokens) -{ - // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true}); -} - -Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) -{ - bool first = true; - bool hasCharSiceNSSep = false; - std::vector identifier; - size_t end = reader.getPeekOffset(); - char c, c2; - while (reader.peek(c)) { - // Abort if this character is not a valid identifer character - if ((first && Utils::isIdentifierStartCharacter(c)) || - (!first && Utils::isIdentifierCharacter(c))) { - identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && - Utils::isIdentifierStartCharacter(c2)) { - identifier.push_back(c); - } else { - if (c == ':' && allowNSSep) { - logger.error( - "Expected character before and after namespace separator " - "\":\"", - reader); - } - reader.resetPeek(); - break; - } - - // This is no longer the first character - first = false; - - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); - - end = reader.getPeekOffset(); - reader.consumePeek(); - } - - // Return the identifier at its location - Variant res = - Variant::fromString(std::string(identifier.data(), identifier.size())); - res.setLocation({reader.getSourceId(), start, end}); - return res; -} - -OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() -{ - // Expect a '{' after the command - reader.consumeWhitespace(); - if (!reader.expect('{')) { - logger.error("Expected \"{\" after \\begin", reader); - return State::NONE; - } - - // Parse the name of the command that should be opened - Variant commandName = parseIdentifier(reader.getOffset(), true); - if (commandName.asString().empty()) { - logger.error("Expected identifier", commandName); - return State::ERROR; - } - - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - SourceOffset start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); - } - } - - if (!reader.expect('}')) { - logger.error("Expected \"}\"", reader); - return State::ERROR; - } - - // Parse the arguments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), true); - - return State::COMMAND; -} - -static bool checkStillInField(const OsdmStreamParser::Command &cmd, - const Variant &endName, Logger &logger) -{ - if (cmd.inField && !cmd.inRangeField) { - logger.error(std::string("\\end in open field of command \"") + - cmd.name.asString() + std::string("\""), - endName); - logger.note(std::string("Open command started here:"), cmd.name); - return true; - } - return false; -} - -OsdmStreamParser::State OsdmStreamParser::parseEndCommand() -{ - // Expect a '{' after the command - if (!reader.expect('{')) { - logger.error("Expected \"{\" after \\end", reader); - return State::NONE; - } - - // Fetch the name of the command that should be ended here - Variant name = parseIdentifier(reader.getOffset(), true); - - // Make sure the given command name is not empty - if (name.asString().empty()) { - logger.error("Expected identifier", name); - return State::ERROR; - } - - // Make sure the command name is terminated with a '}' - if (!reader.expect('}')) { - logger.error("Expected \"}\"", reader); - return State::ERROR; - } - - // Unroll the command stack up to the last range command - while (!commands.top().hasRange) { - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - commands.pop(); - } - - // Make sure we're not in an open field of this command - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - - // Special error message if the top-level command is reached - if (commands.size() == 1) { - logger.error(std::string("Cannot end command \"") + name.asString() + - std::string("\" here, no command open"), - name); - return State::ERROR; - } - - // Inform the about command mismatches - const Command &cmd = commands.top(); - if (commands.top().name.asString() != name.asString()) { - logger.error(std::string("Trying to end command \"") + - cmd.name.asString() + - std::string("\", but open command is \"") + - name.asString() + std::string("\""), - name); - logger.note("Last command was opened here:", cmd.name); - return State::ERROR; - } - - // Set the location to the location of the command that was ended, then end - // the current command - location = name.getLocation(); - commands.pop(); - return cmd.inRangeField ? State::FIELD_END : State::NONE; -} - -Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) -{ - // Parse the arguments using the universal VariantReader - Variant commandArguments; - if (reader.expect('[')) { - auto res = VariantReader::parseObject(reader, logger, ']'); - commandArguments = res.second; - } else { - commandArguments = Variant::mapType{}; - } - - // Insert the parsed name, make sure "name" was not specified in the - // arguments - if (commandArgName.isString()) { - auto res = - commandArguments.asMap().emplace("name", std::move(commandArgName)); - if (!res.second) { - logger.error("Name argument specified multiple times", - SourceLocation{}, MessageMode::NO_CONTEXT); - logger.note("First occurance is here: ", commandArgName); - logger.note("Second occurance is here: ", res.first->second); - } - } - return commandArguments; -} - -void OsdmStreamParser::pushCommand(Variant commandName, - Variant commandArguments, bool hasRange) -{ - // Store the location on the stack - location = commandName.getLocation(); - - // Place the command on the command stack, remove the last commands if we're - // not currently inside a field of these commands - while (!commands.top().inField) { - commands.pop(); - } - commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false}); -} - -OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) -{ - // Parse the commandName as a first identifier - Variant commandName = parseIdentifier(start, true); - if (commandName.asString().empty()) { - logger.error("Empty command name", reader); - return State::NONE; - } - - // Handle the special "begin" and "end" commands - const auto commandNameComponents = - Utils::split(commandName.asString(), ':'); - const bool isBegin = commandNameComponents[0] == "begin"; - const bool isEnd = commandNameComponents[0] == "end"; - if (isBegin || isEnd) { - if (commandNameComponents.size() > 1) { - logger.error( - "Special commands \"\\begin\" and \"\\end\" may not contain a " - "namespace separator \":\"", - commandName); - } - if (isBegin) { - return parseBeginCommand(); - } else if (isEnd) { - return parseEndCommand(); - } - } - - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); - } - } - - // Parse the arugments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), false); - - return State::COMMAND; -} - -void OsdmStreamParser::parseBlockComment() -{ - DynamicToken token; - size_t depth = 1; - while (tokenizer.read(reader, token)) { - if (token.type == Tokens.BlockCommentEnd) { - depth--; - if (depth == 0) { - return; - } - } - if (token.type == Tokens.BlockCommentStart) { - depth++; - } - } - - // Issue an error if the file ends while we are in a block comment - logger.error("File ended while being in a block comment", reader); -} - -void OsdmStreamParser::parseLineComment() -{ - char c; - while (reader.read(c)) { - if (c == '\n') { - return; - } - } -} - -bool OsdmStreamParser::checkIssueData(DataHandler &handler) -{ - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); - location = data.getLocation(); - reader.resetPeek(); - return true; - } - return false; -} - -bool OsdmStreamParser::checkIssueFieldStart() -{ - // Fetch the current command, and check whether we're currently inside a - // field of this command - Command &cmd = commands.top(); - if (!cmd.inField) { - // If this is a range command, we're now implicitly inside the field of - // this command -- we'll have to issue a field start command! - if (cmd.hasRange) { - cmd.inField = true; - cmd.inRangeField = true; - reader.resetPeek(); - return true; - } - - // This was not a range command, so obviously we're now inside within - // a field of some command -- so unroll the commands stack until a - // command with open field is reached - while (!commands.top().inField) { - commands.pop(); - } - } - return false; -} - -OsdmStreamParser::State OsdmStreamParser::parse() -{ - // Handler for incomming data - DataHandler handler; - - // Read tokens until the outer loop should be left - DynamicToken token; - while (tokenizer.peek(reader, token)) { - const TokenTypeId type = token.type; - - // Special handling for Backslash and Text - if (type == Tokens.Backslash) { - // Before appending anything to the output data or starting a new - // command, check whether FIELD_START has to be issued, as the - // current command is a command with range - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - - // Check whether a command starts now, without advancing the peek - // cursor - char c; - if (!reader.fetchPeek(c)) { - logger.error("Trailing backslash at the end of the file.", - token); - return State::END; - } - - // Try to parse a command - if (Utils::isIdentifierStartCharacter(c)) { - // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { - return State::DATA; - } - - // Parse the actual command - State res = parseCommand(token.location.getStart()); - switch (res) { - case State::ERROR: - throw LoggableException( - "Last error was irrecoverable, ending parsing " - "process"); - case State::NONE: - continue; - default: - return res; - } - } - - // This was not a special character, just append the given character - // to the data buffer, use the escape character start as start - // location and the peek offset as end location - reader.peek(c); // Peek the previously fetched character - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); - reader.consumePeek(); - continue; - } else if (type == TextToken) { - // Check whether FIELD_START has to be issued before appending text - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - - reader.consumePeek(); - continue; - } - - // A non-text token was reached, make sure all pending data commands - // have been issued - if (checkIssueData(handler)) { - return State::DATA; - } - - // We will handle the token now, consume the peeked characters - reader.consumePeek(); - - // Update the location to the current token location - location = token.location; - - if (token.type == Tokens.LineComment) { - parseLineComment(); - } else if (token.type == Tokens.BlockCommentStart) { - parseBlockComment(); - } else if (token.type == Tokens.FieldStart) { - Command &cmd = commands.top(); - if (!cmd.inField) { - cmd.inField = true; - return State::FIELD_START; - } - logger.error( - "Got field start token \"{\", but no command for which to " - "start the field. Did you mean \"\\{\"?", - token); - } else if (token.type == Tokens.FieldEnd) { - // Try to end an open field of the current command -- if the current - // command is not inside an open field, end this command and try to - // close the next one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - return State::FIELD_END; - } - commands.pop(); - } else { - break; - } - } - logger.error( - "Got field end token \"}\", but there is no field to end. Did " - "you mean \"\\}\"?", - token); - } else { - logger.error("Unexpected token \"" + token.content + "\"", token); - } - } - - // Issue available data - if (checkIssueData(handler)) { - return State::DATA; - } - - // Make sure all open commands and fields have been ended at the end of the - // stream - while (commands.size() > 1) { - Command &cmd = commands.top(); - if (cmd.inField || cmd.hasRange) { - logger.error("Reached end of stream, but command \"" + - cmd.name.asString() + "\" has not been ended", - cmd.name); - } - commands.pop(); - } - - location = SourceLocation{reader.getSourceId(), reader.getOffset()}; - return State::END; -} - -const Variant &OsdmStreamParser::getCommandName() -{ - return commands.top().name; -} - -const Variant &OsdmStreamParser::getCommandArguments() -{ - return commands.top().arguments; -} -} - diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osdm/OsdmStreamParser.hpp deleted file mode 100644 index 48d8fb7..0000000 --- a/src/formats/osdm/OsdmStreamParser.hpp +++ /dev/null @@ -1,351 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file OsdmStreamParser.hpp - * - * Provides classes for low-level classes for reading the TeX-esque osdm - * format. The class provided here does not build any model objects and does not - * implement the Parser interface. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ -#define _OUSIA_OSDM_STREAM_PARSER_HPP_ - -#include - -#include - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; -class Logger; -class DataHandler; - -/** - * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm - * format. The parser is constructed around a "parse" function, which reads data - * from the underlying CharReader until a new state is reached and indicates - * this state in a return value. The calling code then has to pull corresponding - * data from the stream reader. The reader makes sure the incommind file is - * syntactically valid and tries to recorver from most errors. If an error is - * irrecoverable (this is the case for errors with wrong nesting of commands or - * fields, as this would lead to too many consecutive errors) a - * LoggableException is thrown. - */ -class OsdmStreamParser { -public: - /** - * Enum used to indicate which state the OsdmStreamParser class is in - * after calling the "parse" function. - */ - enum class State { - /** - * State returned if a fully featured command has been read. A command - * consists of the command name and its arguments (which optionally - * includes the name). - */ - COMMAND, - - /** - * State returned if data is given. The reader must decide which field - * or command this should be routed to. Trailing or leading whitespace - * has been removed. Only called if the data is non-empty. - */ - DATA, - - /** - * A user-defined entity has been found. The entity sequence is stored - * in the command name. - */ - ENTITY, - - /** - * State returned if an annotation was started. An annotation consists - * of the command name and its arguments (which optionally include the - * name). - */ - ANNOTATION_START, - - /** - * State returned if an annotation ends. The reader indicates which - * annotation ends. - */ - ANNOTATION_END, - - /** - * State returned if a new field started. The reader assures that the - * current field ends before a new field is started and that the field - * is not started if data has been given outside of a field. The - * field number is set to the current field index. - */ - FIELD_START, - - /** - * State returned if the current field ends. The reader assures that a - * field was actually open. - */ - FIELD_END, - - /** - * The end of the stream has been reached. - */ - END, - - /** - * Returned from internal functions if nothing should be done. - */ - NONE, - - /** - * Returned from internal function to indicate irrecoverable errors. - */ - ERROR - }; - - /** - * Entry used for the command stack. - */ - struct Command { - /** - * Name and location of the current command. - */ - Variant name; - - /** - * Arguments that were passed to the command. - */ - Variant arguments; - - /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). - */ - bool inRangeField; - - /** - * Default constructor. - */ - Command() : hasRange(false), inField(false), inRangeField(false) {} - - /** - * Constructor of the Command class. - * - * @param name is a string variant with name and location of the - * command. - * @param arguments is a map variant with the arguments given to the - * command. - * @param hasRange should be set to true if this is a command with - * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently inside the outer - * field of the command. - */ - Command(Variant name, Variant arguments, bool hasRange, bool inField, - bool inRangeField) - : name(std::move(name)), - arguments(std::move(arguments)), - hasRange(hasRange), - inField(inField), - inRangeField(inRangeField) - { - } - }; - -private: - /** - * Reference to the CharReader instance from which the incomming bytes are - * read. - */ - CharReader &reader; - - /** - * Reference at the logger instance to which all error messages are sent. - */ - Logger &logger; - - /** - * Tokenizer instance used to read individual tokens from the text. - */ - DynamicTokenizer tokenizer; - - /** - * Stack containing the current commands. - */ - std::stack commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). - */ - Variant data; - - /** - * Contains the location of the last token. - */ - SourceLocation location; - - /** - * Contains the field index of the current command. - */ - size_t fieldIdx; - - /** - * Function used internall to parse an identifier. - * - * @param start is the start byte offset of the identifier (including the - * backslash). - * @param allowNSSep should be set to true if the namespace separator is - * allowed in the identifier name. Issues error if the namespace separator - * is placed incorrectly. - */ - Variant parseIdentifier(size_t start, bool allowNSSep = false); - - /** - * Function used internally to handle the special "\begin" command. - */ - State parseBeginCommand(); - - /** - * Function used internally to handle the special "\end" command. - */ - State parseEndCommand(); - - /** - * Pushes the parsed command onto the command stack. - */ - void pushCommand(Variant commandName, Variant commandArguments, - bool hasRange); - - /** - * Parses the command arguments. - */ - Variant parseCommandArguments(Variant commandArgName); - - /** - * Function used internally to parse a command. - * - * @param start is the start byte offset of the command (including the - * backslash) - * @return true if a command was actuall parsed, false otherwise. - */ - State parseCommand(size_t start); - - /** - * Function used internally to parse a block comment. - */ - void parseBlockComment(); - - /** - * Function used internally to parse a generic comment. - */ - void parseLineComment(); - - /** - * Checks whether there is any data pending to be issued, if yes, issues it. - * - * @param handler is the data handler that contains the data that may be - * returned to the user. - * @return true if there was any data and DATA should be returned by the - * parse function, false otherwise. - */ - bool checkIssueData(DataHandler &handler); - - /** - * Called before any data is appended to the internal data handler. Checks - * whether a new field should be started or implicitly ended. - * - * @return true if FIELD_START should be returned by the parse function. - */ - bool checkIssueFieldStart(); - -public: - /** - * Constructor of the OsdmStreamParser class. Attaches the new - * OsdmStreamParser to the given CharReader and Logger instances. - * - * @param reader is the reader instance from which incomming characters - * should be read. - * @param logger is the logger instance to which errors should be written. - */ - OsdmStreamParser(CharReader &reader, Logger &logger); - - /** - * Continues parsing. Returns one of the states defined in the State enum. - * Callers should stop once the State::END state is reached. Use the getter - * functions to get more information about the current state, such as the - * command name or the data or the current field index. - * - * @return the new state the parser has reached. - */ - State parse(); - - /** - * Returns a reference at the internally stored data. Only valid if - * State::DATA was returned by the "parse" function. - * - * @return a reference at a variant containing the data parsed by the - * "parse" function. - */ - const Variant &getData() { return data; } - - /** - * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. - * - * @return a reference at a variant containing name and location of the - * parsed command. - */ - const Variant &getCommandName(); - - /** - * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. - * - * @return a reference at a variant containing arguments given to the - * command. - */ - const Variant &getCommandArguments(); - - /** - * Returns a reference at the char reader. - * - * @return the last internal token location. - */ - SourceLocation &getLocation() { return location; } -}; -} - -#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ - diff --git a/src/formats/osdmx/OsdmxParser.cpp b/src/formats/osdmx/OsdmxParser.cpp deleted file mode 100644 index c46d9de..0000000 --- a/src/formats/osdmx/OsdmxParser.cpp +++ /dev/null @@ -1,1435 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "XmlParser.hpp" - -namespace ousia { - -/* HeadNode Helper class */ - -namespace { -class HeadNode : public Node { -public: - using Node::Node; -}; -} - -namespace RttiTypes { -static Rtti HeadNode = RttiBuilder("HeadNode"); -} - -/* Element Handler Classes */ - -class DocumentHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted document = - project()->createDocument(args["name"].asString()); - document->setLocation(location()); - scope().push(document); - scope().setFlag(ParserFlag::POST_HEAD, false); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DocumentHandler{handlerData}; - } -}; - -class DocumentField : public Node { -public: - DocumentField(Manager &mgr, std::string name, Handle parent) - : Node(mgr, name, parent) - { - } -}; - -namespace RttiTypes { -const Rtti DocumentField = - RttiBuilder("DocumentField").parent(&Node); -} - -class DocumentChildHandler : public Handler { -public: - using Handler::Handler; - - void preamble(Handle parentNode, std::string &fieldName, - DocumentEntity *&parent, bool &inField) - { - // check if the parent in the structure tree was an explicit field - // reference. - inField = parentNode->isa(&RttiTypes::DocumentField); - if (inField) { - fieldName = parentNode->getName(); - parentNode = scope().selectOrThrow( - {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); - } else { - // if it wasn't an explicit reference, we use the default field. - fieldName = DEFAULT_FIELD_NAME; - } - // reference the parent entity explicitly. - parent = nullptr; - if (parentNode->isa(&RttiTypes::StructuredEntity)) { - parent = static_cast( - parentNode.cast().get()); - } else if (parentNode->isa(&RttiTypes::AnnotationEntity)) { - parent = static_cast( - parentNode.cast().get()); - } - } - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - Rooted parentNode = scope().selectOrThrow( - {&RttiTypes::Document, &RttiTypes::StructuredEntity, - &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}); - - std::string fieldName; - DocumentEntity *parent; - bool inField; - - preamble(parentNode, fieldName, parent, inField); - - // try to find a FieldDescriptor for the given tag if we are not in a - // field already. - // TODO: Consider fields of transparent classes - if (!inField && parent != nullptr && - parent->getDescriptor()->hasField(name())) { - Rooted field{new DocumentField( - parentNode->getManager(), fieldName, parentNode)}; - field->setLocation(location()); - scope().push(field); - return; - } - - // Otherwise create a new StructuredEntity - // TODO: Consider Anchors and AnnotationEntities - Rooted strct = scope().resolve( - Utils::split(name(), ':'), logger()); - if (strct == nullptr) { - // if we could not resolve the name, throw an exception. - throw LoggableException( - std::string("\"") + name() + "\" could not be resolved.", - location()); - } - - std::string name; - auto it = args.find("name"); - if (it != args.end()) { - name = it->second.asString(); - args.erase(it); - } - - Rooted entity; - if (parentNode->isa(&RttiTypes::Document)) { - entity = parentNode.cast()->createRootStructuredEntity( - strct, args, name); - } else { - // calculate a path if transparent entities are needed in between. - auto path = parent->getDescriptor()->pathTo(strct); - if (path.empty()) { - throw LoggableException( - std::string("An instance of \"") + strct->getName() + - "\" is not allowed as child of an instance of \"" + - parent->getDescriptor()->getName() + "\"", - location()); - } - - // create all transparent entities until the last field. - for (size_t p = 1; p < path.size() - 1; p = p + 2) { - parent = static_cast( - parent->createChildStructuredEntity( - path[p].cast(), - Variant::mapType{}, path[p - 1]->getName(), - "").get()); - } - entity = parent->createChildStructuredEntity(strct, args, fieldName, - name); - } - entity->setLocation(location()); - scope().push(entity); - } - - void end() override { scope().pop(); } - - void data(const std::string &data, int fieldIdx) override - { - Rooted parentNode = scope().selectOrThrow( - {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}); - - std::string fieldName; - DocumentEntity *parent; - bool inField; - - preamble(parentNode, fieldName, parent, inField); - - // retrieve the correct FieldDescriptor. - // TODO: Consider fields of transparent classes - Rooted desc = parent->getDescriptor(); - Rooted field = desc->getFieldDescriptor(fieldName); - if (field == nullptr) { - logger().error( - std::string("Can't handle data because no field with name \"") + - fieldName + "\" exists in descriptor\"" + desc->getName() + - "\".", - location()); - return; - } - if (!field->isPrimitive()) { - logger().error(std::string("Can't handle data because field \"") + - fieldName + "\" of descriptor \"" + - desc->getName() + "\" is not primitive!", - location()); - return; - } - - // try to parse the content. - auto res = VariantReader::parseGenericString( - data, logger(), location().getSourceId(), location().getStart()); - if (!res.first) { - return; - } - // try to convert it to the correct type. - if (!field->getPrimitiveType()->build(res.second, logger())) { - return; - } - // add it as primitive content. - parent->createChildDocumentPrimitive(res.second, fieldName); - } - - static Handler *create(const HandlerData &handlerData) - { - return new DocumentChildHandler{handlerData}; - } -}; - -class TypesystemHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Create the typesystem instance - Rooted typesystem = - project()->createTypesystem(args["name"].asString()); - typesystem->setLocation(location()); - - // Push the typesystem onto the scope, set the POST_HEAD flag to true - scope().push(typesystem); - scope().setFlag(ParserFlag::POST_HEAD, false); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemHandler{handlerData}; - } -}; - -class TypesystemEnumHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Fetch the current typesystem and create the enum node - Rooted typesystem = scope().selectOrThrow(); - Rooted enumType = - typesystem->createEnumType(args["name"].asString()); - enumType->setLocation(location()); - - scope().push(enumType); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemEnumHandler{handlerData}; - } -}; - -class TypesystemEnumEntryHandler : public Handler { -public: - using Handler::Handler; - - std::string entry; - - void start(Variant::mapType &args) override {} - - void end() override - { - Rooted enumType = scope().selectOrThrow(); - enumType->addEntry(entry, logger()); - } - - void data(const std::string &data, int field) override - { - if (field != 0) { - // TODO: This should be stored in the HandlerData - logger().error("Enum entry only has one field."); - return; - } - entry.append(data); - } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemEnumEntryHandler{handlerData}; - } -}; - -class TypesystemStructHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Fetch the arguments used for creating this type - const std::string &name = args["name"].asString(); - const std::string &parent = args["parent"].asString(); - - // Fetch the current typesystem and create the struct node - Rooted typesystem = scope().selectOrThrow(); - Rooted structType = typesystem->createStructType(name); - structType->setLocation(location()); - - // Try to resolve the parent type and set it as parent structure - if (!parent.empty()) { - scope().resolve( - parent, structType, logger(), - [](Handle parent, Handle structType, - Logger &logger) { - if (parent != nullptr) { - structType.cast()->setParentStructure( - parent.cast(), logger); - } - }); - } - scope().push(structType); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemStructHandler{handlerData}; - } -}; - -class TypesystemStructFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Read the argument values - const std::string &name = args["name"].asString(); - const std::string &type = args["type"].asString(); - const Variant &defaultValue = args["default"]; - const bool optional = - !(defaultValue.isObject() && defaultValue.asObject() == nullptr); - - Rooted structType = scope().selectOrThrow(); - Rooted attribute = - structType->createAttribute(name, defaultValue, optional, logger()); - attribute->setLocation(location()); - - // Try to resolve the type and default value - if (optional) { - scope().resolveTypeWithValue( - type, attribute, attribute->getDefaultValue(), logger(), - [](Handle type, Handle attribute, Logger &logger) { - if (type != nullptr) { - attribute.cast()->setType(type.cast(), - logger); - } - }); - } else { - scope().resolveType( - type, attribute, logger(), - [](Handle type, Handle attribute, Logger &logger) { - if (type != nullptr) { - attribute.cast()->setType(type.cast(), - logger); - } - }); - } - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemStructFieldHandler{handlerData}; - } -}; - -class TypesystemConstantHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Read the argument values - const std::string &name = args["name"].asString(); - const std::string &type = args["type"].asString(); - const Variant &value = args["value"]; - - Rooted typesystem = scope().selectOrThrow(); - Rooted constant = typesystem->createConstant(name, value); - constant->setLocation(location()); - - // Try to resolve the type - scope().resolveTypeWithValue( - type, constant, constant->getValue(), logger(), - [](Handle type, Handle constant, Logger &logger) { - if (type != nullptr) { - constant.cast()->setType(type.cast(), - logger); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemConstantHandler{handlerData}; - } -}; - -/* - * Domain Handlers - */ - -class DomainHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted domain = - project()->createDomain(args["name"].asString()); - domain->setLocation(location()); - - scope().push(domain); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainHandler{handlerData}; - } -}; - -class DomainStructHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - Rooted domain = scope().selectOrThrow(); - - Rooted structuredClass = domain->createStructuredClass( - args["name"].asString(), args["cardinality"].asCardinality(), - nullptr, args["transparent"].asBool(), args["isRoot"].asBool()); - structuredClass->setLocation(location()); - - const std::string &isa = args["isa"].asString(); - if (!isa.empty()) { - scope().resolve( - isa, structuredClass, logger(), - [](Handle superclass, Handle structuredClass, - Logger &logger) { - if (superclass != nullptr) { - structuredClass.cast()->setSuperclass( - superclass.cast(), logger); - } - }); - } - - scope().push(structuredClass); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainStructHandler{handlerData}; - } -}; - -class DomainAnnotationHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - Rooted domain = scope().selectOrThrow(); - - Rooted annotationClass = - domain->createAnnotationClass(args["name"].asString()); - annotationClass->setLocation(location()); - - scope().push(annotationClass); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainAnnotationHandler{handlerData}; - } -}; - -class DomainAttributesHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Fetch the current typesystem and create the struct node - Rooted parent = scope().selectOrThrow(); - - Rooted attrDesc = parent->getAttributesDescriptor(); - attrDesc->setLocation(location()); - - scope().push(attrDesc); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainAttributesHandler{handlerData}; - } -}; - -class DomainFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - FieldDescriptor::FieldType type; - if (args["isSubtree"].asBool()) { - type = FieldDescriptor::FieldType::SUBTREE; - } else { - type = FieldDescriptor::FieldType::TREE; - } - - Rooted parent = scope().selectOrThrow(); - - Rooted field = parent->createFieldDescriptor( - type, args["name"].asString(), args["optional"].asBool()); - field->setLocation(location()); - - scope().push(field); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainFieldHandler{handlerData}; - } -}; - -class DomainFieldRefHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parent = scope().selectOrThrow(); - - const std::string &name = args["name"].asString(); - scope().resolve( - name, parent, logger(), - [](Handle field, Handle parent, Logger &logger) { - if (field != nullptr) { - parent.cast()->addFieldDescriptor( - field.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainFieldRefHandler{handlerData}; - } -}; - -class DomainPrimitiveHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parent = scope().selectOrThrow(); - - Rooted field = parent->createPrimitiveFieldDescriptor( - nullptr, args["name"].asString(), args["optional"].asBool()); - field->setLocation(location()); - - const std::string &type = args["type"].asString(); - scope().resolve( - type, field, logger(), - [](Handle type, Handle field, Logger &logger) { - if (type != nullptr) { - field.cast()->setPrimitiveType( - type.cast()); - } - }); - - scope().push(field); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainPrimitiveHandler{handlerData}; - } -}; - -class DomainChildHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted field = - scope().selectOrThrow(); - - const std::string &ref = args["ref"].asString(); - scope().resolve( - ref, field, logger(), - [](Handle child, Handle field, Logger &logger) { - if (child != nullptr) { - field.cast()->addChild( - child.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainChildHandler{handlerData}; - } -}; - -class DomainParent : public Node { -public: - DomainParent(Manager &mgr, std::string name, Handle parent) - : Node(mgr, name, parent) - { - } -}; - -namespace RttiTypes { -const Rtti DomainParent = - RttiBuilder("DomainParent").parent(&Node); -} - -class DomainParentHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted strct = - scope().selectOrThrow(); - - Rooted parent{new DomainParent( - strct->getManager(), args["name"].asString(), strct)}; - parent->setLocation(location()); - scope().push(parent); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentHandler{handlerData}; - } -}; - -class DomainParentFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parentNameNode = - scope().selectOrThrow(); - FieldDescriptor::FieldType type; - if (args["isSubtree"].asBool()) { - type = FieldDescriptor::FieldType::SUBTREE; - } else { - type = FieldDescriptor::FieldType::TREE; - } - - const std::string &name = args["name"].asString(); - const bool optional = args["optional"].asBool(); - Rooted strct = - parentNameNode->getParent().cast(); - - // resolve the parent, create the declared field and add the declared - // StructuredClass as child to it. - scope().resolve( - parentNameNode->getName(), strct, logger(), - [type, name, optional](Handle parent, Handle strct, - Logger &logger) { - if (parent != nullptr) { - Rooted field = - parent.cast()->createFieldDescriptor( - type, name, optional); - field->addChild(strct.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentFieldHandler{handlerData}; - } -}; - -class DomainParentFieldRefHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parentNameNode = - scope().selectOrThrow(); - - const std::string &name = args["name"].asString(); - Rooted strct = - parentNameNode->getParent().cast(); - auto loc = location(); - - // resolve the parent, get the referenced field and add the declared - // StructuredClass as child to it. - scope().resolve(parentNameNode->getName(), strct, logger(), - [name, loc](Handle parent, - Handle strct, - Logger &logger) { - if (parent != nullptr) { - auto res = parent.cast()->resolve( - &RttiTypes::FieldDescriptor, name); - if (res.size() != 1) { - logger.error( - std::string("Could not find referenced field ") + name, - loc); - return; - } - Rooted field = - res[0].node.cast(); - field->addChild(strct.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentFieldRefHandler{handlerData}; - } -}; - -/* - * Import and Include Handler - */ - -class ImportIncludeHandler : public Handler { -public: - using Handler::Handler; - - bool srcInArgs = false; - std::string rel; - std::string type; - std::string src; - - void start(Variant::mapType &args) override - { - rel = args["rel"].asString(); - type = args["type"].asString(); - src = args["src"].asString(); - srcInArgs = !src.empty(); - } - - void data(const std::string &data, int field) override - { - if (srcInArgs) { - logger().error("\"src\" attribute has already been set"); - return; - } - if (field != 0) { - logger().error("Command has only one field."); - return; - } - src.append(data); - } -}; - -class ImportHandler : public ImportIncludeHandler { -public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override - { - ImportIncludeHandler::start(args); - - // Make sure imports are still possible - if (scope().getFlag(ParserFlag::POST_HEAD)) { - logger().error("Imports must be listed before other commands.", - location()); - return; - } - } - - void end() override - { - // Fetch the last node and check whether an import is valid at this - // position - Rooted leaf = scope().getLeaf(); - if (leaf == nullptr || !leaf->isa(&RttiTypes::RootNode)) { - logger().error( - "Import not supported here, must be inside a document, domain " - "or typesystem command.", - location()); - return; - } - Rooted leafRootNode = leaf.cast(); - - // Perform the actual import, register the imported node within the leaf - // node - Rooted imported = - context().import(src, type, rel, leafRootNode->getReferenceTypes()); - if (imported != nullptr) { - leafRootNode->reference(imported); - } - } - - static Handler *create(const HandlerData &handlerData) - { - return new ImportHandler{handlerData}; - } -}; - -class IncludeHandler : public ImportIncludeHandler { -public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override - { - ImportIncludeHandler::start(args); - } - - void end() override - { - context().include(src, type, rel, {&RttiTypes::Node}); - } - - static Handler *create(const HandlerData &handlerData) - { - return new IncludeHandler{handlerData}; - } -}; - -namespace ParserStates { -/* Document states */ -static const ParserState Document = - ParserStateBuilder() - .parent(&None) - .createdNodeType(&RttiTypes::Document) - .elementHandler(DocumentHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState DocumentChild = - ParserStateBuilder() - .parents({&Document, &DocumentChild}) - .createdNodeTypes({&RttiTypes::StructureNode, - &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}) - .elementHandler(DocumentChildHandler::create); - -/* Domain states */ -static const ParserState Domain = ParserStateBuilder() - .parents({&None, &Document}) - .createdNodeType(&RttiTypes::Domain) - .elementHandler(DomainHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStruct = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::StructuredClass) - .elementHandler(DomainStructHandler::create) - .arguments({Argument::String("name"), - Argument::Cardinality("cardinality", Cardinality::any()), - Argument::Bool("isRoot", false), - Argument::Bool("transparent", false), - Argument::String("isa", "")}); - -static const ParserState DomainAnnotation = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::AnnotationClass) - .elementHandler(DomainAnnotationHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainAttributes = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(DomainAttributesHandler::create) - .arguments({}); - -static const ParserState DomainAttribute = - ParserStateBuilder() - .parent(&DomainAttributes) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState DomainField = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainFieldRef = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldRefHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); - -static const ParserState DomainStructPrimitive = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainPrimitiveHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("optional", false), - Argument::String("type")}); - -static const ParserState DomainStructChild = - ParserStateBuilder() - .parent(&DomainField) - .elementHandler(DomainChildHandler::create) - .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParent = - ParserStateBuilder() - .parent(&DomainStruct) - .createdNodeType(&RttiTypes::DomainParent) - .elementHandler(DomainParentHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStructParentField = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainStructParentFieldRef = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldRefHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); - -/* Typesystem states */ -static const ParserState Typesystem = - ParserStateBuilder() - .parents({&None, &Domain}) - .createdNodeType(&RttiTypes::Typesystem) - .elementHandler(TypesystemHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState TypesystemEnum = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::EnumType) - .elementHandler(TypesystemEnumHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState TypesystemEnumEntry = - ParserStateBuilder() - .parent(&TypesystemEnum) - .elementHandler(TypesystemEnumEntryHandler::create) - .arguments({}); - -static const ParserState TypesystemStruct = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(TypesystemStructHandler::create) - .arguments({Argument::String("name"), Argument::String("parent", "")}); - -static const ParserState TypesystemStructField = - ParserStateBuilder() - .parent(&TypesystemStruct) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState TypesystemConstant = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::Constant) - .elementHandler(TypesystemConstantHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("value")}); - -/* Special states for import and include */ -static const ParserState Import = - ParserStateBuilder() - .parents({&Document, &Typesystem, &Domain}) - .elementHandler(ImportHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const ParserState Include = - ParserStateBuilder() - .parent(&All) - .elementHandler(IncludeHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const std::multimap XmlStates{ - {"document", &Document}, - {"*", &DocumentChild}, - {"domain", &Domain}, - {"struct", &DomainStruct}, - {"annotation", &DomainAnnotation}, - {"attributes", &DomainAttributes}, - {"attribute", &DomainAttribute}, - {"field", &DomainField}, - {"fieldRef", &DomainFieldRef}, - {"primitive", &DomainStructPrimitive}, - {"child", &DomainStructChild}, - {"parent", &DomainStructParent}, - {"field", &DomainStructParentField}, - {"fieldRef", &DomainStructParentFieldRef}, - {"typesystem", &Typesystem}, - {"enum", &TypesystemEnum}, - {"entry", &TypesystemEnumEntry}, - {"struct", &TypesystemStruct}, - {"field", &TypesystemStructField}, - {"constant", &TypesystemConstant}, - {"import", &Import}, - {"include", &Include}}; -} - -/** - * Structue containing the private data that is being passed to the - * XML-Handlers. - */ -struct XMLUserData { - /** - * Containing the depth of the current XML file - */ - size_t depth; - - /** - * Reference at the ParserStack instance. - */ - ParserStack *stack; - - /** - * Reference at the CharReader instance. - */ - CharReader *reader; - - /** - * Constructor of the XMLUserData struct. - * - * @param stack is a pointer at the ParserStack instance. - * @param reader is a pointer at the CharReader instance. - */ - XMLUserData(ParserStack *stack, CharReader *reader) - : depth(0), stack(stack), reader(reader) - { - } -}; - -/** - * Wrapper class around the XML_Parser pointer which safely frees it whenever - * the scope is left (e.g. because an exception was thrown). - */ -class ScopedExpatXmlParser { -private: - /** - * Internal pointer to the XML_Parser instance. - */ - XML_Parser parser; - -public: - /** - * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS - * from the expat library. Throws a parser exception if the XML parser - * cannot be initialized. - * - * @param encoding is the protocol-defined encoding passed to expat (or - * nullptr if expat should determine the encoding by itself). - */ - ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) - { - parser = XML_ParserCreate(encoding); - if (!parser) { - throw LoggableException{ - "Internal error: Could not create expat XML parser!"}; - } - } - - /** - * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. - */ - ~ScopedExpatXmlParser() - { - if (parser) { - XML_ParserFree(parser); - parser = nullptr; - } - } - - /** - * Returns the XML_Parser pointer. - */ - XML_Parser operator&() { return parser; } -}; - -/* Adapter Expat -> ParserStack */ - -static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) -{ - // Fetch the parser stack and the associated user data - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - // Fetch the current location in the XML file - size_t offs = XML_GetCurrentByteIndex(p); - - // Build the source location and update the default location of the - // current - // logger instance - SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; - stack->getContext().getLogger().setDefaultLocation(loc); - return loc; -} - -enum class XMLAttributeState { - IN_TAG_NAME, - SEARCH_ATTR, - IN_ATTR_NAME, - HAS_ATTR_NAME, - HAS_ATTR_EQUALS, - IN_ATTR_DATA -}; - -static std::map reconstructXMLAttributeOffsets( - CharReader &reader, SourceLocation location) -{ - std::map res; - - // Fork the reader, we don't want to mess up the XML parsing process, do we? - CharReaderFork readerFork = reader.fork(); - - // Move the read cursor to the start location, abort if this does not work - size_t offs = location.getStart(); - if (!location.isValid() || offs != readerFork.seek(offs)) { - return res; - } - - // Now all we need to do is to implement one half of an XML parser. As this - // is inherently complicated we'll totaly fail at it. Don't care. All we - // want to get is those darn offsets for pretty error messages... (and we - // can assume the XML is valid as it was already read by expat) - XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; - char c; - std::stringstream attrName; - while (readerFork.read(c)) { - // Abort at the end of the tag - if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { - return res; - } - - // One state machine to rule them all, one state machine to find them, - // One state machine to bring them all and in the darkness bind them - // (the byte offsets) - switch (state) { - case XMLAttributeState::IN_TAG_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::SEARCH_ATTR; - } - break; - case XMLAttributeState::SEARCH_ATTR: - if (!Utils::isWhitespace(c)) { - state = XMLAttributeState::IN_ATTR_NAME; - attrName << c; - } - break; - case XMLAttributeState::IN_ATTR_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::HAS_ATTR_NAME; - } else if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - } else { - attrName << c; - } - break; - case XMLAttributeState::HAS_ATTR_NAME: - if (!Utils::isWhitespace(c)) { - if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - break; - } - // Well, this is a strange XML file... We expected to - // see a '=' here! Try to continue with the - // "HAS_ATTR_EQUALS" state as this state will hopefully - // inlcude some error recovery - } else { - // Skip whitespace here - break; - } - // Fallthrough - case XMLAttributeState::HAS_ATTR_EQUALS: - if (!Utils::isWhitespace(c)) { - if (c == '"') { - // Here we are! We have found the beginning of an - // attribute. Let's quickly lock the current offset away - // in the result map - res.emplace(attrName.str(), - SourceLocation{reader.getSourceId(), - readerFork.getOffset()}); - attrName.str(std::string{}); - state = XMLAttributeState::IN_ATTR_DATA; - } else { - // No, this XML file is not well formed. Assume we're in - // an attribute name once again - attrName.str(std::string{&c, 1}); - state = XMLAttributeState::IN_ATTR_NAME; - } - } - break; - case XMLAttributeState::IN_ATTR_DATA: - if (c == '"') { - // We're at the end of the attribute data, start anew - state = XMLAttributeState::SEARCH_ATTR; - } - break; - } - } - return res; -} - -static void xmlStartElementHandler(void *p, const XML_Char *name, - const XML_Char **attrs) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - SourceLocation loc = syncLoggerPosition(parser); - - // Read the argument locations -- this is only a stupid and slow hack, - // but it is necessary, as expat doesn't give use the byte offset of the - // arguments. - std::map offs = - reconstructXMLAttributeOffsets(*userData->reader, loc); - - // Assemble the arguments - Variant::mapType args; - - const XML_Char **attr = attrs; - while (*attr) { - // Convert the C string to a std::string - const std::string key{*(attr++)}; - - // Search the location of the key - SourceLocation keyLoc; - auto it = offs.find(key); - if (it != offs.end()) { - keyLoc = it->second; - } - - // Parse the string, pass the location of the key - std::pair value = VariantReader::parseGenericString( - *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), - keyLoc.getStart()); - args.emplace(key, value.second); - } - - // Call the start function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->start(std::string(name), args, loc); - } - - // Increment the current depth - userData->depth++; -} - -static void xmlEndElementHandler(void *p, const XML_Char *name) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - syncLoggerPosition(parser); - - // Decrement the current depth - userData->depth--; - - // Call the end function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->end(); - } -} - -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - size_t ulen = len > 0 ? static_cast(len) : 0; - syncLoggerPosition(parser, ulen); - const std::string data = Utils::trim(std::string{s, ulen}); - if (!data.empty()) { - stack->data(data); - } -} - -/* Class XmlParser */ - -void XmlParser::doParse(CharReader &reader, ParserContext &ctx) -{ - // Create the parser object - ScopedExpatXmlParser p{"UTF-8"}; - - // Create the parser stack instance, if we're starting on a non-empty scope, - // try to deduce the parser state - ParserStack stack(ctx, ParserStates::XmlStates); - if (!ctx.getScope().isEmpty()) { - if (!stack.deduceState()) { - return; - } - } - - // Pass the reference to the ParserStack to the XML handler - XMLUserData data(&stack, &reader); - XML_SetUserData(&p, &data); - XML_UseParserAsHandlerArg(&p); - - // Set the callback functions - XML_SetStartElementHandler(&p, xmlStartElementHandler); - XML_SetEndElementHandler(&p, xmlEndElementHandler); - XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); - - // Feed data into expat while there is data to process - constexpr size_t BUFFER_SIZE = 64 * 1024; - while (true) { - // Fetch a buffer from expat for the input data - char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); - if (!buf) { - throw LoggableException{ - "Internal error: XML parser out of memory!"}; - } - - // Read into the buffer - size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); - - // Parse the data and handle any XML error - if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { - // Fetch the xml parser byte offset - size_t offs = XML_GetCurrentByteIndex(&p); - - // Throw a corresponding exception - XML_Error code = XML_GetErrorCode(&p); - std::string msg = std::string{XML_ErrorString(code)}; - throw LoggableException{"XML: " + msg, - SourceLocation{ctx.getSourceId(), offs}}; - } - - // Abort once there are no more bytes in the stream - if (bytesRead == 0) { - break; - } - } -} -} - diff --git a/src/formats/osdmx/OsdmxParser.hpp b/src/formats/osdmx/OsdmxParser.hpp deleted file mode 100644 index c8b6302..0000000 --- a/src/formats/osdmx/OsdmxParser.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file XmlParser.hpp - * - * Contains the parser responsible for reading Ousía XML Documents (extension - * oxd) and Ousía XML Modules (extension oxm). - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_XML_PARSER_HPP_ -#define _OUSIA_XML_PARSER_HPP_ - -#include - -namespace ousia { - -/** - * The XmlParser class implements parsing the various types of Ousía XML - * documents using the expat stream XML parser. - */ -class XmlParser : public Parser { -protected: - /** - * Parses the given input stream as XML file and returns the parsed - * top-level node. - * - * @param reader is the CharReader from which the input should be read. - * @param ctx is a reference to the ParserContext instance that should be - * used. - */ - void doParse(CharReader &reader, ParserContext &ctx) override; -}; - -} - -#endif /* _OUSIA_XML_PARSER_HPP_ */ - diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp new file mode 100644 index 0000000..4973639 --- /dev/null +++ b/src/formats/osml/OsmlParser.cpp @@ -0,0 +1,57 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include "OsdmParser.hpp" +#include "OsdmStreamParser.hpp" + +namespace ousia { + +namespace { + +/** + * The OsdmParserImplementation class contains the actual implementation of the + * parsing process and is created in the "doParse" function of the OsdmParser. + + */ +class OsdmParserImplementation : public ParserStateCallbacks { +private: + /** + * OsdmStreamParser instance. + */ + OsdmStreamParser parser; + + /** + * Instance of the ParserStateStack. + */ + ParserStateStack stack; + +public: + OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap) +}; +} + +void OsdmParser::doParse(CharReader &reader, ParserContext &ctx) +{ + OsdmParserImplementation parser(reader, ctx); + parser.parse(); +} + +} diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp new file mode 100644 index 0000000..37505b4 --- /dev/null +++ b/src/formats/osml/OsmlParser.hpp @@ -0,0 +1,48 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsdmParser.hpp + * + * Contains the parser of the osdm format, the standard plain-text format used + * by Ousía for documents. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_PARSER_HPP_ +#define _OUSIA_OSDM_PARSER_HPP_ + +#include + +namespace ousia { + +/** + * OsdmParser is a small wrapper implementing the Parser interface. The actual + * parsing is performed with the OsdmStreamParser in conjunction with the + * ParserStateStack. + */ +class OsdmParser : public Parser { +protected: + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSDM_PARSER_HPP_ */ + diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp new file mode 100644 index 0000000..6a55f12 --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -0,0 +1,640 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include + +#include "OsdmStreamParser.hpp" + +namespace ousia { + +/** + * Plain format default tokenizer. + */ +class PlainFormatTokens : public Tokenizer { +public: + /** + * Id of the backslash token. + */ + TokenTypeId Backslash; + + /** + * Id of the line comment token. + */ + TokenTypeId LineComment; + + /** + * Id of the block comment start token. + */ + TokenTypeId BlockCommentStart; + + /** + * Id of the block comment end token. + */ + TokenTypeId BlockCommentEnd; + + /** + * Id of the field start token. + */ + TokenTypeId FieldStart; + + /** + * Id of the field end token. + */ + TokenTypeId FieldEnd; + + /** + * Registers the plain format tokens in the internal tokenizer. + */ + PlainFormatTokens() + { + Backslash = registerToken("\\"); + LineComment = registerToken("%"); + BlockCommentStart = registerToken("%{"); + BlockCommentEnd = registerToken("}%"); + FieldStart = registerToken("{"); + FieldEnd = registerToken("}"); + } +}; + +static const PlainFormatTokens Tokens; + +/** + * Class used internally to collect data issued via "DATA" event. + */ +class DataHandler { +private: + /** + * Internal character buffer. + */ + std::vector buf; + + /** + * Start location of the character data. + */ + SourceOffset start; + + /** + * End location of the character data. + */ + SourceOffset end; + +public: + /** + * Default constructor, initializes start and end with zeros. + */ + DataHandler() : start(0), end(0) {} + + /** + * Returns true if the internal buffer is empty. + * + * @return true if no characters were added to the internal buffer, false + * otherwise. + */ + bool isEmpty() { return buf.empty(); } + + /** + * Appends a single character to the internal buffer. + * + * @param c is the character that should be added to the internal buffer. + * @param charStart is the start position of the character. + * @param charEnd is the end position of the character. + */ + void append(char c, SourceOffset charStart, SourceOffset charEnd) + { + if (isEmpty()) { + start = charStart; + } + buf.push_back(c); + end = charEnd; + } + + /** + * Appends a string to the internal buffer. + * + * @param s is the string that should be added to the internal buffer. + * @param stringStart is the start position of the string. + * @param stringEnd is the end position of the string. + */ + void append(const std::string &s, SourceOffset stringStart, + SourceOffset stringEnd) + { + if (isEmpty()) { + start = stringStart; + } + std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); + end = stringEnd; + } + + /** + * Converts the internal buffer to a variant with attached location + * information. + * + * @param sourceId is the source id which is needed for building the + * location information. + * @return a Variant with the internal buffer content as string and + * the correct start and end location. + */ + Variant toVariant(SourceId sourceId) + { + Variant res = Variant::fromString(std::string(buf.data(), buf.size())); + res.setLocation({sourceId, start, end}); + return res; + } +}; + +OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) + : reader(reader), logger(logger), tokenizer(Tokens) +{ + // Place an intial command representing the complete file on the stack + commands.push(Command{"", Variant::mapType{}, true, true, true}); +} + +Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +{ + bool first = true; + bool hasCharSiceNSSep = false; + std::vector identifier; + size_t end = reader.getPeekOffset(); + char c, c2; + while (reader.peek(c)) { + // Abort if this character is not a valid identifer character + if ((first && Utils::isIdentifierStartCharacter(c)) || + (!first && Utils::isIdentifierCharacter(c))) { + identifier.push_back(c); + } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + Utils::isIdentifierStartCharacter(c2)) { + identifier.push_back(c); + } else { + if (c == ':' && allowNSSep) { + logger.error( + "Expected character before and after namespace separator " + "\":\"", + reader); + } + reader.resetPeek(); + break; + } + + // This is no longer the first character + first = false; + + // Advance the hasCharSiceNSSep flag + hasCharSiceNSSep = allowNSSep && (c != ':'); + + end = reader.getPeekOffset(); + reader.consumePeek(); + } + + // Return the identifier at its location + Variant res = + Variant::fromString(std::string(identifier.data(), identifier.size())); + res.setLocation({reader.getSourceId(), start, end}); + return res; +} + +OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +{ + // Expect a '{' after the command + reader.consumeWhitespace(); + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\begin", reader); + return State::NONE; + } + + // Parse the name of the command that should be opened + Variant commandName = parseIdentifier(reader.getOffset(), true); + if (commandName.asString().empty()) { + logger.error("Expected identifier", commandName); + return State::ERROR; + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + SourceOffset start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Parse the arguments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), true); + + return State::COMMAND; +} + +static bool checkStillInField(const OsdmStreamParser::Command &cmd, + const Variant &endName, Logger &logger) +{ + if (cmd.inField && !cmd.inRangeField) { + logger.error(std::string("\\end in open field of command \"") + + cmd.name.asString() + std::string("\""), + endName); + logger.note(std::string("Open command started here:"), cmd.name); + return true; + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +{ + // Expect a '{' after the command + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\end", reader); + return State::NONE; + } + + // Fetch the name of the command that should be ended here + Variant name = parseIdentifier(reader.getOffset(), true); + + // Make sure the given command name is not empty + if (name.asString().empty()) { + logger.error("Expected identifier", name); + return State::ERROR; + } + + // Make sure the command name is terminated with a '}' + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Unroll the command stack up to the last range command + while (!commands.top().hasRange) { + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + commands.pop(); + } + + // Make sure we're not in an open field of this command + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + + // Special error message if the top-level command is reached + if (commands.size() == 1) { + logger.error(std::string("Cannot end command \"") + name.asString() + + std::string("\" here, no command open"), + name); + return State::ERROR; + } + + // Inform the about command mismatches + const Command &cmd = commands.top(); + if (commands.top().name.asString() != name.asString()) { + logger.error(std::string("Trying to end command \"") + + cmd.name.asString() + + std::string("\", but open command is \"") + + name.asString() + std::string("\""), + name); + logger.note("Last command was opened here:", cmd.name); + return State::ERROR; + } + + // Set the location to the location of the command that was ended, then end + // the current command + location = name.getLocation(); + commands.pop(); + return cmd.inRangeField ? State::FIELD_END : State::NONE; +} + +Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +{ + // Parse the arguments using the universal VariantReader + Variant commandArguments; + if (reader.expect('[')) { + auto res = VariantReader::parseObject(reader, logger, ']'); + commandArguments = res.second; + } else { + commandArguments = Variant::mapType{}; + } + + // Insert the parsed name, make sure "name" was not specified in the + // arguments + if (commandArgName.isString()) { + auto res = + commandArguments.asMap().emplace("name", std::move(commandArgName)); + if (!res.second) { + logger.error("Name argument specified multiple times", + SourceLocation{}, MessageMode::NO_CONTEXT); + logger.note("First occurance is here: ", commandArgName); + logger.note("Second occurance is here: ", res.first->second); + } + } + return commandArguments; +} + +void OsdmStreamParser::pushCommand(Variant commandName, + Variant commandArguments, bool hasRange) +{ + // Store the location on the stack + location = commandName.getLocation(); + + // Place the command on the command stack, remove the last commands if we're + // not currently inside a field of these commands + while (!commands.top().inField) { + commands.pop(); + } + commands.push(Command{std::move(commandName), std::move(commandArguments), + hasRange, false, false}); +} + +OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +{ + // Parse the commandName as a first identifier + Variant commandName = parseIdentifier(start, true); + if (commandName.asString().empty()) { + logger.error("Empty command name", reader); + return State::NONE; + } + + // Handle the special "begin" and "end" commands + const auto commandNameComponents = + Utils::split(commandName.asString(), ':'); + const bool isBegin = commandNameComponents[0] == "begin"; + const bool isEnd = commandNameComponents[0] == "end"; + if (isBegin || isEnd) { + if (commandNameComponents.size() > 1) { + logger.error( + "Special commands \"\\begin\" and \"\\end\" may not contain a " + "namespace separator \":\"", + commandName); + } + if (isBegin) { + return parseBeginCommand(); + } else if (isEnd) { + return parseEndCommand(); + } + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + // Parse the arugments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), false); + + return State::COMMAND; +} + +void OsdmStreamParser::parseBlockComment() +{ + Token token; + size_t depth = 1; + while (tokenizer.read(reader, token)) { + if (token.type == Tokens.BlockCommentEnd) { + depth--; + if (depth == 0) { + return; + } + } + if (token.type == Tokens.BlockCommentStart) { + depth++; + } + } + + // Issue an error if the file ends while we are in a block comment + logger.error("File ended while being in a block comment", reader); +} + +void OsdmStreamParser::parseLineComment() +{ + char c; + while (reader.read(c)) { + if (c == '\n') { + return; + } + } +} + +bool OsdmStreamParser::checkIssueData(DataHandler &handler) +{ + if (!handler.isEmpty()) { + data = handler.toVariant(reader.getSourceId()); + location = data.getLocation(); + reader.resetPeek(); + return true; + } + return false; +} + +bool OsdmStreamParser::checkIssueFieldStart() +{ + // Fetch the current command, and check whether we're currently inside a + // field of this command + Command &cmd = commands.top(); + if (!cmd.inField) { + // If this is a range command, we're now implicitly inside the field of + // this command -- we'll have to issue a field start command! + if (cmd.hasRange) { + cmd.inField = true; + cmd.inRangeField = true; + reader.resetPeek(); + return true; + } + + // This was not a range command, so obviously we're now inside within + // a field of some command -- so unroll the commands stack until a + // command with open field is reached + while (!commands.top().inField) { + commands.pop(); + } + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parse() +{ + // Handler for incomming data + DataHandler handler; + + // Read tokens until the outer loop should be left + Token token; + while (tokenizer.peek(reader, token)) { + const TokenTypeId type = token.type; + + // Special handling for Backslash and Text + if (type == Tokens.Backslash) { + // Before appending anything to the output data or starting a new + // command, check whether FIELD_START has to be issued, as the + // current command is a command with range + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Check whether a command starts now, without advancing the peek + // cursor + char c; + if (!reader.fetchPeek(c)) { + logger.error("Trailing backslash at the end of the file.", + token); + return State::END; + } + + // Try to parse a command + if (Utils::isIdentifierStartCharacter(c)) { + // Make sure to issue any data before it is to late + if (checkIssueData(handler)) { + return State::DATA; + } + + // Parse the actual command + State res = parseCommand(token.location.getStart()); + switch (res) { + case State::ERROR: + throw LoggableException( + "Last error was irrecoverable, ending parsing " + "process"); + case State::NONE: + continue; + default: + return res; + } + } + + // This was not a special character, just append the given character + // to the data buffer, use the escape character start as start + // location and the peek offset as end location + reader.peek(c); // Peek the previously fetched character + handler.append(c, token.location.getStart(), + reader.getPeekOffset()); + reader.consumePeek(); + continue; + } else if (type == TextToken) { + // Check whether FIELD_START has to be issued before appending text + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Append the text to the data handler + handler.append(token.content, token.location.getStart(), + token.location.getEnd()); + + reader.consumePeek(); + continue; + } + + // A non-text token was reached, make sure all pending data commands + // have been issued + if (checkIssueData(handler)) { + return State::DATA; + } + + // We will handle the token now, consume the peeked characters + reader.consumePeek(); + + // Update the location to the current token location + location = token.location; + + if (token.type == Tokens.LineComment) { + parseLineComment(); + } else if (token.type == Tokens.BlockCommentStart) { + parseBlockComment(); + } else if (token.type == Tokens.FieldStart) { + Command &cmd = commands.top(); + if (!cmd.inField) { + cmd.inField = true; + return State::FIELD_START; + } + logger.error( + "Got field start token \"{\", but no command for which to " + "start the field. Did you mean \"\\{\"?", + token); + } else if (token.type == Tokens.FieldEnd) { + // Try to end an open field of the current command -- if the current + // command is not inside an open field, end this command and try to + // close the next one + for (int i = 0; i < 2 && commands.size() > 1; i++) { + Command &cmd = commands.top(); + if (!cmd.inRangeField) { + if (cmd.inField) { + cmd.inField = false; + return State::FIELD_END; + } + commands.pop(); + } else { + break; + } + } + logger.error( + "Got field end token \"}\", but there is no field to end. Did " + "you mean \"\\}\"?", + token); + } else { + logger.error("Unexpected token \"" + token.content + "\"", token); + } + } + + // Issue available data + if (checkIssueData(handler)) { + return State::DATA; + } + + // Make sure all open commands and fields have been ended at the end of the + // stream + while (commands.size() > 1) { + Command &cmd = commands.top(); + if (cmd.inField || cmd.hasRange) { + logger.error("Reached end of stream, but command \"" + + cmd.name.asString() + "\" has not been ended", + cmd.name); + } + commands.pop(); + } + + location = SourceLocation{reader.getSourceId(), reader.getOffset()}; + return State::END; +} + +const Variant &OsdmStreamParser::getCommandName() +{ + return commands.top().name; +} + +const Variant &OsdmStreamParser::getCommandArguments() +{ + return commands.top().arguments; +} +} + diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp new file mode 100644 index 0000000..84674c0 --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -0,0 +1,350 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsdmStreamParser.hpp + * + * Provides classes for low-level classes for reading the TeX-esque osdm + * format. The class provided here does not build any model objects and does not + * implement the Parser interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ +#define _OUSIA_OSDM_STREAM_PARSER_HPP_ + +#include + +#include +#include + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; +class DataHandler; + +/** + * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * format. The parser is constructed around a "parse" function, which reads data + * from the underlying CharReader until a new state is reached and indicates + * this state in a return value. The calling code then has to pull corresponding + * data from the stream reader. The reader makes sure the incommind file is + * syntactically valid and tries to recorver from most errors. If an error is + * irrecoverable (this is the case for errors with wrong nesting of commands or + * fields, as this would lead to too many consecutive errors) a + * LoggableException is thrown. + */ +class OsdmStreamParser { +public: + /** + * Enum used to indicate which state the OsdmStreamParser class is in + * after calling the "parse" function. + */ + enum class State { + /** + * State returned if a fully featured command has been read. A command + * consists of the command name and its arguments (which optionally + * includes the name). + */ + COMMAND, + + /** + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. + */ + DATA, + + /** + * A user-defined entity has been found. The entity sequence is stored + * in the command name. + */ + ENTITY, + + /** + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). + */ + ANNOTATION_START, + + /** + * State returned if an annotation ends. The reader indicates which + * annotation ends. + */ + ANNOTATION_END, + + /** + * State returned if a new field started. The reader assures that the + * current field ends before a new field is started and that the field + * is not started if data has been given outside of a field. The + * field number is set to the current field index. + */ + FIELD_START, + + /** + * State returned if the current field ends. The reader assures that a + * field was actually open. + */ + FIELD_END, + + /** + * The end of the stream has been reached. + */ + END, + + /** + * Returned from internal functions if nothing should be done. + */ + NONE, + + /** + * Returned from internal function to indicate irrecoverable errors. + */ + ERROR + }; + + /** + * Entry used for the command stack. + */ + struct Command { + /** + * Name and location of the current command. + */ + Variant name; + + /** + * Arguments that were passed to the command. + */ + Variant arguments; + + /** + * Set to true if this is a command with clear begin and end. + */ + bool hasRange; + + /** + * Set to true if we are currently inside a field of this command. + */ + bool inField; + + /** + * Set to true if we are currently in the range field of the command + * (implies inField being set to true). + */ + bool inRangeField; + + /** + * Default constructor. + */ + Command() : hasRange(false), inField(false), inRangeField(false) {} + + /** + * Constructor of the Command class. + * + * @param name is a string variant with name and location of the + * command. + * @param arguments is a map variant with the arguments given to the + * command. + * @param hasRange should be set to true if this is a command with + * explicit range. + * @param inField is set to true if we currently are inside a field + * of this command. + * @param inRangeField is set to true if we currently inside the outer + * field of the command. + */ + Command(Variant name, Variant arguments, bool hasRange, bool inField, + bool inRangeField) + : name(std::move(name)), + arguments(std::move(arguments)), + hasRange(hasRange), + inField(inField), + inRangeField(inRangeField) + { + } + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; + + /** + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Tokenizer instance used to read individual tokens from the text. + */ + Tokenizer tokenizer; + + /** + * Stack containing the current commands. + */ + std::stack commands; + + /** + * Variant containing the data that has been read (always is a string, + * contains the exact location of the data in the source file). + */ + Variant data; + + /** + * Contains the location of the last token. + */ + SourceLocation location; + + /** + * Contains the field index of the current command. + */ + size_t fieldIdx; + + /** + * Function used internall to parse an identifier. + * + * @param start is the start byte offset of the identifier (including the + * backslash). + * @param allowNSSep should be set to true if the namespace separator is + * allowed in the identifier name. Issues error if the namespace separator + * is placed incorrectly. + */ + Variant parseIdentifier(size_t start, bool allowNSSep = false); + + /** + * Function used internally to handle the special "\begin" command. + */ + State parseBeginCommand(); + + /** + * Function used internally to handle the special "\end" command. + */ + State parseEndCommand(); + + /** + * Pushes the parsed command onto the command stack. + */ + void pushCommand(Variant commandName, Variant commandArguments, + bool hasRange); + + /** + * Parses the command arguments. + */ + Variant parseCommandArguments(Variant commandArgName); + + /** + * Function used internally to parse a command. + * + * @param start is the start byte offset of the command (including the + * backslash) + * @return true if a command was actuall parsed, false otherwise. + */ + State parseCommand(size_t start); + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseLineComment(); + + /** + * Checks whether there is any data pending to be issued, if yes, issues it. + * + * @param handler is the data handler that contains the data that may be + * returned to the user. + * @return true if there was any data and DATA should be returned by the + * parse function, false otherwise. + */ + bool checkIssueData(DataHandler &handler); + + /** + * Called before any data is appended to the internal data handler. Checks + * whether a new field should be started or implicitly ended. + * + * @return true if FIELD_START should be returned by the parse function. + */ + bool checkIssueFieldStart(); + +public: + /** + * Constructor of the OsdmStreamParser class. Attaches the new + * OsdmStreamParser to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + OsdmStreamParser(CharReader &reader, Logger &logger); + + /** + * Continues parsing. Returns one of the states defined in the State enum. + * Callers should stop once the State::END state is reached. Use the getter + * functions to get more information about the current state, such as the + * command name or the data or the current field index. + * + * @return the new state the parser has reached. + */ + State parse(); + + /** + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. + */ + const Variant &getData() { return data; } + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing name and location of the + * parsed command. + */ + const Variant &getCommandName(); + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing arguments given to the + * command. + */ + const Variant &getCommandArguments(); + + /** + * Returns a reference at the char reader. + * + * @return the last internal token location. + */ + SourceLocation &getLocation() { return location; } +}; +} + +#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ + diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp new file mode 100644 index 0000000..c46d9de --- /dev/null +++ b/src/formats/osxml/OsxmlParser.cpp @@ -0,0 +1,1435 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "XmlParser.hpp" + +namespace ousia { + +/* HeadNode Helper class */ + +namespace { +class HeadNode : public Node { +public: + using Node::Node; +}; +} + +namespace RttiTypes { +static Rtti HeadNode = RttiBuilder("HeadNode"); +} + +/* Element Handler Classes */ + +class DocumentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted document = + project()->createDocument(args["name"].asString()); + document->setLocation(location()); + scope().push(document); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentHandler{handlerData}; + } +}; + +class DocumentField : public Node { +public: + DocumentField(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DocumentField = + RttiBuilder("DocumentField").parent(&Node); +} + +class DocumentChildHandler : public Handler { +public: + using Handler::Handler; + + void preamble(Handle parentNode, std::string &fieldName, + DocumentEntity *&parent, bool &inField) + { + // check if the parent in the structure tree was an explicit field + // reference. + inField = parentNode->isa(&RttiTypes::DocumentField); + if (inField) { + fieldName = parentNode->getName(); + parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); + } else { + // if it wasn't an explicit reference, we use the default field. + fieldName = DEFAULT_FIELD_NAME; + } + // reference the parent entity explicitly. + parent = nullptr; + if (parentNode->isa(&RttiTypes::StructuredEntity)) { + parent = static_cast( + parentNode.cast().get()); + } else if (parentNode->isa(&RttiTypes::AnnotationEntity)) { + parent = static_cast( + parentNode.cast().get()); + } + } + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::Document, &RttiTypes::StructuredEntity, + &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // try to find a FieldDescriptor for the given tag if we are not in a + // field already. + // TODO: Consider fields of transparent classes + if (!inField && parent != nullptr && + parent->getDescriptor()->hasField(name())) { + Rooted field{new DocumentField( + parentNode->getManager(), fieldName, parentNode)}; + field->setLocation(location()); + scope().push(field); + return; + } + + // Otherwise create a new StructuredEntity + // TODO: Consider Anchors and AnnotationEntities + Rooted strct = scope().resolve( + Utils::split(name(), ':'), logger()); + if (strct == nullptr) { + // if we could not resolve the name, throw an exception. + throw LoggableException( + std::string("\"") + name() + "\" could not be resolved.", + location()); + } + + std::string name; + auto it = args.find("name"); + if (it != args.end()) { + name = it->second.asString(); + args.erase(it); + } + + Rooted entity; + if (parentNode->isa(&RttiTypes::Document)) { + entity = parentNode.cast()->createRootStructuredEntity( + strct, args, name); + } else { + // calculate a path if transparent entities are needed in between. + auto path = parent->getDescriptor()->pathTo(strct); + if (path.empty()) { + throw LoggableException( + std::string("An instance of \"") + strct->getName() + + "\" is not allowed as child of an instance of \"" + + parent->getDescriptor()->getName() + "\"", + location()); + } + + // create all transparent entities until the last field. + for (size_t p = 1; p < path.size() - 1; p = p + 2) { + parent = static_cast( + parent->createChildStructuredEntity( + path[p].cast(), + Variant::mapType{}, path[p - 1]->getName(), + "").get()); + } + entity = parent->createChildStructuredEntity(strct, args, fieldName, + name); + } + entity->setLocation(location()); + scope().push(entity); + } + + void end() override { scope().pop(); } + + void data(const std::string &data, int fieldIdx) override + { + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // retrieve the correct FieldDescriptor. + // TODO: Consider fields of transparent classes + Rooted desc = parent->getDescriptor(); + Rooted field = desc->getFieldDescriptor(fieldName); + if (field == nullptr) { + logger().error( + std::string("Can't handle data because no field with name \"") + + fieldName + "\" exists in descriptor\"" + desc->getName() + + "\".", + location()); + return; + } + if (!field->isPrimitive()) { + logger().error(std::string("Can't handle data because field \"") + + fieldName + "\" of descriptor \"" + + desc->getName() + "\" is not primitive!", + location()); + return; + } + + // try to parse the content. + auto res = VariantReader::parseGenericString( + data, logger(), location().getSourceId(), location().getStart()); + if (!res.first) { + return; + } + // try to convert it to the correct type. + if (!field->getPrimitiveType()->build(res.second, logger())) { + return; + } + // add it as primitive content. + parent->createChildDocumentPrimitive(res.second, fieldName); + } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentChildHandler{handlerData}; + } +}; + +class TypesystemHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Create the typesystem instance + Rooted typesystem = + project()->createTypesystem(args["name"].asString()); + typesystem->setLocation(location()); + + // Push the typesystem onto the scope, set the POST_HEAD flag to true + scope().push(typesystem); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemHandler{handlerData}; + } +}; + +class TypesystemEnumHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the current typesystem and create the enum node + Rooted typesystem = scope().selectOrThrow(); + Rooted enumType = + typesystem->createEnumType(args["name"].asString()); + enumType->setLocation(location()); + + scope().push(enumType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumHandler{handlerData}; + } +}; + +class TypesystemEnumEntryHandler : public Handler { +public: + using Handler::Handler; + + std::string entry; + + void start(Variant::mapType &args) override {} + + void end() override + { + Rooted enumType = scope().selectOrThrow(); + enumType->addEntry(entry, logger()); + } + + void data(const std::string &data, int field) override + { + if (field != 0) { + // TODO: This should be stored in the HandlerData + logger().error("Enum entry only has one field."); + return; + } + entry.append(data); + } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumEntryHandler{handlerData}; + } +}; + +class TypesystemStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the arguments used for creating this type + const std::string &name = args["name"].asString(); + const std::string &parent = args["parent"].asString(); + + // Fetch the current typesystem and create the struct node + Rooted typesystem = scope().selectOrThrow(); + Rooted structType = typesystem->createStructType(name); + structType->setLocation(location()); + + // Try to resolve the parent type and set it as parent structure + if (!parent.empty()) { + scope().resolve( + parent, structType, logger(), + [](Handle parent, Handle structType, + Logger &logger) { + if (parent != nullptr) { + structType.cast()->setParentStructure( + parent.cast(), logger); + } + }); + } + scope().push(structType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructHandler{handlerData}; + } +}; + +class TypesystemStructFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &defaultValue = args["default"]; + const bool optional = + !(defaultValue.isObject() && defaultValue.asObject() == nullptr); + + Rooted structType = scope().selectOrThrow(); + Rooted attribute = + structType->createAttribute(name, defaultValue, optional, logger()); + attribute->setLocation(location()); + + // Try to resolve the type and default value + if (optional) { + scope().resolveTypeWithValue( + type, attribute, attribute->getDefaultValue(), logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } else { + scope().resolveType( + type, attribute, logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructFieldHandler{handlerData}; + } +}; + +class TypesystemConstantHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &value = args["value"]; + + Rooted typesystem = scope().selectOrThrow(); + Rooted constant = typesystem->createConstant(name, value); + constant->setLocation(location()); + + // Try to resolve the type + scope().resolveTypeWithValue( + type, constant, constant->getValue(), logger(), + [](Handle type, Handle constant, Logger &logger) { + if (type != nullptr) { + constant.cast()->setType(type.cast(), + logger); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemConstantHandler{handlerData}; + } +}; + +/* + * Domain Handlers + */ + +class DomainHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted domain = + project()->createDomain(args["name"].asString()); + domain->setLocation(location()); + + scope().push(domain); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainHandler{handlerData}; + } +}; + +class DomainStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted structuredClass = domain->createStructuredClass( + args["name"].asString(), args["cardinality"].asCardinality(), + nullptr, args["transparent"].asBool(), args["isRoot"].asBool()); + structuredClass->setLocation(location()); + + const std::string &isa = args["isa"].asString(); + if (!isa.empty()) { + scope().resolve( + isa, structuredClass, logger(), + [](Handle superclass, Handle structuredClass, + Logger &logger) { + if (superclass != nullptr) { + structuredClass.cast()->setSuperclass( + superclass.cast(), logger); + } + }); + } + + scope().push(structuredClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainStructHandler{handlerData}; + } +}; + +class DomainAnnotationHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted annotationClass = + domain->createAnnotationClass(args["name"].asString()); + annotationClass->setLocation(location()); + + scope().push(annotationClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAnnotationHandler{handlerData}; + } +}; + +class DomainAttributesHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Fetch the current typesystem and create the struct node + Rooted parent = scope().selectOrThrow(); + + Rooted attrDesc = parent->getAttributesDescriptor(); + attrDesc->setLocation(location()); + + scope().push(attrDesc); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAttributesHandler{handlerData}; + } +}; + +class DomainFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createFieldDescriptor( + type, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldHandler{handlerData}; + } +}; + +class DomainFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + scope().resolve( + name, parent, logger(), + [](Handle field, Handle parent, Logger &logger) { + if (field != nullptr) { + parent.cast()->addFieldDescriptor( + field.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldRefHandler{handlerData}; + } +}; + +class DomainPrimitiveHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createPrimitiveFieldDescriptor( + nullptr, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + const std::string &type = args["type"].asString(); + scope().resolve( + type, field, logger(), + [](Handle type, Handle field, Logger &logger) { + if (type != nullptr) { + field.cast()->setPrimitiveType( + type.cast()); + } + }); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainPrimitiveHandler{handlerData}; + } +}; + +class DomainChildHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted field = + scope().selectOrThrow(); + + const std::string &ref = args["ref"].asString(); + scope().resolve( + ref, field, logger(), + [](Handle child, Handle field, Logger &logger) { + if (child != nullptr) { + field.cast()->addChild( + child.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainChildHandler{handlerData}; + } +}; + +class DomainParent : public Node { +public: + DomainParent(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DomainParent = + RttiBuilder("DomainParent").parent(&Node); +} + +class DomainParentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted strct = + scope().selectOrThrow(); + + Rooted parent{new DomainParent( + strct->getManager(), args["name"].asString(), strct)}; + parent->setLocation(location()); + scope().push(parent); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentHandler{handlerData}; + } +}; + +class DomainParentFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + const std::string &name = args["name"].asString(); + const bool optional = args["optional"].asBool(); + Rooted strct = + parentNameNode->getParent().cast(); + + // resolve the parent, create the declared field and add the declared + // StructuredClass as child to it. + scope().resolve( + parentNameNode->getName(), strct, logger(), + [type, name, optional](Handle parent, Handle strct, + Logger &logger) { + if (parent != nullptr) { + Rooted field = + parent.cast()->createFieldDescriptor( + type, name, optional); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldHandler{handlerData}; + } +}; + +class DomainParentFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + Rooted strct = + parentNameNode->getParent().cast(); + auto loc = location(); + + // resolve the parent, get the referenced field and add the declared + // StructuredClass as child to it. + scope().resolve(parentNameNode->getName(), strct, logger(), + [name, loc](Handle parent, + Handle strct, + Logger &logger) { + if (parent != nullptr) { + auto res = parent.cast()->resolve( + &RttiTypes::FieldDescriptor, name); + if (res.size() != 1) { + logger.error( + std::string("Could not find referenced field ") + name, + loc); + return; + } + Rooted field = + res[0].node.cast(); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldRefHandler{handlerData}; + } +}; + +/* + * Import and Include Handler + */ + +class ImportIncludeHandler : public Handler { +public: + using Handler::Handler; + + bool srcInArgs = false; + std::string rel; + std::string type; + std::string src; + + void start(Variant::mapType &args) override + { + rel = args["rel"].asString(); + type = args["type"].asString(); + src = args["src"].asString(); + srcInArgs = !src.empty(); + } + + void data(const std::string &data, int field) override + { + if (srcInArgs) { + logger().error("\"src\" attribute has already been set"); + return; + } + if (field != 0) { + logger().error("Command has only one field."); + return; + } + src.append(data); + } +}; + +class ImportHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + + // Make sure imports are still possible + if (scope().getFlag(ParserFlag::POST_HEAD)) { + logger().error("Imports must be listed before other commands.", + location()); + return; + } + } + + void end() override + { + // Fetch the last node and check whether an import is valid at this + // position + Rooted leaf = scope().getLeaf(); + if (leaf == nullptr || !leaf->isa(&RttiTypes::RootNode)) { + logger().error( + "Import not supported here, must be inside a document, domain " + "or typesystem command.", + location()); + return; + } + Rooted leafRootNode = leaf.cast(); + + // Perform the actual import, register the imported node within the leaf + // node + Rooted imported = + context().import(src, type, rel, leafRootNode->getReferenceTypes()); + if (imported != nullptr) { + leafRootNode->reference(imported); + } + } + + static Handler *create(const HandlerData &handlerData) + { + return new ImportHandler{handlerData}; + } +}; + +class IncludeHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + } + + void end() override + { + context().include(src, type, rel, {&RttiTypes::Node}); + } + + static Handler *create(const HandlerData &handlerData) + { + return new IncludeHandler{handlerData}; + } +}; + +namespace ParserStates { +/* Document states */ +static const ParserState Document = + ParserStateBuilder() + .parent(&None) + .createdNodeType(&RttiTypes::Document) + .elementHandler(DocumentHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState DocumentChild = + ParserStateBuilder() + .parents({&Document, &DocumentChild}) + .createdNodeTypes({&RttiTypes::StructureNode, + &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}) + .elementHandler(DocumentChildHandler::create); + +/* Domain states */ +static const ParserState Domain = ParserStateBuilder() + .parents({&None, &Document}) + .createdNodeType(&RttiTypes::Domain) + .elementHandler(DomainHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStruct = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::StructuredClass) + .elementHandler(DomainStructHandler::create) + .arguments({Argument::String("name"), + Argument::Cardinality("cardinality", Cardinality::any()), + Argument::Bool("isRoot", false), + Argument::Bool("transparent", false), + Argument::String("isa", "")}); + +static const ParserState DomainAnnotation = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::AnnotationClass) + .elementHandler(DomainAnnotationHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainAttributes = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(DomainAttributesHandler::create) + .arguments({}); + +static const ParserState DomainAttribute = + ParserStateBuilder() + .parent(&DomainAttributes) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState DomainField = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainFieldRef = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +static const ParserState DomainStructPrimitive = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainPrimitiveHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("optional", false), + Argument::String("type")}); + +static const ParserState DomainStructChild = + ParserStateBuilder() + .parent(&DomainField) + .elementHandler(DomainChildHandler::create) + .arguments({Argument::String("ref")}); + +static const ParserState DomainStructParent = + ParserStateBuilder() + .parent(&DomainStruct) + .createdNodeType(&RttiTypes::DomainParent) + .elementHandler(DomainParentHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStructParentField = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainStructParentFieldRef = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +/* Typesystem states */ +static const ParserState Typesystem = + ParserStateBuilder() + .parents({&None, &Domain}) + .createdNodeType(&RttiTypes::Typesystem) + .elementHandler(TypesystemHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState TypesystemEnum = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::EnumType) + .elementHandler(TypesystemEnumHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState TypesystemEnumEntry = + ParserStateBuilder() + .parent(&TypesystemEnum) + .elementHandler(TypesystemEnumEntryHandler::create) + .arguments({}); + +static const ParserState TypesystemStruct = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(TypesystemStructHandler::create) + .arguments({Argument::String("name"), Argument::String("parent", "")}); + +static const ParserState TypesystemStructField = + ParserStateBuilder() + .parent(&TypesystemStruct) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState TypesystemConstant = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::Constant) + .elementHandler(TypesystemConstantHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("value")}); + +/* Special states for import and include */ +static const ParserState Import = + ParserStateBuilder() + .parents({&Document, &Typesystem, &Domain}) + .elementHandler(ImportHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const ParserState Include = + ParserStateBuilder() + .parent(&All) + .elementHandler(IncludeHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const std::multimap XmlStates{ + {"document", &Document}, + {"*", &DocumentChild}, + {"domain", &Domain}, + {"struct", &DomainStruct}, + {"annotation", &DomainAnnotation}, + {"attributes", &DomainAttributes}, + {"attribute", &DomainAttribute}, + {"field", &DomainField}, + {"fieldRef", &DomainFieldRef}, + {"primitive", &DomainStructPrimitive}, + {"child", &DomainStructChild}, + {"parent", &DomainStructParent}, + {"field", &DomainStructParentField}, + {"fieldRef", &DomainStructParentFieldRef}, + {"typesystem", &Typesystem}, + {"enum", &TypesystemEnum}, + {"entry", &TypesystemEnumEntry}, + {"struct", &TypesystemStruct}, + {"field", &TypesystemStructField}, + {"constant", &TypesystemConstant}, + {"import", &Import}, + {"include", &Include}}; +} + +/** + * Structue containing the private data that is being passed to the + * XML-Handlers. + */ +struct XMLUserData { + /** + * Containing the depth of the current XML file + */ + size_t depth; + + /** + * Reference at the ParserStack instance. + */ + ParserStack *stack; + + /** + * Reference at the CharReader instance. + */ + CharReader *reader; + + /** + * Constructor of the XMLUserData struct. + * + * @param stack is a pointer at the ParserStack instance. + * @param reader is a pointer at the CharReader instance. + */ + XMLUserData(ParserStack *stack, CharReader *reader) + : depth(0), stack(stack), reader(reader) + { + } +}; + +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class ScopedExpatXmlParser { +private: + /** + * Internal pointer to the XML_Parser instance. + */ + XML_Parser parser; + +public: + /** + * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * from the expat library. Throws a parser exception if the XML parser + * cannot be initialized. + * + * @param encoding is the protocol-defined encoding passed to expat (or + * nullptr if expat should determine the encoding by itself). + */ + ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + { + parser = XML_ParserCreate(encoding); + if (!parser) { + throw LoggableException{ + "Internal error: Could not create expat XML parser!"}; + } + } + + /** + * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + */ + ~ScopedExpatXmlParser() + { + if (parser) { + XML_ParserFree(parser); + parser = nullptr; + } + } + + /** + * Returns the XML_Parser pointer. + */ + XML_Parser operator&() { return parser; } +}; + +/* Adapter Expat -> ParserStack */ + +static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) +{ + // Fetch the parser stack and the associated user data + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + // Fetch the current location in the XML file + size_t offs = XML_GetCurrentByteIndex(p); + + // Build the source location and update the default location of the + // current + // logger instance + SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; + stack->getContext().getLogger().setDefaultLocation(loc); + return loc; +} + +enum class XMLAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +static std::map reconstructXMLAttributeOffsets( + CharReader &reader, SourceLocation location) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + size_t offs = location.getStart(); + if (!location.isValid() || offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XMLAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::SEARCH_ATTR; + } + break; + case XMLAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XMLAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XMLAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XMLAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XMLAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + attrName.str(std::string{}); + state = XMLAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XMLAttributeState::IN_ATTR_NAME; + } + } + break; + case XMLAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, start anew + state = XMLAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} + +static void xmlStartElementHandler(void *p, const XML_Char *name, + const XML_Char **attrs) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + SourceLocation loc = syncLoggerPosition(parser); + + // Read the argument locations -- this is only a stupid and slow hack, + // but it is necessary, as expat doesn't give use the byte offset of the + // arguments. + std::map offs = + reconstructXMLAttributeOffsets(*userData->reader, loc); + + // Assemble the arguments + Variant::mapType args; + + const XML_Char **attr = attrs; + while (*attr) { + // Convert the C string to a std::string + const std::string key{*(attr++)}; + + // Search the location of the key + SourceLocation keyLoc; + auto it = offs.find(key); + if (it != offs.end()) { + keyLoc = it->second; + } + + // Parse the string, pass the location of the key + std::pair value = VariantReader::parseGenericString( + *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + keyLoc.getStart()); + args.emplace(key, value.second); + } + + // Call the start function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->start(std::string(name), args, loc); + } + + // Increment the current depth + userData->depth++; +} + +static void xmlEndElementHandler(void *p, const XML_Char *name) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + syncLoggerPosition(parser); + + // Decrement the current depth + userData->depth--; + + // Call the end function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->end(); + } +} + +static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + size_t ulen = len > 0 ? static_cast(len) : 0; + syncLoggerPosition(parser, ulen); + const std::string data = Utils::trim(std::string{s, ulen}); + if (!data.empty()) { + stack->data(data); + } +} + +/* Class XmlParser */ + +void XmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ + // Create the parser object + ScopedExpatXmlParser p{"UTF-8"}; + + // Create the parser stack instance, if we're starting on a non-empty scope, + // try to deduce the parser state + ParserStack stack(ctx, ParserStates::XmlStates); + if (!ctx.getScope().isEmpty()) { + if (!stack.deduceState()) { + return; + } + } + + // Pass the reference to the ParserStack to the XML handler + XMLUserData data(&stack, &reader); + XML_SetUserData(&p, &data); + XML_UseParserAsHandlerArg(&p); + + // Set the callback functions + XML_SetStartElementHandler(&p, xmlStartElementHandler); + XML_SetEndElementHandler(&p, xmlEndElementHandler); + XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + + // Feed data into expat while there is data to process + constexpr size_t BUFFER_SIZE = 64 * 1024; + while (true) { + // Fetch a buffer from expat for the input data + char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); + if (!buf) { + throw LoggableException{ + "Internal error: XML parser out of memory!"}; + } + + // Read into the buffer + size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + + // Parse the data and handle any XML error + if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { + // Fetch the xml parser byte offset + size_t offs = XML_GetCurrentByteIndex(&p); + + // Throw a corresponding exception + XML_Error code = XML_GetErrorCode(&p); + std::string msg = std::string{XML_ErrorString(code)}; + throw LoggableException{"XML: " + msg, + SourceLocation{ctx.getSourceId(), offs}}; + } + + // Abort once there are no more bytes in the stream + if (bytesRead == 0) { + break; + } + } +} +} + diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp new file mode 100644 index 0000000..c8b6302 --- /dev/null +++ b/src/formats/osxml/OsxmlParser.hpp @@ -0,0 +1,55 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file XmlParser.hpp + * + * Contains the parser responsible for reading Ousía XML Documents (extension + * oxd) and Ousía XML Modules (extension oxm). + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_XML_PARSER_HPP_ +#define _OUSIA_XML_PARSER_HPP_ + +#include + +namespace ousia { + +/** + * The XmlParser class implements parsing the various types of Ousía XML + * documents using the expat stream XML parser. + */ +class XmlParser : public Parser { +protected: + /** + * Parses the given input stream as XML file and returns the parsed + * top-level node. + * + * @param reader is the CharReader from which the input should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_XML_PARSER_HPP_ */ + -- cgit v1.2.3