summaryrefslogtreecommitdiff
path: root/src/formats/osml
diff options
context:
space:
mode:
Diffstat (limited to 'src/formats/osml')
-rw-r--r--src/formats/osml/OsmlParser.cpp57
-rw-r--r--src/formats/osml/OsmlParser.hpp48
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp640
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp350
4 files changed, 1095 insertions, 0 deletions
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp
new file mode 100644
index 0000000..4973639
--- /dev/null
+++ b/src/formats/osml/OsmlParser.cpp
@@ -0,0 +1,57 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/parser/generic/ParserStateCallbacks.hpp>
+#include <core/parser/generic/ParserStateStack.hpp>
+
+#include "OsdmParser.hpp"
+#include "OsdmStreamParser.hpp"
+
+namespace ousia {
+
+namespace {
+
+/**
+ * The OsdmParserImplementation class contains the actual implementation of the
+ * parsing process and is created in the "doParse" function of the OsdmParser.
+
+ */
+class OsdmParserImplementation : public ParserStateCallbacks {
+private:
+ /**
+ * OsdmStreamParser instance.
+ */
+ OsdmStreamParser parser;
+
+ /**
+ * Instance of the ParserStateStack.
+ */
+ ParserStateStack stack;
+
+public:
+ OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap)
+};
+}
+
+void OsdmParser::doParse(CharReader &reader, ParserContext &ctx)
+{
+ OsdmParserImplementation parser(reader, ctx);
+ parser.parse();
+}
+
+}
diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp
new file mode 100644
index 0000000..37505b4
--- /dev/null
+++ b/src/formats/osml/OsmlParser.hpp
@@ -0,0 +1,48 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsdmParser.hpp
+ *
+ * Contains the parser of the osdm format, the standard plain-text format used
+ * by Ousía for documents.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSDM_PARSER_HPP_
+#define _OUSIA_OSDM_PARSER_HPP_
+
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+
+/**
+ * OsdmParser is a small wrapper implementing the Parser interface. The actual
+ * parsing is performed with the OsdmStreamParser in conjunction with the
+ * ParserStateStack.
+ */
+class OsdmParser : public Parser {
+protected:
+ void doParse(CharReader &reader, ParserContext &ctx) override;
+};
+
+}
+
+#endif /* _OUSIA_OSDM_PARSER_HPP_ */
+
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
new file mode 100644
index 0000000..6a55f12
--- /dev/null
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -0,0 +1,640 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Logger.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/VariantReader.hpp>
+
+#include "OsdmStreamParser.hpp"
+
+namespace ousia {
+
+/**
+ * Plain format default tokenizer.
+ */
+class PlainFormatTokens : public Tokenizer {
+public:
+ /**
+ * Id of the backslash token.
+ */
+ TokenTypeId Backslash;
+
+ /**
+ * Id of the line comment token.
+ */
+ TokenTypeId LineComment;
+
+ /**
+ * Id of the block comment start token.
+ */
+ TokenTypeId BlockCommentStart;
+
+ /**
+ * Id of the block comment end token.
+ */
+ TokenTypeId BlockCommentEnd;
+
+ /**
+ * Id of the field start token.
+ */
+ TokenTypeId FieldStart;
+
+ /**
+ * Id of the field end token.
+ */
+ TokenTypeId FieldEnd;
+
+ /**
+ * Registers the plain format tokens in the internal tokenizer.
+ */
+ PlainFormatTokens()
+ {
+ Backslash = registerToken("\\");
+ LineComment = registerToken("%");
+ BlockCommentStart = registerToken("%{");
+ BlockCommentEnd = registerToken("}%");
+ FieldStart = registerToken("{");
+ FieldEnd = registerToken("}");
+ }
+};
+
+static const PlainFormatTokens Tokens;
+
+/**
+ * Class used internally to collect data issued via "DATA" event.
+ */
+class DataHandler {
+private:
+ /**
+ * Internal character buffer.
+ */
+ std::vector<char> buf;
+
+ /**
+ * Start location of the character data.
+ */
+ SourceOffset start;
+
+ /**
+ * End location of the character data.
+ */
+ SourceOffset end;
+
+public:
+ /**
+ * Default constructor, initializes start and end with zeros.
+ */
+ DataHandler() : start(0), end(0) {}
+
+ /**
+ * Returns true if the internal buffer is empty.
+ *
+ * @return true if no characters were added to the internal buffer, false
+ * otherwise.
+ */
+ bool isEmpty() { return buf.empty(); }
+
+ /**
+ * Appends a single character to the internal buffer.
+ *
+ * @param c is the character that should be added to the internal buffer.
+ * @param charStart is the start position of the character.
+ * @param charEnd is the end position of the character.
+ */
+ void append(char c, SourceOffset charStart, SourceOffset charEnd)
+ {
+ if (isEmpty()) {
+ start = charStart;
+ }
+ buf.push_back(c);
+ end = charEnd;
+ }
+
+ /**
+ * Appends a string to the internal buffer.
+ *
+ * @param s is the string that should be added to the internal buffer.
+ * @param stringStart is the start position of the string.
+ * @param stringEnd is the end position of the string.
+ */
+ void append(const std::string &s, SourceOffset stringStart,
+ SourceOffset stringEnd)
+ {
+ if (isEmpty()) {
+ start = stringStart;
+ }
+ std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
+ end = stringEnd;
+ }
+
+ /**
+ * Converts the internal buffer to a variant with attached location
+ * information.
+ *
+ * @param sourceId is the source id which is needed for building the
+ * location information.
+ * @return a Variant with the internal buffer content as string and
+ * the correct start and end location.
+ */
+ Variant toVariant(SourceId sourceId)
+ {
+ Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
+ res.setLocation({sourceId, start, end});
+ return res;
+ }
+};
+
+OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger)
+ : reader(reader), logger(logger), tokenizer(Tokens)
+{
+ // Place an intial command representing the complete file on the stack
+ commands.push(Command{"", Variant::mapType{}, true, true, true});
+}
+
+Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+{
+ bool first = true;
+ bool hasCharSiceNSSep = false;
+ std::vector<char> identifier;
+ size_t end = reader.getPeekOffset();
+ char c, c2;
+ while (reader.peek(c)) {
+ // Abort if this character is not a valid identifer character
+ if ((first && Utils::isIdentifierStartCharacter(c)) ||
+ (!first && Utils::isIdentifierCharacter(c))) {
+ identifier.push_back(c);
+ } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+ Utils::isIdentifierStartCharacter(c2)) {
+ identifier.push_back(c);
+ } else {
+ if (c == ':' && allowNSSep) {
+ logger.error(
+ "Expected character before and after namespace separator "
+ "\":\"",
+ reader);
+ }
+ reader.resetPeek();
+ break;
+ }
+
+ // This is no longer the first character
+ first = false;
+
+ // Advance the hasCharSiceNSSep flag
+ hasCharSiceNSSep = allowNSSep && (c != ':');
+
+ end = reader.getPeekOffset();
+ reader.consumePeek();
+ }
+
+ // Return the identifier at its location
+ Variant res =
+ Variant::fromString(std::string(identifier.data(), identifier.size()));
+ res.setLocation({reader.getSourceId(), start, end});
+ return res;
+}
+
+OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
+{
+ // Expect a '{' after the command
+ reader.consumeWhitespace();
+ if (!reader.expect('{')) {
+ logger.error("Expected \"{\" after \\begin", reader);
+ return State::NONE;
+ }
+
+ // Parse the name of the command that should be opened
+ Variant commandName = parseIdentifier(reader.getOffset(), true);
+ if (commandName.asString().empty()) {
+ logger.error("Expected identifier", commandName);
+ return State::ERROR;
+ }
+
+ // Check whether the next character is a '#', indicating the start of the
+ // command name
+ Variant commandArgName;
+ SourceOffset start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ if (!reader.expect('}')) {
+ logger.error("Expected \"}\"", reader);
+ return State::ERROR;
+ }
+
+ // Parse the arguments
+ Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), true);
+
+ return State::COMMAND;
+}
+
+static bool checkStillInField(const OsdmStreamParser::Command &cmd,
+ const Variant &endName, Logger &logger)
+{
+ if (cmd.inField && !cmd.inRangeField) {
+ logger.error(std::string("\\end in open field of command \"") +
+ cmd.name.asString() + std::string("\""),
+ endName);
+ logger.note(std::string("Open command started here:"), cmd.name);
+ return true;
+ }
+ return false;
+}
+
+OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
+{
+ // Expect a '{' after the command
+ if (!reader.expect('{')) {
+ logger.error("Expected \"{\" after \\end", reader);
+ return State::NONE;
+ }
+
+ // Fetch the name of the command that should be ended here
+ Variant name = parseIdentifier(reader.getOffset(), true);
+
+ // Make sure the given command name is not empty
+ if (name.asString().empty()) {
+ logger.error("Expected identifier", name);
+ return State::ERROR;
+ }
+
+ // Make sure the command name is terminated with a '}'
+ if (!reader.expect('}')) {
+ logger.error("Expected \"}\"", reader);
+ return State::ERROR;
+ }
+
+ // Unroll the command stack up to the last range command
+ while (!commands.top().hasRange) {
+ if (checkStillInField(commands.top(), name, logger)) {
+ return State::ERROR;
+ }
+ commands.pop();
+ }
+
+ // Make sure we're not in an open field of this command
+ if (checkStillInField(commands.top(), name, logger)) {
+ return State::ERROR;
+ }
+
+ // Special error message if the top-level command is reached
+ if (commands.size() == 1) {
+ logger.error(std::string("Cannot end command \"") + name.asString() +
+ std::string("\" here, no command open"),
+ name);
+ return State::ERROR;
+ }
+
+ // Inform the about command mismatches
+ const Command &cmd = commands.top();
+ if (commands.top().name.asString() != name.asString()) {
+ logger.error(std::string("Trying to end command \"") +
+ cmd.name.asString() +
+ std::string("\", but open command is \"") +
+ name.asString() + std::string("\""),
+ name);
+ logger.note("Last command was opened here:", cmd.name);
+ return State::ERROR;
+ }
+
+ // Set the location to the location of the command that was ended, then end
+ // the current command
+ location = name.getLocation();
+ commands.pop();
+ return cmd.inRangeField ? State::FIELD_END : State::NONE;
+}
+
+Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
+{
+ // Parse the arguments using the universal VariantReader
+ Variant commandArguments;
+ if (reader.expect('[')) {
+ auto res = VariantReader::parseObject(reader, logger, ']');
+ commandArguments = res.second;
+ } else {
+ commandArguments = Variant::mapType{};
+ }
+
+ // Insert the parsed name, make sure "name" was not specified in the
+ // arguments
+ if (commandArgName.isString()) {
+ auto res =
+ commandArguments.asMap().emplace("name", std::move(commandArgName));
+ if (!res.second) {
+ logger.error("Name argument specified multiple times",
+ SourceLocation{}, MessageMode::NO_CONTEXT);
+ logger.note("First occurance is here: ", commandArgName);
+ logger.note("Second occurance is here: ", res.first->second);
+ }
+ }
+ return commandArguments;
+}
+
+void OsdmStreamParser::pushCommand(Variant commandName,
+ Variant commandArguments, bool hasRange)
+{
+ // Store the location on the stack
+ location = commandName.getLocation();
+
+ // Place the command on the command stack, remove the last commands if we're
+ // not currently inside a field of these commands
+ while (!commands.top().inField) {
+ commands.pop();
+ }
+ commands.push(Command{std::move(commandName), std::move(commandArguments),
+ hasRange, false, false});
+}
+
+OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
+{
+ // Parse the commandName as a first identifier
+ Variant commandName = parseIdentifier(start, true);
+ if (commandName.asString().empty()) {
+ logger.error("Empty command name", reader);
+ return State::NONE;
+ }
+
+ // Handle the special "begin" and "end" commands
+ const auto commandNameComponents =
+ Utils::split(commandName.asString(), ':');
+ const bool isBegin = commandNameComponents[0] == "begin";
+ const bool isEnd = commandNameComponents[0] == "end";
+ if (isBegin || isEnd) {
+ if (commandNameComponents.size() > 1) {
+ logger.error(
+ "Special commands \"\\begin\" and \"\\end\" may not contain a "
+ "namespace separator \":\"",
+ commandName);
+ }
+ if (isBegin) {
+ return parseBeginCommand();
+ } else if (isEnd) {
+ return parseEndCommand();
+ }
+ }
+
+ // Check whether the next character is a '#', indicating the start of the
+ // command name
+ Variant commandArgName;
+ start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ // Parse the arugments
+ Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), false);
+
+ return State::COMMAND;
+}
+
+void OsdmStreamParser::parseBlockComment()
+{
+ Token token;
+ size_t depth = 1;
+ while (tokenizer.read(reader, token)) {
+ if (token.type == Tokens.BlockCommentEnd) {
+ depth--;
+ if (depth == 0) {
+ return;
+ }
+ }
+ if (token.type == Tokens.BlockCommentStart) {
+ depth++;
+ }
+ }
+
+ // Issue an error if the file ends while we are in a block comment
+ logger.error("File ended while being in a block comment", reader);
+}
+
+void OsdmStreamParser::parseLineComment()
+{
+ char c;
+ while (reader.read(c)) {
+ if (c == '\n') {
+ return;
+ }
+ }
+}
+
+bool OsdmStreamParser::checkIssueData(DataHandler &handler)
+{
+ if (!handler.isEmpty()) {
+ data = handler.toVariant(reader.getSourceId());
+ location = data.getLocation();
+ reader.resetPeek();
+ return true;
+ }
+ return false;
+}
+
+bool OsdmStreamParser::checkIssueFieldStart()
+{
+ // Fetch the current command, and check whether we're currently inside a
+ // field of this command
+ Command &cmd = commands.top();
+ if (!cmd.inField) {
+ // If this is a range command, we're now implicitly inside the field of
+ // this command -- we'll have to issue a field start command!
+ if (cmd.hasRange) {
+ cmd.inField = true;
+ cmd.inRangeField = true;
+ reader.resetPeek();
+ return true;
+ }
+
+ // This was not a range command, so obviously we're now inside within
+ // a field of some command -- so unroll the commands stack until a
+ // command with open field is reached
+ while (!commands.top().inField) {
+ commands.pop();
+ }
+ }
+ return false;
+}
+
+OsdmStreamParser::State OsdmStreamParser::parse()
+{
+ // Handler for incomming data
+ DataHandler handler;
+
+ // Read tokens until the outer loop should be left
+ Token token;
+ while (tokenizer.peek(reader, token)) {
+ const TokenTypeId type = token.type;
+
+ // Special handling for Backslash and Text
+ if (type == Tokens.Backslash) {
+ // Before appending anything to the output data or starting a new
+ // command, check whether FIELD_START has to be issued, as the
+ // current command is a command with range
+ if (checkIssueFieldStart()) {
+ location = token.location;
+ return State::FIELD_START;
+ }
+
+ // Check whether a command starts now, without advancing the peek
+ // cursor
+ char c;
+ if (!reader.fetchPeek(c)) {
+ logger.error("Trailing backslash at the end of the file.",
+ token);
+ return State::END;
+ }
+
+ // Try to parse a command
+ if (Utils::isIdentifierStartCharacter(c)) {
+ // Make sure to issue any data before it is to late
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // Parse the actual command
+ State res = parseCommand(token.location.getStart());
+ switch (res) {
+ case State::ERROR:
+ throw LoggableException(
+ "Last error was irrecoverable, ending parsing "
+ "process");
+ case State::NONE:
+ continue;
+ default:
+ return res;
+ }
+ }
+
+ // This was not a special character, just append the given character
+ // to the data buffer, use the escape character start as start
+ // location and the peek offset as end location
+ reader.peek(c); // Peek the previously fetched character
+ handler.append(c, token.location.getStart(),
+ reader.getPeekOffset());
+ reader.consumePeek();
+ continue;
+ } else if (type == TextToken) {
+ // Check whether FIELD_START has to be issued before appending text
+ if (checkIssueFieldStart()) {
+ location = token.location;
+ return State::FIELD_START;
+ }
+
+ // Append the text to the data handler
+ handler.append(token.content, token.location.getStart(),
+ token.location.getEnd());
+
+ reader.consumePeek();
+ continue;
+ }
+
+ // A non-text token was reached, make sure all pending data commands
+ // have been issued
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // We will handle the token now, consume the peeked characters
+ reader.consumePeek();
+
+ // Update the location to the current token location
+ location = token.location;
+
+ if (token.type == Tokens.LineComment) {
+ parseLineComment();
+ } else if (token.type == Tokens.BlockCommentStart) {
+ parseBlockComment();
+ } else if (token.type == Tokens.FieldStart) {
+ Command &cmd = commands.top();
+ if (!cmd.inField) {
+ cmd.inField = true;
+ return State::FIELD_START;
+ }
+ logger.error(
+ "Got field start token \"{\", but no command for which to "
+ "start the field. Did you mean \"\\{\"?",
+ token);
+ } else if (token.type == Tokens.FieldEnd) {
+ // Try to end an open field of the current command -- if the current
+ // command is not inside an open field, end this command and try to
+ // close the next one
+ for (int i = 0; i < 2 && commands.size() > 1; i++) {
+ Command &cmd = commands.top();
+ if (!cmd.inRangeField) {
+ if (cmd.inField) {
+ cmd.inField = false;
+ return State::FIELD_END;
+ }
+ commands.pop();
+ } else {
+ break;
+ }
+ }
+ logger.error(
+ "Got field end token \"}\", but there is no field to end. Did "
+ "you mean \"\\}\"?",
+ token);
+ } else {
+ logger.error("Unexpected token \"" + token.content + "\"", token);
+ }
+ }
+
+ // Issue available data
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // Make sure all open commands and fields have been ended at the end of the
+ // stream
+ while (commands.size() > 1) {
+ Command &cmd = commands.top();
+ if (cmd.inField || cmd.hasRange) {
+ logger.error("Reached end of stream, but command \"" +
+ cmd.name.asString() + "\" has not been ended",
+ cmd.name);
+ }
+ commands.pop();
+ }
+
+ location = SourceLocation{reader.getSourceId(), reader.getOffset()};
+ return State::END;
+}
+
+const Variant &OsdmStreamParser::getCommandName()
+{
+ return commands.top().name;
+}
+
+const Variant &OsdmStreamParser::getCommandArguments()
+{
+ return commands.top().arguments;
+}
+}
+
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
new file mode 100644
index 0000000..84674c0
--- /dev/null
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -0,0 +1,350 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsdmStreamParser.hpp
+ *
+ * Provides classes for low-level classes for reading the TeX-esque osdm
+ * format. The class provided here does not build any model objects and does not
+ * implement the Parser interface.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_
+#define _OUSIA_OSDM_STREAM_PARSER_HPP_
+
+#include <stack>
+
+#include <core/common/Variant.hpp>
+#include <core/parser/utils/Tokenizer.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class Logger;
+class DataHandler;
+
+/**
+ * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm
+ * format. The parser is constructed around a "parse" function, which reads data
+ * from the underlying CharReader until a new state is reached and indicates
+ * this state in a return value. The calling code then has to pull corresponding
+ * data from the stream reader. The reader makes sure the incommind file is
+ * syntactically valid and tries to recorver from most errors. If an error is
+ * irrecoverable (this is the case for errors with wrong nesting of commands or
+ * fields, as this would lead to too many consecutive errors) a
+ * LoggableException is thrown.
+ */
+class OsdmStreamParser {
+public:
+ /**
+ * Enum used to indicate which state the OsdmStreamParser class is in
+ * after calling the "parse" function.
+ */
+ enum class State {
+ /**
+ * State returned if a fully featured command has been read. A command
+ * consists of the command name and its arguments (which optionally
+ * includes the name).
+ */
+ COMMAND,
+
+ /**
+ * State returned if data is given. The reader must decide which field
+ * or command this should be routed to. Trailing or leading whitespace
+ * has been removed. Only called if the data is non-empty.
+ */
+ DATA,
+
+ /**
+ * A user-defined entity has been found. The entity sequence is stored
+ * in the command name.
+ */
+ ENTITY,
+
+ /**
+ * State returned if an annotation was started. An annotation consists
+ * of the command name and its arguments (which optionally include the
+ * name).
+ */
+ ANNOTATION_START,
+
+ /**
+ * State returned if an annotation ends. The reader indicates which
+ * annotation ends.
+ */
+ ANNOTATION_END,
+
+ /**
+ * State returned if a new field started. The reader assures that the
+ * current field ends before a new field is started and that the field
+ * is not started if data has been given outside of a field. The
+ * field number is set to the current field index.
+ */
+ FIELD_START,
+
+ /**
+ * State returned if the current field ends. The reader assures that a
+ * field was actually open.
+ */
+ FIELD_END,
+
+ /**
+ * The end of the stream has been reached.
+ */
+ END,
+
+ /**
+ * Returned from internal functions if nothing should be done.
+ */
+ NONE,
+
+ /**
+ * Returned from internal function to indicate irrecoverable errors.
+ */
+ ERROR
+ };
+
+ /**
+ * Entry used for the command stack.
+ */
+ struct Command {
+ /**
+ * Name and location of the current command.
+ */
+ Variant name;
+
+ /**
+ * Arguments that were passed to the command.
+ */
+ Variant arguments;
+
+ /**
+ * Set to true if this is a command with clear begin and end.
+ */
+ bool hasRange;
+
+ /**
+ * Set to true if we are currently inside a field of this command.
+ */
+ bool inField;
+
+ /**
+ * Set to true if we are currently in the range field of the command
+ * (implies inField being set to true).
+ */
+ bool inRangeField;
+
+ /**
+ * Default constructor.
+ */
+ Command() : hasRange(false), inField(false), inRangeField(false) {}
+
+ /**
+ * Constructor of the Command class.
+ *
+ * @param name is a string variant with name and location of the
+ * command.
+ * @param arguments is a map variant with the arguments given to the
+ * command.
+ * @param hasRange should be set to true if this is a command with
+ * explicit range.
+ * @param inField is set to true if we currently are inside a field
+ * of this command.
+ * @param inRangeField is set to true if we currently inside the outer
+ * field of the command.
+ */
+ Command(Variant name, Variant arguments, bool hasRange, bool inField,
+ bool inRangeField)
+ : name(std::move(name)),
+ arguments(std::move(arguments)),
+ hasRange(hasRange),
+ inField(inField),
+ inRangeField(inRangeField)
+ {
+ }
+ };
+
+private:
+ /**
+ * Reference to the CharReader instance from which the incomming bytes are
+ * read.
+ */
+ CharReader &reader;
+
+ /**
+ * Reference at the logger instance to which all error messages are sent.
+ */
+ Logger &logger;
+
+ /**
+ * Tokenizer instance used to read individual tokens from the text.
+ */
+ Tokenizer tokenizer;
+
+ /**
+ * Stack containing the current commands.
+ */
+ std::stack<Command> commands;
+
+ /**
+ * Variant containing the data that has been read (always is a string,
+ * contains the exact location of the data in the source file).
+ */
+ Variant data;
+
+ /**
+ * Contains the location of the last token.
+ */
+ SourceLocation location;
+
+ /**
+ * Contains the field index of the current command.
+ */
+ size_t fieldIdx;
+
+ /**
+ * Function used internall to parse an identifier.
+ *
+ * @param start is the start byte offset of the identifier (including the
+ * backslash).
+ * @param allowNSSep should be set to true if the namespace separator is
+ * allowed in the identifier name. Issues error if the namespace separator
+ * is placed incorrectly.
+ */
+ Variant parseIdentifier(size_t start, bool allowNSSep = false);
+
+ /**
+ * Function used internally to handle the special "\begin" command.
+ */
+ State parseBeginCommand();
+
+ /**
+ * Function used internally to handle the special "\end" command.
+ */
+ State parseEndCommand();
+
+ /**
+ * Pushes the parsed command onto the command stack.
+ */
+ void pushCommand(Variant commandName, Variant commandArguments,
+ bool hasRange);
+
+ /**
+ * Parses the command arguments.
+ */
+ Variant parseCommandArguments(Variant commandArgName);
+
+ /**
+ * Function used internally to parse a command.
+ *
+ * @param start is the start byte offset of the command (including the
+ * backslash)
+ * @return true if a command was actuall parsed, false otherwise.
+ */
+ State parseCommand(size_t start);
+
+ /**
+ * Function used internally to parse a block comment.
+ */
+ void parseBlockComment();
+
+ /**
+ * Function used internally to parse a generic comment.
+ */
+ void parseLineComment();
+
+ /**
+ * Checks whether there is any data pending to be issued, if yes, issues it.
+ *
+ * @param handler is the data handler that contains the data that may be
+ * returned to the user.
+ * @return true if there was any data and DATA should be returned by the
+ * parse function, false otherwise.
+ */
+ bool checkIssueData(DataHandler &handler);
+
+ /**
+ * Called before any data is appended to the internal data handler. Checks
+ * whether a new field should be started or implicitly ended.
+ *
+ * @return true if FIELD_START should be returned by the parse function.
+ */
+ bool checkIssueFieldStart();
+
+public:
+ /**
+ * Constructor of the OsdmStreamParser class. Attaches the new
+ * OsdmStreamParser to the given CharReader and Logger instances.
+ *
+ * @param reader is the reader instance from which incomming characters
+ * should be read.
+ * @param logger is the logger instance to which errors should be written.
+ */
+ OsdmStreamParser(CharReader &reader, Logger &logger);
+
+ /**
+ * Continues parsing. Returns one of the states defined in the State enum.
+ * Callers should stop once the State::END state is reached. Use the getter
+ * functions to get more information about the current state, such as the
+ * command name or the data or the current field index.
+ *
+ * @return the new state the parser has reached.
+ */
+ State parse();
+
+ /**
+ * Returns a reference at the internally stored data. Only valid if
+ * State::DATA was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing the data parsed by the
+ * "parse" function.
+ */
+ const Variant &getData() { return data; }
+
+ /**
+ * Returns a reference at the internally stored command name. Only valid if
+ * State::COMMAND was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing name and location of the
+ * parsed command.
+ */
+ const Variant &getCommandName();
+
+ /**
+ * Returns a reference at the internally stored command name. Only valid if
+ * State::COMMAND was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing arguments given to the
+ * command.
+ */
+ const Variant &getCommandArguments();
+
+ /**
+ * Returns a reference at the char reader.
+ *
+ * @return the last internal token location.
+ */
+ SourceLocation &getLocation() { return location; }
+};
+}
+
+#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */
+