summaryrefslogtreecommitdiff
path: root/src/formats/osml
diff options
context:
space:
mode:
authorBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2015-02-15 21:56:04 +0100
committerBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2015-02-15 21:56:04 +0100
commitd2f99e4b43ed93ef0fa8e138e0c3afc79775b77c (patch)
tree8e7cdb894b7036b3ca01499ee9432d2e62930477 /src/formats/osml
parent40f7df390f00f85c17bd0e6527ec4ba19cbce4fc (diff)
parent4f2872d9968aec93bebff90d1238347c8a364949 (diff)
Merge branch 'master' of somweyr.de:ousia
Diffstat (limited to 'src/formats/osml')
-rw-r--r--src/formats/osml/OsmlParser.cpp57
-rw-r--r--src/formats/osml/OsmlParser.hpp48
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp754
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp385
4 files changed, 1244 insertions, 0 deletions
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp
new file mode 100644
index 0000000..4973639
--- /dev/null
+++ b/src/formats/osml/OsmlParser.cpp
@@ -0,0 +1,57 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/parser/generic/ParserStateCallbacks.hpp>
+#include <core/parser/generic/ParserStateStack.hpp>
+
+#include "OsdmParser.hpp"
+#include "OsdmStreamParser.hpp"
+
+namespace ousia {
+
+namespace {
+
+/**
+ * The OsdmParserImplementation class contains the actual implementation of the
+ * parsing process and is created in the "doParse" function of the OsdmParser.
+
+ */
+class OsdmParserImplementation : public ParserStateCallbacks {
+private:
+ /**
+ * OsdmStreamParser instance.
+ */
+ OsdmStreamParser parser;
+
+ /**
+ * Instance of the ParserStateStack.
+ */
+ ParserStateStack stack;
+
+public:
+ OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap)
+};
+}
+
+void OsdmParser::doParse(CharReader &reader, ParserContext &ctx)
+{
+ OsdmParserImplementation parser(reader, ctx);
+ parser.parse();
+}
+
+}
diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp
new file mode 100644
index 0000000..37505b4
--- /dev/null
+++ b/src/formats/osml/OsmlParser.hpp
@@ -0,0 +1,48 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsdmParser.hpp
+ *
+ * Contains the parser of the osdm format, the standard plain-text format used
+ * by Ousía for documents.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSDM_PARSER_HPP_
+#define _OUSIA_OSDM_PARSER_HPP_
+
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+
+/**
+ * OsdmParser is a small wrapper implementing the Parser interface. The actual
+ * parsing is performed with the OsdmStreamParser in conjunction with the
+ * ParserStateStack.
+ */
+class OsdmParser : public Parser {
+protected:
+ void doParse(CharReader &reader, ParserContext &ctx) override;
+};
+
+}
+
+#endif /* _OUSIA_OSDM_PARSER_HPP_ */
+
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
new file mode 100644
index 0000000..0174fa4
--- /dev/null
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -0,0 +1,754 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Logger.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/VariantReader.hpp>
+
+#include "OsmlStreamParser.hpp"
+
+namespace ousia {
+
+/**
+ * Plain format default tokenizer.
+ */
+class PlainFormatTokens : public Tokenizer {
+public:
+ /**
+ * Id of the backslash token.
+ */
+ TokenTypeId Backslash;
+
+ /**
+ * Id of the line comment token.
+ */
+ TokenTypeId LineComment;
+
+ /**
+ * Id of the block comment start token.
+ */
+ TokenTypeId BlockCommentStart;
+
+ /**
+ * Id of the block comment end token.
+ */
+ TokenTypeId BlockCommentEnd;
+
+ /**
+ * Id of the field start token.
+ */
+ TokenTypeId FieldStart;
+
+ /**
+ * Id of the field end token.
+ */
+ TokenTypeId FieldEnd;
+
+ /**
+ * Id of the default field start token.
+ */
+ TokenTypeId DefaultFieldStart;
+
+ /**
+ * Id of the annotation start token.
+ */
+ TokenTypeId AnnotationStart;
+
+ /**
+ * Id of the annotation end token.
+ */
+ TokenTypeId AnnotationEnd;
+
+ /**
+ * Registers the plain format tokens in the internal tokenizer.
+ */
+ PlainFormatTokens()
+ {
+ Backslash = registerToken("\\");
+ LineComment = registerToken("%");
+ BlockCommentStart = registerToken("%{");
+ BlockCommentEnd = registerToken("}%");
+ FieldStart = registerToken("{");
+ FieldEnd = registerToken("}");
+ DefaultFieldStart = registerToken("{!");
+ AnnotationStart = registerToken("<\\");
+ AnnotationEnd = registerToken("\\>");
+ }
+};
+
+static const PlainFormatTokens Tokens;
+
+/**
+ * Class used internally to collect data issued via "DATA" event.
+ */
+class DataHandler {
+private:
+ /**
+ * Internal character buffer.
+ */
+ std::vector<char> buf;
+
+ /**
+ * Start location of the character data.
+ */
+ SourceOffset start;
+
+ /**
+ * End location of the character data.
+ */
+ SourceOffset end;
+
+public:
+ /**
+ * Default constructor, initializes start and end with zeros.
+ */
+ DataHandler() : start(0), end(0) {}
+
+ /**
+ * Returns true if the internal buffer is empty.
+ *
+ * @return true if no characters were added to the internal buffer, false
+ * otherwise.
+ */
+ bool isEmpty() { return buf.empty(); }
+
+ /**
+ * Appends a single character to the internal buffer.
+ *
+ * @param c is the character that should be added to the internal buffer.
+ * @param charStart is the start position of the character.
+ * @param charEnd is the end position of the character.
+ */
+ void append(char c, SourceOffset charStart, SourceOffset charEnd)
+ {
+ if (isEmpty()) {
+ start = charStart;
+ }
+ buf.push_back(c);
+ end = charEnd;
+ }
+
+ /**
+ * Appends a string to the internal buffer.
+ *
+ * @param s is the string that should be added to the internal buffer.
+ * @param stringStart is the start position of the string.
+ * @param stringEnd is the end position of the string.
+ */
+ void append(const std::string &s, SourceOffset stringStart,
+ SourceOffset stringEnd)
+ {
+ if (isEmpty()) {
+ start = stringStart;
+ }
+ std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
+ end = stringEnd;
+ }
+
+ /**
+ * Converts the internal buffer to a variant with attached location
+ * information.
+ *
+ * @param sourceId is the source id which is needed for building the
+ * location information.
+ * @return a Variant with the internal buffer content as string and
+ * the correct start and end location.
+ */
+ Variant toVariant(SourceId sourceId)
+ {
+ Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
+ res.setLocation({sourceId, start, end});
+ return res;
+ }
+};
+
+OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
+ : reader(reader), logger(logger), tokenizer(Tokens)
+{
+ // Place an intial command representing the complete file on the stack
+ commands.push(Command{"", Variant::mapType{}, true, true, true, false});
+}
+
+Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+{
+ bool first = true;
+ bool hasCharSiceNSSep = false;
+ std::vector<char> identifier;
+ size_t end = reader.getPeekOffset();
+ char c, c2;
+ while (reader.peek(c)) {
+ // Abort if this character is not a valid identifer character
+ if ((first && Utils::isIdentifierStartCharacter(c)) ||
+ (!first && Utils::isIdentifierCharacter(c))) {
+ identifier.push_back(c);
+ } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+ Utils::isIdentifierStartCharacter(c2)) {
+ identifier.push_back(c);
+ } else {
+ if (c == ':' && allowNSSep) {
+ logger.error(
+ "Expected character before and after namespace separator "
+ "\":\"",
+ reader);
+ }
+ reader.resetPeek();
+ break;
+ }
+
+ // This is no longer the first character
+ first = false;
+
+ // Advance the hasCharSiceNSSep flag
+ hasCharSiceNSSep = allowNSSep && (c != ':');
+
+ end = reader.getPeekOffset();
+ reader.consumePeek();
+ }
+
+ // Return the identifier at its location
+ Variant res =
+ Variant::fromString(std::string(identifier.data(), identifier.size()));
+ res.setLocation({reader.getSourceId(), start, end});
+ return res;
+}
+
+OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
+{
+ // Expect a '{' after the command
+ reader.consumeWhitespace();
+ if (!reader.expect('{')) {
+ logger.error("Expected \"{\" after \\begin", reader);
+ return State::NONE;
+ }
+
+ // Parse the name of the command that should be opened
+ Variant commandName = parseIdentifier(reader.getOffset(), true);
+ if (commandName.asString().empty()) {
+ logger.error("Expected identifier", commandName);
+ return State::ERROR;
+ }
+
+ // Check whether the next character is a '#', indicating the start of the
+ // command name
+ Variant commandArgName;
+ SourceOffset start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ if (!reader.expect('}')) {
+ logger.error("Expected \"}\"", reader);
+ return State::ERROR;
+ }
+
+ // Parse the arguments
+ Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), true);
+
+ return State::COMMAND;
+}
+
+static bool checkStillInField(const OsmlStreamParser::Command &cmd,
+ const Variant &endName, Logger &logger)
+{
+ if (cmd.inField && !cmd.inRangeField) {
+ logger.error(std::string("\\end in open field of command \"") +
+ cmd.name.asString() + std::string("\""),
+ endName);
+ logger.note(std::string("Open command started here:"), cmd.name);
+ return true;
+ }
+ return false;
+}
+
+OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
+{
+ // Expect a '{' after the command
+ if (!reader.expect('{')) {
+ logger.error("Expected \"{\" after \\end", reader);
+ return State::NONE;
+ }
+
+ // Fetch the name of the command that should be ended here
+ Variant name = parseIdentifier(reader.getOffset(), true);
+
+ // Make sure the given command name is not empty
+ if (name.asString().empty()) {
+ logger.error("Expected identifier", name);
+ return State::ERROR;
+ }
+
+ // Make sure the command name is terminated with a '}'
+ if (!reader.expect('}')) {
+ logger.error("Expected \"}\"", reader);
+ return State::ERROR;
+ }
+
+ // Unroll the command stack up to the last range command
+ while (!commands.top().hasRange) {
+ if (checkStillInField(commands.top(), name, logger)) {
+ return State::ERROR;
+ }
+ commands.pop();
+ }
+
+ // Make sure we're not in an open field of this command
+ if (checkStillInField(commands.top(), name, logger)) {
+ return State::ERROR;
+ }
+
+ // Special error message if the top-level command is reached
+ if (commands.size() == 1) {
+ logger.error(std::string("Cannot end command \"") + name.asString() +
+ std::string("\" here, no command open"),
+ name);
+ return State::ERROR;
+ }
+
+ // Inform the about command mismatches
+ const Command &cmd = commands.top();
+ if (commands.top().name.asString() != name.asString()) {
+ logger.error(std::string("Trying to end command \"") +
+ cmd.name.asString() +
+ std::string("\", but open command is \"") +
+ name.asString() + std::string("\""),
+ name);
+ logger.note("Last command was opened here:", cmd.name);
+ return State::ERROR;
+ }
+
+ // Set the location to the location of the command that was ended, then end
+ // the current command
+ location = name.getLocation();
+ commands.pop();
+ return cmd.inRangeField ? State::FIELD_END : State::NONE;
+}
+
+Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
+{
+ // Parse the arguments using the universal VariantReader
+ Variant commandArguments;
+ if (reader.expect('[')) {
+ auto res = VariantReader::parseObject(reader, logger, ']');
+ commandArguments = res.second;
+ } else {
+ commandArguments = Variant::mapType{};
+ }
+
+ // Insert the parsed name, make sure "name" was not specified in the
+ // arguments
+ if (commandArgName.isString()) {
+ auto res =
+ commandArguments.asMap().emplace("name", std::move(commandArgName));
+ if (!res.second) {
+ logger.error("Name argument specified multiple times",
+ SourceLocation{}, MessageMode::NO_CONTEXT);
+ logger.note("First occurance is here: ", commandArgName);
+ logger.note("Second occurance is here: ", res.first->second);
+ }
+ }
+ return commandArguments;
+}
+
+void OsmlStreamParser::pushCommand(Variant commandName,
+ Variant commandArguments, bool hasRange)
+{
+ // Store the location on the stack
+ location = commandName.getLocation();
+
+ // Place the command on the command stack, remove the last commands if we're
+ // not currently inside a field of these commands
+ while (!commands.top().inField) {
+ commands.pop();
+ }
+ commands.push(Command{std::move(commandName), std::move(commandArguments),
+ hasRange, false, false, false});
+}
+
+OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
+ bool isAnnotation)
+{
+ // Parse the commandName as a first identifier
+ Variant commandName = parseIdentifier(start, true);
+ if (commandName.asString().empty()) {
+ logger.error("Empty command name", reader);
+ return State::NONE;
+ }
+
+ // Handle the special "begin" and "end" commands
+ const auto commandNameComponents =
+ Utils::split(commandName.asString(), ':');
+ const bool isBegin = commandNameComponents[0] == "begin";
+ const bool isEnd = commandNameComponents[0] == "end";
+
+ // Parse the begin or end command
+ State res = State::COMMAND;
+ if (isBegin || isEnd) {
+ if (commandNameComponents.size() > 1) {
+ logger.error(
+ "Special commands \"\\begin\" and \"\\end\" may not contain a "
+ "namespace separator \":\"",
+ commandName);
+ }
+ if (isBegin) {
+ res = parseBeginCommand();
+ } else if (isEnd) {
+ res = parseEndCommand();
+ }
+ } else {
+ // Check whether the next character is a '#', indicating the start of
+ // the command name
+ Variant commandArgName;
+ start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ // Parse the arugments
+ Variant commandArguments =
+ parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), false);
+ }
+
+ // Check whether a ">" character is the next character that is to be read.
+ // In that case the current command could be an annotation end command!
+ char c;
+ if (reader.fetch(c) && c == '>') {
+ // Ignore the character after a begin or end command
+ if (isBegin || isEnd) {
+ logger.warning(
+ "Ignoring annotation end character \">\" after special "
+ "commands \"begin\" or \"end\". Write \"\\>\" to end a "
+ "\"begin\"/\"end\" enclosed annotation.",
+ reader);
+ return res;
+ }
+
+ // If this should be an annoation, ignore the character
+ if (isAnnotation) {
+ logger.warning(
+ "Ignoring annotation end character \">\" after annotation "
+ "start command. Write \"\\>\" to end the annotation.",
+ reader);
+ } else {
+ // Make sure no arguments apart from the "name" argument are given
+ // to an annotation end
+ Variant::mapType &map = commands.top().arguments.asMap();
+ if (!map.empty()) {
+ if (map.count("name") == 0 || map.size() > 1U) {
+ logger.error(
+ "An annotation end command may not have any arguments "
+ "other than \"name\"");
+ return res;
+ }
+ }
+
+ // If we got here, this is a valid ANNOTATION_END command, issue it
+ reader.peek(c);
+ reader.consumePeek();
+ return State::ANNOTATION_END;
+ }
+ }
+
+ // If we're starting an annotation, return the command as annotation start
+ // instead of command
+ if (isAnnotation && res == State::COMMAND) {
+ return State::ANNOTATION_START;
+ }
+ return res;
+}
+
+void OsmlStreamParser::parseBlockComment()
+{
+ Token token;
+ size_t depth = 1;
+ while (tokenizer.read(reader, token)) {
+ if (token.type == Tokens.BlockCommentEnd) {
+ depth--;
+ if (depth == 0) {
+ return;
+ }
+ }
+ if (token.type == Tokens.BlockCommentStart) {
+ depth++;
+ }
+ }
+
+ // Issue an error if the file ends while we are in a block comment
+ logger.error("File ended while being in a block comment", reader);
+}
+
+void OsmlStreamParser::parseLineComment()
+{
+ char c;
+ while (reader.read(c)) {
+ if (c == '\n') {
+ return;
+ }
+ }
+}
+
+bool OsmlStreamParser::checkIssueData(DataHandler &handler)
+{
+ if (!handler.isEmpty()) {
+ data = handler.toVariant(reader.getSourceId());
+ location = data.getLocation();
+ reader.resetPeek();
+ return true;
+ }
+ return false;
+}
+
+bool OsmlStreamParser::checkIssueFieldStart()
+{
+ // Fetch the current command, and check whether we're currently inside a
+ // field of this command
+ Command &cmd = commands.top();
+ if (!cmd.inField) {
+ // If this is a range command, we're now implicitly inside the field of
+ // this command -- we'll have to issue a field start command!
+ if (cmd.hasRange) {
+ cmd.inField = true;
+ cmd.inRangeField = true;
+ reader.resetPeek();
+ return true;
+ }
+
+ // This was not a range command, so obviously we're now inside within
+ // a field of some command -- so unroll the commands stack until a
+ // command with open field is reached
+ while (!commands.top().inField) {
+ commands.pop();
+ }
+ }
+ return false;
+}
+
+bool OsmlStreamParser::closeField()
+{
+ // Try to end an open field of the current command -- if the current command
+ // is not inside an open field, end this command and try to close the next
+ // one
+ for (int i = 0; i < 2 && commands.size() > 1; i++) {
+ Command &cmd = commands.top();
+ if (!cmd.inRangeField) {
+ if (cmd.inField) {
+ cmd.inField = false;
+ if (cmd.inDefaultField) {
+ commands.pop();
+ }
+ return true;
+ }
+ commands.pop();
+ } else {
+ return false;
+ }
+ }
+ return false;
+}
+
+OsmlStreamParser::State OsmlStreamParser::parse()
+{
+ // Handler for incomming data
+ DataHandler handler;
+
+ // Read tokens until the outer loop should be left
+ Token token;
+ while (tokenizer.peek(reader, token)) {
+ const TokenTypeId type = token.type;
+
+ // Special handling for Backslash and Text
+ if (type == Tokens.Backslash || type == Tokens.AnnotationStart) {
+ // Before appending anything to the output data or starting a new
+ // command, check whether FIELD_START has to be issued, as the
+ // current command is a command with range
+ if (checkIssueFieldStart()) {
+ location = token.location;
+ return State::FIELD_START;
+ }
+
+ // Check whether a command starts now, without advancing the peek
+ // cursor
+ char c;
+ if (!reader.fetchPeek(c)) {
+ logger.error("Trailing backslash at the end of the file.",
+ token);
+ return State::END;
+ }
+
+ // Try to parse a command
+ if (Utils::isIdentifierStartCharacter(c)) {
+ // Make sure to issue any data before it is to late
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // Parse the actual command
+ State res = parseCommand(token.location.getStart(),
+ type == Tokens.AnnotationStart);
+ switch (res) {
+ case State::ERROR:
+ throw LoggableException(
+ "Last error was irrecoverable, ending parsing "
+ "process");
+ case State::NONE:
+ continue;
+ default:
+ return res;
+ }
+ }
+
+ // This was not a special character, just append the given character
+ // to the data buffer, use the escape character start as start
+ // location and the peek offset as end location
+ reader.peek(c); // Peek the previously fetched character
+
+ // If this was an annotation start token, add the parsed < to the
+ // output
+ if (type == Tokens.AnnotationStart) {
+ handler.append('<', token.location.getStart(),
+ token.location.getStart() + 1);
+ }
+
+ handler.append(c, token.location.getStart(),
+ reader.getPeekOffset());
+ reader.consumePeek();
+ continue;
+ } else if (type == TextToken) {
+ // Check whether FIELD_START has to be issued before appending text
+ if (checkIssueFieldStart()) {
+ location = token.location;
+ return State::FIELD_START;
+ }
+
+ // Append the text to the data handler
+ handler.append(token.content, token.location.getStart(),
+ token.location.getEnd());
+
+ reader.consumePeek();
+ continue;
+ }
+
+ // A non-text token was reached, make sure all pending data commands
+ // have been issued
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // We will handle the token now, consume the peeked characters
+ reader.consumePeek();
+
+ // Update the location to the current token location
+ location = token.location;
+
+ if (token.type == Tokens.LineComment) {
+ parseLineComment();
+ } else if (token.type == Tokens.BlockCommentStart) {
+ parseBlockComment();
+ } else if (token.type == Tokens.FieldStart) {
+ Command &cmd = commands.top();
+ if (!cmd.inField) {
+ cmd.inField = true;
+ return State::FIELD_START;
+ }
+ logger.error(
+ "Got field start token \"{\", but no command for which to "
+ "start the field. Write \"\\{\" to insert this sequence as "
+ "text.",
+ token);
+ } else if (token.type == Tokens.FieldEnd) {
+ if (closeField()) {
+ return State::FIELD_END;
+ }
+ logger.error(
+ "Got field end token \"}\", but there is no field to end. "
+ "Write \"\\}\" to insert this sequence as text.",
+ token);
+ } else if (token.type == Tokens.DefaultFieldStart) {
+ // Try to start a default field the first time the token is reached
+ Command &topCmd = commands.top();
+ if (!topCmd.inField) {
+ topCmd.inField = true;
+ topCmd.inDefaultField = true;
+ return State::FIELD_START;
+ }
+ logger.error(
+ "Got default field start token \"{!\", but no command for "
+ "which to start the field. Write \"\\{!\" to insert this "
+ "sequence as text",
+ token);
+ } else if (token.type == Tokens.AnnotationEnd) {
+ // We got a single annotation end token "\>" -- simply issue the
+ // ANNOTATION_END event
+ Variant annotationName = Variant::fromString("");
+ annotationName.setLocation(token.location);
+ pushCommand(annotationName, Variant::mapType{}, false);
+ return State::ANNOTATION_END;
+ } else {
+ logger.error("Unexpected token \"" + token.content + "\"", token);
+ }
+ }
+
+ // Issue available data
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // Make sure all open commands and fields have been ended at the end of the
+ // stream
+ while (commands.size() > 1) {
+ Command &cmd = commands.top();
+ if (cmd.inField || cmd.hasRange) {
+ logger.error("Reached end of stream, but command \"" +
+ cmd.name.asString() + "\" has not been ended",
+ cmd.name);
+ }
+ commands.pop();
+ }
+
+ location = SourceLocation{reader.getSourceId(), reader.getOffset()};
+ return State::END;
+}
+
+const Variant &OsmlStreamParser::getCommandName() const
+{
+ return commands.top().name;
+}
+
+const Variant &OsmlStreamParser::getCommandArguments() const
+{
+ return commands.top().arguments;
+}
+
+bool OsmlStreamParser::inDefaultField() const
+{
+ return commands.top().inRangeField || commands.top().inDefaultField;
+}
+}
+
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
new file mode 100644
index 0000000..dc3034c
--- /dev/null
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -0,0 +1,385 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsmlStreamParser.hpp
+ *
+ * Provides classes for low-level classes for reading the TeX-esque osml
+ * format. The class provided here does not build any model objects and does not
+ * implement the Parser interface.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
+#define _OUSIA_OSML_STREAM_PARSER_HPP_
+
+#include <stack>
+
+#include <core/common/Variant.hpp>
+#include <core/parser/utils/Tokenizer.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class Logger;
+class DataHandler;
+
+/**
+ * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
+ * format. The parser is constructed around a "parse" function, which reads data
+ * from the underlying CharReader until a new state is reached and indicates
+ * this state in a return value. The calling code then has to pull corresponding
+ * data from the stream reader. The reader makes sure the incommind file is
+ * syntactically valid and tries to recorver from most errors. If an error is
+ * irrecoverable (this is the case for errors with wrong nesting of commands or
+ * fields, as this would lead to too many consecutive errors) a
+ * LoggableException is thrown.
+ */
+class OsmlStreamParser {
+public:
+ /**
+ * Enum used to indicate which state the OsmlStreamParser class is in
+ * after calling the "parse" function.
+ */
+ enum class State {
+ /**
+ * State returned if a fully featured command has been read. A command
+ * consists of the command name and its arguments (which optionally
+ * includes the name).
+ */
+ COMMAND,
+
+ /**
+ * State returned if data is given. The reader must decide which field
+ * or command this should be routed to. Trailing or leading whitespace
+ * has been removed. Only called if the data is non-empty.
+ */
+ DATA,
+
+ /**
+ * A user-defined entity has been found. The entity sequence is stored
+ * in the command name.
+ */
+ ENTITY,
+
+ /**
+ * State returned if an annotation was started. An annotation consists
+ * of the command name and its arguments (which optionally include the
+ * name).
+ */
+ ANNOTATION_START,
+
+ /**
+ * State returned if an annotation ends. The reader indicates which
+ * annotation ends.
+ */
+ ANNOTATION_END,
+
+ /**
+ * State returned if a new field started. The reader assures that the
+ * current field ends before a new field is started and that the field
+ * is not started if data has been given outside of a field. The
+ * field number is set to the current field index.
+ */
+ FIELD_START,
+
+ /**
+ * State returned if the current field ends. The reader assures that a
+ * field was actually open.
+ */
+ FIELD_END,
+
+ /**
+ * The end of the stream has been reached.
+ */
+ END,
+
+ /**
+ * Returned from internal functions if nothing should be done.
+ */
+ NONE,
+
+ /**
+ * Returned from internal function to indicate irrecoverable errors.
+ */
+ ERROR
+ };
+
+ /**
+ * Entry used for the command stack.
+ */
+ struct Command {
+ /**
+ * Name and location of the current command.
+ */
+ Variant name;
+
+ /**
+ * Arguments that were passed to the command.
+ */
+ Variant arguments;
+
+ /**
+ * Set to true if this is a command with clear begin and end.
+ */
+ bool hasRange : 1;
+
+ /**
+ * Set to true if we are currently inside a field of this command.
+ */
+ bool inField : 1;
+
+ /**
+ * Set to true if we are currently in the range field of the command
+ * (implies inField being set to true).
+ */
+ bool inRangeField : 1;
+
+ /**
+ * Set to true if we are currently in a field that has been especially
+ * marked as default field (using the "|") syntax.
+ */
+ bool inDefaultField : 1;
+
+ /**
+ * Default constructor.
+ */
+ Command()
+ : hasRange(false),
+ inField(false),
+ inRangeField(false),
+ inDefaultField()
+ {
+ }
+
+ /**
+ * Constructor of the Command class.
+ *
+ * @param name is a string variant with name and location of the
+ * command.
+ * @param arguments is a map variant with the arguments given to the
+ * command.
+ * @param hasRange should be set to true if this is a command with
+ * explicit range.
+ * @param inField is set to true if we currently are inside a field
+ * of this command.
+ * @param inRangeField is set to true if we currently are inside the
+ * outer field of a ranged command.
+ * @param inDefaultField is set to true if we currently are in a
+ * specially marked default field.
+ */
+ Command(Variant name, Variant arguments, bool hasRange,
+ bool inField, bool inRangeField, bool inDefaultField)
+ : name(std::move(name)),
+ arguments(std::move(arguments)),
+ hasRange(hasRange),
+ inField(inField),
+ inRangeField(inRangeField),
+ inDefaultField(inDefaultField)
+ {
+ }
+ };
+
+private:
+ /**
+ * Reference to the CharReader instance from which the incomming bytes are
+ * read.
+ */
+ CharReader &reader;
+
+ /**
+ * Reference at the logger instance to which all error messages are sent.
+ */
+ Logger &logger;
+
+ /**
+ * Tokenizer instance used to read individual tokens from the text.
+ */
+ Tokenizer tokenizer;
+
+ /**
+ * Stack containing the current commands.
+ */
+ std::stack<Command> commands;
+
+ /**
+ * Variant containing the data that has been read (always is a string,
+ * contains the exact location of the data in the source file).
+ */
+ Variant data;
+
+ /**
+ * Contains the location of the last token.
+ */
+ SourceLocation location;
+
+ /**
+ * Contains the field index of the current command.
+ */
+ size_t fieldIdx;
+
+ /**
+ * Function used internall to parse an identifier.
+ *
+ * @param start is the start byte offset of the identifier (including the
+ * backslash).
+ * @param allowNSSep should be set to true if the namespace separator is
+ * allowed in the identifier name. Issues error if the namespace separator
+ * is placed incorrectly.
+ */
+ Variant parseIdentifier(size_t start, bool allowNSSep = false);
+
+ /**
+ * Function used internally to handle the special "\begin" command.
+ */
+ State parseBeginCommand();
+
+ /**
+ * Function used internally to handle the special "\end" command.
+ */
+ State parseEndCommand();
+
+ /**
+ * Pushes the parsed command onto the command stack.
+ */
+ void pushCommand(Variant commandName, Variant commandArguments,
+ bool hasRange);
+
+ /**
+ * Parses the command arguments.
+ */
+ Variant parseCommandArguments(Variant commandArgName);
+
+ /**
+ * Function used internally to parse a command.
+ *
+ * @param start is the start byte offset of the command (including the
+ * backslash)
+ * @param isAnnotation if true, the command is not returned as command, but
+ * as annotation start.
+ * @return true if a command was actuall parsed, false otherwise.
+ */
+ State parseCommand(size_t start, bool isAnnotation);
+
+ /**
+ * Function used internally to parse a block comment.
+ */
+ void parseBlockComment();
+
+ /**
+ * Function used internally to parse a generic comment.
+ */
+ void parseLineComment();
+
+ /**
+ * Checks whether there is any data pending to be issued, if yes, issues it.
+ *
+ * @param handler is the data handler that contains the data that may be
+ * returned to the user.
+ * @return true if there was any data and DATA should be returned by the
+ * parse function, false otherwise.
+ */
+ bool checkIssueData(DataHandler &handler);
+
+ /**
+ * Called before any data is appended to the internal data handler. Checks
+ * whether a new field should be started or implicitly ended.
+ *
+ * @return true if FIELD_START should be returned by the parse function.
+ */
+ bool checkIssueFieldStart();
+
+ /**
+ * Closes a currently open field. Note that the command will be removed from
+ * the internal command stack if the field that is being closed is a
+ * field marked as default field.
+ *
+ * @return true if the field could be closed, false if there was no field
+ * to close.
+ */
+ bool closeField();
+
+public:
+ /**
+ * Constructor of the OsmlStreamParser class. Attaches the new
+ * OsmlStreamParser to the given CharReader and Logger instances.
+ *
+ * @param reader is the reader instance from which incomming characters
+ * should be read.
+ * @param logger is the logger instance to which errors should be written.
+ */
+ OsmlStreamParser(CharReader &reader, Logger &logger);
+
+ /**
+ * Continues parsing. Returns one of the states defined in the State enum.
+ * Callers should stop once the State::END state is reached. Use the getter
+ * functions to get more information about the current state, such as the
+ * command name or the data or the current field index.
+ *
+ * @return the new state the parser has reached.
+ */
+ State parse();
+
+ /**
+ * Returns a reference at the internally stored data. Only valid if
+ * State::DATA was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing the data parsed by the
+ * "parse" function.
+ */
+ const Variant &getData() const { return data; }
+
+ /**
+ * Returns a reference at the internally stored command name. Only valid if
+ * State::COMMAND was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing name and location of the
+ * parsed command.
+ */
+ const Variant &getCommandName() const;
+
+ /**
+ * Returns a reference at the internally stored command name. Only valid if
+ * State::COMMAND was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing arguments given to the
+ * command.
+ */
+ const Variant &getCommandArguments() const;
+
+ /**
+ * Returns true if the current field is the "default" field. This is true if
+ * the parser either is in the outer range of a range command or inside a
+ * field that has been especially marked as "default" field (using the "|"
+ * syntax).
+ */
+ bool inDefaultField() const;
+
+ /**
+ * Returns a reference at the char reader.
+ *
+ * @return the last internal token location.
+ */
+ const SourceLocation &getLocation() const { return location; }
+};
+}
+
+#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */
+