diff options
Diffstat (limited to 'src/formats/osml')
-rw-r--r-- | src/formats/osml/OsmlParser.cpp | 57 | ||||
-rw-r--r-- | src/formats/osml/OsmlParser.hpp | 48 | ||||
-rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp | 640 | ||||
-rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp | 350 |
4 files changed, 1095 insertions, 0 deletions
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp new file mode 100644 index 0000000..4973639 --- /dev/null +++ b/src/formats/osml/OsmlParser.cpp @@ -0,0 +1,57 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/parser/generic/ParserStateCallbacks.hpp> +#include <core/parser/generic/ParserStateStack.hpp> + +#include "OsdmParser.hpp" +#include "OsdmStreamParser.hpp" + +namespace ousia { + +namespace { + +/** + * The OsdmParserImplementation class contains the actual implementation of the + * parsing process and is created in the "doParse" function of the OsdmParser. + + */ +class OsdmParserImplementation : public ParserStateCallbacks { +private: + /** + * OsdmStreamParser instance. + */ + OsdmStreamParser parser; + + /** + * Instance of the ParserStateStack. + */ + ParserStateStack stack; + +public: + OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap) +}; +} + +void OsdmParser::doParse(CharReader &reader, ParserContext &ctx) +{ + OsdmParserImplementation parser(reader, ctx); + parser.parse(); +} + +} diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp new file mode 100644 index 0000000..37505b4 --- /dev/null +++ b/src/formats/osml/OsmlParser.hpp @@ -0,0 +1,48 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsdmParser.hpp + * + * Contains the parser of the osdm format, the standard plain-text format used + * by Ousía for documents. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_PARSER_HPP_ +#define _OUSIA_OSDM_PARSER_HPP_ + +#include <core/parser/Parser.hpp> + +namespace ousia { + +/** + * OsdmParser is a small wrapper implementing the Parser interface. The actual + * parsing is performed with the OsdmStreamParser in conjunction with the + * ParserStateStack. + */ +class OsdmParser : public Parser { +protected: + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSDM_PARSER_HPP_ */ + diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp new file mode 100644 index 0000000..6a55f12 --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -0,0 +1,640 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/CharReader.hpp> +#include <core/common/Logger.hpp> +#include <core/common/Utils.hpp> +#include <core/common/VariantReader.hpp> + +#include "OsdmStreamParser.hpp" + +namespace ousia { + +/** + * Plain format default tokenizer. + */ +class PlainFormatTokens : public Tokenizer { +public: + /** + * Id of the backslash token. + */ + TokenTypeId Backslash; + + /** + * Id of the line comment token. + */ + TokenTypeId LineComment; + + /** + * Id of the block comment start token. + */ + TokenTypeId BlockCommentStart; + + /** + * Id of the block comment end token. + */ + TokenTypeId BlockCommentEnd; + + /** + * Id of the field start token. + */ + TokenTypeId FieldStart; + + /** + * Id of the field end token. + */ + TokenTypeId FieldEnd; + + /** + * Registers the plain format tokens in the internal tokenizer. + */ + PlainFormatTokens() + { + Backslash = registerToken("\\"); + LineComment = registerToken("%"); + BlockCommentStart = registerToken("%{"); + BlockCommentEnd = registerToken("}%"); + FieldStart = registerToken("{"); + FieldEnd = registerToken("}"); + } +}; + +static const PlainFormatTokens Tokens; + +/** + * Class used internally to collect data issued via "DATA" event. + */ +class DataHandler { +private: + /** + * Internal character buffer. + */ + std::vector<char> buf; + + /** + * Start location of the character data. + */ + SourceOffset start; + + /** + * End location of the character data. + */ + SourceOffset end; + +public: + /** + * Default constructor, initializes start and end with zeros. + */ + DataHandler() : start(0), end(0) {} + + /** + * Returns true if the internal buffer is empty. + * + * @return true if no characters were added to the internal buffer, false + * otherwise. + */ + bool isEmpty() { return buf.empty(); } + + /** + * Appends a single character to the internal buffer. + * + * @param c is the character that should be added to the internal buffer. + * @param charStart is the start position of the character. + * @param charEnd is the end position of the character. + */ + void append(char c, SourceOffset charStart, SourceOffset charEnd) + { + if (isEmpty()) { + start = charStart; + } + buf.push_back(c); + end = charEnd; + } + + /** + * Appends a string to the internal buffer. + * + * @param s is the string that should be added to the internal buffer. + * @param stringStart is the start position of the string. + * @param stringEnd is the end position of the string. + */ + void append(const std::string &s, SourceOffset stringStart, + SourceOffset stringEnd) + { + if (isEmpty()) { + start = stringStart; + } + std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); + end = stringEnd; + } + + /** + * Converts the internal buffer to a variant with attached location + * information. + * + * @param sourceId is the source id which is needed for building the + * location information. + * @return a Variant with the internal buffer content as string and + * the correct start and end location. + */ + Variant toVariant(SourceId sourceId) + { + Variant res = Variant::fromString(std::string(buf.data(), buf.size())); + res.setLocation({sourceId, start, end}); + return res; + } +}; + +OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) + : reader(reader), logger(logger), tokenizer(Tokens) +{ + // Place an intial command representing the complete file on the stack + commands.push(Command{"", Variant::mapType{}, true, true, true}); +} + +Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +{ + bool first = true; + bool hasCharSiceNSSep = false; + std::vector<char> identifier; + size_t end = reader.getPeekOffset(); + char c, c2; + while (reader.peek(c)) { + // Abort if this character is not a valid identifer character + if ((first && Utils::isIdentifierStartCharacter(c)) || + (!first && Utils::isIdentifierCharacter(c))) { + identifier.push_back(c); + } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + Utils::isIdentifierStartCharacter(c2)) { + identifier.push_back(c); + } else { + if (c == ':' && allowNSSep) { + logger.error( + "Expected character before and after namespace separator " + "\":\"", + reader); + } + reader.resetPeek(); + break; + } + + // This is no longer the first character + first = false; + + // Advance the hasCharSiceNSSep flag + hasCharSiceNSSep = allowNSSep && (c != ':'); + + end = reader.getPeekOffset(); + reader.consumePeek(); + } + + // Return the identifier at its location + Variant res = + Variant::fromString(std::string(identifier.data(), identifier.size())); + res.setLocation({reader.getSourceId(), start, end}); + return res; +} + +OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +{ + // Expect a '{' after the command + reader.consumeWhitespace(); + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\begin", reader); + return State::NONE; + } + + // Parse the name of the command that should be opened + Variant commandName = parseIdentifier(reader.getOffset(), true); + if (commandName.asString().empty()) { + logger.error("Expected identifier", commandName); + return State::ERROR; + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + SourceOffset start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Parse the arguments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), true); + + return State::COMMAND; +} + +static bool checkStillInField(const OsdmStreamParser::Command &cmd, + const Variant &endName, Logger &logger) +{ + if (cmd.inField && !cmd.inRangeField) { + logger.error(std::string("\\end in open field of command \"") + + cmd.name.asString() + std::string("\""), + endName); + logger.note(std::string("Open command started here:"), cmd.name); + return true; + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +{ + // Expect a '{' after the command + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\end", reader); + return State::NONE; + } + + // Fetch the name of the command that should be ended here + Variant name = parseIdentifier(reader.getOffset(), true); + + // Make sure the given command name is not empty + if (name.asString().empty()) { + logger.error("Expected identifier", name); + return State::ERROR; + } + + // Make sure the command name is terminated with a '}' + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Unroll the command stack up to the last range command + while (!commands.top().hasRange) { + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + commands.pop(); + } + + // Make sure we're not in an open field of this command + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + + // Special error message if the top-level command is reached + if (commands.size() == 1) { + logger.error(std::string("Cannot end command \"") + name.asString() + + std::string("\" here, no command open"), + name); + return State::ERROR; + } + + // Inform the about command mismatches + const Command &cmd = commands.top(); + if (commands.top().name.asString() != name.asString()) { + logger.error(std::string("Trying to end command \"") + + cmd.name.asString() + + std::string("\", but open command is \"") + + name.asString() + std::string("\""), + name); + logger.note("Last command was opened here:", cmd.name); + return State::ERROR; + } + + // Set the location to the location of the command that was ended, then end + // the current command + location = name.getLocation(); + commands.pop(); + return cmd.inRangeField ? State::FIELD_END : State::NONE; +} + +Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +{ + // Parse the arguments using the universal VariantReader + Variant commandArguments; + if (reader.expect('[')) { + auto res = VariantReader::parseObject(reader, logger, ']'); + commandArguments = res.second; + } else { + commandArguments = Variant::mapType{}; + } + + // Insert the parsed name, make sure "name" was not specified in the + // arguments + if (commandArgName.isString()) { + auto res = + commandArguments.asMap().emplace("name", std::move(commandArgName)); + if (!res.second) { + logger.error("Name argument specified multiple times", + SourceLocation{}, MessageMode::NO_CONTEXT); + logger.note("First occurance is here: ", commandArgName); + logger.note("Second occurance is here: ", res.first->second); + } + } + return commandArguments; +} + +void OsdmStreamParser::pushCommand(Variant commandName, + Variant commandArguments, bool hasRange) +{ + // Store the location on the stack + location = commandName.getLocation(); + + // Place the command on the command stack, remove the last commands if we're + // not currently inside a field of these commands + while (!commands.top().inField) { + commands.pop(); + } + commands.push(Command{std::move(commandName), std::move(commandArguments), + hasRange, false, false}); +} + +OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +{ + // Parse the commandName as a first identifier + Variant commandName = parseIdentifier(start, true); + if (commandName.asString().empty()) { + logger.error("Empty command name", reader); + return State::NONE; + } + + // Handle the special "begin" and "end" commands + const auto commandNameComponents = + Utils::split(commandName.asString(), ':'); + const bool isBegin = commandNameComponents[0] == "begin"; + const bool isEnd = commandNameComponents[0] == "end"; + if (isBegin || isEnd) { + if (commandNameComponents.size() > 1) { + logger.error( + "Special commands \"\\begin\" and \"\\end\" may not contain a " + "namespace separator \":\"", + commandName); + } + if (isBegin) { + return parseBeginCommand(); + } else if (isEnd) { + return parseEndCommand(); + } + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + // Parse the arugments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), false); + + return State::COMMAND; +} + +void OsdmStreamParser::parseBlockComment() +{ + Token token; + size_t depth = 1; + while (tokenizer.read(reader, token)) { + if (token.type == Tokens.BlockCommentEnd) { + depth--; + if (depth == 0) { + return; + } + } + if (token.type == Tokens.BlockCommentStart) { + depth++; + } + } + + // Issue an error if the file ends while we are in a block comment + logger.error("File ended while being in a block comment", reader); +} + +void OsdmStreamParser::parseLineComment() +{ + char c; + while (reader.read(c)) { + if (c == '\n') { + return; + } + } +} + +bool OsdmStreamParser::checkIssueData(DataHandler &handler) +{ + if (!handler.isEmpty()) { + data = handler.toVariant(reader.getSourceId()); + location = data.getLocation(); + reader.resetPeek(); + return true; + } + return false; +} + +bool OsdmStreamParser::checkIssueFieldStart() +{ + // Fetch the current command, and check whether we're currently inside a + // field of this command + Command &cmd = commands.top(); + if (!cmd.inField) { + // If this is a range command, we're now implicitly inside the field of + // this command -- we'll have to issue a field start command! + if (cmd.hasRange) { + cmd.inField = true; + cmd.inRangeField = true; + reader.resetPeek(); + return true; + } + + // This was not a range command, so obviously we're now inside within + // a field of some command -- so unroll the commands stack until a + // command with open field is reached + while (!commands.top().inField) { + commands.pop(); + } + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parse() +{ + // Handler for incomming data + DataHandler handler; + + // Read tokens until the outer loop should be left + Token token; + while (tokenizer.peek(reader, token)) { + const TokenTypeId type = token.type; + + // Special handling for Backslash and Text + if (type == Tokens.Backslash) { + // Before appending anything to the output data or starting a new + // command, check whether FIELD_START has to be issued, as the + // current command is a command with range + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Check whether a command starts now, without advancing the peek + // cursor + char c; + if (!reader.fetchPeek(c)) { + logger.error("Trailing backslash at the end of the file.", + token); + return State::END; + } + + // Try to parse a command + if (Utils::isIdentifierStartCharacter(c)) { + // Make sure to issue any data before it is to late + if (checkIssueData(handler)) { + return State::DATA; + } + + // Parse the actual command + State res = parseCommand(token.location.getStart()); + switch (res) { + case State::ERROR: + throw LoggableException( + "Last error was irrecoverable, ending parsing " + "process"); + case State::NONE: + continue; + default: + return res; + } + } + + // This was not a special character, just append the given character + // to the data buffer, use the escape character start as start + // location and the peek offset as end location + reader.peek(c); // Peek the previously fetched character + handler.append(c, token.location.getStart(), + reader.getPeekOffset()); + reader.consumePeek(); + continue; + } else if (type == TextToken) { + // Check whether FIELD_START has to be issued before appending text + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Append the text to the data handler + handler.append(token.content, token.location.getStart(), + token.location.getEnd()); + + reader.consumePeek(); + continue; + } + + // A non-text token was reached, make sure all pending data commands + // have been issued + if (checkIssueData(handler)) { + return State::DATA; + } + + // We will handle the token now, consume the peeked characters + reader.consumePeek(); + + // Update the location to the current token location + location = token.location; + + if (token.type == Tokens.LineComment) { + parseLineComment(); + } else if (token.type == Tokens.BlockCommentStart) { + parseBlockComment(); + } else if (token.type == Tokens.FieldStart) { + Command &cmd = commands.top(); + if (!cmd.inField) { + cmd.inField = true; + return State::FIELD_START; + } + logger.error( + "Got field start token \"{\", but no command for which to " + "start the field. Did you mean \"\\{\"?", + token); + } else if (token.type == Tokens.FieldEnd) { + // Try to end an open field of the current command -- if the current + // command is not inside an open field, end this command and try to + // close the next one + for (int i = 0; i < 2 && commands.size() > 1; i++) { + Command &cmd = commands.top(); + if (!cmd.inRangeField) { + if (cmd.inField) { + cmd.inField = false; + return State::FIELD_END; + } + commands.pop(); + } else { + break; + } + } + logger.error( + "Got field end token \"}\", but there is no field to end. Did " + "you mean \"\\}\"?", + token); + } else { + logger.error("Unexpected token \"" + token.content + "\"", token); + } + } + + // Issue available data + if (checkIssueData(handler)) { + return State::DATA; + } + + // Make sure all open commands and fields have been ended at the end of the + // stream + while (commands.size() > 1) { + Command &cmd = commands.top(); + if (cmd.inField || cmd.hasRange) { + logger.error("Reached end of stream, but command \"" + + cmd.name.asString() + "\" has not been ended", + cmd.name); + } + commands.pop(); + } + + location = SourceLocation{reader.getSourceId(), reader.getOffset()}; + return State::END; +} + +const Variant &OsdmStreamParser::getCommandName() +{ + return commands.top().name; +} + +const Variant &OsdmStreamParser::getCommandArguments() +{ + return commands.top().arguments; +} +} + diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp new file mode 100644 index 0000000..84674c0 --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -0,0 +1,350 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsdmStreamParser.hpp + * + * Provides classes for low-level classes for reading the TeX-esque osdm + * format. The class provided here does not build any model objects and does not + * implement the Parser interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ +#define _OUSIA_OSDM_STREAM_PARSER_HPP_ + +#include <stack> + +#include <core/common/Variant.hpp> +#include <core/parser/utils/Tokenizer.hpp> + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; +class DataHandler; + +/** + * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * format. The parser is constructed around a "parse" function, which reads data + * from the underlying CharReader until a new state is reached and indicates + * this state in a return value. The calling code then has to pull corresponding + * data from the stream reader. The reader makes sure the incommind file is + * syntactically valid and tries to recorver from most errors. If an error is + * irrecoverable (this is the case for errors with wrong nesting of commands or + * fields, as this would lead to too many consecutive errors) a + * LoggableException is thrown. + */ +class OsdmStreamParser { +public: + /** + * Enum used to indicate which state the OsdmStreamParser class is in + * after calling the "parse" function. + */ + enum class State { + /** + * State returned if a fully featured command has been read. A command + * consists of the command name and its arguments (which optionally + * includes the name). + */ + COMMAND, + + /** + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. + */ + DATA, + + /** + * A user-defined entity has been found. The entity sequence is stored + * in the command name. + */ + ENTITY, + + /** + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). + */ + ANNOTATION_START, + + /** + * State returned if an annotation ends. The reader indicates which + * annotation ends. + */ + ANNOTATION_END, + + /** + * State returned if a new field started. The reader assures that the + * current field ends before a new field is started and that the field + * is not started if data has been given outside of a field. The + * field number is set to the current field index. + */ + FIELD_START, + + /** + * State returned if the current field ends. The reader assures that a + * field was actually open. + */ + FIELD_END, + + /** + * The end of the stream has been reached. + */ + END, + + /** + * Returned from internal functions if nothing should be done. + */ + NONE, + + /** + * Returned from internal function to indicate irrecoverable errors. + */ + ERROR + }; + + /** + * Entry used for the command stack. + */ + struct Command { + /** + * Name and location of the current command. + */ + Variant name; + + /** + * Arguments that were passed to the command. + */ + Variant arguments; + + /** + * Set to true if this is a command with clear begin and end. + */ + bool hasRange; + + /** + * Set to true if we are currently inside a field of this command. + */ + bool inField; + + /** + * Set to true if we are currently in the range field of the command + * (implies inField being set to true). + */ + bool inRangeField; + + /** + * Default constructor. + */ + Command() : hasRange(false), inField(false), inRangeField(false) {} + + /** + * Constructor of the Command class. + * + * @param name is a string variant with name and location of the + * command. + * @param arguments is a map variant with the arguments given to the + * command. + * @param hasRange should be set to true if this is a command with + * explicit range. + * @param inField is set to true if we currently are inside a field + * of this command. + * @param inRangeField is set to true if we currently inside the outer + * field of the command. + */ + Command(Variant name, Variant arguments, bool hasRange, bool inField, + bool inRangeField) + : name(std::move(name)), + arguments(std::move(arguments)), + hasRange(hasRange), + inField(inField), + inRangeField(inRangeField) + { + } + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; + + /** + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Tokenizer instance used to read individual tokens from the text. + */ + Tokenizer tokenizer; + + /** + * Stack containing the current commands. + */ + std::stack<Command> commands; + + /** + * Variant containing the data that has been read (always is a string, + * contains the exact location of the data in the source file). + */ + Variant data; + + /** + * Contains the location of the last token. + */ + SourceLocation location; + + /** + * Contains the field index of the current command. + */ + size_t fieldIdx; + + /** + * Function used internall to parse an identifier. + * + * @param start is the start byte offset of the identifier (including the + * backslash). + * @param allowNSSep should be set to true if the namespace separator is + * allowed in the identifier name. Issues error if the namespace separator + * is placed incorrectly. + */ + Variant parseIdentifier(size_t start, bool allowNSSep = false); + + /** + * Function used internally to handle the special "\begin" command. + */ + State parseBeginCommand(); + + /** + * Function used internally to handle the special "\end" command. + */ + State parseEndCommand(); + + /** + * Pushes the parsed command onto the command stack. + */ + void pushCommand(Variant commandName, Variant commandArguments, + bool hasRange); + + /** + * Parses the command arguments. + */ + Variant parseCommandArguments(Variant commandArgName); + + /** + * Function used internally to parse a command. + * + * @param start is the start byte offset of the command (including the + * backslash) + * @return true if a command was actuall parsed, false otherwise. + */ + State parseCommand(size_t start); + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseLineComment(); + + /** + * Checks whether there is any data pending to be issued, if yes, issues it. + * + * @param handler is the data handler that contains the data that may be + * returned to the user. + * @return true if there was any data and DATA should be returned by the + * parse function, false otherwise. + */ + bool checkIssueData(DataHandler &handler); + + /** + * Called before any data is appended to the internal data handler. Checks + * whether a new field should be started or implicitly ended. + * + * @return true if FIELD_START should be returned by the parse function. + */ + bool checkIssueFieldStart(); + +public: + /** + * Constructor of the OsdmStreamParser class. Attaches the new + * OsdmStreamParser to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + OsdmStreamParser(CharReader &reader, Logger &logger); + + /** + * Continues parsing. Returns one of the states defined in the State enum. + * Callers should stop once the State::END state is reached. Use the getter + * functions to get more information about the current state, such as the + * command name or the data or the current field index. + * + * @return the new state the parser has reached. + */ + State parse(); + + /** + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. + */ + const Variant &getData() { return data; } + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing name and location of the + * parsed command. + */ + const Variant &getCommandName(); + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing arguments given to the + * command. + */ + const Variant &getCommandArguments(); + + /** + * Returns a reference at the char reader. + * + * @return the last internal token location. + */ + SourceLocation &getLocation() { return location; } +}; +} + +#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ + |