/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * @file OsmlStreamParser.hpp * * Provides classes for low-level classes for reading the TeX-esque osml * format. The class provided here does not build any model objects and does not * implement the Parser interface. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ #include #include #include namespace ousia { // Forward declarations class CharReader; class Logger; class DataHandler; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml * format. The parser is constructed around a "parse" function, which reads data * from the underlying CharReader until a new state is reached and indicates * this state in a return value. The calling code then has to pull corresponding * data from the stream reader. The reader makes sure the incommind file is * syntactically valid and tries to recorver from most errors. If an error is * irrecoverable (this is the case for errors with wrong nesting of commands or * fields, as this would lead to too many consecutive errors) a * LoggableException is thrown. */ class OsmlStreamParser { public: /** * Enum used to indicate which state the OsmlStreamParser class is in * after calling the "parse" function. */ enum class State { /** * State returned if a fully featured command has been read. A command * consists of the command name and its arguments (which optionally * includes the name). */ COMMAND, /** * State returned if data is given. The reader must decide which field * or command this should be routed to. Trailing or leading whitespace * has been removed. Only called if the data is non-empty. */ DATA, /** * A user-defined entity has been found. The entity sequence is stored * in the command name. */ ENTITY, /** * State returned if an annotation was started. An annotation consists * of the command name and its arguments (which optionally include the * name). */ ANNOTATION_START, /** * State returned if an annotation ends. The reader indicates which * annotation ends. */ ANNOTATION_END, /** * State returned if a new field started. The reader assures that the * current field ends before a new field is started and that the field * is not started if data has been given outside of a field. The * field number is set to the current field index. */ FIELD_START, /** * State returned if the current field ends. The reader assures that a * field was actually open. */ FIELD_END, /** * The end of the stream has been reached. */ END, /** * Returned from internal functions if nothing should be done. */ NONE, /** * Returned from internal function to indicate irrecoverable errors. */ ERROR }; /** * Entry used for the command stack. */ struct Command { /** * Name and location of the current command. */ Variant name; /** * Arguments that were passed to the command. */ Variant arguments; /** * Set to true if this is a command with clear begin and end. */ bool hasRange; /** * Set to true if we are currently inside a field of this command. */ bool inField; /** * Set to true if we are currently in the range field of the command * (implies inField being set to true). */ bool inRangeField; /** * Set to true if we are currently in a field that has been especially * marked as default field (using the "|") syntax. */ bool inDefaultField; /** * Default constructor. */ Command() : hasRange(false), inField(false), inRangeField(false), inDefaultField() { } /** * Constructor of the Command class. * * @param name is a string variant with name and location of the * command. * @param arguments is a map variant with the arguments given to the * command. * @param hasRange should be set to true if this is a command with * explicit range. * @param inField is set to true if we currently are inside a field * of this command. * @param inRangeField is set to true if we currently are inside the * outer field of a ranged command. * @param inDefaultField is set to true if we currently are in a * specially marked default field. */ Command(Variant name, Variant arguments, bool hasRange, bool inField, bool inRangeField, bool inDefaultField) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), inField(inField), inRangeField(inRangeField), inDefaultField(inDefaultField) { } }; private: /** * Reference to the CharReader instance from which the incomming bytes are * read. */ CharReader &reader; /** * Reference at the logger instance to which all error messages are sent. */ Logger &logger; /** * Tokenizer instance used to read individual tokens from the text. */ Tokenizer tokenizer; /** * Stack containing the current commands. */ std::stack commands; /** * Variant containing the data that has been read (always is a string, * contains the exact location of the data in the source file). */ Variant data; /** * Contains the location of the last token. */ SourceLocation location; /** * Contains the field index of the current command. */ size_t fieldIdx; /** * Function used internall to parse an identifier. * * @param start is the start byte offset of the identifier (including the * backslash). * @param allowNSSep should be set to true if the namespace separator is * allowed in the identifier name. Issues error if the namespace separator * is placed incorrectly. */ Variant parseIdentifier(size_t start, bool allowNSSep = false); /** * Function used internally to handle the special "\begin" command. */ State parseBeginCommand(); /** * Function used internally to handle the special "\end" command. */ State parseEndCommand(); /** * Pushes the parsed command onto the command stack. */ void pushCommand(Variant commandName, Variant commandArguments, bool hasRange); /** * Parses the command arguments. */ Variant parseCommandArguments(Variant commandArgName); /** * Function used internally to parse a command. * * @param start is the start byte offset of the command (including the * backslash) * @param isAnnotation if true, the command is not returned as command, but * as annotation start. * @return true if a command was actuall parsed, false otherwise. */ State parseCommand(size_t start, bool isAnnotation); /** * Function used internally to parse a block comment. */ void parseBlockComment(); /** * Function used internally to parse a generic comment. */ void parseLineComment(); /** * Checks whether there is any data pending to be issued, if yes, issues it. * * @param handler is the data handler that contains the data that may be * returned to the user. * @return true if there was any data and DATA should be returned by the * parse function, false otherwise. */ bool checkIssueData(DataHandler &handler); /** * Called before any data is appended to the internal data handler. Checks * whether a new field should be started or implicitly ended. * * @return true if FIELD_START should be returned by the parse function. */ bool checkIssueFieldStart(); /** * Closes a currently open field. Note that the command will be removed from * the internal command stack if the field that is being closed is a * field marked as default field. * * @return true if the field could be closed, false if there was no field * to close. */ bool closeField(); public: /** * Constructor of the OsmlStreamParser class. Attaches the new * OsmlStreamParser to the given CharReader and Logger instances. * * @param reader is the reader instance from which incomming characters * should be read. * @param logger is the logger instance to which errors should be written. */ OsmlStreamParser(CharReader &reader, Logger &logger); /** * Continues parsing. Returns one of the states defined in the State enum. * Callers should stop once the State::END state is reached. Use the getter * functions to get more information about the current state, such as the * command name or the data or the current field index. * * @return the new state the parser has reached. */ State parse(); /** * Returns a reference at the internally stored data. Only valid if * State::DATA was returned by the "parse" function. * * @return a reference at a variant containing the data parsed by the * "parse" function. */ const Variant &getData() const { return data; } /** * Returns a reference at the internally stored command name. Only valid if * State::COMMAND was returned by the "parse" function. * * @return a reference at a variant containing name and location of the * parsed command. */ const Variant &getCommandName() const; /** * Returns a reference at the internally stored command name. Only valid if * State::COMMAND was returned by the "parse" function. * * @return a reference at a variant containing arguments given to the * command. */ const Variant &getCommandArguments() const; /** * Returns true if the current field is the "default" field. This is true if * the parser either is in the outer range of a range command or inside a * field that has been especially marked as "default" field (using the "|" * syntax). */ bool inDefaultField() const; /** * Returns a reference at the char reader. * * @return the last internal token location. */ const SourceLocation &getLocation() const { return location; } }; } #endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */