diff options
Diffstat (limited to 'src/formats/osml/OsmlStreamParser.hpp')
-rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp | 331 |
1 files changed, 81 insertions, 250 deletions
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..b7e64f7 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,68 +29,53 @@ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include <stack> +#include <cstdint> +#include <memory> -#include <core/common/Variant.hpp> -#include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/stack/Callbacks.hpp> namespace ousia { // Forward declarations class CharReader; class Logger; -class DataHandler; +class OsmlStreamParserImpl; +class TokenizedData; +class Variant; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml * format. The parser is constructed around a "parse" function, which reads data * from the underlying CharReader until a new state is reached and indicates * this state in a return value. The calling code then has to pull corresponding - * data from the stream reader. The reader makes sure the incommind file is + * data from the stream reader. The reader makes sure the incomming stream is * syntactically valid and tries to recorver from most errors. If an error is * irrecoverable (this is the case for errors with wrong nesting of commands or * fields, as this would lead to too many consecutive errors) a - * LoggableException is thrown. + * LoggableException is thrown. In short, the OsmlStreamParser can be described + * as a SAX parser for OSML. */ -class OsmlStreamParser { +class OsmlStreamParser: public parser_stack::ParserCallbacks { public: /** * Enum used to indicate which state the OsmlStreamParser class is in * after calling the "parse" function. */ - enum class State { + enum class State : uint8_t { /** - * State returned if a fully featured command has been read. A command - * consists of the command name and its arguments (which optionally - * includes the name). + * State returned if the start of a command has been read. Use the + * getCommandName(), getCommandArguments() and inRangeCommand() + * functions the retrieve more information about the command that was + * just started. */ - COMMAND, + COMMAND_START = 0, /** - * State returned if data is given. The reader must decide which field - * or command this should be routed to. Trailing or leading whitespace - * has been removed. Only called if the data is non-empty. - */ - DATA, - - /** - * A user-defined entity has been found. The entity sequence is stored - * in the command name. - */ - ENTITY, - - /** - * State returned if an annotation was started. An annotation consists - * of the command name and its arguments (which optionally include the - * name). - */ - ANNOTATION_START, - - /** - * State returned if an annotation ends. The reader indicates which - * annotation ends. + * State returned if a range command or range annotation has just ended. + * This state is not returned for non-range commands (as the actual end + * of a command is context dependent). */ - ANNOTATION_END, + RANGE_END = 1, /** * State returned if a new field started. The reader assures that the @@ -98,223 +83,46 @@ public: * is not started if data has been given outside of a field. The * field number is set to the current field index. */ - FIELD_START, + FIELD_START = 2, /** * State returned if the current field ends. The reader assures that a * field was actually open. */ - FIELD_END, + FIELD_END = 3, /** - * The end of the stream has been reached. + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). */ - END, + ANNOTATION_START = 4, /** - * Returned from internal functions if nothing should be done. + * State returned if an annotation ends. The reader indicates which + * annotation ends. */ - NONE, + ANNOTATION_END = 5, /** - * Returned from internal function to indicate irrecoverable errors. + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. */ - ERROR - }; - - /** - * Entry used for the command stack. - */ - struct Command { - /** - * Name and location of the current command. - */ - Variant name; - - /** - * Arguments that were passed to the command. - */ - Variant arguments; + DATA = 6, /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange : 1; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField : 1; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). - */ - bool inRangeField : 1; - - /** - * Set to true if we are currently in a field that has been especially - * marked as default field (using the "|") syntax. - */ - bool inDefaultField : 1; - - /** - * Default constructor. - */ - Command() - : hasRange(false), - inField(false), - inRangeField(false), - inDefaultField() - { - } - - /** - * Constructor of the Command class. - * - * @param name is a string variant with name and location of the - * command. - * @param arguments is a map variant with the arguments given to the - * command. - * @param hasRange should be set to true if this is a command with - * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently are inside the - * outer field of a ranged command. - * @param inDefaultField is set to true if we currently are in a - * specially marked default field. - */ - Command(Variant name, Variant arguments, bool hasRange, - bool inField, bool inRangeField, bool inDefaultField) - : name(std::move(name)), - arguments(std::move(arguments)), - hasRange(hasRange), - inField(inField), - inRangeField(inRangeField), - inDefaultField(inDefaultField) - { - } + * The end of the stream has been reached. + */ + END = 7 }; private: /** - * Reference to the CharReader instance from which the incomming bytes are - * read. - */ - CharReader &reader; - - /** - * Reference at the logger instance to which all error messages are sent. + * Pointer at the class containing the internal implementation (according + * to the PIMPL idiom). */ - Logger &logger; - - /** - * Tokenizer instance used to read individual tokens from the text. - */ - Tokenizer tokenizer; - - /** - * Stack containing the current commands. - */ - std::stack<Command> commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). - */ - Variant data; - - /** - * Contains the location of the last token. - */ - SourceLocation location; - - /** - * Contains the field index of the current command. - */ - size_t fieldIdx; - - /** - * Function used internall to parse an identifier. - * - * @param start is the start byte offset of the identifier (including the - * backslash). - * @param allowNSSep should be set to true if the namespace separator is - * allowed in the identifier name. Issues error if the namespace separator - * is placed incorrectly. - */ - Variant parseIdentifier(size_t start, bool allowNSSep = false); - - /** - * Function used internally to handle the special "\begin" command. - */ - State parseBeginCommand(); - - /** - * Function used internally to handle the special "\end" command. - */ - State parseEndCommand(); - - /** - * Pushes the parsed command onto the command stack. - */ - void pushCommand(Variant commandName, Variant commandArguments, - bool hasRange); - - /** - * Parses the command arguments. - */ - Variant parseCommandArguments(Variant commandArgName); - - /** - * Function used internally to parse a command. - * - * @param start is the start byte offset of the command (including the - * backslash) - * @param isAnnotation if true, the command is not returned as command, but - * as annotation start. - * @return true if a command was actuall parsed, false otherwise. - */ - State parseCommand(size_t start, bool isAnnotation); - - /** - * Function used internally to parse a block comment. - */ - void parseBlockComment(); - - /** - * Function used internally to parse a generic comment. - */ - void parseLineComment(); - - /** - * Checks whether there is any data pending to be issued, if yes, issues it. - * - * @param handler is the data handler that contains the data that may be - * returned to the user. - * @return true if there was any data and DATA should be returned by the - * parse function, false otherwise. - */ - bool checkIssueData(DataHandler &handler); - - /** - * Called before any data is appended to the internal data handler. Checks - * whether a new field should be started or implicitly ended. - * - * @return true if FIELD_START should be returned by the parse function. - */ - bool checkIssueFieldStart(); - - /** - * Closes a currently open field. Note that the command will be removed from - * the internal command stack if the field that is being closed is a - * field marked as default field. - * - * @return true if the field could be closed, false if there was no field - * to close. - */ - bool closeField(); + std::unique_ptr<OsmlStreamParserImpl> impl; public: /** @@ -328,6 +136,12 @@ public: OsmlStreamParser(CharReader &reader, Logger &logger); /** + * Destructor of the OsmlStreamParser, needed to destroy the incomplete + * OsmlStreamParserImpl. + */ + ~OsmlStreamParser(); + + /** * Continues parsing. Returns one of the states defined in the State enum. * Callers should stop once the State::END state is reached. Use the getter * functions to get more information about the current state, such as the @@ -338,17 +152,9 @@ public: State parse(); /** - * Returns a reference at the internally stored data. Only valid if - * State::DATA was returned by the "parse" function. - * - * @return a reference at a variant containing the data parsed by the - * "parse" function. - */ - const Variant &getData() const { return data; } - - /** * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. + * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END + * was returned by the "parse" function. * * @return a reference at a variant containing name and location of the * parsed command. @@ -357,7 +163,8 @@ public: /** * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. + * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END + * was returned by the "parse" function. * * @return a reference at a variant containing arguments given to the * command. @@ -365,19 +172,43 @@ public: const Variant &getCommandArguments() const; /** - * Returns true if the current field is the "default" field. This is true if - * the parser either is in the outer range of a range command or inside a - * field that has been especially marked as "default" field (using the "|" - * syntax). + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. */ - bool inDefaultField() const; + const TokenizedData &getData() const; + + /** + * Returns the location of the current token. + */ + const SourceLocation &getLocation() const; /** - * Returns a reference at the char reader. + * Returns true if the currently started command is a range command, only + * valid if State::COMMAND_START or State::ANNOTATION_START was returned by + * the "parse" function. * - * @return the last internal token location. + * @return true if the command is started is a range command, false + * otherwise. */ - const SourceLocation &getLocation() const { return location; } + bool inRangeCommand() const; + + /** + * Returns true if the current field is the "default" field. This is true if + * the parser either is in the outer range of a range command or inside a + * field that has been especially marked as "default" field (using the "{!" + * syntax). Only valid if State::FIELD_START was returned by the "parse" + * function. + * + * @return true if the current field was marked as default field (using the + * "{!" syntax). + */ + bool inDefaultField() const; + + TokenId registerToken(const std::string &token) override; + void unregisterToken(TokenId token) override; }; } |