diff options
Diffstat (limited to 'src/formats/osdm/OsdmStreamParser.hpp')
-rw-r--r-- | src/formats/osdm/OsdmStreamParser.hpp | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osdm/OsdmStreamParser.hpp new file mode 100644 index 0000000..48d8fb7 --- /dev/null +++ b/src/formats/osdm/OsdmStreamParser.hpp @@ -0,0 +1,351 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsdmStreamParser.hpp + * + * Provides classes for low-level classes for reading the TeX-esque osdm + * format. The class provided here does not build any model objects and does not + * implement the Parser interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ +#define _OUSIA_OSDM_STREAM_PARSER_HPP_ + +#include <stack> + +#include <core/common/Variant.hpp> + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; +class DataHandler; + +/** + * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * format. The parser is constructed around a "parse" function, which reads data + * from the underlying CharReader until a new state is reached and indicates + * this state in a return value. The calling code then has to pull corresponding + * data from the stream reader. The reader makes sure the incommind file is + * syntactically valid and tries to recorver from most errors. If an error is + * irrecoverable (this is the case for errors with wrong nesting of commands or + * fields, as this would lead to too many consecutive errors) a + * LoggableException is thrown. + */ +class OsdmStreamParser { +public: + /** + * Enum used to indicate which state the OsdmStreamParser class is in + * after calling the "parse" function. + */ + enum class State { + /** + * State returned if a fully featured command has been read. A command + * consists of the command name and its arguments (which optionally + * includes the name). + */ + COMMAND, + + /** + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. + */ + DATA, + + /** + * A user-defined entity has been found. The entity sequence is stored + * in the command name. + */ + ENTITY, + + /** + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). + */ + ANNOTATION_START, + + /** + * State returned if an annotation ends. The reader indicates which + * annotation ends. + */ + ANNOTATION_END, + + /** + * State returned if a new field started. The reader assures that the + * current field ends before a new field is started and that the field + * is not started if data has been given outside of a field. The + * field number is set to the current field index. + */ + FIELD_START, + + /** + * State returned if the current field ends. The reader assures that a + * field was actually open. + */ + FIELD_END, + + /** + * The end of the stream has been reached. + */ + END, + + /** + * Returned from internal functions if nothing should be done. + */ + NONE, + + /** + * Returned from internal function to indicate irrecoverable errors. + */ + ERROR + }; + + /** + * Entry used for the command stack. + */ + struct Command { + /** + * Name and location of the current command. + */ + Variant name; + + /** + * Arguments that were passed to the command. + */ + Variant arguments; + + /** + * Set to true if this is a command with clear begin and end. + */ + bool hasRange; + + /** + * Set to true if we are currently inside a field of this command. + */ + bool inField; + + /** + * Set to true if we are currently in the range field of the command + * (implies inField being set to true). + */ + bool inRangeField; + + /** + * Default constructor. + */ + Command() : hasRange(false), inField(false), inRangeField(false) {} + + /** + * Constructor of the Command class. + * + * @param name is a string variant with name and location of the + * command. + * @param arguments is a map variant with the arguments given to the + * command. + * @param hasRange should be set to true if this is a command with + * explicit range. + * @param inField is set to true if we currently are inside a field + * of this command. + * @param inRangeField is set to true if we currently inside the outer + * field of the command. + */ + Command(Variant name, Variant arguments, bool hasRange, bool inField, + bool inRangeField) + : name(std::move(name)), + arguments(std::move(arguments)), + hasRange(hasRange), + inField(inField), + inRangeField(inRangeField) + { + } + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; + + /** + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Tokenizer instance used to read individual tokens from the text. + */ + DynamicTokenizer tokenizer; + + /** + * Stack containing the current commands. + */ + std::stack<Command> commands; + + /** + * Variant containing the data that has been read (always is a string, + * contains the exact location of the data in the source file). + */ + Variant data; + + /** + * Contains the location of the last token. + */ + SourceLocation location; + + /** + * Contains the field index of the current command. + */ + size_t fieldIdx; + + /** + * Function used internall to parse an identifier. + * + * @param start is the start byte offset of the identifier (including the + * backslash). + * @param allowNSSep should be set to true if the namespace separator is + * allowed in the identifier name. Issues error if the namespace separator + * is placed incorrectly. + */ + Variant parseIdentifier(size_t start, bool allowNSSep = false); + + /** + * Function used internally to handle the special "\begin" command. + */ + State parseBeginCommand(); + + /** + * Function used internally to handle the special "\end" command. + */ + State parseEndCommand(); + + /** + * Pushes the parsed command onto the command stack. + */ + void pushCommand(Variant commandName, Variant commandArguments, + bool hasRange); + + /** + * Parses the command arguments. + */ + Variant parseCommandArguments(Variant commandArgName); + + /** + * Function used internally to parse a command. + * + * @param start is the start byte offset of the command (including the + * backslash) + * @return true if a command was actuall parsed, false otherwise. + */ + State parseCommand(size_t start); + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseLineComment(); + + /** + * Checks whether there is any data pending to be issued, if yes, issues it. + * + * @param handler is the data handler that contains the data that may be + * returned to the user. + * @return true if there was any data and DATA should be returned by the + * parse function, false otherwise. + */ + bool checkIssueData(DataHandler &handler); + + /** + * Called before any data is appended to the internal data handler. Checks + * whether a new field should be started or implicitly ended. + * + * @return true if FIELD_START should be returned by the parse function. + */ + bool checkIssueFieldStart(); + +public: + /** + * Constructor of the OsdmStreamParser class. Attaches the new + * OsdmStreamParser to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + OsdmStreamParser(CharReader &reader, Logger &logger); + + /** + * Continues parsing. Returns one of the states defined in the State enum. + * Callers should stop once the State::END state is reached. Use the getter + * functions to get more information about the current state, such as the + * command name or the data or the current field index. + * + * @return the new state the parser has reached. + */ + State parse(); + + /** + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. + */ + const Variant &getData() { return data; } + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing name and location of the + * parsed command. + */ + const Variant &getCommandName(); + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing arguments given to the + * command. + */ + const Variant &getCommandArguments(); + + /** + * Returns a reference at the char reader. + * + * @return the last internal token location. + */ + SourceLocation &getLocation() { return location; } +}; +} + +#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ + |