summaryrefslogtreecommitdiff
path: root/src/formats/osml/OsmlStreamParser.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/formats/osml/OsmlStreamParser.hpp')
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp331
1 files changed, 81 insertions, 250 deletions
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index dc3034c..b7e64f7 100644
--- a/src/formats/osml/OsmlStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -29,68 +29,53 @@
#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
#define _OUSIA_OSML_STREAM_PARSER_HPP_
-#include <stack>
+#include <cstdint>
+#include <memory>
-#include <core/common/Variant.hpp>
-#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/stack/Callbacks.hpp>
namespace ousia {
// Forward declarations
class CharReader;
class Logger;
-class DataHandler;
+class OsmlStreamParserImpl;
+class TokenizedData;
+class Variant;
/**
* The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
* format. The parser is constructed around a "parse" function, which reads data
* from the underlying CharReader until a new state is reached and indicates
* this state in a return value. The calling code then has to pull corresponding
- * data from the stream reader. The reader makes sure the incommind file is
+ * data from the stream reader. The reader makes sure the incomming stream is
* syntactically valid and tries to recorver from most errors. If an error is
* irrecoverable (this is the case for errors with wrong nesting of commands or
* fields, as this would lead to too many consecutive errors) a
- * LoggableException is thrown.
+ * LoggableException is thrown. In short, the OsmlStreamParser can be described
+ * as a SAX parser for OSML.
*/
-class OsmlStreamParser {
+class OsmlStreamParser: public parser_stack::ParserCallbacks {
public:
/**
* Enum used to indicate which state the OsmlStreamParser class is in
* after calling the "parse" function.
*/
- enum class State {
+ enum class State : uint8_t {
/**
- * State returned if a fully featured command has been read. A command
- * consists of the command name and its arguments (which optionally
- * includes the name).
+ * State returned if the start of a command has been read. Use the
+ * getCommandName(), getCommandArguments() and inRangeCommand()
+ * functions the retrieve more information about the command that was
+ * just started.
*/
- COMMAND,
+ COMMAND_START = 0,
/**
- * State returned if data is given. The reader must decide which field
- * or command this should be routed to. Trailing or leading whitespace
- * has been removed. Only called if the data is non-empty.
- */
- DATA,
-
- /**
- * A user-defined entity has been found. The entity sequence is stored
- * in the command name.
- */
- ENTITY,
-
- /**
- * State returned if an annotation was started. An annotation consists
- * of the command name and its arguments (which optionally include the
- * name).
- */
- ANNOTATION_START,
-
- /**
- * State returned if an annotation ends. The reader indicates which
- * annotation ends.
+ * State returned if a range command or range annotation has just ended.
+ * This state is not returned for non-range commands (as the actual end
+ * of a command is context dependent).
*/
- ANNOTATION_END,
+ RANGE_END = 1,
/**
* State returned if a new field started. The reader assures that the
@@ -98,223 +83,46 @@ public:
* is not started if data has been given outside of a field. The
* field number is set to the current field index.
*/
- FIELD_START,
+ FIELD_START = 2,
/**
* State returned if the current field ends. The reader assures that a
* field was actually open.
*/
- FIELD_END,
+ FIELD_END = 3,
/**
- * The end of the stream has been reached.
+ * State returned if an annotation was started. An annotation consists
+ * of the command name and its arguments (which optionally include the
+ * name).
*/
- END,
+ ANNOTATION_START = 4,
/**
- * Returned from internal functions if nothing should be done.
+ * State returned if an annotation ends. The reader indicates which
+ * annotation ends.
*/
- NONE,
+ ANNOTATION_END = 5,
/**
- * Returned from internal function to indicate irrecoverable errors.
+ * State returned if data is given. The reader must decide which field
+ * or command this should be routed to. Trailing or leading whitespace
+ * has been removed. Only called if the data is non-empty.
*/
- ERROR
- };
-
- /**
- * Entry used for the command stack.
- */
- struct Command {
- /**
- * Name and location of the current command.
- */
- Variant name;
-
- /**
- * Arguments that were passed to the command.
- */
- Variant arguments;
+ DATA = 6,
/**
- * Set to true if this is a command with clear begin and end.
- */
- bool hasRange : 1;
-
- /**
- * Set to true if we are currently inside a field of this command.
- */
- bool inField : 1;
-
- /**
- * Set to true if we are currently in the range field of the command
- * (implies inField being set to true).
- */
- bool inRangeField : 1;
-
- /**
- * Set to true if we are currently in a field that has been especially
- * marked as default field (using the "|") syntax.
- */
- bool inDefaultField : 1;
-
- /**
- * Default constructor.
- */
- Command()
- : hasRange(false),
- inField(false),
- inRangeField(false),
- inDefaultField()
- {
- }
-
- /**
- * Constructor of the Command class.
- *
- * @param name is a string variant with name and location of the
- * command.
- * @param arguments is a map variant with the arguments given to the
- * command.
- * @param hasRange should be set to true if this is a command with
- * explicit range.
- * @param inField is set to true if we currently are inside a field
- * of this command.
- * @param inRangeField is set to true if we currently are inside the
- * outer field of a ranged command.
- * @param inDefaultField is set to true if we currently are in a
- * specially marked default field.
- */
- Command(Variant name, Variant arguments, bool hasRange,
- bool inField, bool inRangeField, bool inDefaultField)
- : name(std::move(name)),
- arguments(std::move(arguments)),
- hasRange(hasRange),
- inField(inField),
- inRangeField(inRangeField),
- inDefaultField(inDefaultField)
- {
- }
+ * The end of the stream has been reached.
+ */
+ END = 7
};
private:
/**
- * Reference to the CharReader instance from which the incomming bytes are
- * read.
- */
- CharReader &reader;
-
- /**
- * Reference at the logger instance to which all error messages are sent.
+ * Pointer at the class containing the internal implementation (according
+ * to the PIMPL idiom).
*/
- Logger &logger;
-
- /**
- * Tokenizer instance used to read individual tokens from the text.
- */
- Tokenizer tokenizer;
-
- /**
- * Stack containing the current commands.
- */
- std::stack<Command> commands;
-
- /**
- * Variant containing the data that has been read (always is a string,
- * contains the exact location of the data in the source file).
- */
- Variant data;
-
- /**
- * Contains the location of the last token.
- */
- SourceLocation location;
-
- /**
- * Contains the field index of the current command.
- */
- size_t fieldIdx;
-
- /**
- * Function used internall to parse an identifier.
- *
- * @param start is the start byte offset of the identifier (including the
- * backslash).
- * @param allowNSSep should be set to true if the namespace separator is
- * allowed in the identifier name. Issues error if the namespace separator
- * is placed incorrectly.
- */
- Variant parseIdentifier(size_t start, bool allowNSSep = false);
-
- /**
- * Function used internally to handle the special "\begin" command.
- */
- State parseBeginCommand();
-
- /**
- * Function used internally to handle the special "\end" command.
- */
- State parseEndCommand();
-
- /**
- * Pushes the parsed command onto the command stack.
- */
- void pushCommand(Variant commandName, Variant commandArguments,
- bool hasRange);
-
- /**
- * Parses the command arguments.
- */
- Variant parseCommandArguments(Variant commandArgName);
-
- /**
- * Function used internally to parse a command.
- *
- * @param start is the start byte offset of the command (including the
- * backslash)
- * @param isAnnotation if true, the command is not returned as command, but
- * as annotation start.
- * @return true if a command was actuall parsed, false otherwise.
- */
- State parseCommand(size_t start, bool isAnnotation);
-
- /**
- * Function used internally to parse a block comment.
- */
- void parseBlockComment();
-
- /**
- * Function used internally to parse a generic comment.
- */
- void parseLineComment();
-
- /**
- * Checks whether there is any data pending to be issued, if yes, issues it.
- *
- * @param handler is the data handler that contains the data that may be
- * returned to the user.
- * @return true if there was any data and DATA should be returned by the
- * parse function, false otherwise.
- */
- bool checkIssueData(DataHandler &handler);
-
- /**
- * Called before any data is appended to the internal data handler. Checks
- * whether a new field should be started or implicitly ended.
- *
- * @return true if FIELD_START should be returned by the parse function.
- */
- bool checkIssueFieldStart();
-
- /**
- * Closes a currently open field. Note that the command will be removed from
- * the internal command stack if the field that is being closed is a
- * field marked as default field.
- *
- * @return true if the field could be closed, false if there was no field
- * to close.
- */
- bool closeField();
+ std::unique_ptr<OsmlStreamParserImpl> impl;
public:
/**
@@ -328,6 +136,12 @@ public:
OsmlStreamParser(CharReader &reader, Logger &logger);
/**
+ * Destructor of the OsmlStreamParser, needed to destroy the incomplete
+ * OsmlStreamParserImpl.
+ */
+ ~OsmlStreamParser();
+
+ /**
* Continues parsing. Returns one of the states defined in the State enum.
* Callers should stop once the State::END state is reached. Use the getter
* functions to get more information about the current state, such as the
@@ -338,17 +152,9 @@ public:
State parse();
/**
- * Returns a reference at the internally stored data. Only valid if
- * State::DATA was returned by the "parse" function.
- *
- * @return a reference at a variant containing the data parsed by the
- * "parse" function.
- */
- const Variant &getData() const { return data; }
-
- /**
* Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
+ * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END
+ * was returned by the "parse" function.
*
* @return a reference at a variant containing name and location of the
* parsed command.
@@ -357,7 +163,8 @@ public:
/**
* Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
+ * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END
+ * was returned by the "parse" function.
*
* @return a reference at a variant containing arguments given to the
* command.
@@ -365,19 +172,43 @@ public:
const Variant &getCommandArguments() const;
/**
- * Returns true if the current field is the "default" field. This is true if
- * the parser either is in the outer range of a range command or inside a
- * field that has been especially marked as "default" field (using the "|"
- * syntax).
+ * Returns a reference at the internally stored data. Only valid if
+ * State::DATA was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing the data parsed by the
+ * "parse" function.
*/
- bool inDefaultField() const;
+ const TokenizedData &getData() const;
+
+ /**
+ * Returns the location of the current token.
+ */
+ const SourceLocation &getLocation() const;
/**
- * Returns a reference at the char reader.
+ * Returns true if the currently started command is a range command, only
+ * valid if State::COMMAND_START or State::ANNOTATION_START was returned by
+ * the "parse" function.
*
- * @return the last internal token location.
+ * @return true if the command is started is a range command, false
+ * otherwise.
*/
- const SourceLocation &getLocation() const { return location; }
+ bool inRangeCommand() const;
+
+ /**
+ * Returns true if the current field is the "default" field. This is true if
+ * the parser either is in the outer range of a range command or inside a
+ * field that has been especially marked as "default" field (using the "{!"
+ * syntax). Only valid if State::FIELD_START was returned by the "parse"
+ * function.
+ *
+ * @return true if the current field was marked as default field (using the
+ * "{!" syntax).
+ */
+ bool inDefaultField() const;
+
+ TokenId registerToken(const std::string &token) override;
+ void unregisterToken(TokenId token) override;
};
}