summaryrefslogtreecommitdiff
path: root/src/formats
diff options
context:
space:
mode:
Diffstat (limited to 'src/formats')
-rw-r--r--src/formats/osml/OsmlParser.cpp30
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp800
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp331
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp138
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp48
-rw-r--r--src/formats/osxml/OsxmlParser.cpp30
6 files changed, 671 insertions, 706 deletions
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp
index 16e7aa4..d169393 100644
--- a/src/formats/osml/OsmlParser.cpp
+++ b/src/formats/osml/OsmlParser.cpp
@@ -73,7 +73,7 @@ public:
: logger(ctx.getLogger()),
ctx(ctx),
parser(reader, logger),
- stack(ctx, GenericParserStates)
+ stack(parser, ctx, GenericParserStates)
{
}
@@ -88,7 +88,7 @@ public:
OsmlStreamParser::State state = parser.parse();
logger.setDefaultLocation(parser.getLocation());
switch (state) {
- case OsmlStreamParser::State::COMMAND: {
+ case OsmlStreamParser::State::COMMAND_START: {
// Implicitly create a "document" element if the first
// command is not any other top-level command
if (needsDocument) {
@@ -96,23 +96,23 @@ public:
parser.getCommandName().asString();
if (cmd != "typesystem" && cmd != "document" &&
cmd != "ontology") {
- stack.command("document", Variant::mapType{});
+ stack.commandStart("document", Variant::mapType{},
+ false);
}
needsDocument = false;
}
- stack.command(parser.getCommandName(),
- parser.getCommandArguments().asMap());
+ stack.commandStart(parser.getCommandName(),
+ parser.getCommandArguments().asMap(),
+ parser.inRangeCommand());
break;
}
- case OsmlStreamParser::State::DATA:
- stack.data(parser.getData());
- break;
- case OsmlStreamParser::State::ENTITY:
- // TODO
+ case OsmlStreamParser::State::RANGE_END:
+ stack.rangeEnd();
break;
case OsmlStreamParser::State::ANNOTATION_START:
stack.annotationStart(parser.getCommandName(),
- parser.getCommandArguments().asMap());
+ parser.getCommandArguments().asMap(),
+ parser.inRangeCommand());
break;
case OsmlStreamParser::State::ANNOTATION_END: {
Variant elementName = Variant::fromString(std::string{});
@@ -130,11 +130,9 @@ public:
case OsmlStreamParser::State::FIELD_END:
stack.fieldEnd();
break;
- case OsmlStreamParser::State::NONE:
- case OsmlStreamParser::State::ERROR:
- // Internally used in OsmlStreamParser, these states should
- // never occur. Just contiunue.
- continue;
+ case OsmlStreamParser::State::DATA:
+ stack.data(parser.getData());
+ break;
case OsmlStreamParser::State::END:
return;
}
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index f61ac7d..64a489d 100644
--- a/src/formats/osml/OsmlStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -16,179 +16,421 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <cassert>
+#include <stack>
+#include <vector>
+
#include <core/common/CharReader.hpp>
#include <core/common/Logger.hpp>
#include <core/common/Utils.hpp>
+#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
+#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
+
#include "OsmlStreamParser.hpp"
namespace ousia {
+namespace {
/**
- * Plain format default tokenizer.
+ * Osml format default tokenizer. Registers the primary tokens in its
+ * constructor. A single, static instance of this class is created as
+ * "OsmlTokens", which is copied to the Tokenizer instance of
+ * OsmlStreamParserImpl.
*/
-class PlainFormatTokens : public Tokenizer {
+class OsmlFormatTokens : public Tokenizer {
public:
+ TokenId Backslash;
+ TokenId LineComment;
+ TokenId BlockCommentStart;
+ TokenId BlockCommentEnd;
+ TokenId FieldStart;
+ TokenId FieldEnd;
+ TokenId DefaultFieldStart;
+ TokenId AnnotationStart;
+ TokenId AnnotationEnd;
+
/**
- * Id of the backslash token.
+ * Registers the plain format tokens in the internal tokenizer.
*/
- TokenId Backslash;
+ OsmlFormatTokens()
+ {
+ Backslash = registerToken("\\");
+ LineComment = registerToken("%");
+ BlockCommentStart = registerToken("%{");
+ BlockCommentEnd = registerToken("}%");
+ FieldStart = registerToken("{");
+ FieldEnd = registerToken("}");
+ DefaultFieldStart = registerToken("{!");
+ AnnotationStart = registerToken("<\\");
+ AnnotationEnd = registerToken("\\>");
+ }
+};
+
+/**
+ * Instance of OsmlFormatTokens used to initialize the internal tokenizer
+ * instance of OsmlStreamParserImpl.
+ */
+static const OsmlFormatTokens OsmlTokens;
+/**
+ * Structure representing a field.
+ */
+struct Field {
/**
- * Id of the line comment token.
+ * Specifies whether this field was marked as default field.
*/
- TokenId LineComment;
+ bool defaultField;
/**
- * Id of the block comment start token.
+ * Location at which the field was started.
*/
- TokenId BlockCommentStart;
+ SourceLocation location;
/**
- * Id of the block comment end token.
+ * Constructor of the Field structure, initializes all member variables with
+ * the given values.
+ *
+ * @param defaultField is a flag specifying whether this field is a default
+ * field.
+ * @param location specifies the location at which the field was started.
*/
- TokenId BlockCommentEnd;
+ Field(bool defaultField = false,
+ const SourceLocation &location = SourceLocation{})
+ : defaultField(defaultField), location(location)
+ {
+ }
+};
+/**
+ * Entry used for the command stack.
+ */
+class Command {
+private:
/**
- * Id of the field start token.
+ * Name and location of the current command.
*/
- TokenId FieldStart;
+ Variant name;
/**
- * Id of the field end token.
+ * Arguments that were passed to the command.
*/
- TokenId FieldEnd;
+ Variant arguments;
/**
- * Id of the default field start token.
+ * Vector used as stack for holding the number of opening/closing braces
+ * and the corresponding "isDefaultField" flag.
*/
- TokenId DefaultFieldStart;
+ std::vector<Field> fields;
/**
- * Id of the annotation start token.
+ * Set to true if this is a command with clear begin and end.
*/
- TokenId AnnotationStart;
+ bool hasRange;
+public:
/**
- * Id of the annotation end token.
+ * Default constructor, marks this command as normal, non-range command.
*/
- TokenId AnnotationEnd;
+ Command() : hasRange(false) {}
/**
- * Registers the plain format tokens in the internal tokenizer.
+ * Constructor of the Command class.
+ *
+ * @param name is a string variant with name and location of the
+ * command.
+ * @param arguments is a map variant with the arguments given to the
+ * command.
+ * @param hasRange should be set to true if this is a command with
+ * explicit range.
*/
- PlainFormatTokens()
+ Command(Variant name, Variant arguments, bool hasRange)
+ : name(std::move(name)),
+ arguments(std::move(arguments)),
+ hasRange(hasRange)
{
- Backslash = registerToken("\\");
- LineComment = registerToken("%");
- BlockCommentStart = registerToken("%{");
- BlockCommentEnd = registerToken("}%");
- FieldStart = registerToken("{");
- FieldEnd = registerToken("}");
- DefaultFieldStart = registerToken("{!");
- AnnotationStart = registerToken("<\\");
- AnnotationEnd = registerToken("\\>");
}
-};
-static const PlainFormatTokens OsmlTokens;
+ /**
+ * Returns a reference at the variant representing name and location of the
+ * command.
+ *
+ * @return a variant containing name and location of the command.
+ */
+ const Variant &getName() const { return name; }
-/**
- * Class used internally to collect data issued via "DATA" event.
- */
-class DataHandler {
-private:
/**
- * Internal character buffer.
+ * Returns a reference at the variant containing name, value and location of
+ * the arguments.
+ *
+ * @return the arguments stored for the command.
*/
- std::vector<char> buf;
+ const Variant &getArguments() const { return arguments; }
/**
- * Start location of the character data.
+ * Returns a reference at the internal field list. This list should be used
+ * for printing error messages when fields are still open although the outer
+ * range field closes.
+ *
+ * @return a const reference at the internal field vector.
*/
- SourceOffset start;
+ const std::vector<Field> &getFields() const { return fields; }
/**
- * End location of the character data.
+ * Returns true if this command is currently in a default field.
+ *
+ * @return true if the current field on the field stack was explicitly
+ * marked as default field. If the field stack is empty, true is returned
+ * if this is a range command.
*/
- SourceOffset end;
+ bool inDefaultField() const
+ {
+ return (!fields.empty() && fields.back().defaultField) ||
+ (fields.empty() && hasRange);
+ }
-public:
/**
- * Default constructor, initializes start and end with zeros.
+ * Returns true if this command currently is in any field.
+ *
+ * @return true if a field is on the stack or this is a range commands.
+ * Range commands always are in a field.
*/
- DataHandler() : start(0), end(0) {}
+ bool inField() const { return !fields.empty() || hasRange; }
/**
- * Returns true if the internal buffer is empty.
+ * Returns true if this command currently is in a range field.
*
- * @return true if no characters were added to the internal buffer, false
- * otherwise.
+ * @return true if the command has a range and no other ranges are on the
+ * stack.
*/
- bool isEmpty() { return buf.empty(); }
+ bool inRangeField() const { return fields.empty() && hasRange; }
/**
- * Appends a single character to the internal buffer.
+ * Returns true if this command currently is in a non-range field.
*
- * @param c is the character that should be added to the internal buffer.
- * @param charStart is the start position of the character.
- * @param charEnd is the end position of the character.
+ * @return true if the command is in a field, but the field is not the field
+ * constructed by the "range"
*/
- void append(char c, SourceOffset charStart, SourceOffset charEnd)
+ bool inNonRangeField() const { return !fields.empty(); }
+
+ /**
+ * Pushes another field onto the field stack of this command.
+ *
+ * @param defaultField if true, explicitly marks this field as default
+ * field.
+ * @param location is the source location at which the field was started.
+ * Used for error messages in which the user is notified about an error with
+ * too few closing fields.
+ */
+ void pushField(bool defaultField = false,
+ const SourceLocation &location = SourceLocation{})
{
- if (isEmpty()) {
- start = charStart;
- }
- buf.push_back(c);
- end = charEnd;
+ fields.emplace_back(defaultField, location);
}
/**
- * Appends a string to the internal buffer.
+ * Removes another field from the field stack of this command, returns true
+ * if the operation was successful.
*
- * @param s is the string that should be added to the internal buffer.
- * @param stringStart is the start position of the string.
- * @param stringEnd is the end position of the string.
+ * @return true if there was a field to pop on the stack, false otherwise.
*/
- void append(const std::string &s, SourceOffset stringStart,
- SourceOffset stringEnd)
+ bool popField()
{
- if (isEmpty()) {
- start = stringStart;
+ if (!fields.empty()) {
+ fields.pop_back();
+ return true;
}
- std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
- end = stringEnd;
+ return false;
}
+};
+}
+
+/* Class OsmlStreamParserImpl */
+
+/**
+ * Internal implementation of OsmlStreamParser.
+ */
+class OsmlStreamParserImpl {
+public:
+ /**
+ * State enum compatible with OsmlStreamParserState but extended by two more
+ * entries (END and NONE).
+ */
+ enum class State : uint8_t {
+ COMMAND_START = 0,
+ RANGE_END = 1,
+ FIELD_START = 2,
+ FIELD_END = 3,
+ ANNOTATION_START = 4,
+ ANNOTATION_END = 5,
+ DATA = 6,
+ END = 7,
+ RECOVERABLE_ERROR = 8,
+ IRRECOVERABLE_ERROR = 9
+ };
+
+private:
+ /**
+ * Reference to the CharReader instance from which the incomming bytes are
+ * read.
+ */
+ CharReader &reader;
/**
- * Converts the internal buffer to a variant with attached location
- * information.
+ * Reference at the logger instance to which all error messages are sent.
+ */
+ Logger &logger;
+
+ /**
+ * Tokenizer instance used to read individual tokens from the text.
+ */
+ Tokenizer tokenizer;
+
+ /**
+ * Stack containing the current commands.
+ */
+ std::stack<Command> commands;
+
+ /**
+ * Variant containing the tokenized data that was returned from the
+ * tokenizer as data.
+ */
+ TokenizedData data;
+
+ /**
+ * Variable containing the current location of the parser.
+ */
+ SourceLocation location;
+
+ /**
+ * Function used internally to parse an identifier.
*
- * @param sourceId is the source id which is needed for building the
- * location information.
- * @return a Variant with the internal buffer content as string and
- * the correct start and end location.
+ * @param start is the start byte offset of the identifier (including the
+ * backslash).
+ * @param allowNSSep should be set to true if the namespace separator is
+ * allowed in the identifier name. Issues error if the namespace separator
+ * is placed incorrectly.
*/
- Variant toVariant(SourceId sourceId)
- {
- Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
- res.setLocation({sourceId, start, end});
- return res;
- }
+ Variant parseIdentifier(size_t start, bool allowNSSep = false);
+
+ /**
+ * Function used internally to handle the special "\begin" command.
+ *
+ * @return an internal State specifying whether an error occured (return
+ * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a
+ * command was actually started (return value State::COMMAND_START).
+ */
+ State parseBeginCommand();
+
+ /**
+ * Function used internally to handle the special "\end" command.
+ *
+ * @return an internal State specifying whether an error occured (return
+ * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a
+ * command was actually ended (return value State::RANGE_END).
+ */
+ State parseEndCommand();
+
+ /**
+ * Parses the command arguments. Handles errors if the name of the command
+ * was given using the hash notation and as a name field.
+ *
+ * @param commandArgName is the name argument that was given using the hash
+ * notation.
+ * @return a map variant containing the arguments.
+ */
+ Variant parseCommandArguments(Variant commandArgName);
+
+ /**
+ * Function used internally to parse a command.
+ *
+ * @param start is the start byte offset of the command (including the
+ * backslash)
+ * @param isAnnotation if true, the command is not returned as command, but
+ * as annotation start.
+ * @return true if a command was actuall parsed, false otherwise.
+ */
+ State parseCommand(size_t start, bool isAnnotation);
+
+ /**
+ * Function used internally to parse a block comment.
+ */
+ void parseBlockComment();
+
+ /**
+ * Function used internally to parse a generic comment.
+ */
+ void parseLineComment();
+
+ /**
+ * Pushes the parsed command onto the command stack.
+ */
+ void pushCommand(Variant commandName, Variant commandArguments,
+ bool hasRange);
+
+ /**
+ * Checks whether there is any data pending to be issued, if yes, resets the
+ * currently peeked characters and returns true.
+ *
+ * @return true if there was any data and DATA should be returned by the
+ * parse function, false otherwise.
+ */
+ bool checkIssueData();
+
+ /**
+ * Returns a reference at the current command at the top of the command
+ * stack.
+ *
+ * @return a reference at the top command in the command stack.
+ */
+ Command &cmd() { return commands.top(); }
+
+ /**
+ * Returns a reference at the current command at the top of the command
+ * stack.
+ *
+ * @return a reference at the top command in the command stack.
+ */
+ const Command &cmd() const { return commands.top(); }
+
+public:
+ /**
+ * Constructor of the OsmlStreamParserImpl class. Attaches the new
+ * OsmlStreamParserImpl to the given CharReader and Logger instances.
+ *
+ * @param reader is the reader instance from which incomming characters
+ * should be read.
+ * @param logger is the logger instance to which errors should be written.
+ */
+ OsmlStreamParserImpl(CharReader &reader, Logger &logger);
+
+ State parse();
+
+ TokenId registerToken(const std::string &token);
+ void unregisterToken(TokenId id);
+
+ const TokenizedData &getData() const { return data; }
+ const Variant &getCommandName() const { return cmd().getName(); }
+ const Variant &getCommandArguments() const { return cmd().getArguments(); }
+ const SourceLocation &getLocation() const { return location; }
+ bool inRangeCommand() const { return cmd().inRangeField(); };
+ bool inDefaultField() const { return cmd().inDefaultField(); }
};
-OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
+/* Class OsmlStreamParserImpl */
+
+OsmlStreamParserImpl::OsmlStreamParserImpl(CharReader &reader, Logger &logger)
: reader(reader), logger(logger), tokenizer(OsmlTokens)
{
- // Place an intial command representing the complete file on the stack
- commands.push(Command{"", Variant::mapType{}, true, true, true, false});
+ commands.emplace("", Variant::mapType{}, true);
}
-Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+Variant OsmlStreamParserImpl::parseIdentifier(size_t start, bool allowNSSep)
{
bool first = true;
- bool hasCharSiceNSSep = false;
+ bool hasCharSinceNSSep = false;
std::vector<char> identifier;
size_t end = reader.getPeekOffset();
char c, c2;
@@ -197,7 +439,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
if ((first && Utils::isIdentifierStartCharacter(c)) ||
(!first && Utils::isIdentifierCharacter(c))) {
identifier.push_back(c);
- } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+ } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&
Utils::isIdentifierStartCharacter(c2)) {
identifier.push_back(c);
} else {
@@ -214,8 +456,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
// This is no longer the first character
first = false;
- // Advance the hasCharSiceNSSep flag
- hasCharSiceNSSep = allowNSSep && (c != ':');
+ // Advance the hasCharSinceNSSep flag
+ hasCharSinceNSSep = allowNSSep && (c != ':');
end = reader.getPeekOffset();
reader.consumePeek();
@@ -228,20 +470,20 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
return res;
}
-OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parseBeginCommand()
{
// Expect a '{' after the command
reader.consumeWhitespace();
if (!reader.expect('{')) {
logger.error("Expected \"{\" after \\begin", reader);
- return State::NONE;
+ return State::RECOVERABLE_ERROR;
}
// Parse the name of the command that should be opened
Variant commandName = parseIdentifier(reader.getOffset(), true);
if (commandName.asString().empty()) {
logger.error("Expected identifier", commandName);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
// Check whether the next character is a '#', indicating the start of the
@@ -257,7 +499,7 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
if (!reader.expect('}')) {
logger.error("Expected \"}\"", reader);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
// Parse the arguments
@@ -266,28 +508,15 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
// Push the command onto the command stack
pushCommand(std::move(commandName), std::move(commandArguments), true);
- return State::COMMAND;
-}
-
-static bool checkStillInField(const OsmlStreamParser::Command &cmd,
- const Variant &endName, Logger &logger)
-{
- if (cmd.inField && !cmd.inRangeField) {
- logger.error(std::string("\\end in open field of command \"") +
- cmd.name.asString() + std::string("\""),
- endName);
- logger.note(std::string("Open command started here:"), cmd.name);
- return true;
- }
- return false;
+ return State::COMMAND_START;
}
-OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parseEndCommand()
{
// Expect a '{' after the command
if (!reader.expect('{')) {
logger.error("Expected \"{\" after \\end", reader);
- return State::NONE;
+ return State::RECOVERABLE_ERROR;
}
// Fetch the name of the command that should be ended here
@@ -296,56 +525,58 @@ OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
// Make sure the given command name is not empty
if (name.asString().empty()) {
logger.error("Expected identifier", name);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
// Make sure the command name is terminated with a '}'
if (!reader.expect('}')) {
logger.error("Expected \"}\"", reader);
- return State::ERROR;
- }
-
- // Unroll the command stack up to the last range command
- while (!commands.top().hasRange) {
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
+ }
+
+ // Unroll the command stack up to the last range command, make sure we do
+ // not intersect with any open field
+ while (!cmd().inRangeField()) {
+ if (cmd().inField()) {
+ logger.error(std::string("\\end in open field of command \"") +
+ cmd().getName().asString() + std::string("\""),
+ name);
+ const std::vector<Field> &fields = cmd().getFields();
+ for (const Field &field : fields) {
+ logger.note(std::string("Still open field started here: "),
+ field.location);
+ }
+ return State::IRRECOVERABLE_ERROR;
}
commands.pop();
}
- // Make sure we're not in an open field of this command
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
- }
-
// Special error message if the top-level command is reached
if (commands.size() == 1) {
logger.error(std::string("Cannot end command \"") + name.asString() +
std::string("\" here, no command open"),
name);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
- // Inform the about command mismatches
- const Command &cmd = commands.top();
- if (commands.top().name.asString() != name.asString()) {
- logger.error(std::string("Trying to end command \"") +
- cmd.name.asString() +
+ // Inform the user about command mismatches, copy the current command
+ // descriptor before popping it from the stack
+ if (getCommandName().asString() != name.asString()) {
+ logger.error(std::string("Trying to end command \"") + name.asString() +
std::string("\", but open command is \"") +
- name.asString() + std::string("\""),
+ getCommandName().asString() + std::string("\""),
name);
- logger.note("Last command was opened here:", cmd.name);
- return State::ERROR;
+ logger.note("Open command started here:", getCommandName());
+ return State::IRRECOVERABLE_ERROR;
}
- // Set the location to the location of the command that was ended, then end
- // the current command
+ // End the current command
location = name.getLocation();
commands.pop();
- return cmd.inRangeField ? State::FIELD_END : State::NONE;
+ return State::RANGE_END;
}
-Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
+Variant OsmlStreamParserImpl::parseCommandArguments(Variant commandArgName)
{
// Parse the arguments using the universal VariantReader
Variant commandArguments;
@@ -371,29 +602,14 @@ Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
return commandArguments;
}
-void OsmlStreamParser::pushCommand(Variant commandName,
- Variant commandArguments, bool hasRange)
-{
- // Store the location on the stack
- location = commandName.getLocation();
-
- // Place the command on the command stack, remove the last commands if we're
- // not currently inside a field of these commands
- while (!commands.top().inField) {
- commands.pop();
- }
- commands.push(Command{std::move(commandName), std::move(commandArguments),
- hasRange, false, false, false});
-}
-
-OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
- bool isAnnotation)
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parseCommand(
+ size_t start, bool isAnnotation)
{
// Parse the commandName as a first identifier
Variant commandName = parseIdentifier(start, true);
if (commandName.asString().empty()) {
logger.error("Empty command name", reader);
- return State::NONE;
+ return State::RECOVERABLE_ERROR;
}
// Handle the special "begin" and "end" commands
@@ -403,7 +619,7 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
const bool isEnd = commandNameComponents[0] == "end";
// Parse the begin or end command
- State res = State::COMMAND;
+ State res = State::COMMAND_START;
if (isBegin || isEnd) {
if (commandNameComponents.size() > 1) {
logger.error(
@@ -459,12 +675,13 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
} else {
// Make sure no arguments apart from the "name" argument are given
// to an annotation end
- Variant::mapType &map = commands.top().arguments.asMap();
+ const Variant::mapType &map = getCommandArguments().asMap();
if (!map.empty()) {
if (map.count("name") == 0 || map.size() > 1U) {
logger.error(
"An annotation end command may not have any arguments "
- "other than \"name\"");
+ "other than \"name\"",
+ reader);
return res;
}
}
@@ -478,17 +695,21 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
// If we're starting an annotation, return the command as annotation start
// instead of command
- if (isAnnotation && res == State::COMMAND) {
+ if (isAnnotation && res == State::COMMAND_START) {
return State::ANNOTATION_START;
}
return res;
}
-void OsmlStreamParser::parseBlockComment()
+void OsmlStreamParserImpl::parseBlockComment()
{
Token token;
+ TokenizedData commentData;
size_t depth = 1;
- while (tokenizer.read(reader, token)) {
+ while (tokenizer.read(reader, token, commentData)) {
+ // Throw the comment data away
+ commentData.clear();
+
if (token.id == OsmlTokens.BlockCommentEnd) {
depth--;
if (depth == 0) {
@@ -504,7 +725,7 @@ void OsmlStreamParser::parseBlockComment()
logger.error("File ended while being in a block comment", reader);
}
-void OsmlStreamParser::parseLineComment()
+void OsmlStreamParserImpl::parseLineComment()
{
char c;
while (reader.read(c)) {
@@ -514,86 +735,46 @@ void OsmlStreamParser::parseLineComment()
}
}
-bool OsmlStreamParser::checkIssueData(DataHandler &handler)
+void OsmlStreamParserImpl::pushCommand(Variant commandName,
+ Variant commandArguments, bool hasRange)
{
- if (!handler.isEmpty()) {
- data = handler.toVariant(reader.getSourceId());
- location = data.getLocation();
- reader.resetPeek();
- return true;
- }
- return false;
-}
-
-bool OsmlStreamParser::checkIssueFieldStart()
-{
- // Fetch the current command, and check whether we're currently inside a
- // field of this command
- Command &cmd = commands.top();
- if (!cmd.inField) {
- // If this is a range command, we're now implicitly inside the field of
- // this command -- we'll have to issue a field start command!
- if (cmd.hasRange) {
- cmd.inField = true;
- cmd.inRangeField = true;
- reader.resetPeek();
- return true;
- }
+ // Store the location of the command
+ location = commandName.getLocation();
- // This was not a range command, so obviously we're now inside within
- // a field of some command -- so unroll the commands stack until a
- // command with open field is reached
- while (!commands.top().inField) {
- commands.pop();
- }
+ // Place the command on the command stack, remove the last commands if we're
+ // not currently inside a field of these commands
+ while (!cmd().inField()) {
+ commands.pop();
}
- return false;
+
+ // Push the new command onto the command stack
+ commands.emplace(std::move(commandName), std::move(commandArguments),
+ hasRange);
}
-bool OsmlStreamParser::closeField()
+bool OsmlStreamParserImpl::checkIssueData()
{
- // Try to end an open field of the current command -- if the current command
- // is not inside an open field, end this command and try to close the next
- // one
- for (int i = 0; i < 2 && commands.size() > 1; i++) {
- Command &cmd = commands.top();
- if (!cmd.inRangeField) {
- if (cmd.inField) {
- cmd.inField = false;
- if (cmd.inDefaultField) {
- commands.pop();
- }
- return true;
- }
- commands.pop();
- } else {
- return false;
- }
+ if (!data.empty()) {
+ location = data.getLocation();
+ reader.resetPeek();
+ return true;
}
return false;
}
-OsmlStreamParser::State OsmlStreamParser::parse()
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parse()
{
- // Handler for incomming data
- DataHandler handler;
+ // Reset the data handler
+ data.clear();
// Read tokens until the outer loop should be left
Token token;
- while (tokenizer.peek(reader, token)) {
+ while (tokenizer.peek(reader, token, data)) {
const TokenId type = token.id;
// Special handling for Backslash and Text
if (type == OsmlTokens.Backslash ||
type == OsmlTokens.AnnotationStart) {
- // Before appending anything to the output data or starting a new
- // command, check whether FIELD_START has to be issued, as the
- // current command is a command with range
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
// Check whether a command starts now, without advancing the peek
// cursor
char c;
@@ -606,7 +787,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// Try to parse a command
if (Utils::isIdentifierStartCharacter(c)) {
// Make sure to issue any data before it is to late
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -614,11 +795,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()
State res = parseCommand(token.location.getStart(),
type == OsmlTokens.AnnotationStart);
switch (res) {
- case State::ERROR:
+ case State::IRRECOVERABLE_ERROR:
throw LoggableException(
"Last error was irrecoverable, ending parsing "
"process");
- case State::NONE:
+ case State::RECOVERABLE_ERROR:
continue;
default:
return res;
@@ -632,78 +813,64 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// If this was an annotation start token, add the parsed < to the
// output
+ SourceOffset charStart = token.location.getStart();
+ SourceOffset charEnd = reader.getPeekOffset();
if (type == OsmlTokens.AnnotationStart) {
- handler.append('<', token.location.getStart(),
- token.location.getStart() + 1);
+ data.append('<', charStart, charStart + 1);
+ charStart = charStart + 1;
}
- handler.append(c, token.location.getStart(),
- reader.getPeekOffset());
+ // Append the character to the output data, mark it as protected
+ data.append(c, charStart, charEnd, true);
reader.consumePeek();
continue;
} else if (type == Tokens::Data) {
- // Check whether FIELD_START has to be issued before appending text
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
- // Append the text to the data handler
- handler.append(token.content, token.location.getStart(),
- token.location.getEnd());
-
reader.consumePeek();
continue;
+ } else if (type == OsmlTokens.LineComment) {
+ reader.consumePeek();
+ parseLineComment();
+ continue;
+ } else if (type == OsmlTokens.BlockCommentStart) {
+ reader.consumePeek();
+ parseBlockComment();
+ continue;
}
// A non-text token was reached, make sure all pending data commands
// have been issued
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
// We will handle the token now, consume the peeked characters
reader.consumePeek();
- // Update the location to the current token location
+ // Synchronize the location with the current token location
location = token.location;
- if (token.id == OsmlTokens.LineComment) {
- parseLineComment();
- } else if (token.id == OsmlTokens.BlockCommentStart) {
- parseBlockComment();
- } else if (token.id == OsmlTokens.FieldStart) {
- Command &cmd = commands.top();
- if (!cmd.inField) {
- cmd.inField = true;
- return State::FIELD_START;
- }
- logger.error(
- "Got field start token \"{\", but no command for which to "
- "start the field. Write \"\\{\" to insert this sequence as "
- "text.",
- token);
+ if (token.id == OsmlTokens.FieldStart) {
+ cmd().pushField(false, token.location);
+ return State::FIELD_START;
} else if (token.id == OsmlTokens.FieldEnd) {
- if (closeField()) {
+ // Remove all commands from the list that currently are not in any
+ // field
+ while (!cmd().inField()) {
+ commands.pop();
+ }
+
+ // If the remaining command is not in a range field, remove this
+ // command
+ if (cmd().inNonRangeField()) {
+ cmd().popField();
return State::FIELD_END;
}
logger.error(
- "Got field end token \"}\", but there is no field to end. "
- "Write \"\\}\" to insert this sequence as text.",
+ "Got field end token \"}\", but there is no field to end.",
token);
} else if (token.id == OsmlTokens.DefaultFieldStart) {
- // Try to start a default field the first time the token is reached
- Command &topCmd = commands.top();
- if (!topCmd.inField) {
- topCmd.inField = true;
- topCmd.inDefaultField = true;
- return State::FIELD_START;
- }
- logger.error(
- "Got default field start token \"{!\", but no command for "
- "which to start the field. Write \"\\{!\" to insert this "
- "sequence as text",
- token);
+ cmd().pushField(true, token.location);
+ return State::FIELD_START;
} else if (token.id == OsmlTokens.AnnotationEnd) {
// We got a single annotation end token "\>" -- simply issue the
// ANNOTATION_END event
@@ -717,38 +884,103 @@ OsmlStreamParser::State OsmlStreamParser::parse()
}
// Issue available data
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
// Make sure all open commands and fields have been ended at the end of the
// stream
- while (commands.size() > 1) {
- Command &cmd = commands.top();
- if (cmd.inField || cmd.hasRange) {
- logger.error("Reached end of stream, but command \"" +
- cmd.name.asString() + "\" has not been ended",
- cmd.name);
+ while (true) {
+ bool topLevelCommand = commands.size() == 1U;
+ if (cmd().inField()) {
+ // If the stream ended with an open range field, issue information
+ // about the range field
+ if (cmd().inRangeField() && !topLevelCommand) {
+ // Inform about the still open command itself
+ logger.error("Reached end of stream, but command \"" +
+ getCommandName().asString() +
+ "\" has not been ended",
+ getCommandName());
+ } else {
+ // Issue information about still open fields
+ const std::vector<Field> &fields = cmd().getFields();
+ if (!fields.empty()) {
+ logger.error(
+ std::string(
+ "Reached end of stream, but field is still open."),
+ fields.back().location);
+ }
+ }
+ }
+ if (!topLevelCommand) {
+ commands.pop();
+ } else {
+ break;
}
- commands.pop();
}
location = SourceLocation{reader.getSourceId(), reader.getOffset()};
return State::END;
}
+TokenId OsmlStreamParserImpl::registerToken(const std::string &token)
+{
+ return tokenizer.registerToken(token, false);
+}
+
+void OsmlStreamParserImpl::unregisterToken(TokenId id)
+{
+ assert(tokenizer.unregisterToken(id));
+}
+
+/* Class OsmlStreamParser */
+
+OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
+ : impl(new OsmlStreamParserImpl(reader, logger))
+{
+}
+
+OsmlStreamParser::~OsmlStreamParser()
+{
+ // Stub needed because OsmlStreamParserImpl is incomplete in header
+}
+
+OsmlStreamParser::State OsmlStreamParser::parse()
+{
+ return static_cast<State>(impl->parse());
+}
+
+const TokenizedData &OsmlStreamParser::getData() const
+{
+ return impl->getData();
+}
+
const Variant &OsmlStreamParser::getCommandName() const
{
- return commands.top().name;
+ return impl->getCommandName();
}
const Variant &OsmlStreamParser::getCommandArguments() const
{
- return commands.top().arguments;
+ return impl->getCommandArguments();
+}
+
+const SourceLocation &OsmlStreamParser::getLocation() const
+{
+ return impl->getLocation();
+}
+
+bool OsmlStreamParser::inDefaultField() const { return impl->inDefaultField(); }
+
+bool OsmlStreamParser::inRangeCommand() const { return impl->inRangeCommand(); }
+
+TokenId OsmlStreamParser::registerToken(const std::string &token)
+{
+ return impl->registerToken(token);
}
-bool OsmlStreamParser::inDefaultField() const
+void OsmlStreamParser::unregisterToken(TokenId id)
{
- return commands.top().inRangeField || commands.top().inDefaultField;
+ impl->unregisterToken(id);
}
}
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index dc3034c..b7e64f7 100644
--- a/src/formats/osml/OsmlStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -29,68 +29,53 @@
#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
#define _OUSIA_OSML_STREAM_PARSER_HPP_
-#include <stack>
+#include <cstdint>
+#include <memory>
-#include <core/common/Variant.hpp>
-#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/stack/Callbacks.hpp>
namespace ousia {
// Forward declarations
class CharReader;
class Logger;
-class DataHandler;
+class OsmlStreamParserImpl;
+class TokenizedData;
+class Variant;
/**
* The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
* format. The parser is constructed around a "parse" function, which reads data
* from the underlying CharReader until a new state is reached and indicates
* this state in a return value. The calling code then has to pull corresponding
- * data from the stream reader. The reader makes sure the incommind file is
+ * data from the stream reader. The reader makes sure the incomming stream is
* syntactically valid and tries to recorver from most errors. If an error is
* irrecoverable (this is the case for errors with wrong nesting of commands or
* fields, as this would lead to too many consecutive errors) a
- * LoggableException is thrown.
+ * LoggableException is thrown. In short, the OsmlStreamParser can be described
+ * as a SAX parser for OSML.
*/
-class OsmlStreamParser {
+class OsmlStreamParser: public parser_stack::ParserCallbacks {
public:
/**
* Enum used to indicate which state the OsmlStreamParser class is in
* after calling the "parse" function.
*/
- enum class State {
+ enum class State : uint8_t {
/**
- * State returned if a fully featured command has been read. A command
- * consists of the command name and its arguments (which optionally
- * includes the name).
+ * State returned if the start of a command has been read. Use the
+ * getCommandName(), getCommandArguments() and inRangeCommand()
+ * functions the retrieve more information about the command that was
+ * just started.
*/
- COMMAND,
+ COMMAND_START = 0,
/**
- * State returned if data is given. The reader must decide which field
- * or command this should be routed to. Trailing or leading whitespace
- * has been removed. Only called if the data is non-empty.
- */
- DATA,
-
- /**
- * A user-defined entity has been found. The entity sequence is stored
- * in the command name.
- */
- ENTITY,
-
- /**
- * State returned if an annotation was started. An annotation consists
- * of the command name and its arguments (which optionally include the
- * name).
- */
- ANNOTATION_START,
-
- /**
- * State returned if an annotation ends. The reader indicates which
- * annotation ends.
+ * State returned if a range command or range annotation has just ended.
+ * This state is not returned for non-range commands (as the actual end
+ * of a command is context dependent).
*/
- ANNOTATION_END,
+ RANGE_END = 1,
/**
* State returned if a new field started. The reader assures that the
@@ -98,223 +83,46 @@ public:
* is not started if data has been given outside of a field. The
* field number is set to the current field index.
*/
- FIELD_START,
+ FIELD_START = 2,
/**
* State returned if the current field ends. The reader assures that a
* field was actually open.
*/
- FIELD_END,
+ FIELD_END = 3,
/**
- * The end of the stream has been reached.
+ * State returned if an annotation was started. An annotation consists
+ * of the command name and its arguments (which optionally include the
+ * name).
*/
- END,
+ ANNOTATION_START = 4,
/**
- * Returned from internal functions if nothing should be done.
+ * State returned if an annotation ends. The reader indicates which
+ * annotation ends.
*/
- NONE,
+ ANNOTATION_END = 5,
/**
- * Returned from internal function to indicate irrecoverable errors.
+ * State returned if data is given. The reader must decide which field
+ * or command this should be routed to. Trailing or leading whitespace
+ * has been removed. Only called if the data is non-empty.
*/
- ERROR
- };
-
- /**
- * Entry used for the command stack.
- */
- struct Command {
- /**
- * Name and location of the current command.
- */
- Variant name;
-
- /**
- * Arguments that were passed to the command.
- */
- Variant arguments;
+ DATA = 6,
/**
- * Set to true if this is a command with clear begin and end.
- */
- bool hasRange : 1;
-
- /**
- * Set to true if we are currently inside a field of this command.
- */
- bool inField : 1;
-
- /**
- * Set to true if we are currently in the range field of the command
- * (implies inField being set to true).
- */
- bool inRangeField : 1;
-
- /**
- * Set to true if we are currently in a field that has been especially
- * marked as default field (using the "|") syntax.
- */
- bool inDefaultField : 1;
-
- /**
- * Default constructor.
- */
- Command()
- : hasRange(false),
- inField(false),
- inRangeField(false),
- inDefaultField()
- {
- }
-
- /**
- * Constructor of the Command class.
- *
- * @param name is a string variant with name and location of the
- * command.
- * @param arguments is a map variant with the arguments given to the
- * command.
- * @param hasRange should be set to true if this is a command with
- * explicit range.
- * @param inField is set to true if we currently are inside a field
- * of this command.
- * @param inRangeField is set to true if we currently are inside the
- * outer field of a ranged command.
- * @param inDefaultField is set to true if we currently are in a
- * specially marked default field.
- */
- Command(Variant name, Variant arguments, bool hasRange,
- bool inField, bool inRangeField, bool inDefaultField)
- : name(std::move(name)),
- arguments(std::move(arguments)),
- hasRange(hasRange),
- inField(inField),
- inRangeField(inRangeField),
- inDefaultField(inDefaultField)
- {
- }
+ * The end of the stream has been reached.
+ */
+ END = 7
};
private:
/**
- * Reference to the CharReader instance from which the incomming bytes are
- * read.
- */
- CharReader &reader;
-
- /**
- * Reference at the logger instance to which all error messages are sent.
+ * Pointer at the class containing the internal implementation (according
+ * to the PIMPL idiom).
*/
- Logger &logger;
-
- /**
- * Tokenizer instance used to read individual tokens from the text.
- */
- Tokenizer tokenizer;
-
- /**
- * Stack containing the current commands.
- */
- std::stack<Command> commands;
-
- /**
- * Variant containing the data that has been read (always is a string,
- * contains the exact location of the data in the source file).
- */
- Variant data;
-
- /**
- * Contains the location of the last token.
- */
- SourceLocation location;
-
- /**
- * Contains the field index of the current command.
- */
- size_t fieldIdx;
-
- /**
- * Function used internall to parse an identifier.
- *
- * @param start is the start byte offset of the identifier (including the
- * backslash).
- * @param allowNSSep should be set to true if the namespace separator is
- * allowed in the identifier name. Issues error if the namespace separator
- * is placed incorrectly.
- */
- Variant parseIdentifier(size_t start, bool allowNSSep = false);
-
- /**
- * Function used internally to handle the special "\begin" command.
- */
- State parseBeginCommand();
-
- /**
- * Function used internally to handle the special "\end" command.
- */
- State parseEndCommand();
-
- /**
- * Pushes the parsed command onto the command stack.
- */
- void pushCommand(Variant commandName, Variant commandArguments,
- bool hasRange);
-
- /**
- * Parses the command arguments.
- */
- Variant parseCommandArguments(Variant commandArgName);
-
- /**
- * Function used internally to parse a command.
- *
- * @param start is the start byte offset of the command (including the
- * backslash)
- * @param isAnnotation if true, the command is not returned as command, but
- * as annotation start.
- * @return true if a command was actuall parsed, false otherwise.
- */
- State parseCommand(size_t start, bool isAnnotation);
-
- /**
- * Function used internally to parse a block comment.
- */
- void parseBlockComment();
-
- /**
- * Function used internally to parse a generic comment.
- */
- void parseLineComment();
-
- /**
- * Checks whether there is any data pending to be issued, if yes, issues it.
- *
- * @param handler is the data handler that contains the data that may be
- * returned to the user.
- * @return true if there was any data and DATA should be returned by the
- * parse function, false otherwise.
- */
- bool checkIssueData(DataHandler &handler);
-
- /**
- * Called before any data is appended to the internal data handler. Checks
- * whether a new field should be started or implicitly ended.
- *
- * @return true if FIELD_START should be returned by the parse function.
- */
- bool checkIssueFieldStart();
-
- /**
- * Closes a currently open field. Note that the command will be removed from
- * the internal command stack if the field that is being closed is a
- * field marked as default field.
- *
- * @return true if the field could be closed, false if there was no field
- * to close.
- */
- bool closeField();
+ std::unique_ptr<OsmlStreamParserImpl> impl;
public:
/**
@@ -328,6 +136,12 @@ public:
OsmlStreamParser(CharReader &reader, Logger &logger);
/**
+ * Destructor of the OsmlStreamParser, needed to destroy the incomplete
+ * OsmlStreamParserImpl.
+ */
+ ~OsmlStreamParser();
+
+ /**
* Continues parsing. Returns one of the states defined in the State enum.
* Callers should stop once the State::END state is reached. Use the getter
* functions to get more information about the current state, such as the
@@ -338,17 +152,9 @@ public:
State parse();
/**
- * Returns a reference at the internally stored data. Only valid if
- * State::DATA was returned by the "parse" function.
- *
- * @return a reference at a variant containing the data parsed by the
- * "parse" function.
- */
- const Variant &getData() const { return data; }
-
- /**
* Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
+ * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END
+ * was returned by the "parse" function.
*
* @return a reference at a variant containing name and location of the
* parsed command.
@@ -357,7 +163,8 @@ public:
/**
* Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
+ * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END
+ * was returned by the "parse" function.
*
* @return a reference at a variant containing arguments given to the
* command.
@@ -365,19 +172,43 @@ public:
const Variant &getCommandArguments() const;
/**
- * Returns true if the current field is the "default" field. This is true if
- * the parser either is in the outer range of a range command or inside a
- * field that has been especially marked as "default" field (using the "|"
- * syntax).
+ * Returns a reference at the internally stored data. Only valid if
+ * State::DATA was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing the data parsed by the
+ * "parse" function.
*/
- bool inDefaultField() const;
+ const TokenizedData &getData() const;
+
+ /**
+ * Returns the location of the current token.
+ */
+ const SourceLocation &getLocation() const;
/**
- * Returns a reference at the char reader.
+ * Returns true if the currently started command is a range command, only
+ * valid if State::COMMAND_START or State::ANNOTATION_START was returned by
+ * the "parse" function.
*
- * @return the last internal token location.
+ * @return true if the command is started is a range command, false
+ * otherwise.
*/
- const SourceLocation &getLocation() const { return location; }
+ bool inRangeCommand() const;
+
+ /**
+ * Returns true if the current field is the "default" field. This is true if
+ * the parser either is in the outer range of a range command or inside a
+ * field that has been especially marked as "default" field (using the "{!"
+ * syntax). Only valid if State::FIELD_START was returned by the "parse"
+ * function.
+ *
+ * @return true if the current field was marked as default field (using the
+ * "{!" syntax).
+ */
+ bool inDefaultField() const;
+
+ TokenId registerToken(const std::string &token) override;
+ void unregisterToken(TokenId token) override;
};
}
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
index c9254b0..79a8dbe 100644
--- a/src/formats/osxml/OsxmlEventParser.cpp
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -25,7 +25,7 @@
#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
#include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include "OsxmlAttributeLocator.hpp"
#include "OsxmlEventParser.hpp"
@@ -40,6 +40,11 @@ namespace ousia {
class OsxmlEventParserData {
public:
/**
+ * Current character data buffer.
+ */
+ TokenizedData data;
+
+ /**
* Contains the current depth of the parsing process.
*/
ssize_t depth;
@@ -52,35 +57,13 @@ public:
ssize_t annotationEndTagDepth;
/**
- * Current character data buffer.
- */
- std::vector<char> textBuf;
-
- /**
- * Current whitespace buffer (for the trimming whitspace mode)
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Flag indicating whether a whitespace character was present (for the
- * collapsing whitespace mode).
- */
- bool hasWhitespace;
-
- /**
- * Current character data start.
- */
- size_t textStart;
-
- /**
- * Current character data end.
- */
- size_t textEnd;
-
- /**
- * Default constructor.
+ * Constructor taking the sourceId of the file from which the XML is being
+ * parsed.
+ *
+ * @param sourceId is the source if of the XML file from which the data is
+ * currently being parsed.
*/
- OsxmlEventParserData();
+ OsxmlEventParserData(SourceId sourceId);
/**
* Increments the depth.
@@ -103,14 +86,6 @@ public:
* @return true if character data is available.
*/
bool hasText();
-
- /**
- * Returns a Variant containing the character data and its location.
- *
- * @return a string variant containing the text data and the character
- * location.
- */
- Variant getText(SourceId sourceId);
};
/* Class GuardedExpatXmlParser */
@@ -168,7 +143,7 @@ public:
static const std::string TOP_LEVEL_TAG{"ousia"};
/**
- * Prefix used to indicate the start of an annoation (note the trailing colon)
+ * Prefix used to indicate the start of an annoation (note the trailing colon).
*/
static const std::string ANNOTATION_START_PREFIX{"a:start:"};
@@ -215,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// If there is any text data in the buffer, issue that first
if (parser->getData().hasText()) {
- parser->getEvents().data(
- parser->getData().getText(parser->getReader().getSourceId()));
+ TokenizedData &data = parser->getData().data;
+ parser->getEvents().data(data);
+ data.clear();
}
// Read the argument locations -- this is only a stupid and slow hack,
@@ -335,7 +311,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// Just issue a "commandStart" event in any other case
Variant nameVar = Variant::fromString(nameStr);
nameVar.setLocation(nameLoc);
- parser->getEvents().command(nameVar, args);
+ parser->getEvents().commandStart(nameVar, args);
}
}
@@ -360,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)
// If there is any text data in the buffer, issue that first
if (parser->getData().hasText()) {
- parser->getEvents().data(
- parser->getData().getText(parser->getReader().getSourceId()));
+ TokenizedData &data = parser->getData().data;
+ parser->getEvents().data(data);
+ data.clear();
}
// Abort if the special ousia tag ends here
@@ -370,8 +347,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)
return;
}
- // Issue the "fieldEnd" event
- parser->getEvents().fieldEnd();
+ // Issue the "rangeEnd" event
+ parser->getEvents().rangeEnd();
}
static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
@@ -393,34 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
// Synchronize the logger position
SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
- // Fetch some variables for convenience
- const WhitespaceMode mode = parser->getWhitespaceMode();
- OsxmlEventParserData &data = parser->getData();
- std::vector<char> &textBuf = data.textBuf;
- std::vector<char> &whitespaceBuf = data.whitespaceBuf;
- bool &hasWhitespace = data.hasWhitespace;
- size_t &textStart = data.textStart;
- size_t &textEnd = data.textEnd;
-
- size_t pos = loc.getStart();
- for (size_t i = 0; i < ulen; i++, pos++) {
- switch (mode) {
- case WhitespaceMode::PRESERVE:
- PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd);
- break;
- case WhitespaceMode::TRIM:
- TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- whitespaceBuf);
- break;
- case WhitespaceMode::COLLAPSE:
- CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- hasWhitespace);
- break;
- }
- }
+ // Append the data to the buffer
+ parser->getData().data.append(std::string(s, ulen), loc.getStart());
}
/* Class OsxmlEvents */
@@ -429,12 +380,8 @@ OsxmlEvents::~OsxmlEvents() {}
/* Class OsxmlEventParser */
-OsxmlEventParserData::OsxmlEventParserData()
- : depth(0),
- annotationEndTagDepth(-1),
- hasWhitespace(false),
- textStart(0),
- textEnd(0)
+OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId)
+ : data(sourceId), depth(0), annotationEndTagDepth(-1)
{
}
@@ -455,25 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag()
return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);
}
-bool OsxmlEventParserData::hasText() { return !textBuf.empty(); }
-
-Variant OsxmlEventParserData::getText(SourceId sourceId)
-{
- // Create a variant containing the string data and the location
- Variant var =
- Variant::fromString(std::string{textBuf.data(), textBuf.size()});
- var.setLocation({sourceId, textStart, textEnd});
-
- // Reset the text buffers
- textBuf.clear();
- whitespaceBuf.clear();
- hasWhitespace = false;
- textStart = 0;
- textEnd = 0;
-
- // Return the variant
- return var;
-}
+bool OsxmlEventParserData::hasText() { return !data.empty(); }
/* Class OsxmlEventParser */
@@ -482,8 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
: reader(reader),
events(events),
logger(logger),
- whitespaceMode(WhitespaceMode::COLLAPSE),
- data(new OsxmlEventParserData())
+ data(new OsxmlEventParserData(reader.getSourceId()))
{
}
@@ -532,16 +460,6 @@ void OsxmlEventParser::parse()
}
}
-void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
-{
- this->whitespaceMode = whitespaceMode;
-}
-
-WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
-{
- return whitespaceMode;
-}
-
CharReader &OsxmlEventParser::getReader() const { return reader; }
Logger &OsxmlEventParser::getLogger() const { return logger; }
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
index e39245f..4c5a485 100644
--- a/src/formats/osxml/OsxmlEventParser.hpp
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -32,8 +32,6 @@
#include <memory>
#include <string>
-#include <core/common/Whitespace.hpp>
-
namespace ousia {
// Forward declarations
@@ -61,7 +59,8 @@ public:
* @param args is a map containing the arguments that were given to the
* command.
*/
- virtual void command(const Variant &name, const Variant::mapType &args) = 0;
+ virtual void commandStart(const Variant &name,
+ const Variant::mapType &args) = 0;
/**
* Called whenever an annotation starts. Note that this implicitly always
@@ -90,24 +89,17 @@ public:
const Variant &elementName) = 0;
/**
- * Called whenever the default field which was implicitly started by
- * commandStart or annotationStart ends. Note that this does not end the
- * range of an annotation, but the default field of the annotation. To
- * signal the end of the annotation this, the annotationEnd method will be
- * invoked.
+ * Called whenever the command or annotation tags end.
*/
- virtual void fieldEnd() = 0;
+ virtual void rangeEnd() = 0;
/**
- * Called whenever data is found. Whitespace data is handled as specified
- * and the data has been parsed to the specified variant type. This function
- * is not called if the parsing failed, the parser prints an error message
- * instead.
+ * Called whenever string data is found.
*
- * @param data is the already parsed data that should be passed to the
- * handler.
+ * @param data is a TokenizedData instance containing the string data that
+ * was found in the XML file.
*/
- virtual void data(const Variant &data) = 0;
+ virtual void data(const TokenizedData &data) = 0;
};
/**
@@ -135,11 +127,6 @@ private:
Logger &logger;
/**
- * Current whitespace mode.
- */
- WhitespaceMode whitespaceMode;
-
- /**
* Data to be used by the internal functions.
*/
std::unique_ptr<OsxmlEventParserData> data;
@@ -171,21 +158,6 @@ public:
void parse();
/**
- * Sets the whitespace handling mode.
- *
- * @param whitespaceMode defines how whitespace in the data should be
- * handled.
- */
- void setWhitespaceMode(WhitespaceMode whitespaceMode);
-
- /**
- * Returns the current whitespace handling mode.
- *
- * @return the currently set whitespace handling mode.
- */
- WhitespaceMode getWhitespaceMode() const;
-
- /**
* Returns the internal CharReader reference.
*
* @return the CharReader reference.
@@ -207,7 +179,9 @@ public:
OsxmlEvents &getEvents() const;
/**
- * Returns a reference at the internal data.
+ * Used internally to fetch a reference at the internal data.
+ *
+ * @return a reference at the internal OsxmlEventParserData structure.
*/
OsxmlEventParserData &getData() const;
};
diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp
index c216855..10cc77a 100644
--- a/src/formats/osxml/OsxmlParser.cpp
+++ b/src/formats/osxml/OsxmlParser.cpp
@@ -16,6 +16,9 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <core/common/Variant.hpp>
+#include <core/common/CharReader.hpp>
+#include <core/parser/stack/Callbacks.hpp>
#include <core/parser/stack/GenericParserStates.hpp>
#include <core/parser/stack/Stack.hpp>
#include <core/parser/ParserContext.hpp>
@@ -30,7 +33,7 @@ using namespace parser_stack;
/**
* Class containing the actual OsxmlParser implementation.
*/
-class OsxmlParserImplementation : public OsxmlEvents {
+class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks {
private:
/**
* Actual xml parser -- converts the xml stream into a set of events.
@@ -54,7 +57,7 @@ public:
*/
OsxmlParserImplementation(CharReader &reader, ParserContext &ctx)
: parser(reader, *this, ctx.getLogger()),
- stack(ctx, GenericParserStates)
+ stack(*this, ctx, GenericParserStates)
{
}
@@ -63,17 +66,16 @@ public:
*/
void parse() { parser.parse(); }
- void command(const Variant &name, const Variant::mapType &args) override
+ void commandStart(const Variant &name,
+ const Variant::mapType &args) override
{
- stack.command(name, args);
- stack.fieldStart(true);
+ stack.commandStart(name, args, true);
}
void annotationStart(const Variant &name,
const Variant::mapType &args) override
{
- stack.annotationStart(name, args);
- stack.fieldStart(true);
+ stack.annotationStart(name, args, true);
}
void annotationEnd(const Variant &className,
@@ -82,9 +84,19 @@ public:
stack.annotationEnd(className, elementName);
}
- void fieldEnd() override { stack.fieldEnd(); }
+ void rangeEnd() override { stack.rangeEnd(); }
- void data(const Variant &data) override { stack.data(data); }
+ void data(const TokenizedData &data) override { stack.data(data); }
+
+ TokenId registerToken(const std::string &token) override
+ {
+ return Tokens::Empty;
+ }
+
+ void unregisterToken(TokenId id) override
+ {
+ // Do nothing here
+ }
};
/* Class OsxmlParser */