diff options
Diffstat (limited to 'src/formats')
-rw-r--r-- | src/formats/osml/OsmlParser.cpp | 30 | ||||
-rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp | 800 | ||||
-rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp | 331 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 138 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 48 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlParser.cpp | 30 |
6 files changed, 671 insertions, 706 deletions
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp index 16e7aa4..d169393 100644 --- a/src/formats/osml/OsmlParser.cpp +++ b/src/formats/osml/OsmlParser.cpp @@ -73,7 +73,7 @@ public: : logger(ctx.getLogger()), ctx(ctx), parser(reader, logger), - stack(ctx, GenericParserStates) + stack(parser, ctx, GenericParserStates) { } @@ -88,7 +88,7 @@ public: OsmlStreamParser::State state = parser.parse(); logger.setDefaultLocation(parser.getLocation()); switch (state) { - case OsmlStreamParser::State::COMMAND: { + case OsmlStreamParser::State::COMMAND_START: { // Implicitly create a "document" element if the first // command is not any other top-level command if (needsDocument) { @@ -96,23 +96,23 @@ public: parser.getCommandName().asString(); if (cmd != "typesystem" && cmd != "document" && cmd != "ontology") { - stack.command("document", Variant::mapType{}); + stack.commandStart("document", Variant::mapType{}, + false); } needsDocument = false; } - stack.command(parser.getCommandName(), - parser.getCommandArguments().asMap()); + stack.commandStart(parser.getCommandName(), + parser.getCommandArguments().asMap(), + parser.inRangeCommand()); break; } - case OsmlStreamParser::State::DATA: - stack.data(parser.getData()); - break; - case OsmlStreamParser::State::ENTITY: - // TODO + case OsmlStreamParser::State::RANGE_END: + stack.rangeEnd(); break; case OsmlStreamParser::State::ANNOTATION_START: stack.annotationStart(parser.getCommandName(), - parser.getCommandArguments().asMap()); + parser.getCommandArguments().asMap(), + parser.inRangeCommand()); break; case OsmlStreamParser::State::ANNOTATION_END: { Variant elementName = Variant::fromString(std::string{}); @@ -130,11 +130,9 @@ public: case OsmlStreamParser::State::FIELD_END: stack.fieldEnd(); break; - case OsmlStreamParser::State::NONE: - case OsmlStreamParser::State::ERROR: - // Internally used in OsmlStreamParser, these states should - // never occur. Just contiunue. - continue; + case OsmlStreamParser::State::DATA: + stack.data(parser.getData()); + break; case OsmlStreamParser::State::END: return; } diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..64a489d 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -16,179 +16,421 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <cassert> +#include <stack> +#include <vector> + #include <core/common/CharReader.hpp> #include <core/common/Logger.hpp> #include <core/common/Utils.hpp> +#include <core/common/Variant.hpp> #include <core/common/VariantReader.hpp> +#include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/utils/TokenizedData.hpp> + #include "OsmlStreamParser.hpp" namespace ousia { +namespace { /** - * Plain format default tokenizer. + * Osml format default tokenizer. Registers the primary tokens in its + * constructor. A single, static instance of this class is created as + * "OsmlTokens", which is copied to the Tokenizer instance of + * OsmlStreamParserImpl. */ -class PlainFormatTokens : public Tokenizer { +class OsmlFormatTokens : public Tokenizer { public: + TokenId Backslash; + TokenId LineComment; + TokenId BlockCommentStart; + TokenId BlockCommentEnd; + TokenId FieldStart; + TokenId FieldEnd; + TokenId DefaultFieldStart; + TokenId AnnotationStart; + TokenId AnnotationEnd; + /** - * Id of the backslash token. + * Registers the plain format tokens in the internal tokenizer. */ - TokenId Backslash; + OsmlFormatTokens() + { + Backslash = registerToken("\\"); + LineComment = registerToken("%"); + BlockCommentStart = registerToken("%{"); + BlockCommentEnd = registerToken("}%"); + FieldStart = registerToken("{"); + FieldEnd = registerToken("}"); + DefaultFieldStart = registerToken("{!"); + AnnotationStart = registerToken("<\\"); + AnnotationEnd = registerToken("\\>"); + } +}; + +/** + * Instance of OsmlFormatTokens used to initialize the internal tokenizer + * instance of OsmlStreamParserImpl. + */ +static const OsmlFormatTokens OsmlTokens; +/** + * Structure representing a field. + */ +struct Field { /** - * Id of the line comment token. + * Specifies whether this field was marked as default field. */ - TokenId LineComment; + bool defaultField; /** - * Id of the block comment start token. + * Location at which the field was started. */ - TokenId BlockCommentStart; + SourceLocation location; /** - * Id of the block comment end token. + * Constructor of the Field structure, initializes all member variables with + * the given values. + * + * @param defaultField is a flag specifying whether this field is a default + * field. + * @param location specifies the location at which the field was started. */ - TokenId BlockCommentEnd; + Field(bool defaultField = false, + const SourceLocation &location = SourceLocation{}) + : defaultField(defaultField), location(location) + { + } +}; +/** + * Entry used for the command stack. + */ +class Command { +private: /** - * Id of the field start token. + * Name and location of the current command. */ - TokenId FieldStart; + Variant name; /** - * Id of the field end token. + * Arguments that were passed to the command. */ - TokenId FieldEnd; + Variant arguments; /** - * Id of the default field start token. + * Vector used as stack for holding the number of opening/closing braces + * and the corresponding "isDefaultField" flag. */ - TokenId DefaultFieldStart; + std::vector<Field> fields; /** - * Id of the annotation start token. + * Set to true if this is a command with clear begin and end. */ - TokenId AnnotationStart; + bool hasRange; +public: /** - * Id of the annotation end token. + * Default constructor, marks this command as normal, non-range command. */ - TokenId AnnotationEnd; + Command() : hasRange(false) {} /** - * Registers the plain format tokens in the internal tokenizer. + * Constructor of the Command class. + * + * @param name is a string variant with name and location of the + * command. + * @param arguments is a map variant with the arguments given to the + * command. + * @param hasRange should be set to true if this is a command with + * explicit range. */ - PlainFormatTokens() + Command(Variant name, Variant arguments, bool hasRange) + : name(std::move(name)), + arguments(std::move(arguments)), + hasRange(hasRange) { - Backslash = registerToken("\\"); - LineComment = registerToken("%"); - BlockCommentStart = registerToken("%{"); - BlockCommentEnd = registerToken("}%"); - FieldStart = registerToken("{"); - FieldEnd = registerToken("}"); - DefaultFieldStart = registerToken("{!"); - AnnotationStart = registerToken("<\\"); - AnnotationEnd = registerToken("\\>"); } -}; -static const PlainFormatTokens OsmlTokens; + /** + * Returns a reference at the variant representing name and location of the + * command. + * + * @return a variant containing name and location of the command. + */ + const Variant &getName() const { return name; } -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: /** - * Internal character buffer. + * Returns a reference at the variant containing name, value and location of + * the arguments. + * + * @return the arguments stored for the command. */ - std::vector<char> buf; + const Variant &getArguments() const { return arguments; } /** - * Start location of the character data. + * Returns a reference at the internal field list. This list should be used + * for printing error messages when fields are still open although the outer + * range field closes. + * + * @return a const reference at the internal field vector. */ - SourceOffset start; + const std::vector<Field> &getFields() const { return fields; } /** - * End location of the character data. + * Returns true if this command is currently in a default field. + * + * @return true if the current field on the field stack was explicitly + * marked as default field. If the field stack is empty, true is returned + * if this is a range command. */ - SourceOffset end; + bool inDefaultField() const + { + return (!fields.empty() && fields.back().defaultField) || + (fields.empty() && hasRange); + } -public: /** - * Default constructor, initializes start and end with zeros. + * Returns true if this command currently is in any field. + * + * @return true if a field is on the stack or this is a range commands. + * Range commands always are in a field. */ - DataHandler() : start(0), end(0) {} + bool inField() const { return !fields.empty() || hasRange; } /** - * Returns true if the internal buffer is empty. + * Returns true if this command currently is in a range field. * - * @return true if no characters were added to the internal buffer, false - * otherwise. + * @return true if the command has a range and no other ranges are on the + * stack. */ - bool isEmpty() { return buf.empty(); } + bool inRangeField() const { return fields.empty() && hasRange; } /** - * Appends a single character to the internal buffer. + * Returns true if this command currently is in a non-range field. * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. + * @return true if the command is in a field, but the field is not the field + * constructed by the "range" */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) + bool inNonRangeField() const { return !fields.empty(); } + + /** + * Pushes another field onto the field stack of this command. + * + * @param defaultField if true, explicitly marks this field as default + * field. + * @param location is the source location at which the field was started. + * Used for error messages in which the user is notified about an error with + * too few closing fields. + */ + void pushField(bool defaultField = false, + const SourceLocation &location = SourceLocation{}) { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; + fields.emplace_back(defaultField, location); } /** - * Appends a string to the internal buffer. + * Removes another field from the field stack of this command, returns true + * if the operation was successful. * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. + * @return true if there was a field to pop on the stack, false otherwise. */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) + bool popField() { - if (isEmpty()) { - start = stringStart; + if (!fields.empty()) { + fields.pop_back(); + return true; } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; + return false; } +}; +} + +/* Class OsmlStreamParserImpl */ + +/** + * Internal implementation of OsmlStreamParser. + */ +class OsmlStreamParserImpl { +public: + /** + * State enum compatible with OsmlStreamParserState but extended by two more + * entries (END and NONE). + */ + enum class State : uint8_t { + COMMAND_START = 0, + RANGE_END = 1, + FIELD_START = 2, + FIELD_END = 3, + ANNOTATION_START = 4, + ANNOTATION_END = 5, + DATA = 6, + END = 7, + RECOVERABLE_ERROR = 8, + IRRECOVERABLE_ERROR = 9 + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; /** - * Converts the internal buffer to a variant with attached location - * information. + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Tokenizer instance used to read individual tokens from the text. + */ + Tokenizer tokenizer; + + /** + * Stack containing the current commands. + */ + std::stack<Command> commands; + + /** + * Variant containing the tokenized data that was returned from the + * tokenizer as data. + */ + TokenizedData data; + + /** + * Variable containing the current location of the parser. + */ + SourceLocation location; + + /** + * Function used internally to parse an identifier. * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. + * @param start is the start byte offset of the identifier (including the + * backslash). + * @param allowNSSep should be set to true if the namespace separator is + * allowed in the identifier name. Issues error if the namespace separator + * is placed incorrectly. */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } + Variant parseIdentifier(size_t start, bool allowNSSep = false); + + /** + * Function used internally to handle the special "\begin" command. + * + * @return an internal State specifying whether an error occured (return + * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a + * command was actually started (return value State::COMMAND_START). + */ + State parseBeginCommand(); + + /** + * Function used internally to handle the special "\end" command. + * + * @return an internal State specifying whether an error occured (return + * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a + * command was actually ended (return value State::RANGE_END). + */ + State parseEndCommand(); + + /** + * Parses the command arguments. Handles errors if the name of the command + * was given using the hash notation and as a name field. + * + * @param commandArgName is the name argument that was given using the hash + * notation. + * @return a map variant containing the arguments. + */ + Variant parseCommandArguments(Variant commandArgName); + + /** + * Function used internally to parse a command. + * + * @param start is the start byte offset of the command (including the + * backslash) + * @param isAnnotation if true, the command is not returned as command, but + * as annotation start. + * @return true if a command was actuall parsed, false otherwise. + */ + State parseCommand(size_t start, bool isAnnotation); + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseLineComment(); + + /** + * Pushes the parsed command onto the command stack. + */ + void pushCommand(Variant commandName, Variant commandArguments, + bool hasRange); + + /** + * Checks whether there is any data pending to be issued, if yes, resets the + * currently peeked characters and returns true. + * + * @return true if there was any data and DATA should be returned by the + * parse function, false otherwise. + */ + bool checkIssueData(); + + /** + * Returns a reference at the current command at the top of the command + * stack. + * + * @return a reference at the top command in the command stack. + */ + Command &cmd() { return commands.top(); } + + /** + * Returns a reference at the current command at the top of the command + * stack. + * + * @return a reference at the top command in the command stack. + */ + const Command &cmd() const { return commands.top(); } + +public: + /** + * Constructor of the OsmlStreamParserImpl class. Attaches the new + * OsmlStreamParserImpl to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + OsmlStreamParserImpl(CharReader &reader, Logger &logger); + + State parse(); + + TokenId registerToken(const std::string &token); + void unregisterToken(TokenId id); + + const TokenizedData &getData() const { return data; } + const Variant &getCommandName() const { return cmd().getName(); } + const Variant &getCommandArguments() const { return cmd().getArguments(); } + const SourceLocation &getLocation() const { return location; } + bool inRangeCommand() const { return cmd().inRangeField(); }; + bool inDefaultField() const { return cmd().inDefaultField(); } }; -OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) +/* Class OsmlStreamParserImpl */ + +OsmlStreamParserImpl::OsmlStreamParserImpl(CharReader &reader, Logger &logger) : reader(reader), logger(logger), tokenizer(OsmlTokens) { - // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true, false}); + commands.emplace("", Variant::mapType{}, true); } -Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) +Variant OsmlStreamParserImpl::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; - bool hasCharSiceNSSep = false; + bool hasCharSinceNSSep = false; std::vector<char> identifier; size_t end = reader.getPeekOffset(); char c, c2; @@ -197,7 +439,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) if ((first && Utils::isIdentifierStartCharacter(c)) || (!first && Utils::isIdentifierCharacter(c))) { identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { identifier.push_back(c); } else { @@ -214,8 +456,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) // This is no longer the first character first = false; - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); + // Advance the hasCharSinceNSSep flag + hasCharSinceNSSep = allowNSSep && (c != ':'); end = reader.getPeekOffset(); reader.consumePeek(); @@ -228,20 +470,20 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) return res; } -OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseBeginCommand() { // Expect a '{' after the command reader.consumeWhitespace(); if (!reader.expect('{')) { logger.error("Expected \"{\" after \\begin", reader); - return State::NONE; + return State::RECOVERABLE_ERROR; } // Parse the name of the command that should be opened Variant commandName = parseIdentifier(reader.getOffset(), true); if (commandName.asString().empty()) { logger.error("Expected identifier", commandName); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } // Check whether the next character is a '#', indicating the start of the @@ -257,7 +499,7 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() if (!reader.expect('}')) { logger.error("Expected \"}\"", reader); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } // Parse the arguments @@ -266,28 +508,15 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() // Push the command onto the command stack pushCommand(std::move(commandName), std::move(commandArguments), true); - return State::COMMAND; -} - -static bool checkStillInField(const OsmlStreamParser::Command &cmd, - const Variant &endName, Logger &logger) -{ - if (cmd.inField && !cmd.inRangeField) { - logger.error(std::string("\\end in open field of command \"") + - cmd.name.asString() + std::string("\""), - endName); - logger.note(std::string("Open command started here:"), cmd.name); - return true; - } - return false; + return State::COMMAND_START; } -OsmlStreamParser::State OsmlStreamParser::parseEndCommand() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseEndCommand() { // Expect a '{' after the command if (!reader.expect('{')) { logger.error("Expected \"{\" after \\end", reader); - return State::NONE; + return State::RECOVERABLE_ERROR; } // Fetch the name of the command that should be ended here @@ -296,56 +525,58 @@ OsmlStreamParser::State OsmlStreamParser::parseEndCommand() // Make sure the given command name is not empty if (name.asString().empty()) { logger.error("Expected identifier", name); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } // Make sure the command name is terminated with a '}' if (!reader.expect('}')) { logger.error("Expected \"}\"", reader); - return State::ERROR; - } - - // Unroll the command stack up to the last range command - while (!commands.top().hasRange) { - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; + return State::IRRECOVERABLE_ERROR; + } + + // Unroll the command stack up to the last range command, make sure we do + // not intersect with any open field + while (!cmd().inRangeField()) { + if (cmd().inField()) { + logger.error(std::string("\\end in open field of command \"") + + cmd().getName().asString() + std::string("\""), + name); + const std::vector<Field> &fields = cmd().getFields(); + for (const Field &field : fields) { + logger.note(std::string("Still open field started here: "), + field.location); + } + return State::IRRECOVERABLE_ERROR; } commands.pop(); } - // Make sure we're not in an open field of this command - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - // Special error message if the top-level command is reached if (commands.size() == 1) { logger.error(std::string("Cannot end command \"") + name.asString() + std::string("\" here, no command open"), name); - return State::ERROR; + return State::IRRECOVERABLE_ERROR; } - // Inform the about command mismatches - const Command &cmd = commands.top(); - if (commands.top().name.asString() != name.asString()) { - logger.error(std::string("Trying to end command \"") + - cmd.name.asString() + + // Inform the user about command mismatches, copy the current command + // descriptor before popping it from the stack + if (getCommandName().asString() != name.asString()) { + logger.error(std::string("Trying to end command \"") + name.asString() + std::string("\", but open command is \"") + - name.asString() + std::string("\""), + getCommandName().asString() + std::string("\""), name); - logger.note("Last command was opened here:", cmd.name); - return State::ERROR; + logger.note("Open command started here:", getCommandName()); + return State::IRRECOVERABLE_ERROR; } - // Set the location to the location of the command that was ended, then end - // the current command + // End the current command location = name.getLocation(); commands.pop(); - return cmd.inRangeField ? State::FIELD_END : State::NONE; + return State::RANGE_END; } -Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) +Variant OsmlStreamParserImpl::parseCommandArguments(Variant commandArgName) { // Parse the arguments using the universal VariantReader Variant commandArguments; @@ -371,29 +602,14 @@ Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) return commandArguments; } -void OsmlStreamParser::pushCommand(Variant commandName, - Variant commandArguments, bool hasRange) -{ - // Store the location on the stack - location = commandName.getLocation(); - - // Place the command on the command stack, remove the last commands if we're - // not currently inside a field of these commands - while (!commands.top().inField) { - commands.pop(); - } - commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false, false}); -} - -OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, - bool isAnnotation) +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseCommand( + size_t start, bool isAnnotation) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); if (commandName.asString().empty()) { logger.error("Empty command name", reader); - return State::NONE; + return State::RECOVERABLE_ERROR; } // Handle the special "begin" and "end" commands @@ -403,7 +619,7 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, const bool isEnd = commandNameComponents[0] == "end"; // Parse the begin or end command - State res = State::COMMAND; + State res = State::COMMAND_START; if (isBegin || isEnd) { if (commandNameComponents.size() > 1) { logger.error( @@ -459,12 +675,13 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, } else { // Make sure no arguments apart from the "name" argument are given // to an annotation end - Variant::mapType &map = commands.top().arguments.asMap(); + const Variant::mapType &map = getCommandArguments().asMap(); if (!map.empty()) { if (map.count("name") == 0 || map.size() > 1U) { logger.error( "An annotation end command may not have any arguments " - "other than \"name\""); + "other than \"name\"", + reader); return res; } } @@ -478,17 +695,21 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, // If we're starting an annotation, return the command as annotation start // instead of command - if (isAnnotation && res == State::COMMAND) { + if (isAnnotation && res == State::COMMAND_START) { return State::ANNOTATION_START; } return res; } -void OsmlStreamParser::parseBlockComment() +void OsmlStreamParserImpl::parseBlockComment() { Token token; + TokenizedData commentData; size_t depth = 1; - while (tokenizer.read(reader, token)) { + while (tokenizer.read(reader, token, commentData)) { + // Throw the comment data away + commentData.clear(); + if (token.id == OsmlTokens.BlockCommentEnd) { depth--; if (depth == 0) { @@ -504,7 +725,7 @@ void OsmlStreamParser::parseBlockComment() logger.error("File ended while being in a block comment", reader); } -void OsmlStreamParser::parseLineComment() +void OsmlStreamParserImpl::parseLineComment() { char c; while (reader.read(c)) { @@ -514,86 +735,46 @@ void OsmlStreamParser::parseLineComment() } } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +void OsmlStreamParserImpl::pushCommand(Variant commandName, + Variant commandArguments, bool hasRange) { - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); - location = data.getLocation(); - reader.resetPeek(); - return true; - } - return false; -} - -bool OsmlStreamParser::checkIssueFieldStart() -{ - // Fetch the current command, and check whether we're currently inside a - // field of this command - Command &cmd = commands.top(); - if (!cmd.inField) { - // If this is a range command, we're now implicitly inside the field of - // this command -- we'll have to issue a field start command! - if (cmd.hasRange) { - cmd.inField = true; - cmd.inRangeField = true; - reader.resetPeek(); - return true; - } + // Store the location of the command + location = commandName.getLocation(); - // This was not a range command, so obviously we're now inside within - // a field of some command -- so unroll the commands stack until a - // command with open field is reached - while (!commands.top().inField) { - commands.pop(); - } + // Place the command on the command stack, remove the last commands if we're + // not currently inside a field of these commands + while (!cmd().inField()) { + commands.pop(); } - return false; + + // Push the new command onto the command stack + commands.emplace(std::move(commandName), std::move(commandArguments), + hasRange); } -bool OsmlStreamParser::closeField() +bool OsmlStreamParserImpl::checkIssueData() { - // Try to end an open field of the current command -- if the current command - // is not inside an open field, end this command and try to close the next - // one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - if (cmd.inDefaultField) { - commands.pop(); - } - return true; - } - commands.pop(); - } else { - return false; - } + if (!data.empty()) { + location = data.getLocation(); + reader.resetPeek(); + return true; } return false; } -OsmlStreamParser::State OsmlStreamParser::parse() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parse() { - // Handler for incomming data - DataHandler handler; + // Reset the data handler + data.clear(); // Read tokens until the outer loop should be left Token token; - while (tokenizer.peek(reader, token)) { + while (tokenizer.peek(reader, token, data)) { const TokenId type = token.id; // Special handling for Backslash and Text if (type == OsmlTokens.Backslash || type == OsmlTokens.AnnotationStart) { - // Before appending anything to the output data or starting a new - // command, check whether FIELD_START has to be issued, as the - // current command is a command with range - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - // Check whether a command starts now, without advancing the peek // cursor char c; @@ -606,7 +787,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() // Try to parse a command if (Utils::isIdentifierStartCharacter(c)) { // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -614,11 +795,11 @@ OsmlStreamParser::State OsmlStreamParser::parse() State res = parseCommand(token.location.getStart(), type == OsmlTokens.AnnotationStart); switch (res) { - case State::ERROR: + case State::IRRECOVERABLE_ERROR: throw LoggableException( "Last error was irrecoverable, ending parsing " "process"); - case State::NONE: + case State::RECOVERABLE_ERROR: continue; default: return res; @@ -632,78 +813,64 @@ OsmlStreamParser::State OsmlStreamParser::parse() // If this was an annotation start token, add the parsed < to the // output + SourceOffset charStart = token.location.getStart(); + SourceOffset charEnd = reader.getPeekOffset(); if (type == OsmlTokens.AnnotationStart) { - handler.append('<', token.location.getStart(), - token.location.getStart() + 1); + data.append('<', charStart, charStart + 1); + charStart = charStart + 1; } - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); + // Append the character to the output data, mark it as protected + data.append(c, charStart, charEnd, true); reader.consumePeek(); continue; } else if (type == Tokens::Data) { - // Check whether FIELD_START has to be issued before appending text - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - reader.consumePeek(); continue; + } else if (type == OsmlTokens.LineComment) { + reader.consumePeek(); + parseLineComment(); + continue; + } else if (type == OsmlTokens.BlockCommentStart) { + reader.consumePeek(); + parseBlockComment(); + continue; } // A non-text token was reached, make sure all pending data commands // have been issued - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } // We will handle the token now, consume the peeked characters reader.consumePeek(); - // Update the location to the current token location + // Synchronize the location with the current token location location = token.location; - if (token.id == OsmlTokens.LineComment) { - parseLineComment(); - } else if (token.id == OsmlTokens.BlockCommentStart) { - parseBlockComment(); - } else if (token.id == OsmlTokens.FieldStart) { - Command &cmd = commands.top(); - if (!cmd.inField) { - cmd.inField = true; - return State::FIELD_START; - } - logger.error( - "Got field start token \"{\", but no command for which to " - "start the field. Write \"\\{\" to insert this sequence as " - "text.", - token); + if (token.id == OsmlTokens.FieldStart) { + cmd().pushField(false, token.location); + return State::FIELD_START; } else if (token.id == OsmlTokens.FieldEnd) { - if (closeField()) { + // Remove all commands from the list that currently are not in any + // field + while (!cmd().inField()) { + commands.pop(); + } + + // If the remaining command is not in a range field, remove this + // command + if (cmd().inNonRangeField()) { + cmd().popField(); return State::FIELD_END; } logger.error( - "Got field end token \"}\", but there is no field to end. " - "Write \"\\}\" to insert this sequence as text.", + "Got field end token \"}\", but there is no field to end.", token); } else if (token.id == OsmlTokens.DefaultFieldStart) { - // Try to start a default field the first time the token is reached - Command &topCmd = commands.top(); - if (!topCmd.inField) { - topCmd.inField = true; - topCmd.inDefaultField = true; - return State::FIELD_START; - } - logger.error( - "Got default field start token \"{!\", but no command for " - "which to start the field. Write \"\\{!\" to insert this " - "sequence as text", - token); + cmd().pushField(true, token.location); + return State::FIELD_START; } else if (token.id == OsmlTokens.AnnotationEnd) { // We got a single annotation end token "\>" -- simply issue the // ANNOTATION_END event @@ -717,38 +884,103 @@ OsmlStreamParser::State OsmlStreamParser::parse() } // Issue available data - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } // Make sure all open commands and fields have been ended at the end of the // stream - while (commands.size() > 1) { - Command &cmd = commands.top(); - if (cmd.inField || cmd.hasRange) { - logger.error("Reached end of stream, but command \"" + - cmd.name.asString() + "\" has not been ended", - cmd.name); + while (true) { + bool topLevelCommand = commands.size() == 1U; + if (cmd().inField()) { + // If the stream ended with an open range field, issue information + // about the range field + if (cmd().inRangeField() && !topLevelCommand) { + // Inform about the still open command itself + logger.error("Reached end of stream, but command \"" + + getCommandName().asString() + + "\" has not been ended", + getCommandName()); + } else { + // Issue information about still open fields + const std::vector<Field> &fields = cmd().getFields(); + if (!fields.empty()) { + logger.error( + std::string( + "Reached end of stream, but field is still open."), + fields.back().location); + } + } + } + if (!topLevelCommand) { + commands.pop(); + } else { + break; } - commands.pop(); } location = SourceLocation{reader.getSourceId(), reader.getOffset()}; return State::END; } +TokenId OsmlStreamParserImpl::registerToken(const std::string &token) +{ + return tokenizer.registerToken(token, false); +} + +void OsmlStreamParserImpl::unregisterToken(TokenId id) +{ + assert(tokenizer.unregisterToken(id)); +} + +/* Class OsmlStreamParser */ + +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) + : impl(new OsmlStreamParserImpl(reader, logger)) +{ +} + +OsmlStreamParser::~OsmlStreamParser() +{ + // Stub needed because OsmlStreamParserImpl is incomplete in header +} + +OsmlStreamParser::State OsmlStreamParser::parse() +{ + return static_cast<State>(impl->parse()); +} + +const TokenizedData &OsmlStreamParser::getData() const +{ + return impl->getData(); +} + const Variant &OsmlStreamParser::getCommandName() const { - return commands.top().name; + return impl->getCommandName(); } const Variant &OsmlStreamParser::getCommandArguments() const { - return commands.top().arguments; + return impl->getCommandArguments(); +} + +const SourceLocation &OsmlStreamParser::getLocation() const +{ + return impl->getLocation(); +} + +bool OsmlStreamParser::inDefaultField() const { return impl->inDefaultField(); } + +bool OsmlStreamParser::inRangeCommand() const { return impl->inRangeCommand(); } + +TokenId OsmlStreamParser::registerToken(const std::string &token) +{ + return impl->registerToken(token); } -bool OsmlStreamParser::inDefaultField() const +void OsmlStreamParser::unregisterToken(TokenId id) { - return commands.top().inRangeField || commands.top().inDefaultField; + impl->unregisterToken(id); } } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..b7e64f7 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,68 +29,53 @@ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include <stack> +#include <cstdint> +#include <memory> -#include <core/common/Variant.hpp> -#include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/stack/Callbacks.hpp> namespace ousia { // Forward declarations class CharReader; class Logger; -class DataHandler; +class OsmlStreamParserImpl; +class TokenizedData; +class Variant; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml * format. The parser is constructed around a "parse" function, which reads data * from the underlying CharReader until a new state is reached and indicates * this state in a return value. The calling code then has to pull corresponding - * data from the stream reader. The reader makes sure the incommind file is + * data from the stream reader. The reader makes sure the incomming stream is * syntactically valid and tries to recorver from most errors. If an error is * irrecoverable (this is the case for errors with wrong nesting of commands or * fields, as this would lead to too many consecutive errors) a - * LoggableException is thrown. + * LoggableException is thrown. In short, the OsmlStreamParser can be described + * as a SAX parser for OSML. */ -class OsmlStreamParser { +class OsmlStreamParser: public parser_stack::ParserCallbacks { public: /** * Enum used to indicate which state the OsmlStreamParser class is in * after calling the "parse" function. */ - enum class State { + enum class State : uint8_t { /** - * State returned if a fully featured command has been read. A command - * consists of the command name and its arguments (which optionally - * includes the name). + * State returned if the start of a command has been read. Use the + * getCommandName(), getCommandArguments() and inRangeCommand() + * functions the retrieve more information about the command that was + * just started. */ - COMMAND, + COMMAND_START = 0, /** - * State returned if data is given. The reader must decide which field - * or command this should be routed to. Trailing or leading whitespace - * has been removed. Only called if the data is non-empty. - */ - DATA, - - /** - * A user-defined entity has been found. The entity sequence is stored - * in the command name. - */ - ENTITY, - - /** - * State returned if an annotation was started. An annotation consists - * of the command name and its arguments (which optionally include the - * name). - */ - ANNOTATION_START, - - /** - * State returned if an annotation ends. The reader indicates which - * annotation ends. + * State returned if a range command or range annotation has just ended. + * This state is not returned for non-range commands (as the actual end + * of a command is context dependent). */ - ANNOTATION_END, + RANGE_END = 1, /** * State returned if a new field started. The reader assures that the @@ -98,223 +83,46 @@ public: * is not started if data has been given outside of a field. The * field number is set to the current field index. */ - FIELD_START, + FIELD_START = 2, /** * State returned if the current field ends. The reader assures that a * field was actually open. */ - FIELD_END, + FIELD_END = 3, /** - * The end of the stream has been reached. + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). */ - END, + ANNOTATION_START = 4, /** - * Returned from internal functions if nothing should be done. + * State returned if an annotation ends. The reader indicates which + * annotation ends. */ - NONE, + ANNOTATION_END = 5, /** - * Returned from internal function to indicate irrecoverable errors. + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. */ - ERROR - }; - - /** - * Entry used for the command stack. - */ - struct Command { - /** - * Name and location of the current command. - */ - Variant name; - - /** - * Arguments that were passed to the command. - */ - Variant arguments; + DATA = 6, /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange : 1; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField : 1; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). - */ - bool inRangeField : 1; - - /** - * Set to true if we are currently in a field that has been especially - * marked as default field (using the "|") syntax. - */ - bool inDefaultField : 1; - - /** - * Default constructor. - */ - Command() - : hasRange(false), - inField(false), - inRangeField(false), - inDefaultField() - { - } - - /** - * Constructor of the Command class. - * - * @param name is a string variant with name and location of the - * command. - * @param arguments is a map variant with the arguments given to the - * command. - * @param hasRange should be set to true if this is a command with - * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently are inside the - * outer field of a ranged command. - * @param inDefaultField is set to true if we currently are in a - * specially marked default field. - */ - Command(Variant name, Variant arguments, bool hasRange, - bool inField, bool inRangeField, bool inDefaultField) - : name(std::move(name)), - arguments(std::move(arguments)), - hasRange(hasRange), - inField(inField), - inRangeField(inRangeField), - inDefaultField(inDefaultField) - { - } + * The end of the stream has been reached. + */ + END = 7 }; private: /** - * Reference to the CharReader instance from which the incomming bytes are - * read. - */ - CharReader &reader; - - /** - * Reference at the logger instance to which all error messages are sent. + * Pointer at the class containing the internal implementation (according + * to the PIMPL idiom). */ - Logger &logger; - - /** - * Tokenizer instance used to read individual tokens from the text. - */ - Tokenizer tokenizer; - - /** - * Stack containing the current commands. - */ - std::stack<Command> commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). - */ - Variant data; - - /** - * Contains the location of the last token. - */ - SourceLocation location; - - /** - * Contains the field index of the current command. - */ - size_t fieldIdx; - - /** - * Function used internall to parse an identifier. - * - * @param start is the start byte offset of the identifier (including the - * backslash). - * @param allowNSSep should be set to true if the namespace separator is - * allowed in the identifier name. Issues error if the namespace separator - * is placed incorrectly. - */ - Variant parseIdentifier(size_t start, bool allowNSSep = false); - - /** - * Function used internally to handle the special "\begin" command. - */ - State parseBeginCommand(); - - /** - * Function used internally to handle the special "\end" command. - */ - State parseEndCommand(); - - /** - * Pushes the parsed command onto the command stack. - */ - void pushCommand(Variant commandName, Variant commandArguments, - bool hasRange); - - /** - * Parses the command arguments. - */ - Variant parseCommandArguments(Variant commandArgName); - - /** - * Function used internally to parse a command. - * - * @param start is the start byte offset of the command (including the - * backslash) - * @param isAnnotation if true, the command is not returned as command, but - * as annotation start. - * @return true if a command was actuall parsed, false otherwise. - */ - State parseCommand(size_t start, bool isAnnotation); - - /** - * Function used internally to parse a block comment. - */ - void parseBlockComment(); - - /** - * Function used internally to parse a generic comment. - */ - void parseLineComment(); - - /** - * Checks whether there is any data pending to be issued, if yes, issues it. - * - * @param handler is the data handler that contains the data that may be - * returned to the user. - * @return true if there was any data and DATA should be returned by the - * parse function, false otherwise. - */ - bool checkIssueData(DataHandler &handler); - - /** - * Called before any data is appended to the internal data handler. Checks - * whether a new field should be started or implicitly ended. - * - * @return true if FIELD_START should be returned by the parse function. - */ - bool checkIssueFieldStart(); - - /** - * Closes a currently open field. Note that the command will be removed from - * the internal command stack if the field that is being closed is a - * field marked as default field. - * - * @return true if the field could be closed, false if there was no field - * to close. - */ - bool closeField(); + std::unique_ptr<OsmlStreamParserImpl> impl; public: /** @@ -328,6 +136,12 @@ public: OsmlStreamParser(CharReader &reader, Logger &logger); /** + * Destructor of the OsmlStreamParser, needed to destroy the incomplete + * OsmlStreamParserImpl. + */ + ~OsmlStreamParser(); + + /** * Continues parsing. Returns one of the states defined in the State enum. * Callers should stop once the State::END state is reached. Use the getter * functions to get more information about the current state, such as the @@ -338,17 +152,9 @@ public: State parse(); /** - * Returns a reference at the internally stored data. Only valid if - * State::DATA was returned by the "parse" function. - * - * @return a reference at a variant containing the data parsed by the - * "parse" function. - */ - const Variant &getData() const { return data; } - - /** * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. + * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END + * was returned by the "parse" function. * * @return a reference at a variant containing name and location of the * parsed command. @@ -357,7 +163,8 @@ public: /** * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. + * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END + * was returned by the "parse" function. * * @return a reference at a variant containing arguments given to the * command. @@ -365,19 +172,43 @@ public: const Variant &getCommandArguments() const; /** - * Returns true if the current field is the "default" field. This is true if - * the parser either is in the outer range of a range command or inside a - * field that has been especially marked as "default" field (using the "|" - * syntax). + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. */ - bool inDefaultField() const; + const TokenizedData &getData() const; + + /** + * Returns the location of the current token. + */ + const SourceLocation &getLocation() const; /** - * Returns a reference at the char reader. + * Returns true if the currently started command is a range command, only + * valid if State::COMMAND_START or State::ANNOTATION_START was returned by + * the "parse" function. * - * @return the last internal token location. + * @return true if the command is started is a range command, false + * otherwise. */ - const SourceLocation &getLocation() const { return location; } + bool inRangeCommand() const; + + /** + * Returns true if the current field is the "default" field. This is true if + * the parser either is in the outer range of a range command or inside a + * field that has been especially marked as "default" field (using the "{!" + * syntax). Only valid if State::FIELD_START was returned by the "parse" + * function. + * + * @return true if the current field was marked as default field (using the + * "{!" syntax). + */ + bool inDefaultField() const; + + TokenId registerToken(const std::string &token) override; + void unregisterToken(TokenId token) override; }; } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..79a8dbe 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,7 @@ #include <core/common/Variant.hpp> #include <core/common/VariantReader.hpp> #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" @@ -40,6 +40,11 @@ namespace ousia { class OsxmlEventParserData { public: /** + * Current character data buffer. + */ + TokenizedData data; + + /** * Contains the current depth of the parsing process. */ ssize_t depth; @@ -52,35 +57,13 @@ public: ssize_t annotationEndTagDepth; /** - * Current character data buffer. - */ - std::vector<char> textBuf; - - /** - * Current whitespace buffer (for the trimming whitspace mode) - */ - std::vector<char> whitespaceBuf; - - /** - * Flag indicating whether a whitespace character was present (for the - * collapsing whitespace mode). - */ - bool hasWhitespace; - - /** - * Current character data start. - */ - size_t textStart; - - /** - * Current character data end. - */ - size_t textEnd; - - /** - * Default constructor. + * Constructor taking the sourceId of the file from which the XML is being + * parsed. + * + * @param sourceId is the source if of the XML file from which the data is + * currently being parsed. */ - OsxmlEventParserData(); + OsxmlEventParserData(SourceId sourceId); /** * Increments the depth. @@ -103,14 +86,6 @@ public: * @return true if character data is available. */ bool hasText(); - - /** - * Returns a Variant containing the character data and its location. - * - * @return a string variant containing the text data and the character - * location. - */ - Variant getText(SourceId sourceId); }; /* Class GuardedExpatXmlParser */ @@ -168,7 +143,7 @@ public: static const std::string TOP_LEVEL_TAG{"ousia"}; /** - * Prefix used to indicate the start of an annoation (note the trailing colon) + * Prefix used to indicate the start of an annoation (note the trailing colon). */ static const std::string ANNOTATION_START_PREFIX{"a:start:"}; @@ -215,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { - parser->getEvents().data( - parser->getData().getText(parser->getReader().getSourceId())); + TokenizedData &data = parser->getData().data; + parser->getEvents().data(data); + data.clear(); } // Read the argument locations -- this is only a stupid and slow hack, @@ -335,7 +311,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Just issue a "commandStart" event in any other case Variant nameVar = Variant::fromString(nameStr); nameVar.setLocation(nameLoc); - parser->getEvents().command(nameVar, args); + parser->getEvents().commandStart(nameVar, args); } } @@ -360,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name) // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { - parser->getEvents().data( - parser->getData().getText(parser->getReader().getSourceId())); + TokenizedData &data = parser->getData().data; + parser->getEvents().data(data); + data.clear(); } // Abort if the special ousia tag ends here @@ -370,8 +347,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name) return; } - // Issue the "fieldEnd" event - parser->getEvents().fieldEnd(); + // Issue the "rangeEnd" event + parser->getEvents().rangeEnd(); } static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) @@ -393,34 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) // Synchronize the logger position SourceLocation loc = xmlSyncLoggerPosition(p, ulen); - // Fetch some variables for convenience - const WhitespaceMode mode = parser->getWhitespaceMode(); - OsxmlEventParserData &data = parser->getData(); - std::vector<char> &textBuf = data.textBuf; - std::vector<char> &whitespaceBuf = data.whitespaceBuf; - bool &hasWhitespace = data.hasWhitespace; - size_t &textStart = data.textStart; - size_t &textEnd = data.textEnd; - - size_t pos = loc.getStart(); - for (size_t i = 0; i < ulen; i++, pos++) { - switch (mode) { - case WhitespaceMode::PRESERVE: - PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd); - break; - case WhitespaceMode::TRIM: - TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - whitespaceBuf); - break; - case WhitespaceMode::COLLAPSE: - CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - hasWhitespace); - break; - } - } + // Append the data to the buffer + parser->getData().data.append(std::string(s, ulen), loc.getStart()); } /* Class OsxmlEvents */ @@ -429,12 +380,8 @@ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ -OsxmlEventParserData::OsxmlEventParserData() - : depth(0), - annotationEndTagDepth(-1), - hasWhitespace(false), - textStart(0), - textEnd(0) +OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId) + : data(sourceId), depth(0), annotationEndTagDepth(-1) { } @@ -455,25 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag() return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); } -bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } - -Variant OsxmlEventParserData::getText(SourceId sourceId) -{ - // Create a variant containing the string data and the location - Variant var = - Variant::fromString(std::string{textBuf.data(), textBuf.size()}); - var.setLocation({sourceId, textStart, textEnd}); - - // Reset the text buffers - textBuf.clear(); - whitespaceBuf.clear(); - hasWhitespace = false; - textStart = 0; - textEnd = 0; - - // Return the variant - return var; -} +bool OsxmlEventParserData::hasText() { return !data.empty(); } /* Class OsxmlEventParser */ @@ -482,8 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), - data(new OsxmlEventParserData()) + data(new OsxmlEventParserData(reader.getSourceId())) { } @@ -532,16 +460,6 @@ void OsxmlEventParser::parse() } } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ - return whitespaceMode; -} - CharReader &OsxmlEventParser::getReader() const { return reader; } Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..4c5a485 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@ #include <memory> #include <string> -#include <core/common/Whitespace.hpp> - namespace ousia { // Forward declarations @@ -61,7 +59,8 @@ public: * @param args is a map containing the arguments that were given to the * command. */ - virtual void command(const Variant &name, const Variant::mapType &args) = 0; + virtual void commandStart(const Variant &name, + const Variant::mapType &args) = 0; /** * Called whenever an annotation starts. Note that this implicitly always @@ -90,24 +89,17 @@ public: const Variant &elementName) = 0; /** - * Called whenever the default field which was implicitly started by - * commandStart or annotationStart ends. Note that this does not end the - * range of an annotation, but the default field of the annotation. To - * signal the end of the annotation this, the annotationEnd method will be - * invoked. + * Called whenever the command or annotation tags end. */ - virtual void fieldEnd() = 0; + virtual void rangeEnd() = 0; /** - * Called whenever data is found. Whitespace data is handled as specified - * and the data has been parsed to the specified variant type. This function - * is not called if the parsing failed, the parser prints an error message - * instead. + * Called whenever string data is found. * - * @param data is the already parsed data that should be passed to the - * handler. + * @param data is a TokenizedData instance containing the string data that + * was found in the XML file. */ - virtual void data(const Variant &data) = 0; + virtual void data(const TokenizedData &data) = 0; }; /** @@ -135,11 +127,6 @@ private: Logger &logger; /** - * Current whitespace mode. - */ - WhitespaceMode whitespaceMode; - - /** * Data to be used by the internal functions. */ std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +158,6 @@ public: void parse(); /** - * Sets the whitespace handling mode. - * - * @param whitespaceMode defines how whitespace in the data should be - * handled. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Returns the current whitespace handling mode. - * - * @return the currently set whitespace handling mode. - */ - WhitespaceMode getWhitespaceMode() const; - - /** * Returns the internal CharReader reference. * * @return the CharReader reference. @@ -207,7 +179,9 @@ public: OsxmlEvents &getEvents() const; /** - * Returns a reference at the internal data. + * Used internally to fetch a reference at the internal data. + * + * @return a reference at the internal OsxmlEventParserData structure. */ OsxmlEventParserData &getData() const; }; diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index c216855..10cc77a 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -16,6 +16,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <core/common/Variant.hpp> +#include <core/common/CharReader.hpp> +#include <core/parser/stack/Callbacks.hpp> #include <core/parser/stack/GenericParserStates.hpp> #include <core/parser/stack/Stack.hpp> #include <core/parser/ParserContext.hpp> @@ -30,7 +33,7 @@ using namespace parser_stack; /** * Class containing the actual OsxmlParser implementation. */ -class OsxmlParserImplementation : public OsxmlEvents { +class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks { private: /** * Actual xml parser -- converts the xml stream into a set of events. @@ -54,7 +57,7 @@ public: */ OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) : parser(reader, *this, ctx.getLogger()), - stack(ctx, GenericParserStates) + stack(*this, ctx, GenericParserStates) { } @@ -63,17 +66,16 @@ public: */ void parse() { parser.parse(); } - void command(const Variant &name, const Variant::mapType &args) override + void commandStart(const Variant &name, + const Variant::mapType &args) override { - stack.command(name, args); - stack.fieldStart(true); + stack.commandStart(name, args, true); } void annotationStart(const Variant &name, const Variant::mapType &args) override { - stack.annotationStart(name, args); - stack.fieldStart(true); + stack.annotationStart(name, args, true); } void annotationEnd(const Variant &className, @@ -82,9 +84,19 @@ public: stack.annotationEnd(className, elementName); } - void fieldEnd() override { stack.fieldEnd(); } + void rangeEnd() override { stack.rangeEnd(); } - void data(const Variant &data) override { stack.data(data); } + void data(const TokenizedData &data) override { stack.data(data); } + + TokenId registerToken(const std::string &token) override + { + return Tokens::Empty; + } + + void unregisterToken(TokenId id) override + { + // Do nothing here + } }; /* Class OsxmlParser */ |