diff options
Diffstat (limited to 'src/formats')
| -rw-r--r-- | src/formats/osml/OsmlParser.cpp | 30 | ||||
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp | 800 | ||||
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp | 331 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 138 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 48 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlParser.cpp | 30 | 
6 files changed, 671 insertions, 706 deletions
| diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp index 16e7aa4..d169393 100644 --- a/src/formats/osml/OsmlParser.cpp +++ b/src/formats/osml/OsmlParser.cpp @@ -73,7 +73,7 @@ public:  	    : logger(ctx.getLogger()),  	      ctx(ctx),  	      parser(reader, logger), -	      stack(ctx, GenericParserStates) +	      stack(parser, ctx, GenericParserStates)  	{  	} @@ -88,7 +88,7 @@ public:  			OsmlStreamParser::State state = parser.parse();  			logger.setDefaultLocation(parser.getLocation());  			switch (state) { -				case OsmlStreamParser::State::COMMAND: { +				case OsmlStreamParser::State::COMMAND_START: {  					// Implicitly create a "document" element if the first  					// command is not any other top-level command  					if (needsDocument) { @@ -96,23 +96,23 @@ public:  						    parser.getCommandName().asString();  						if (cmd != "typesystem" && cmd != "document" &&  						    cmd != "ontology") { -							stack.command("document", Variant::mapType{}); +							stack.commandStart("document", Variant::mapType{}, +							                   false);  						}  						needsDocument = false;  					} -					stack.command(parser.getCommandName(), -					              parser.getCommandArguments().asMap()); +					stack.commandStart(parser.getCommandName(), +					                   parser.getCommandArguments().asMap(), +					                   parser.inRangeCommand());  					break;  				} -				case OsmlStreamParser::State::DATA: -					stack.data(parser.getData()); -					break; -				case OsmlStreamParser::State::ENTITY: -					// TODO +				case OsmlStreamParser::State::RANGE_END: +					stack.rangeEnd();  					break;  				case OsmlStreamParser::State::ANNOTATION_START:  					stack.annotationStart(parser.getCommandName(), -					                      parser.getCommandArguments().asMap()); +					                      parser.getCommandArguments().asMap(), +					                      parser.inRangeCommand());  					break;  				case OsmlStreamParser::State::ANNOTATION_END: {  					Variant elementName = Variant::fromString(std::string{}); @@ -130,11 +130,9 @@ public:  				case OsmlStreamParser::State::FIELD_END:  					stack.fieldEnd();  					break; -				case OsmlStreamParser::State::NONE: -				case OsmlStreamParser::State::ERROR: -					// Internally used in OsmlStreamParser, these states should -					// never occur. Just contiunue. -					continue; +				case OsmlStreamParser::State::DATA: +					stack.data(parser.getData()); +					break;  				case OsmlStreamParser::State::END:  					return;  			} diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..64a489d 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -16,179 +16,421 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ +#include <cassert> +#include <stack> +#include <vector> +  #include <core/common/CharReader.hpp>  #include <core/common/Logger.hpp>  #include <core/common/Utils.hpp> +#include <core/common/Variant.hpp>  #include <core/common/VariantReader.hpp> +#include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/utils/TokenizedData.hpp> +  #include "OsmlStreamParser.hpp"  namespace ousia { +namespace {  /** - * Plain format default tokenizer. + * Osml format default tokenizer. Registers the primary tokens in its + * constructor. A single, static instance of this class is created as + * "OsmlTokens", which is copied to the Tokenizer instance of + * OsmlStreamParserImpl.   */ -class PlainFormatTokens : public Tokenizer { +class OsmlFormatTokens : public Tokenizer {  public: +	TokenId Backslash; +	TokenId LineComment; +	TokenId BlockCommentStart; +	TokenId BlockCommentEnd; +	TokenId FieldStart; +	TokenId FieldEnd; +	TokenId DefaultFieldStart; +	TokenId AnnotationStart; +	TokenId AnnotationEnd; +  	/** -	 * Id of the backslash token. +	 * Registers the plain format tokens in the internal tokenizer.  	 */ -	TokenId Backslash; +	OsmlFormatTokens() +	{ +		Backslash = registerToken("\\"); +		LineComment = registerToken("%"); +		BlockCommentStart = registerToken("%{"); +		BlockCommentEnd = registerToken("}%"); +		FieldStart = registerToken("{"); +		FieldEnd = registerToken("}"); +		DefaultFieldStart = registerToken("{!"); +		AnnotationStart = registerToken("<\\"); +		AnnotationEnd = registerToken("\\>"); +	} +}; + +/** + * Instance of OsmlFormatTokens used to initialize the internal tokenizer + * instance of OsmlStreamParserImpl. + */ +static const OsmlFormatTokens OsmlTokens; +/** + * Structure representing a field. + */ +struct Field {  	/** -	 * Id of the line comment token. +	 * Specifies whether this field was marked as default field.  	 */ -	TokenId LineComment; +	bool defaultField;  	/** -	 * Id of the block comment start token. +	 * Location at which the field was started.  	 */ -	TokenId BlockCommentStart; +	SourceLocation location;  	/** -	 * Id of the block comment end token. +	 * Constructor of the Field structure, initializes all member variables with +	 * the given values. +	 * +	 * @param defaultField is a flag specifying whether this field is a default +	 * field. +	 * @param location specifies the location at which the field was started.  	 */ -	TokenId BlockCommentEnd; +	Field(bool defaultField = false, +	      const SourceLocation &location = SourceLocation{}) +	    : defaultField(defaultField), location(location) +	{ +	} +}; +/** + * Entry used for the command stack. + */ +class Command { +private:  	/** -	 * Id of the field start token. +	 * Name and location of the current command.  	 */ -	TokenId FieldStart; +	Variant name;  	/** -	 * Id of the field end token. +	 * Arguments that were passed to the command.  	 */ -	TokenId FieldEnd; +	Variant arguments;  	/** -	 * Id of the default field start token. +	 * Vector used as stack for holding the number of opening/closing braces +	 * and the corresponding "isDefaultField" flag.  	 */ -	TokenId DefaultFieldStart; +	std::vector<Field> fields;  	/** -	 * Id of the annotation start token. +	 * Set to true if this is a command with clear begin and end.  	 */ -	TokenId AnnotationStart; +	bool hasRange; +public:  	/** -	 * Id of the annotation end token. +	 * Default constructor, marks this command as normal, non-range command.  	 */ -	TokenId AnnotationEnd; +	Command() : hasRange(false) {}  	/** -	 * Registers the plain format tokens in the internal tokenizer. +	 * Constructor of the Command class. +	 * +	 * @param name is a string variant with name and location of the +	 * command. +	 * @param arguments is a map variant with the arguments given to the +	 * command. +	 * @param hasRange should be set to true if this is a command with +	 * explicit range.  	 */ -	PlainFormatTokens() +	Command(Variant name, Variant arguments, bool hasRange) +	    : name(std::move(name)), +	      arguments(std::move(arguments)), +	      hasRange(hasRange)  	{ -		Backslash = registerToken("\\"); -		LineComment = registerToken("%"); -		BlockCommentStart = registerToken("%{"); -		BlockCommentEnd = registerToken("}%"); -		FieldStart = registerToken("{"); -		FieldEnd = registerToken("}"); -		DefaultFieldStart = registerToken("{!"); -		AnnotationStart = registerToken("<\\"); -		AnnotationEnd = registerToken("\\>");  	} -}; -static const PlainFormatTokens OsmlTokens; +	/** +	 * Returns a reference at the variant representing name and location of the +	 * command. +	 * +	 * @return a variant containing name and location of the command. +	 */ +	const Variant &getName() const { return name; } -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private:  	/** -	 * Internal character buffer. +	 * Returns a reference at the variant containing name, value and location of +	 * the arguments. +	 * +	 * @return the arguments stored for the command.  	 */ -	std::vector<char> buf; +	const Variant &getArguments() const { return arguments; }  	/** -	 * Start location of the character data. +	 * Returns a reference at the internal field list. This list should be used +	 * for printing error messages when fields are still open although the outer +	 * range field closes. +	 * +	 * @return a const reference at the internal field vector.  	 */ -	SourceOffset start; +	const std::vector<Field> &getFields() const { return fields; }  	/** -	 * End location of the character data. +	 * Returns true if this command is currently in a default field. +	 * +	 * @return true if the current field on the field stack was explicitly +	 * marked as default field. If the field stack is empty, true is returned +	 * if this is a range command.  	 */ -	SourceOffset end; +	bool inDefaultField() const +	{ +		return (!fields.empty() && fields.back().defaultField) || +		       (fields.empty() && hasRange); +	} -public:  	/** -	 * Default constructor, initializes start and end with zeros. +	 * Returns true if this command currently is in any field. +	 * +	 * @return true if a field is on the stack or this is a range commands. +	 * Range commands always are in a field.  	 */ -	DataHandler() : start(0), end(0) {} +	bool inField() const { return !fields.empty() || hasRange; }  	/** -	 * Returns true if the internal buffer is empty. +	 * Returns true if this command currently is in a range field.  	 * -	 * @return true if no characters were added to the internal buffer, false -	 * otherwise. +	 * @return true if the command has a range and no other ranges are on the +	 * stack.  	 */ -	bool isEmpty() { return buf.empty(); } +	bool inRangeField() const { return fields.empty() && hasRange; }  	/** -	 * Appends a single character to the internal buffer. +	 * Returns true if this command currently is in a non-range field.  	 * -	 * @param c is the character that should be added to the internal buffer. -	 * @param charStart is the start position of the character. -	 * @param charEnd is the end position of the character. +	 * @return true if the command is in a field, but the field is not the field +	 * constructed by the "range"  	 */ -	void append(char c, SourceOffset charStart, SourceOffset charEnd) +	bool inNonRangeField() const { return !fields.empty(); } + +	/** +	 * Pushes another field onto the field stack of this command. +	 * +	 * @param defaultField if true, explicitly marks this field as default +	 * field. +	 * @param location is the source location at which the field was started. +	 * Used for error messages in which the user is notified about an error with +	 * too few closing fields. +	 */ +	void pushField(bool defaultField = false, +	               const SourceLocation &location = SourceLocation{})  	{ -		if (isEmpty()) { -			start = charStart; -		} -		buf.push_back(c); -		end = charEnd; +		fields.emplace_back(defaultField, location);  	}  	/** -	 * Appends a string to the internal buffer. +	 * Removes another field from the field stack of this command, returns true +	 * if the operation was successful.  	 * -	 * @param s is the string that should be added to the internal buffer. -	 * @param stringStart is the start position of the string. -	 * @param stringEnd is the end position of the string. +	 * @return true if there was a field to pop on the stack, false otherwise.  	 */ -	void append(const std::string &s, SourceOffset stringStart, -	            SourceOffset stringEnd) +	bool popField()  	{ -		if (isEmpty()) { -			start = stringStart; +		if (!fields.empty()) { +			fields.pop_back(); +			return true;  		} -		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); -		end = stringEnd; +		return false;  	} +}; +} + +/* Class OsmlStreamParserImpl */ + +/** + * Internal implementation of OsmlStreamParser. + */ +class OsmlStreamParserImpl { +public: +	/** +	 * State enum compatible with OsmlStreamParserState but extended by two more +	 * entries (END and NONE). +	 */ +	enum class State : uint8_t { +		COMMAND_START = 0, +		RANGE_END = 1, +		FIELD_START = 2, +		FIELD_END = 3, +		ANNOTATION_START = 4, +		ANNOTATION_END = 5, +		DATA = 6, +		END = 7, +		RECOVERABLE_ERROR = 8, +		IRRECOVERABLE_ERROR = 9 +	}; + +private: +	/** +	 * Reference to the CharReader instance from which the incomming bytes are +	 * read. +	 */ +	CharReader &reader;  	/** -	 * Converts the internal buffer to a variant with attached location -	 * information. +	 * Reference at the logger instance to which all error messages are sent. +	 */ +	Logger &logger; + +	/** +	 * Tokenizer instance used to read individual tokens from the text. +	 */ +	Tokenizer tokenizer; + +	/** +	 * Stack containing the current commands. +	 */ +	std::stack<Command> commands; + +	/** +	 * Variant containing the tokenized data that was returned from the +	 * tokenizer as data. +	 */ +	TokenizedData data; + +	/** +	 * Variable containing the current location of the parser. +	 */ +	SourceLocation location; + +	/** +	 * Function used internally to parse an identifier.  	 * -	 * @param sourceId is the source id which is needed for building the -	 * location information. -	 * @return a Variant with the internal buffer content as string and -	 * the correct start and end location. +	 * @param start is the start byte offset of the identifier (including the +	 * backslash). +	 * @param allowNSSep should be set to true if the namespace separator is +	 * allowed in the identifier name. Issues error if the namespace separator +	 * is placed incorrectly.  	 */ -	Variant toVariant(SourceId sourceId) -	{ -		Variant res = Variant::fromString(std::string(buf.data(), buf.size())); -		res.setLocation({sourceId, start, end}); -		return res; -	} +	Variant parseIdentifier(size_t start, bool allowNSSep = false); + +	/** +	 * Function used internally to handle the special "\begin" command. +	 * +	 * @return an internal State specifying whether an error occured (return +	 * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a +	 * command was actually started (return value State::COMMAND_START). +	 */ +	State parseBeginCommand(); + +	/** +	 * Function used internally to handle the special "\end" command. +	 * +	 * @return an internal State specifying whether an error occured (return +	 * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a +	 * command was actually ended (return value State::RANGE_END). +	 */ +	State parseEndCommand(); + +	/** +	 * Parses the command arguments. Handles errors if the name of the command +	 * was given using the hash notation and as a name field. +	 * +	 * @param commandArgName is the name argument that was given using the hash +	 * notation. +	 * @return a map variant containing the arguments. +	 */ +	Variant parseCommandArguments(Variant commandArgName); + +	/** +	 * Function used internally to parse a command. +	 * +	 * @param start is the start byte offset of the command (including the +	 * backslash) +	 * @param isAnnotation if true, the command is not returned as command, but +	 * as annotation start. +	 * @return true if a command was actuall parsed, false otherwise. +	 */ +	State parseCommand(size_t start, bool isAnnotation); + +	/** +	 * Function used internally to parse a block comment. +	 */ +	void parseBlockComment(); + +	/** +	 * Function used internally to parse a generic comment. +	 */ +	void parseLineComment(); + +	/** +	 * Pushes the parsed command onto the command stack. +	 */ +	void pushCommand(Variant commandName, Variant commandArguments, +	                 bool hasRange); + +	/** +	 * Checks whether there is any data pending to be issued, if yes, resets the +	 * currently peeked characters and returns true. +	 * +	 * @return true if there was any data and DATA should be returned by the +	 * parse function, false otherwise. +	 */ +	bool checkIssueData(); + +	/** +	 * Returns a reference at the current command at the top of the command +	 * stack. +	 * +	 * @return a reference at the top command in the command stack. +	 */ +	Command &cmd() { return commands.top(); } + +	/** +	 * Returns a reference at the current command at the top of the command +	 * stack. +	 * +	 * @return a reference at the top command in the command stack. +	 */ +	const Command &cmd() const { return commands.top(); } + +public: +	/** +	 * Constructor of the OsmlStreamParserImpl class. Attaches the new +	 * OsmlStreamParserImpl to the given CharReader and Logger instances. +	 * +	 * @param reader is the reader instance from which incomming characters +	 * should be read. +	 * @param logger is the logger instance to which errors should be written. +	 */ +	OsmlStreamParserImpl(CharReader &reader, Logger &logger); + +	State parse(); + +	TokenId registerToken(const std::string &token); +	void unregisterToken(TokenId id); + +	const TokenizedData &getData() const { return data; } +	const Variant &getCommandName() const { return cmd().getName(); } +	const Variant &getCommandArguments() const { return cmd().getArguments(); } +	const SourceLocation &getLocation() const { return location; } +	bool inRangeCommand() const { return cmd().inRangeField(); }; +	bool inDefaultField() const { return cmd().inDefaultField(); }  }; -OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) +/* Class OsmlStreamParserImpl */ + +OsmlStreamParserImpl::OsmlStreamParserImpl(CharReader &reader, Logger &logger)      : reader(reader), logger(logger), tokenizer(OsmlTokens)  { -	// Place an intial command representing the complete file on the stack -	commands.push(Command{"", Variant::mapType{}, true, true, true, false}); +	commands.emplace("", Variant::mapType{}, true);  } -Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) +Variant OsmlStreamParserImpl::parseIdentifier(size_t start, bool allowNSSep)  {  	bool first = true; -	bool hasCharSiceNSSep = false; +	bool hasCharSinceNSSep = false;  	std::vector<char> identifier;  	size_t end = reader.getPeekOffset();  	char c, c2; @@ -197,7 +439,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  		if ((first && Utils::isIdentifierStartCharacter(c)) ||  		    (!first && Utils::isIdentifierCharacter(c))) {  			identifier.push_back(c); -		} else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && +		} else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&  		           Utils::isIdentifierStartCharacter(c2)) {  			identifier.push_back(c);  		} else { @@ -214,8 +456,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  		// This is no longer the first character  		first = false; -		// Advance the hasCharSiceNSSep flag -		hasCharSiceNSSep = allowNSSep && (c != ':'); +		// Advance the hasCharSinceNSSep flag +		hasCharSinceNSSep = allowNSSep && (c != ':');  		end = reader.getPeekOffset();  		reader.consumePeek(); @@ -228,20 +470,20 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  	return res;  } -OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseBeginCommand()  {  	// Expect a '{' after the command  	reader.consumeWhitespace();  	if (!reader.expect('{')) {  		logger.error("Expected \"{\" after \\begin", reader); -		return State::NONE; +		return State::RECOVERABLE_ERROR;  	}  	// Parse the name of the command that should be opened  	Variant commandName = parseIdentifier(reader.getOffset(), true);  	if (commandName.asString().empty()) {  		logger.error("Expected identifier", commandName); -		return State::ERROR; +		return State::IRRECOVERABLE_ERROR;  	}  	// Check whether the next character is a '#', indicating the start of the @@ -257,7 +499,7 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()  	if (!reader.expect('}')) {  		logger.error("Expected \"}\"", reader); -		return State::ERROR; +		return State::IRRECOVERABLE_ERROR;  	}  	// Parse the arguments @@ -266,28 +508,15 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()  	// Push the command onto the command stack  	pushCommand(std::move(commandName), std::move(commandArguments), true); -	return State::COMMAND; -} - -static bool checkStillInField(const OsmlStreamParser::Command &cmd, -                              const Variant &endName, Logger &logger) -{ -	if (cmd.inField && !cmd.inRangeField) { -		logger.error(std::string("\\end in open field of command \"") + -		                 cmd.name.asString() + std::string("\""), -		             endName); -		logger.note(std::string("Open command started here:"), cmd.name); -		return true; -	} -	return false; +	return State::COMMAND_START;  } -OsmlStreamParser::State OsmlStreamParser::parseEndCommand() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseEndCommand()  {  	// Expect a '{' after the command  	if (!reader.expect('{')) {  		logger.error("Expected \"{\" after \\end", reader); -		return State::NONE; +		return State::RECOVERABLE_ERROR;  	}  	// Fetch the name of the command that should be ended here @@ -296,56 +525,58 @@ OsmlStreamParser::State OsmlStreamParser::parseEndCommand()  	// Make sure the given command name is not empty  	if (name.asString().empty()) {  		logger.error("Expected identifier", name); -		return State::ERROR; +		return State::IRRECOVERABLE_ERROR;  	}  	// Make sure the command name is terminated with a '}'  	if (!reader.expect('}')) {  		logger.error("Expected \"}\"", reader); -		return State::ERROR; -	} - -	// Unroll the command stack up to the last range command -	while (!commands.top().hasRange) { -		if (checkStillInField(commands.top(), name, logger)) { -			return State::ERROR; +		return State::IRRECOVERABLE_ERROR; +	} + +	// Unroll the command stack up to the last range command, make sure we do +	// not intersect with any open field +	while (!cmd().inRangeField()) { +		if (cmd().inField()) { +			logger.error(std::string("\\end in open field of command \"") + +			                 cmd().getName().asString() + std::string("\""), +			             name); +			const std::vector<Field> &fields = cmd().getFields(); +			for (const Field &field : fields) { +				logger.note(std::string("Still open field started here: "), +				            field.location); +			} +			return State::IRRECOVERABLE_ERROR;  		}  		commands.pop();  	} -	// Make sure we're not in an open field of this command -	if (checkStillInField(commands.top(), name, logger)) { -		return State::ERROR; -	} -  	// Special error message if the top-level command is reached  	if (commands.size() == 1) {  		logger.error(std::string("Cannot end command \"") + name.asString() +  		                 std::string("\" here, no command open"),  		             name); -		return State::ERROR; +		return State::IRRECOVERABLE_ERROR;  	} -	// Inform the about command mismatches -	const Command &cmd = commands.top(); -	if (commands.top().name.asString() != name.asString()) { -		logger.error(std::string("Trying to end command \"") + -		                 cmd.name.asString() + +	// Inform the user about command mismatches, copy the current command +	// descriptor before popping it from the stack +	if (getCommandName().asString() != name.asString()) { +		logger.error(std::string("Trying to end command \"") + name.asString() +  		                 std::string("\", but open command is \"") + -		                 name.asString() + std::string("\""), +		                 getCommandName().asString() + std::string("\""),  		             name); -		logger.note("Last command was opened here:", cmd.name); -		return State::ERROR; +		logger.note("Open command started here:", getCommandName()); +		return State::IRRECOVERABLE_ERROR;  	} -	// Set the location to the location of the command that was ended, then end -	// the current command +	// End the current command  	location = name.getLocation();  	commands.pop(); -	return cmd.inRangeField ? State::FIELD_END : State::NONE; +	return State::RANGE_END;  } -Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) +Variant OsmlStreamParserImpl::parseCommandArguments(Variant commandArgName)  {  	// Parse the arguments using the universal VariantReader  	Variant commandArguments; @@ -371,29 +602,14 @@ Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)  	return commandArguments;  } -void OsmlStreamParser::pushCommand(Variant commandName, -                                   Variant commandArguments, bool hasRange) -{ -	// Store the location on the stack -	location = commandName.getLocation(); - -	// Place the command on the command stack, remove the last commands if we're -	// not currently inside a field of these commands -	while (!commands.top().inField) { -		commands.pop(); -	} -	commands.push(Command{std::move(commandName), std::move(commandArguments), -	                      hasRange, false, false, false}); -} - -OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, -                                                       bool isAnnotation) +OsmlStreamParserImpl::State OsmlStreamParserImpl::parseCommand( +    size_t start, bool isAnnotation)  {  	// Parse the commandName as a first identifier  	Variant commandName = parseIdentifier(start, true);  	if (commandName.asString().empty()) {  		logger.error("Empty command name", reader); -		return State::NONE; +		return State::RECOVERABLE_ERROR;  	}  	// Handle the special "begin" and "end" commands @@ -403,7 +619,7 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,  	const bool isEnd = commandNameComponents[0] == "end";  	// Parse the begin or end command -	State res = State::COMMAND; +	State res = State::COMMAND_START;  	if (isBegin || isEnd) {  		if (commandNameComponents.size() > 1) {  			logger.error( @@ -459,12 +675,13 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,  		} else {  			// Make sure no arguments apart from the "name" argument are given  			// to an annotation end -			Variant::mapType &map = commands.top().arguments.asMap(); +			const Variant::mapType &map = getCommandArguments().asMap();  			if (!map.empty()) {  				if (map.count("name") == 0 || map.size() > 1U) {  					logger.error(  					    "An annotation end command may not have any arguments " -					    "other than \"name\""); +					    "other than \"name\"", +					    reader);  					return res;  				}  			} @@ -478,17 +695,21 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,  	// If we're starting an annotation, return the command as annotation start  	// instead of command -	if (isAnnotation && res == State::COMMAND) { +	if (isAnnotation && res == State::COMMAND_START) {  		return State::ANNOTATION_START;  	}  	return res;  } -void OsmlStreamParser::parseBlockComment() +void OsmlStreamParserImpl::parseBlockComment()  {  	Token token; +	TokenizedData commentData;  	size_t depth = 1; -	while (tokenizer.read(reader, token)) { +	while (tokenizer.read(reader, token, commentData)) { +		// Throw the comment data away +		commentData.clear(); +  		if (token.id == OsmlTokens.BlockCommentEnd) {  			depth--;  			if (depth == 0) { @@ -504,7 +725,7 @@ void OsmlStreamParser::parseBlockComment()  	logger.error("File ended while being in a block comment", reader);  } -void OsmlStreamParser::parseLineComment() +void OsmlStreamParserImpl::parseLineComment()  {  	char c;  	while (reader.read(c)) { @@ -514,86 +735,46 @@ void OsmlStreamParser::parseLineComment()  	}  } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +void OsmlStreamParserImpl::pushCommand(Variant commandName, +                                       Variant commandArguments, bool hasRange)  { -	if (!handler.isEmpty()) { -		data = handler.toVariant(reader.getSourceId()); -		location = data.getLocation(); -		reader.resetPeek(); -		return true; -	} -	return false; -} - -bool OsmlStreamParser::checkIssueFieldStart() -{ -	// Fetch the current command, and check whether we're currently inside a -	// field of this command -	Command &cmd = commands.top(); -	if (!cmd.inField) { -		// If this is a range command, we're now implicitly inside the field of -		// this command -- we'll have to issue a field start command! -		if (cmd.hasRange) { -			cmd.inField = true; -			cmd.inRangeField = true; -			reader.resetPeek(); -			return true; -		} +	// Store the location of the command +	location = commandName.getLocation(); -		// This was not a range command, so obviously we're now inside within -		// a field of some command -- so unroll the commands stack until a -		// command with open field is reached -		while (!commands.top().inField) { -			commands.pop(); -		} +	// Place the command on the command stack, remove the last commands if we're +	// not currently inside a field of these commands +	while (!cmd().inField()) { +		commands.pop();  	} -	return false; + +	// Push the new command onto the command stack +	commands.emplace(std::move(commandName), std::move(commandArguments), +	                 hasRange);  } -bool OsmlStreamParser::closeField() +bool OsmlStreamParserImpl::checkIssueData()  { -	// Try to end an open field of the current command -- if the current command -	// is not inside an open field, end this command and try to close the next -	// one -	for (int i = 0; i < 2 && commands.size() > 1; i++) { -		Command &cmd = commands.top(); -		if (!cmd.inRangeField) { -			if (cmd.inField) { -				cmd.inField = false; -				if (cmd.inDefaultField) { -					commands.pop(); -				} -				return true; -			} -			commands.pop(); -		} else { -			return false; -		} +	if (!data.empty()) { +		location = data.getLocation(); +		reader.resetPeek(); +		return true;  	}  	return false;  } -OsmlStreamParser::State OsmlStreamParser::parse() +OsmlStreamParserImpl::State OsmlStreamParserImpl::parse()  { -	// Handler for incomming data -	DataHandler handler; +	// Reset the data handler +	data.clear();  	// Read tokens until the outer loop should be left  	Token token; -	while (tokenizer.peek(reader, token)) { +	while (tokenizer.peek(reader, token, data)) {  		const TokenId type = token.id;  		// Special handling for Backslash and Text  		if (type == OsmlTokens.Backslash ||  		    type == OsmlTokens.AnnotationStart) { -			// Before appending anything to the output data or starting a new -			// command, check whether FIELD_START has to be issued, as the -			// current command is a command with range -			if (checkIssueFieldStart()) { -				location = token.location; -				return State::FIELD_START; -			} -  			// Check whether a command starts now, without advancing the peek  			// cursor  			char c; @@ -606,7 +787,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			// Try to parse a command  			if (Utils::isIdentifierStartCharacter(c)) {  				// Make sure to issue any data before it is to late -				if (checkIssueData(handler)) { +				if (checkIssueData()) {  					return State::DATA;  				} @@ -614,11 +795,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()  				State res = parseCommand(token.location.getStart(),  				                         type == OsmlTokens.AnnotationStart);  				switch (res) { -					case State::ERROR: +					case State::IRRECOVERABLE_ERROR:  						throw LoggableException(  						    "Last error was irrecoverable, ending parsing "  						    "process"); -					case State::NONE: +					case State::RECOVERABLE_ERROR:  						continue;  					default:  						return res; @@ -632,78 +813,64 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			// If this was an annotation start token, add the parsed < to the  			// output +			SourceOffset charStart = token.location.getStart(); +			SourceOffset charEnd = reader.getPeekOffset();  			if (type == OsmlTokens.AnnotationStart) { -				handler.append('<', token.location.getStart(), -				               token.location.getStart() + 1); +				data.append('<', charStart, charStart + 1); +				charStart = charStart + 1;  			} -			handler.append(c, token.location.getStart(), -			               reader.getPeekOffset()); +			// Append the character to the output data, mark it as protected +			data.append(c, charStart, charEnd, true);  			reader.consumePeek();  			continue;  		} else if (type == Tokens::Data) { -			// Check whether FIELD_START has to be issued before appending text -			if (checkIssueFieldStart()) { -				location = token.location; -				return State::FIELD_START; -			} - -			// Append the text to the data handler -			handler.append(token.content, token.location.getStart(), -			               token.location.getEnd()); -  			reader.consumePeek();  			continue; +		} else if (type == OsmlTokens.LineComment) { +			reader.consumePeek(); +			parseLineComment(); +			continue; +		} else if (type == OsmlTokens.BlockCommentStart) { +			reader.consumePeek(); +			parseBlockComment(); +			continue;  		}  		// A non-text token was reached, make sure all pending data commands  		// have been issued -		if (checkIssueData(handler)) { +		if (checkIssueData()) {  			return State::DATA;  		}  		// We will handle the token now, consume the peeked characters  		reader.consumePeek(); -		// Update the location to the current token location +		// Synchronize the location with the current token location  		location = token.location; -		if (token.id == OsmlTokens.LineComment) { -			parseLineComment(); -		} else if (token.id == OsmlTokens.BlockCommentStart) { -			parseBlockComment(); -		} else if (token.id == OsmlTokens.FieldStart) { -			Command &cmd = commands.top(); -			if (!cmd.inField) { -				cmd.inField = true; -				return State::FIELD_START; -			} -			logger.error( -			    "Got field start token \"{\", but no command for which to " -			    "start the field. Write \"\\{\" to insert this sequence as " -			    "text.", -			    token); +		if (token.id == OsmlTokens.FieldStart) { +			cmd().pushField(false, token.location); +			return State::FIELD_START;  		} else if (token.id == OsmlTokens.FieldEnd) { -			if (closeField()) { +			// Remove all commands from the list that currently are not in any +			// field +			while (!cmd().inField()) { +				commands.pop(); +			} + +			// If the remaining command is not in a range field, remove this +			// command +			if (cmd().inNonRangeField()) { +				cmd().popField();  				return State::FIELD_END;  			}  			logger.error( -			    "Got field end token \"}\", but there is no field to end. " -			    "Write \"\\}\" to insert this sequence as text.", +			    "Got field end token \"}\", but there is no field to end.",  			    token);  		} else if (token.id == OsmlTokens.DefaultFieldStart) { -			// Try to start a default field the first time the token is reached -			Command &topCmd = commands.top(); -			if (!topCmd.inField) { -				topCmd.inField = true; -				topCmd.inDefaultField = true; -				return State::FIELD_START; -			} -			logger.error( -			    "Got default field start token \"{!\", but no command for " -			    "which to start the field. Write \"\\{!\" to insert this " -			    "sequence as text", -			    token); +			cmd().pushField(true, token.location); +			return State::FIELD_START;  		} else if (token.id == OsmlTokens.AnnotationEnd) {  			// We got a single annotation end token "\>" -- simply issue the  			// ANNOTATION_END event @@ -717,38 +884,103 @@ OsmlStreamParser::State OsmlStreamParser::parse()  	}  	// Issue available data -	if (checkIssueData(handler)) { +	if (checkIssueData()) {  		return State::DATA;  	}  	// Make sure all open commands and fields have been ended at the end of the  	// stream -	while (commands.size() > 1) { -		Command &cmd = commands.top(); -		if (cmd.inField || cmd.hasRange) { -			logger.error("Reached end of stream, but command \"" + -			                 cmd.name.asString() + "\" has not been ended", -			             cmd.name); +	while (true) { +		bool topLevelCommand = commands.size() == 1U; +		if (cmd().inField()) { +			// If the stream ended with an open range field, issue information +			// about the range field +			if (cmd().inRangeField() && !topLevelCommand) { +				// Inform about the still open command itself +				logger.error("Reached end of stream, but command \"" + +				                 getCommandName().asString() + +				                 "\" has not been ended", +				             getCommandName()); +			} else { +				// Issue information about still open fields +				const std::vector<Field> &fields = cmd().getFields(); +				if (!fields.empty()) { +					logger.error( +					    std::string( +					        "Reached end of stream, but field is still open."), +					    fields.back().location); +				} +			} +		} +		if (!topLevelCommand) { +			commands.pop(); +		} else { +			break;  		} -		commands.pop();  	}  	location = SourceLocation{reader.getSourceId(), reader.getOffset()};  	return State::END;  } +TokenId OsmlStreamParserImpl::registerToken(const std::string &token) +{ +	return tokenizer.registerToken(token, false); +} + +void OsmlStreamParserImpl::unregisterToken(TokenId id) +{ +	assert(tokenizer.unregisterToken(id)); +} + +/* Class OsmlStreamParser */ + +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) +    : impl(new OsmlStreamParserImpl(reader, logger)) +{ +} + +OsmlStreamParser::~OsmlStreamParser() +{ +	// Stub needed because OsmlStreamParserImpl is incomplete in header +} + +OsmlStreamParser::State OsmlStreamParser::parse() +{ +	return static_cast<State>(impl->parse()); +} + +const TokenizedData &OsmlStreamParser::getData() const +{ +	return impl->getData(); +} +  const Variant &OsmlStreamParser::getCommandName() const  { -	return commands.top().name; +	return impl->getCommandName();  }  const Variant &OsmlStreamParser::getCommandArguments() const  { -	return commands.top().arguments; +	return impl->getCommandArguments(); +} + +const SourceLocation &OsmlStreamParser::getLocation() const +{ +	return impl->getLocation(); +} + +bool OsmlStreamParser::inDefaultField() const { return impl->inDefaultField(); } + +bool OsmlStreamParser::inRangeCommand() const { return impl->inRangeCommand(); } + +TokenId OsmlStreamParser::registerToken(const std::string &token) +{ +	return impl->registerToken(token);  } -bool OsmlStreamParser::inDefaultField() const +void OsmlStreamParser::unregisterToken(TokenId id)  { -	return commands.top().inRangeField || commands.top().inDefaultField; +	impl->unregisterToken(id);  }  } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..b7e64f7 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,68 +29,53 @@  #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_  #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include <stack> +#include <cstdint> +#include <memory> -#include <core/common/Variant.hpp> -#include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/stack/Callbacks.hpp>  namespace ousia {  // Forward declarations  class CharReader;  class Logger; -class DataHandler; +class OsmlStreamParserImpl; +class TokenizedData; +class Variant;  /**   * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml   * format. The parser is constructed around a "parse" function, which reads data   * from the underlying CharReader until a new state is reached and indicates   * this state in a return value. The calling code then has to pull corresponding - * data from the stream reader. The reader makes sure the incommind file is + * data from the stream reader. The reader makes sure the incomming stream is   * syntactically valid and tries to recorver from most errors. If an error is   * irrecoverable (this is the case for errors with wrong nesting of commands or   * fields, as this would lead to too many consecutive errors) a - * LoggableException is thrown. + * LoggableException is thrown. In short, the OsmlStreamParser can be described + * as a SAX parser for OSML.   */ -class OsmlStreamParser { +class OsmlStreamParser: public parser_stack::ParserCallbacks {  public:  	/**  	 * Enum used to indicate which state the OsmlStreamParser class is in  	 * after calling the "parse" function.  	 */ -	enum class State { +	enum class State : uint8_t {  		/** -	     * State returned if a fully featured command has been read. A command -	     * consists of the command name and its arguments (which optionally -	     * includes the name). +	     * State returned if the start of a command has been read. Use the +	     * getCommandName(), getCommandArguments() and inRangeCommand() +	     * functions the retrieve more information about the command that was +	     * just started.  	     */ -		COMMAND, +		COMMAND_START = 0,  		/** -	     * State returned if data is given. The reader must decide which field -	     * or command this should be routed to. Trailing or leading whitespace -	     * has been removed. Only called if the data is non-empty. -	     */ -		DATA, - -		/** -	     * A user-defined entity has been found. The entity sequence is stored -	     * in the command name. -	     */ -		ENTITY, - -		/** -	     * State returned if an annotation was started. An annotation consists -	     * of the command name and its arguments (which optionally include the -	     * name). -	     */ -		ANNOTATION_START, - -		/** -	     * State returned if an annotation ends. The reader indicates which -	     * annotation ends. +	     * State returned if a range command or range annotation has just ended. +	     * This state is not returned for non-range commands (as the actual end +	     * of a command is context dependent).  	     */ -		ANNOTATION_END, +		RANGE_END = 1,  		/**  	     * State returned if a new field started. The reader assures that the @@ -98,223 +83,46 @@ public:  	     * is not started if data has been given outside of a field. The  	     * field number is set to the current field index.  	     */ -		FIELD_START, +		FIELD_START = 2,  		/**  	     * State returned if the current field ends. The reader assures that a  	     * field was actually open.  	     */ -		FIELD_END, +		FIELD_END = 3,  		/** -	     * The end of the stream has been reached. +	     * State returned if an annotation was started. An annotation consists +	     * of the command name and its arguments (which optionally include the +	     * name).  	     */ -		END, +		ANNOTATION_START = 4,  		/** -	     * Returned from internal functions if nothing should be done. +	     * State returned if an annotation ends. The reader indicates which +	     * annotation ends.  	     */ -		NONE, +		ANNOTATION_END = 5,  		/** -	     * Returned from internal function to indicate irrecoverable errors. +	     * State returned if data is given. The reader must decide which field +	     * or command this should be routed to. Trailing or leading whitespace +	     * has been removed. Only called if the data is non-empty.  	     */ -		ERROR -	}; - -	/** -	 * Entry used for the command stack. -	 */ -	struct Command { -		/** -		 * Name and location of the current command. -		 */ -		Variant name; - -		/** -		 * Arguments that were passed to the command. -		 */ -		Variant arguments; +		DATA = 6,  		/** -		 * Set to true if this is a command with clear begin and end. -		 */ -		bool hasRange : 1; - -		/** -		 * Set to true if we are currently inside a field of this command. -		 */ -		bool inField : 1; - -		/** -		 * Set to true if we are currently in the range field of the command -		 * (implies inField being set to true). -		 */ -		bool inRangeField : 1; - -		/** -		 * Set to true if we are currently in a field that has been especially -		 * marked as default field (using the "|") syntax. -		 */ -		bool inDefaultField : 1; - -		/** -		 * Default constructor. -		 */ -		Command() -		    : hasRange(false), -		      inField(false), -		      inRangeField(false), -		      inDefaultField() -		{ -		} - -		/** -		 * Constructor of the Command class. -		 * -		 * @param name is a string variant with name and location of the -		 * command. -		 * @param arguments is a map variant with the arguments given to the -		 * command. -		 * @param hasRange should be set to true if this is a command with -		 * explicit range. -		 * @param inField is set to true if we currently are inside a field -		 * of this command. -		 * @param inRangeField is set to true if we currently are inside the -		 * outer field of a ranged command. -		 * @param inDefaultField is set to true if we currently are in a -		 * specially marked default field. -		 */ -		Command(Variant name, Variant arguments, bool hasRange, -		        bool inField, bool inRangeField, bool inDefaultField) -		    : name(std::move(name)), -		      arguments(std::move(arguments)), -		      hasRange(hasRange), -		      inField(inField), -		      inRangeField(inRangeField), -		      inDefaultField(inDefaultField) -		{ -		} +	     * The end of the stream has been reached. +	     */ +		END = 7  	};  private:  	/** -	 * Reference to the CharReader instance from which the incomming bytes are -	 * read. -	 */ -	CharReader &reader; - -	/** -	 * Reference at the logger instance to which all error messages are sent. +	 * Pointer at the class containing the internal implementation (according +	 * to the PIMPL idiom).  	 */ -	Logger &logger; - -	/** -	 * Tokenizer instance used to read individual tokens from the text. -	 */ -	Tokenizer tokenizer; - -	/** -	 * Stack containing the current commands. -	 */ -	std::stack<Command> commands; - -	/** -	 * Variant containing the data that has been read (always is a string, -	 * contains the exact location of the data in the source file). -	 */ -	Variant data; - -	/** -	 * Contains the location of the last token. -	 */ -	SourceLocation location; - -	/** -	 * Contains the field index of the current command. -	 */ -	size_t fieldIdx; - -	/** -	 * Function used internall to parse an identifier. -	 * -	 * @param start is the start byte offset of the identifier (including the -	 * backslash). -	 * @param allowNSSep should be set to true if the namespace separator is -	 * allowed in the identifier name. Issues error if the namespace separator -	 * is placed incorrectly. -	 */ -	Variant parseIdentifier(size_t start, bool allowNSSep = false); - -	/** -	 * Function used internally to handle the special "\begin" command. -	 */ -	State parseBeginCommand(); - -	/** -	 * Function used internally to handle the special "\end" command. -	 */ -	State parseEndCommand(); - -	/** -	 * Pushes the parsed command onto the command stack. -	 */ -	void pushCommand(Variant commandName, Variant commandArguments, -	                 bool hasRange); - -	/** -	 * Parses the command arguments. -	 */ -	Variant parseCommandArguments(Variant commandArgName); - -	/** -	 * Function used internally to parse a command. -	 * -	 * @param start is the start byte offset of the command (including the -	 * backslash) -	 * @param isAnnotation if true, the command is not returned as command, but -	 * as annotation start. -	 * @return true if a command was actuall parsed, false otherwise. -	 */ -	State parseCommand(size_t start, bool isAnnotation); - -	/** -	 * Function used internally to parse a block comment. -	 */ -	void parseBlockComment(); - -	/** -	 * Function used internally to parse a generic comment. -	 */ -	void parseLineComment(); - -	/** -	 * Checks whether there is any data pending to be issued, if yes, issues it. -	 * -	 * @param handler is the data handler that contains the data that may be -	 * returned to the user. -	 * @return true if there was any data and DATA should be returned by the -	 * parse function, false otherwise. -	 */ -	bool checkIssueData(DataHandler &handler); - -	/** -	 * Called before any data is appended to the internal data handler. Checks -	 * whether a new field should be started or implicitly ended. -	 * -	 * @return true if FIELD_START should be returned by the parse function. -	 */ -	bool checkIssueFieldStart(); - -	/** -	 * Closes a currently open field. Note that the command will be removed from -	 * the internal command stack if the field that is being closed is a -	 * field marked as default field. -	 * -	 * @return true if the field could be closed, false if there was no field -	 * to close. -	 */ -	bool closeField(); +	std::unique_ptr<OsmlStreamParserImpl> impl;  public:  	/** @@ -328,6 +136,12 @@ public:  	OsmlStreamParser(CharReader &reader, Logger &logger);  	/** +	 * Destructor of the OsmlStreamParser, needed to destroy the incomplete +	 * OsmlStreamParserImpl. +	 */ +	~OsmlStreamParser(); + +	/**  	 * Continues parsing. Returns one of the states defined in the State enum.  	 * Callers should stop once the State::END state is reached. Use the getter  	 * functions to get more information about the current state, such as the @@ -338,17 +152,9 @@ public:  	State parse();  	/** -	 * Returns a reference at the internally stored data. Only valid if -	 * State::DATA was returned by the "parse" function. -	 * -	 * @return a reference at a variant containing the data parsed by the -	 * "parse" function. -	 */ -	const Variant &getData() const { return data; } - -	/**  	 * Returns a reference at the internally stored command name. Only valid if -	 * State::COMMAND was returned by the "parse" function. +	 * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END +	 * was returned by the "parse" function.  	 *  	 * @return a reference at a variant containing name and location of the  	 * parsed command. @@ -357,7 +163,8 @@ public:  	/**  	 * Returns a reference at the internally stored command name. Only valid if -	 * State::COMMAND was returned by the "parse" function. +	 * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END +	 * was returned by the "parse" function.  	 *  	 * @return a reference at a variant containing arguments given to the  	 * command. @@ -365,19 +172,43 @@ public:  	const Variant &getCommandArguments() const;  	/** -	 * Returns true if the current field is the "default" field. This is true if -	 * the parser either is in the outer range of a range command or inside a -	 * field that has been especially marked as "default" field (using the "|" -	 * syntax). +	 * Returns a reference at the internally stored data. Only valid if +	 * State::DATA was returned by the "parse" function. +	 * +	 * @return a reference at a variant containing the data parsed by the +	 * "parse" function.  	 */ -	bool inDefaultField() const; +	const TokenizedData &getData() const; + +	/** +	 * Returns the location of the current token. +	 */ +	const SourceLocation &getLocation() const;  	/** -	 * Returns a reference at the char reader. +	 * Returns true if the currently started command is a range command, only +	 * valid if State::COMMAND_START or State::ANNOTATION_START was returned by +	 * the "parse" function.  	 * -	 * @return the last internal token location. +	 * @return true if the command is started is a range command, false +	 * otherwise.  	 */ -	const SourceLocation &getLocation() const { return location; } +	bool inRangeCommand() const; + +	/** +	 * Returns true if the current field is the "default" field. This is true if +	 * the parser either is in the outer range of a range command or inside a +	 * field that has been especially marked as "default" field (using the "{!" +	 * syntax). Only valid if State::FIELD_START was returned by the "parse" +	 * function. +	 * +	 * @return true if the current field was marked as default field (using the +	 * "{!" syntax). +	 */ +	bool inDefaultField() const; + +	TokenId registerToken(const std::string &token) override; +	void unregisterToken(TokenId token) override;  };  } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..79a8dbe 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,7 @@  #include <core/common/Variant.hpp>  #include <core/common/VariantReader.hpp>  #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include <core/parser/utils/TokenizedData.hpp>  #include "OsxmlAttributeLocator.hpp"  #include "OsxmlEventParser.hpp" @@ -40,6 +40,11 @@ namespace ousia {  class OsxmlEventParserData {  public:  	/** +	 * Current character data buffer. +	 */ +	TokenizedData data; + +	/**  	 * Contains the current depth of the parsing process.  	 */  	ssize_t depth; @@ -52,35 +57,13 @@ public:  	ssize_t annotationEndTagDepth;  	/** -	 * Current character data buffer. -	 */ -	std::vector<char> textBuf; - -	/** -	 * Current whitespace buffer (for the trimming whitspace mode) -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Flag indicating whether a whitespace character was present (for the -	 * collapsing whitespace mode). -	 */ -	bool hasWhitespace; - -	/** -	 * Current character data start. -	 */ -	size_t textStart; - -	/** -	 * Current character data end. -	 */ -	size_t textEnd; - -	/** -	 * Default constructor. +	 * Constructor taking the sourceId of the file from which the XML is being +	 * parsed. +	 * +	 * @param sourceId is the source if of the XML file from which the data is +	 * currently being parsed.  	 */ -	OsxmlEventParserData(); +	OsxmlEventParserData(SourceId sourceId);  	/**  	 * Increments the depth. @@ -103,14 +86,6 @@ public:  	 * @return true if character data is available.  	 */  	bool hasText(); - -	/** -	 * Returns a Variant containing the character data and its location. -	 * -	 * @return a string variant containing the text data and the character -	 * location. -	 */ -	Variant getText(SourceId sourceId);  };  /* Class GuardedExpatXmlParser */ @@ -168,7 +143,7 @@ public:  static const std::string TOP_LEVEL_TAG{"ousia"};  /** - * Prefix used to indicate the start of an annoation (note the trailing colon) + * Prefix used to indicate the start of an annoation (note the trailing colon).   */  static const std::string ANNOTATION_START_PREFIX{"a:start:"}; @@ -215,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,  	// If there is any text data in the buffer, issue that first  	if (parser->getData().hasText()) { -		parser->getEvents().data( -		    parser->getData().getText(parser->getReader().getSourceId())); +		TokenizedData &data = parser->getData().data; +		parser->getEvents().data(data); +		data.clear();  	}  	// Read the argument locations -- this is only a stupid and slow hack, @@ -335,7 +311,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,  		// Just issue a "commandStart" event in any other case  		Variant nameVar = Variant::fromString(nameStr);  		nameVar.setLocation(nameLoc); -		parser->getEvents().command(nameVar, args); +		parser->getEvents().commandStart(nameVar, args);  	}  } @@ -360,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)  	// If there is any text data in the buffer, issue that first  	if (parser->getData().hasText()) { -		parser->getEvents().data( -		    parser->getData().getText(parser->getReader().getSourceId())); +		TokenizedData &data = parser->getData().data; +		parser->getEvents().data(data); +		data.clear();  	}  	// Abort if the special ousia tag ends here @@ -370,8 +347,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)  		return;  	} -	// Issue the "fieldEnd" event -	parser->getEvents().fieldEnd(); +	// Issue the "rangeEnd" event +	parser->getEvents().rangeEnd();  }  static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) @@ -393,34 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)  	// Synchronize the logger position  	SourceLocation loc = xmlSyncLoggerPosition(p, ulen); -	// Fetch some variables for convenience -	const WhitespaceMode mode = parser->getWhitespaceMode(); -	OsxmlEventParserData &data = parser->getData(); -	std::vector<char> &textBuf = data.textBuf; -	std::vector<char> &whitespaceBuf = data.whitespaceBuf; -	bool &hasWhitespace = data.hasWhitespace; -	size_t &textStart = data.textStart; -	size_t &textEnd = data.textEnd; - -	size_t pos = loc.getStart(); -	for (size_t i = 0; i < ulen; i++, pos++) { -		switch (mode) { -			case WhitespaceMode::PRESERVE: -				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd); -				break; -			case WhitespaceMode::TRIM: -				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                  textStart, textEnd, -				                                  whitespaceBuf); -				break; -			case WhitespaceMode::COLLAPSE: -				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd, -				                                    hasWhitespace); -				break; -		} -	} +	// Append the data to the buffer +	parser->getData().data.append(std::string(s, ulen), loc.getStart());  }  /* Class OsxmlEvents */ @@ -429,12 +380,8 @@ OsxmlEvents::~OsxmlEvents() {}  /* Class OsxmlEventParser */ -OsxmlEventParserData::OsxmlEventParserData() -    : depth(0), -      annotationEndTagDepth(-1), -      hasWhitespace(false), -      textStart(0), -      textEnd(0) +OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId) +    : data(sourceId), depth(0), annotationEndTagDepth(-1)  {  } @@ -455,25 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag()  	return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);  } -bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } - -Variant OsxmlEventParserData::getText(SourceId sourceId) -{ -	// Create a variant containing the string data and the location -	Variant var = -	    Variant::fromString(std::string{textBuf.data(), textBuf.size()}); -	var.setLocation({sourceId, textStart, textEnd}); - -	// Reset the text buffers -	textBuf.clear(); -	whitespaceBuf.clear(); -	hasWhitespace = false; -	textStart = 0; -	textEnd = 0; - -	// Return the variant -	return var; -} +bool OsxmlEventParserData::hasText() { return !data.empty(); }  /* Class OsxmlEventParser */ @@ -482,8 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,      : reader(reader),        events(events),        logger(logger), -      whitespaceMode(WhitespaceMode::COLLAPSE), -      data(new OsxmlEventParserData()) +      data(new OsxmlEventParserData(reader.getSourceId()))  {  } @@ -532,16 +460,6 @@ void OsxmlEventParser::parse()  	}  } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ -	this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ -	return whitespaceMode; -} -  CharReader &OsxmlEventParser::getReader() const { return reader; }  Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..4c5a485 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@  #include <memory>  #include <string> -#include <core/common/Whitespace.hpp> -  namespace ousia {  // Forward declarations @@ -61,7 +59,8 @@ public:  	 * @param args is a map containing the arguments that were given to the  	 * command.  	 */ -	virtual void command(const Variant &name, const Variant::mapType &args) = 0; +	virtual void commandStart(const Variant &name, +	                          const Variant::mapType &args) = 0;  	/**  	 * Called whenever an annotation starts. Note that this implicitly always @@ -90,24 +89,17 @@ public:  	                           const Variant &elementName) = 0;  	/** -	 * Called whenever the default field which was implicitly started by -	 * commandStart or annotationStart ends. Note that this does not end the -	 * range of an annotation, but the default field of the annotation. To -	 * signal the end of the annotation this, the annotationEnd method will be -	 * invoked. +	 * Called whenever the command or annotation tags end.  	 */ -	virtual void fieldEnd() = 0; +	virtual void rangeEnd() = 0;  	/** -	 * Called whenever data is found. Whitespace data is handled as specified -	 * and the data has been parsed to the specified variant type. This function -	 * is not called if the parsing failed, the parser prints an error message -	 * instead. +	 * Called whenever string data is found.  	 * -	 * @param data is the already parsed data that should be passed to the -	 * handler. +	 * @param data is a TokenizedData instance containing the string data that +	 * was found in the XML file.  	 */ -	virtual void data(const Variant &data) = 0; +	virtual void data(const TokenizedData &data) = 0;  };  /** @@ -135,11 +127,6 @@ private:  	Logger &logger;  	/** -	 * Current whitespace mode. -	 */ -	WhitespaceMode whitespaceMode; - -	/**  	 * Data to be used by the internal functions.  	 */  	std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +158,6 @@ public:  	void parse();  	/** -	 * Sets the whitespace handling mode. -	 * -	 * @param whitespaceMode defines how whitespace in the data should be -	 * handled. -	 */ -	void setWhitespaceMode(WhitespaceMode whitespaceMode); - -	/** -	 * Returns the current whitespace handling mode. -	 * -	 * @return the currently set whitespace handling mode. -	 */ -	WhitespaceMode getWhitespaceMode() const; - -	/**  	 * Returns the internal CharReader reference.  	 *  	 * @return the CharReader reference. @@ -207,7 +179,9 @@ public:  	OsxmlEvents &getEvents() const;  	/** -	 * Returns a reference at the internal data. +	 * Used internally to fetch a reference at the internal data. +	 * +	 * @return a reference at the internal OsxmlEventParserData structure.  	 */  	OsxmlEventParserData &getData() const;  }; diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index c216855..10cc77a 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -16,6 +16,9 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ +#include <core/common/Variant.hpp> +#include <core/common/CharReader.hpp> +#include <core/parser/stack/Callbacks.hpp>  #include <core/parser/stack/GenericParserStates.hpp>  #include <core/parser/stack/Stack.hpp>  #include <core/parser/ParserContext.hpp> @@ -30,7 +33,7 @@ using namespace parser_stack;  /**   * Class containing the actual OsxmlParser implementation.   */ -class OsxmlParserImplementation : public OsxmlEvents { +class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks {  private:  	/**  	 * Actual xml parser -- converts the xml stream into a set of events. @@ -54,7 +57,7 @@ public:  	 */  	OsxmlParserImplementation(CharReader &reader, ParserContext &ctx)  	    : parser(reader, *this, ctx.getLogger()), -	      stack(ctx, GenericParserStates) +	      stack(*this, ctx, GenericParserStates)  	{  	} @@ -63,17 +66,16 @@ public:  	 */  	void parse() { parser.parse(); } -	void command(const Variant &name, const Variant::mapType &args) override +	void commandStart(const Variant &name, +	                  const Variant::mapType &args) override  	{ -		stack.command(name, args); -		stack.fieldStart(true); +		stack.commandStart(name, args, true);  	}  	void annotationStart(const Variant &name,  	                     const Variant::mapType &args) override  	{ -		stack.annotationStart(name, args); -		stack.fieldStart(true); +		stack.annotationStart(name, args, true);  	}  	void annotationEnd(const Variant &className, @@ -82,9 +84,19 @@ public:  		stack.annotationEnd(className, elementName);  	} -	void fieldEnd() override { stack.fieldEnd(); } +	void rangeEnd() override { stack.rangeEnd(); } -	void data(const Variant &data) override { stack.data(data); } +	void data(const TokenizedData &data) override { stack.data(data); } + +	TokenId registerToken(const std::string &token) override +	{ +		return Tokens::Empty; +	} + +	void unregisterToken(TokenId id) override +	{ +		// Do nothing here +	}  };  /* Class OsxmlParser */ | 
