diff options
Diffstat (limited to 'src/formats/osml')
| -rw-r--r-- | src/formats/osml/OsmlParser.cpp | 57 | ||||
| -rw-r--r-- | src/formats/osml/OsmlParser.hpp | 48 | ||||
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp | 754 | ||||
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp | 385 | 
4 files changed, 1244 insertions, 0 deletions
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp new file mode 100644 index 0000000..4973639 --- /dev/null +++ b/src/formats/osml/OsmlParser.cpp @@ -0,0 +1,57 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/parser/generic/ParserStateCallbacks.hpp> +#include <core/parser/generic/ParserStateStack.hpp> + +#include "OsdmParser.hpp" +#include "OsdmStreamParser.hpp" + +namespace ousia { + +namespace { + +/** + * The OsdmParserImplementation class contains the actual implementation of the + * parsing process and is created in the "doParse" function of the OsdmParser. +  + */ +class OsdmParserImplementation : public ParserStateCallbacks { +private: +	/** +	 * OsdmStreamParser instance. +	 */ +	OsdmStreamParser parser; + +	/** +	 * Instance of the ParserStateStack. +	 */ +	ParserStateStack stack; + +public: +	OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap) +}; +} + +void OsdmParser::doParse(CharReader &reader, ParserContext &ctx) +{ +	OsdmParserImplementation parser(reader, ctx); +	parser.parse(); +} + +} diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp new file mode 100644 index 0000000..37505b4 --- /dev/null +++ b/src/formats/osml/OsmlParser.hpp @@ -0,0 +1,48 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsdmParser.hpp + * + * Contains the parser of the osdm format, the standard plain-text format used + * by Ousía for documents. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_PARSER_HPP_ +#define _OUSIA_OSDM_PARSER_HPP_ + +#include <core/parser/Parser.hpp> + +namespace ousia { + +/** + * OsdmParser is a small wrapper implementing the Parser interface. The actual + * parsing is performed with the OsdmStreamParser in conjunction with the + * ParserStateStack. + */ +class OsdmParser : public Parser { +protected: +	void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSDM_PARSER_HPP_ */ + diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp new file mode 100644 index 0000000..0174fa4 --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -0,0 +1,754 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/CharReader.hpp> +#include <core/common/Logger.hpp> +#include <core/common/Utils.hpp> +#include <core/common/VariantReader.hpp> + +#include "OsmlStreamParser.hpp" + +namespace ousia { + +/** + * Plain format default tokenizer. + */ +class PlainFormatTokens : public Tokenizer { +public: +	/** +	 * Id of the backslash token. +	 */ +	TokenTypeId Backslash; + +	/** +	 * Id of the line comment token. +	 */ +	TokenTypeId LineComment; + +	/** +	 * Id of the block comment start token. +	 */ +	TokenTypeId BlockCommentStart; + +	/** +	 * Id of the block comment end token. +	 */ +	TokenTypeId BlockCommentEnd; + +	/** +	 * Id of the field start token. +	 */ +	TokenTypeId FieldStart; + +	/** +	 * Id of the field end token. +	 */ +	TokenTypeId FieldEnd; + +	/** +	 * Id of the default field start token. +	 */ +	TokenTypeId DefaultFieldStart; + +	/** +	 * Id of the annotation start token. +	 */ +	TokenTypeId AnnotationStart; + +	/** +	 * Id of the annotation end token. +	 */ +	TokenTypeId AnnotationEnd; + +	/** +	 * Registers the plain format tokens in the internal tokenizer. +	 */ +	PlainFormatTokens() +	{ +		Backslash = registerToken("\\"); +		LineComment = registerToken("%"); +		BlockCommentStart = registerToken("%{"); +		BlockCommentEnd = registerToken("}%"); +		FieldStart = registerToken("{"); +		FieldEnd = registerToken("}"); +		DefaultFieldStart = registerToken("{!"); +		AnnotationStart = registerToken("<\\"); +		AnnotationEnd = registerToken("\\>"); +	} +}; + +static const PlainFormatTokens Tokens; + +/** + * Class used internally to collect data issued via "DATA" event. + */ +class DataHandler { +private: +	/** +	 * Internal character buffer. +	 */ +	std::vector<char> buf; + +	/** +	 * Start location of the character data. +	 */ +	SourceOffset start; + +	/** +	 * End location of the character data. +	 */ +	SourceOffset end; + +public: +	/** +	 * Default constructor, initializes start and end with zeros. +	 */ +	DataHandler() : start(0), end(0) {} + +	/** +	 * Returns true if the internal buffer is empty. +	 * +	 * @return true if no characters were added to the internal buffer, false +	 * otherwise. +	 */ +	bool isEmpty() { return buf.empty(); } + +	/** +	 * Appends a single character to the internal buffer. +	 * +	 * @param c is the character that should be added to the internal buffer. +	 * @param charStart is the start position of the character. +	 * @param charEnd is the end position of the character. +	 */ +	void append(char c, SourceOffset charStart, SourceOffset charEnd) +	{ +		if (isEmpty()) { +			start = charStart; +		} +		buf.push_back(c); +		end = charEnd; +	} + +	/** +	 * Appends a string to the internal buffer. +	 * +	 * @param s is the string that should be added to the internal buffer. +	 * @param stringStart is the start position of the string. +	 * @param stringEnd is the end position of the string. +	 */ +	void append(const std::string &s, SourceOffset stringStart, +	            SourceOffset stringEnd) +	{ +		if (isEmpty()) { +			start = stringStart; +		} +		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); +		end = stringEnd; +	} + +	/** +	 * Converts the internal buffer to a variant with attached location +	 * information. +	 * +	 * @param sourceId is the source id which is needed for building the +	 * location information. +	 * @return a Variant with the internal buffer content as string and +	 * the correct start and end location. +	 */ +	Variant toVariant(SourceId sourceId) +	{ +		Variant res = Variant::fromString(std::string(buf.data(), buf.size())); +		res.setLocation({sourceId, start, end}); +		return res; +	} +}; + +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) +    : reader(reader), logger(logger), tokenizer(Tokens) +{ +	// Place an intial command representing the complete file on the stack +	commands.push(Command{"", Variant::mapType{}, true, true, true, false}); +} + +Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) +{ +	bool first = true; +	bool hasCharSiceNSSep = false; +	std::vector<char> identifier; +	size_t end = reader.getPeekOffset(); +	char c, c2; +	while (reader.peek(c)) { +		// Abort if this character is not a valid identifer character +		if ((first && Utils::isIdentifierStartCharacter(c)) || +		    (!first && Utils::isIdentifierCharacter(c))) { +			identifier.push_back(c); +		} else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && +		           Utils::isIdentifierStartCharacter(c2)) { +			identifier.push_back(c); +		} else { +			if (c == ':' && allowNSSep) { +				logger.error( +				    "Expected character before and after namespace separator " +				    "\":\"", +				    reader); +			} +			reader.resetPeek(); +			break; +		} + +		// This is no longer the first character +		first = false; + +		// Advance the hasCharSiceNSSep flag +		hasCharSiceNSSep = allowNSSep && (c != ':'); + +		end = reader.getPeekOffset(); +		reader.consumePeek(); +	} + +	// Return the identifier at its location +	Variant res = +	    Variant::fromString(std::string(identifier.data(), identifier.size())); +	res.setLocation({reader.getSourceId(), start, end}); +	return res; +} + +OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() +{ +	// Expect a '{' after the command +	reader.consumeWhitespace(); +	if (!reader.expect('{')) { +		logger.error("Expected \"{\" after \\begin", reader); +		return State::NONE; +	} + +	// Parse the name of the command that should be opened +	Variant commandName = parseIdentifier(reader.getOffset(), true); +	if (commandName.asString().empty()) { +		logger.error("Expected identifier", commandName); +		return State::ERROR; +	} + +	// Check whether the next character is a '#', indicating the start of the +	// command name +	Variant commandArgName; +	SourceOffset start = reader.getOffset(); +	if (reader.expect('#')) { +		commandArgName = parseIdentifier(start); +		if (commandArgName.asString().empty()) { +			logger.error("Expected identifier after \"#\"", commandArgName); +		} +	} + +	if (!reader.expect('}')) { +		logger.error("Expected \"}\"", reader); +		return State::ERROR; +	} + +	// Parse the arguments +	Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + +	// Push the command onto the command stack +	pushCommand(std::move(commandName), std::move(commandArguments), true); + +	return State::COMMAND; +} + +static bool checkStillInField(const OsmlStreamParser::Command &cmd, +                              const Variant &endName, Logger &logger) +{ +	if (cmd.inField && !cmd.inRangeField) { +		logger.error(std::string("\\end in open field of command \"") + +		                 cmd.name.asString() + std::string("\""), +		             endName); +		logger.note(std::string("Open command started here:"), cmd.name); +		return true; +	} +	return false; +} + +OsmlStreamParser::State OsmlStreamParser::parseEndCommand() +{ +	// Expect a '{' after the command +	if (!reader.expect('{')) { +		logger.error("Expected \"{\" after \\end", reader); +		return State::NONE; +	} + +	// Fetch the name of the command that should be ended here +	Variant name = parseIdentifier(reader.getOffset(), true); + +	// Make sure the given command name is not empty +	if (name.asString().empty()) { +		logger.error("Expected identifier", name); +		return State::ERROR; +	} + +	// Make sure the command name is terminated with a '}' +	if (!reader.expect('}')) { +		logger.error("Expected \"}\"", reader); +		return State::ERROR; +	} + +	// Unroll the command stack up to the last range command +	while (!commands.top().hasRange) { +		if (checkStillInField(commands.top(), name, logger)) { +			return State::ERROR; +		} +		commands.pop(); +	} + +	// Make sure we're not in an open field of this command +	if (checkStillInField(commands.top(), name, logger)) { +		return State::ERROR; +	} + +	// Special error message if the top-level command is reached +	if (commands.size() == 1) { +		logger.error(std::string("Cannot end command \"") + name.asString() + +		                 std::string("\" here, no command open"), +		             name); +		return State::ERROR; +	} + +	// Inform the about command mismatches +	const Command &cmd = commands.top(); +	if (commands.top().name.asString() != name.asString()) { +		logger.error(std::string("Trying to end command \"") + +		                 cmd.name.asString() + +		                 std::string("\", but open command is \"") + +		                 name.asString() + std::string("\""), +		             name); +		logger.note("Last command was opened here:", cmd.name); +		return State::ERROR; +	} + +	// Set the location to the location of the command that was ended, then end +	// the current command +	location = name.getLocation(); +	commands.pop(); +	return cmd.inRangeField ? State::FIELD_END : State::NONE; +} + +Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) +{ +	// Parse the arguments using the universal VariantReader +	Variant commandArguments; +	if (reader.expect('[')) { +		auto res = VariantReader::parseObject(reader, logger, ']'); +		commandArguments = res.second; +	} else { +		commandArguments = Variant::mapType{}; +	} + +	// Insert the parsed name, make sure "name" was not specified in the +	// arguments +	if (commandArgName.isString()) { +		auto res = +		    commandArguments.asMap().emplace("name", std::move(commandArgName)); +		if (!res.second) { +			logger.error("Name argument specified multiple times", +			             SourceLocation{}, MessageMode::NO_CONTEXT); +			logger.note("First occurance is here: ", commandArgName); +			logger.note("Second occurance is here: ", res.first->second); +		} +	} +	return commandArguments; +} + +void OsmlStreamParser::pushCommand(Variant commandName, +                                   Variant commandArguments, bool hasRange) +{ +	// Store the location on the stack +	location = commandName.getLocation(); + +	// Place the command on the command stack, remove the last commands if we're +	// not currently inside a field of these commands +	while (!commands.top().inField) { +		commands.pop(); +	} +	commands.push(Command{std::move(commandName), std::move(commandArguments), +	                      hasRange, false, false, false}); +} + +OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, +                                                       bool isAnnotation) +{ +	// Parse the commandName as a first identifier +	Variant commandName = parseIdentifier(start, true); +	if (commandName.asString().empty()) { +		logger.error("Empty command name", reader); +		return State::NONE; +	} + +	// Handle the special "begin" and "end" commands +	const auto commandNameComponents = +	    Utils::split(commandName.asString(), ':'); +	const bool isBegin = commandNameComponents[0] == "begin"; +	const bool isEnd = commandNameComponents[0] == "end"; + +	// Parse the begin or end command +	State res = State::COMMAND; +	if (isBegin || isEnd) { +		if (commandNameComponents.size() > 1) { +			logger.error( +			    "Special commands \"\\begin\" and \"\\end\" may not contain a " +			    "namespace separator \":\"", +			    commandName); +		} +		if (isBegin) { +			res = parseBeginCommand(); +		} else if (isEnd) { +			res = parseEndCommand(); +		} +	} else { +		// Check whether the next character is a '#', indicating the start of +		// the command name +		Variant commandArgName; +		start = reader.getOffset(); +		if (reader.expect('#')) { +			commandArgName = parseIdentifier(start); +			if (commandArgName.asString().empty()) { +				logger.error("Expected identifier after \"#\"", commandArgName); +			} +		} + +		// Parse the arugments +		Variant commandArguments = +		    parseCommandArguments(std::move(commandArgName)); + +		// Push the command onto the command stack +		pushCommand(std::move(commandName), std::move(commandArguments), false); +	} + +	// Check whether a ">" character is the next character that is to be read. +	// In that case the current command could be an annotation end command! +	char c; +	if (reader.fetch(c) && c == '>') { +		// Ignore the character after a begin or end command +		if (isBegin || isEnd) { +			logger.warning( +			    "Ignoring annotation end character \">\" after special " +			    "commands \"begin\" or \"end\". Write \"\\>\" to end a " +			    "\"begin\"/\"end\" enclosed annotation.", +			    reader); +			return res; +		} + +		// If this should be an annoation, ignore the character +		if (isAnnotation) { +			logger.warning( +			    "Ignoring annotation end character \">\" after annotation " +			    "start command. Write \"\\>\" to end the annotation.", +			    reader); +		} else { +			// Make sure no arguments apart from the "name" argument are given +			// to an annotation end +			Variant::mapType &map = commands.top().arguments.asMap(); +			if (!map.empty()) { +				if (map.count("name") == 0 || map.size() > 1U) { +					logger.error( +					    "An annotation end command may not have any arguments " +					    "other than \"name\""); +					return res; +				} +			} + +			// If we got here, this is a valid ANNOTATION_END command, issue it +			reader.peek(c); +			reader.consumePeek(); +			return State::ANNOTATION_END; +		} +	} + +	// If we're starting an annotation, return the command as annotation start +	// instead of command +	if (isAnnotation && res == State::COMMAND) { +		return State::ANNOTATION_START; +	} +	return res; +} + +void OsmlStreamParser::parseBlockComment() +{ +	Token token; +	size_t depth = 1; +	while (tokenizer.read(reader, token)) { +		if (token.type == Tokens.BlockCommentEnd) { +			depth--; +			if (depth == 0) { +				return; +			} +		} +		if (token.type == Tokens.BlockCommentStart) { +			depth++; +		} +	} + +	// Issue an error if the file ends while we are in a block comment +	logger.error("File ended while being in a block comment", reader); +} + +void OsmlStreamParser::parseLineComment() +{ +	char c; +	while (reader.read(c)) { +		if (c == '\n') { +			return; +		} +	} +} + +bool OsmlStreamParser::checkIssueData(DataHandler &handler) +{ +	if (!handler.isEmpty()) { +		data = handler.toVariant(reader.getSourceId()); +		location = data.getLocation(); +		reader.resetPeek(); +		return true; +	} +	return false; +} + +bool OsmlStreamParser::checkIssueFieldStart() +{ +	// Fetch the current command, and check whether we're currently inside a +	// field of this command +	Command &cmd = commands.top(); +	if (!cmd.inField) { +		// If this is a range command, we're now implicitly inside the field of +		// this command -- we'll have to issue a field start command! +		if (cmd.hasRange) { +			cmd.inField = true; +			cmd.inRangeField = true; +			reader.resetPeek(); +			return true; +		} + +		// This was not a range command, so obviously we're now inside within +		// a field of some command -- so unroll the commands stack until a +		// command with open field is reached +		while (!commands.top().inField) { +			commands.pop(); +		} +	} +	return false; +} + +bool OsmlStreamParser::closeField() +{ +	// Try to end an open field of the current command -- if the current command +	// is not inside an open field, end this command and try to close the next +	// one +	for (int i = 0; i < 2 && commands.size() > 1; i++) { +		Command &cmd = commands.top(); +		if (!cmd.inRangeField) { +			if (cmd.inField) { +				cmd.inField = false; +				if (cmd.inDefaultField) { +					commands.pop(); +				} +				return true; +			} +			commands.pop(); +		} else { +			return false; +		} +	} +	return false; +} + +OsmlStreamParser::State OsmlStreamParser::parse() +{ +	// Handler for incomming data +	DataHandler handler; + +	// Read tokens until the outer loop should be left +	Token token; +	while (tokenizer.peek(reader, token)) { +		const TokenTypeId type = token.type; + +		// Special handling for Backslash and Text +		if (type == Tokens.Backslash || type == Tokens.AnnotationStart) { +			// Before appending anything to the output data or starting a new +			// command, check whether FIELD_START has to be issued, as the +			// current command is a command with range +			if (checkIssueFieldStart()) { +				location = token.location; +				return State::FIELD_START; +			} + +			// Check whether a command starts now, without advancing the peek +			// cursor +			char c; +			if (!reader.fetchPeek(c)) { +				logger.error("Trailing backslash at the end of the file.", +				             token); +				return State::END; +			} + +			// Try to parse a command +			if (Utils::isIdentifierStartCharacter(c)) { +				// Make sure to issue any data before it is to late +				if (checkIssueData(handler)) { +					return State::DATA; +				} + +				// Parse the actual command +				State res = parseCommand(token.location.getStart(), +				                         type == Tokens.AnnotationStart); +				switch (res) { +					case State::ERROR: +						throw LoggableException( +						    "Last error was irrecoverable, ending parsing " +						    "process"); +					case State::NONE: +						continue; +					default: +						return res; +				} +			} + +			// This was not a special character, just append the given character +			// to the data buffer, use the escape character start as start +			// location and the peek offset as end location +			reader.peek(c);  // Peek the previously fetched character + +			// If this was an annotation start token, add the parsed < to the +			// output +			if (type == Tokens.AnnotationStart) { +				handler.append('<', token.location.getStart(), +				               token.location.getStart() + 1); +			} + +			handler.append(c, token.location.getStart(), +			               reader.getPeekOffset()); +			reader.consumePeek(); +			continue; +		} else if (type == TextToken) { +			// Check whether FIELD_START has to be issued before appending text +			if (checkIssueFieldStart()) { +				location = token.location; +				return State::FIELD_START; +			} + +			// Append the text to the data handler +			handler.append(token.content, token.location.getStart(), +			               token.location.getEnd()); + +			reader.consumePeek(); +			continue; +		} + +		// A non-text token was reached, make sure all pending data commands +		// have been issued +		if (checkIssueData(handler)) { +			return State::DATA; +		} + +		// We will handle the token now, consume the peeked characters +		reader.consumePeek(); + +		// Update the location to the current token location +		location = token.location; + +		if (token.type == Tokens.LineComment) { +			parseLineComment(); +		} else if (token.type == Tokens.BlockCommentStart) { +			parseBlockComment(); +		} else if (token.type == Tokens.FieldStart) { +			Command &cmd = commands.top(); +			if (!cmd.inField) { +				cmd.inField = true; +				return State::FIELD_START; +			} +			logger.error( +			    "Got field start token \"{\", but no command for which to " +			    "start the field. Write \"\\{\" to insert this sequence as " +			    "text.", +			    token); +		} else if (token.type == Tokens.FieldEnd) { +			if (closeField()) { +				return State::FIELD_END; +			} +			logger.error( +			    "Got field end token \"}\", but there is no field to end. " +			    "Write \"\\}\" to insert this sequence as text.", +			    token); +		} else if (token.type == Tokens.DefaultFieldStart) { +			// Try to start a default field the first time the token is reached +			Command &topCmd = commands.top(); +			if (!topCmd.inField) { +				topCmd.inField = true; +				topCmd.inDefaultField = true; +				return State::FIELD_START; +			} +			logger.error( +			    "Got default field start token \"{!\", but no command for " +			    "which to start the field. Write \"\\{!\" to insert this " +			    "sequence as text", +			    token); +		} else if (token.type == Tokens.AnnotationEnd) { +			// We got a single annotation end token "\>" -- simply issue the +			// ANNOTATION_END event +			Variant annotationName = Variant::fromString(""); +			annotationName.setLocation(token.location); +			pushCommand(annotationName, Variant::mapType{}, false); +			return State::ANNOTATION_END; +		} else { +			logger.error("Unexpected token \"" + token.content + "\"", token); +		} +	} + +	// Issue available data +	if (checkIssueData(handler)) { +		return State::DATA; +	} + +	// Make sure all open commands and fields have been ended at the end of the +	// stream +	while (commands.size() > 1) { +		Command &cmd = commands.top(); +		if (cmd.inField || cmd.hasRange) { +			logger.error("Reached end of stream, but command \"" + +			                 cmd.name.asString() + "\" has not been ended", +			             cmd.name); +		} +		commands.pop(); +	} + +	location = SourceLocation{reader.getSourceId(), reader.getOffset()}; +	return State::END; +} + +const Variant &OsmlStreamParser::getCommandName() const +{ +	return commands.top().name; +} + +const Variant &OsmlStreamParser::getCommandArguments() const +{ +	return commands.top().arguments; +} + +bool OsmlStreamParser::inDefaultField() const +{ +	return commands.top().inRangeField || commands.top().inDefaultField; +} +} + diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp new file mode 100644 index 0000000..dc3034c --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -0,0 +1,385 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsmlStreamParser.hpp + * + * Provides classes for low-level classes for reading the TeX-esque osml + * format. The class provided here does not build any model objects and does not + * implement the Parser interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ +#define _OUSIA_OSML_STREAM_PARSER_HPP_ + +#include <stack> + +#include <core/common/Variant.hpp> +#include <core/parser/utils/Tokenizer.hpp> + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; +class DataHandler; + +/** + * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml + * format. The parser is constructed around a "parse" function, which reads data + * from the underlying CharReader until a new state is reached and indicates + * this state in a return value. The calling code then has to pull corresponding + * data from the stream reader. The reader makes sure the incommind file is + * syntactically valid and tries to recorver from most errors. If an error is + * irrecoverable (this is the case for errors with wrong nesting of commands or + * fields, as this would lead to too many consecutive errors) a + * LoggableException is thrown. + */ +class OsmlStreamParser { +public: +	/** +	 * Enum used to indicate which state the OsmlStreamParser class is in +	 * after calling the "parse" function. +	 */ +	enum class State { +		/** +	     * State returned if a fully featured command has been read. A command +	     * consists of the command name and its arguments (which optionally +	     * includes the name). +	     */ +		COMMAND, + +		/** +	     * State returned if data is given. The reader must decide which field +	     * or command this should be routed to. Trailing or leading whitespace +	     * has been removed. Only called if the data is non-empty. +	     */ +		DATA, + +		/** +	     * A user-defined entity has been found. The entity sequence is stored +	     * in the command name. +	     */ +		ENTITY, + +		/** +	     * State returned if an annotation was started. An annotation consists +	     * of the command name and its arguments (which optionally include the +	     * name). +	     */ +		ANNOTATION_START, + +		/** +	     * State returned if an annotation ends. The reader indicates which +	     * annotation ends. +	     */ +		ANNOTATION_END, + +		/** +	     * State returned if a new field started. The reader assures that the +	     * current field ends before a new field is started and that the field +	     * is not started if data has been given outside of a field. The +	     * field number is set to the current field index. +	     */ +		FIELD_START, + +		/** +	     * State returned if the current field ends. The reader assures that a +	     * field was actually open. +	     */ +		FIELD_END, + +		/** +	     * The end of the stream has been reached. +	     */ +		END, + +		/** +	     * Returned from internal functions if nothing should be done. +	     */ +		NONE, + +		/** +	     * Returned from internal function to indicate irrecoverable errors. +	     */ +		ERROR +	}; + +	/** +	 * Entry used for the command stack. +	 */ +	struct Command { +		/** +		 * Name and location of the current command. +		 */ +		Variant name; + +		/** +		 * Arguments that were passed to the command. +		 */ +		Variant arguments; + +		/** +		 * Set to true if this is a command with clear begin and end. +		 */ +		bool hasRange : 1; + +		/** +		 * Set to true if we are currently inside a field of this command. +		 */ +		bool inField : 1; + +		/** +		 * Set to true if we are currently in the range field of the command +		 * (implies inField being set to true). +		 */ +		bool inRangeField : 1; + +		/** +		 * Set to true if we are currently in a field that has been especially +		 * marked as default field (using the "|") syntax. +		 */ +		bool inDefaultField : 1; + +		/** +		 * Default constructor. +		 */ +		Command() +		    : hasRange(false), +		      inField(false), +		      inRangeField(false), +		      inDefaultField() +		{ +		} + +		/** +		 * Constructor of the Command class. +		 * +		 * @param name is a string variant with name and location of the +		 * command. +		 * @param arguments is a map variant with the arguments given to the +		 * command. +		 * @param hasRange should be set to true if this is a command with +		 * explicit range. +		 * @param inField is set to true if we currently are inside a field +		 * of this command. +		 * @param inRangeField is set to true if we currently are inside the +		 * outer field of a ranged command. +		 * @param inDefaultField is set to true if we currently are in a +		 * specially marked default field. +		 */ +		Command(Variant name, Variant arguments, bool hasRange, +		        bool inField, bool inRangeField, bool inDefaultField) +		    : name(std::move(name)), +		      arguments(std::move(arguments)), +		      hasRange(hasRange), +		      inField(inField), +		      inRangeField(inRangeField), +		      inDefaultField(inDefaultField) +		{ +		} +	}; + +private: +	/** +	 * Reference to the CharReader instance from which the incomming bytes are +	 * read. +	 */ +	CharReader &reader; + +	/** +	 * Reference at the logger instance to which all error messages are sent. +	 */ +	Logger &logger; + +	/** +	 * Tokenizer instance used to read individual tokens from the text. +	 */ +	Tokenizer tokenizer; + +	/** +	 * Stack containing the current commands. +	 */ +	std::stack<Command> commands; + +	/** +	 * Variant containing the data that has been read (always is a string, +	 * contains the exact location of the data in the source file). +	 */ +	Variant data; + +	/** +	 * Contains the location of the last token. +	 */ +	SourceLocation location; + +	/** +	 * Contains the field index of the current command. +	 */ +	size_t fieldIdx; + +	/** +	 * Function used internall to parse an identifier. +	 * +	 * @param start is the start byte offset of the identifier (including the +	 * backslash). +	 * @param allowNSSep should be set to true if the namespace separator is +	 * allowed in the identifier name. Issues error if the namespace separator +	 * is placed incorrectly. +	 */ +	Variant parseIdentifier(size_t start, bool allowNSSep = false); + +	/** +	 * Function used internally to handle the special "\begin" command. +	 */ +	State parseBeginCommand(); + +	/** +	 * Function used internally to handle the special "\end" command. +	 */ +	State parseEndCommand(); + +	/** +	 * Pushes the parsed command onto the command stack. +	 */ +	void pushCommand(Variant commandName, Variant commandArguments, +	                 bool hasRange); + +	/** +	 * Parses the command arguments. +	 */ +	Variant parseCommandArguments(Variant commandArgName); + +	/** +	 * Function used internally to parse a command. +	 * +	 * @param start is the start byte offset of the command (including the +	 * backslash) +	 * @param isAnnotation if true, the command is not returned as command, but +	 * as annotation start. +	 * @return true if a command was actuall parsed, false otherwise. +	 */ +	State parseCommand(size_t start, bool isAnnotation); + +	/** +	 * Function used internally to parse a block comment. +	 */ +	void parseBlockComment(); + +	/** +	 * Function used internally to parse a generic comment. +	 */ +	void parseLineComment(); + +	/** +	 * Checks whether there is any data pending to be issued, if yes, issues it. +	 * +	 * @param handler is the data handler that contains the data that may be +	 * returned to the user. +	 * @return true if there was any data and DATA should be returned by the +	 * parse function, false otherwise. +	 */ +	bool checkIssueData(DataHandler &handler); + +	/** +	 * Called before any data is appended to the internal data handler. Checks +	 * whether a new field should be started or implicitly ended. +	 * +	 * @return true if FIELD_START should be returned by the parse function. +	 */ +	bool checkIssueFieldStart(); + +	/** +	 * Closes a currently open field. Note that the command will be removed from +	 * the internal command stack if the field that is being closed is a +	 * field marked as default field. +	 * +	 * @return true if the field could be closed, false if there was no field +	 * to close. +	 */ +	bool closeField(); + +public: +	/** +	 * Constructor of the OsmlStreamParser class. Attaches the new +	 * OsmlStreamParser to the given CharReader and Logger instances. +	 * +	 * @param reader is the reader instance from which incomming characters +	 * should be read. +	 * @param logger is the logger instance to which errors should be written. +	 */ +	OsmlStreamParser(CharReader &reader, Logger &logger); + +	/** +	 * Continues parsing. Returns one of the states defined in the State enum. +	 * Callers should stop once the State::END state is reached. Use the getter +	 * functions to get more information about the current state, such as the +	 * command name or the data or the current field index. +	 * +	 * @return the new state the parser has reached. +	 */ +	State parse(); + +	/** +	 * Returns a reference at the internally stored data. Only valid if +	 * State::DATA was returned by the "parse" function. +	 * +	 * @return a reference at a variant containing the data parsed by the +	 * "parse" function. +	 */ +	const Variant &getData() const { return data; } + +	/** +	 * Returns a reference at the internally stored command name. Only valid if +	 * State::COMMAND was returned by the "parse" function. +	 * +	 * @return a reference at a variant containing name and location of the +	 * parsed command. +	 */ +	const Variant &getCommandName() const; + +	/** +	 * Returns a reference at the internally stored command name. Only valid if +	 * State::COMMAND was returned by the "parse" function. +	 * +	 * @return a reference at a variant containing arguments given to the +	 * command. +	 */ +	const Variant &getCommandArguments() const; + +	/** +	 * Returns true if the current field is the "default" field. This is true if +	 * the parser either is in the outer range of a range command or inside a +	 * field that has been especially marked as "default" field (using the "|" +	 * syntax). +	 */ +	bool inDefaultField() const; + +	/** +	 * Returns a reference at the char reader. +	 * +	 * @return the last internal token location. +	 */ +	const SourceLocation &getLocation() const { return location; } +}; +} + +#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */ +  | 
