diff options
| author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-15 21:32:54 +0100 | 
|---|---|---|
| committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-15 21:32:54 +0100 | 
| commit | 8e5e08c4f293434585d2a88f7f331f8ce49b67b9 (patch) | |
| tree | fa82a937b1ea80f45d7955938c333f68f8a0f3f6 /src/formats/osxml | |
| parent | 2544749215bc2465bfeca431e271110ca86d8a83 (diff) | |
| parent | 40f4666c43211d9071a827ad8a2524688e7f678f (diff) | |
Merge branch 'astoecke_parser_stack_new'
Conflicts:
	application/src/core/parser/stack/DocumentHandler.cpp
	application/src/core/parser/stack/DocumentHandler.hpp
Diffstat (limited to 'src/formats/osxml')
| -rw-r--r-- | src/formats/osxml/OsxmlAttributeLocator.cpp | 144 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlAttributeLocator.hpp | 67 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 547 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 217 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlParser.cpp | 98 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlParser.hpp | 55 | 
6 files changed, 1128 insertions, 0 deletions
diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp new file mode 100644 index 0000000..e37446a --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.cpp @@ -0,0 +1,144 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/Location.hpp> +#include <core/common/CharReader.hpp> +#include <core/common/Utils.hpp> + +#include "OsxmlAttributeLocator.hpp" + +namespace ousia { + +/** + * Enum used internally in the statemachine of the xml argument parser. + */ +enum class XmlAttributeState { +	IN_TAG_NAME, +	SEARCH_ATTR, +	IN_ATTR_NAME, +	HAS_ATTR_NAME, +	HAS_ATTR_EQUALS, +	IN_ATTR_DATA +}; + +std::map<std::string, SourceLocation> OsxmlAttributeLocator::locate( +    CharReader &reader, size_t offs) +{ +	std::map<std::string, SourceLocation> res; + +	// Fork the reader, we don't want to mess up the XML parsing process, do we? +	CharReaderFork readerFork = reader.fork(); + +	// Move the read cursor to the start location, abort if this does not work +	if (offs != readerFork.seek(offs)) { +		return res; +	} + +	// Now all we need to do is to implement one half of an XML parser. As this +	// is inherently complicated we'll totaly fail at it. Don't care. All we +	// want to get is those darn offsets for pretty error messages... (and we +	// can assume the XML is valid as it was already read by expat) +	XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; +	char c; +	std::stringstream attrName; +	while (readerFork.read(c)) { +		// Abort at the end of the tag +		if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { +			return res; +		} + +		// One state machine to rule them all, one state machine to find them, +		// One state machine to bring them all and in the darkness bind them +		// (the byte offsets) +		switch (state) { +			case XmlAttributeState::IN_TAG_NAME: +				if (Utils::isWhitespace(c)) { +					res.emplace("$tag", +					            SourceLocation{reader.getSourceId(), offs + 1, +					                           readerFork.getOffset() - 1}); +					state = XmlAttributeState::SEARCH_ATTR; +				} +				break; +			case XmlAttributeState::SEARCH_ATTR: +				if (!Utils::isWhitespace(c)) { +					state = XmlAttributeState::IN_ATTR_NAME; +					attrName << c; +				} +				break; +			case XmlAttributeState::IN_ATTR_NAME: +				if (Utils::isWhitespace(c)) { +					state = XmlAttributeState::HAS_ATTR_NAME; +				} else if (c == '=') { +					state = XmlAttributeState::HAS_ATTR_EQUALS; +				} else { +					attrName << c; +				} +				break; +			case XmlAttributeState::HAS_ATTR_NAME: +				if (!Utils::isWhitespace(c)) { +					if (c == '=') { +						state = XmlAttributeState::HAS_ATTR_EQUALS; +						break; +					} +					// Well, this is a strange XML file... We expected to +					// see a '=' here! Try to continue with the +					// "HAS_ATTR_EQUALS" state as this state will hopefully +					// inlcude some error recovery +				} else { +					// Skip whitespace here +					break; +				} +			// Fallthrough +			case XmlAttributeState::HAS_ATTR_EQUALS: +				if (!Utils::isWhitespace(c)) { +					if (c == '"') { +						// Here we are! We have found the beginning of an +						// attribute. Let's quickly lock the current offset away +						// in the result map +						res.emplace(attrName.str(), +						            SourceLocation{reader.getSourceId(), +						                           readerFork.getOffset()}); +						state = XmlAttributeState::IN_ATTR_DATA; +					} else { +						// No, this XML file is not well formed. Assume we're in +						// an attribute name once again +						attrName.str(std::string{&c, 1}); +						state = XmlAttributeState::IN_ATTR_NAME; +					} +				} +				break; +			case XmlAttributeState::IN_ATTR_DATA: +				if (c == '"') { +					// We're at the end of the attribute data, set the end +					// location +					auto it = res.find(attrName.str()); +					if (it != res.end()) { +						it->second.setEnd(readerFork.getOffset() - 1); +					} + +					// Reset the attribute name and restart the search +					attrName.str(std::string{}); +					state = XmlAttributeState::SEARCH_ATTR; +				} +				break; +		} +	} +	return res; +} +} + diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp new file mode 100644 index 0000000..f9a3437 --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.hpp @@ -0,0 +1,67 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlAttributeLocator.hpp + * + * Contains a class used for locating the byte offsets of the attributes given + * in a XML tag. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ +#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ + +#include <map> + +namespace ousia { + +// Forward declarations +class CharReader; +class SourceLocation; + +/** + * Class containing one static function for locating the byte offsets of the + * attributes in a XML tag. This are not retrieved by our xml parser, so we have + * to do this manually. + */ +class OsxmlAttributeLocator { +public: +	/** +	 * Function used to reconstruct the location of the attributes of a XML tag +	 * in the source code. This is necessary, as the xml parser only returns an +	 * offset to the begining of a tag and not to the position of the individual +	 * arguments. +	 * +	 * @param reader is the char reader from which the character data should be +	 * read. +	 * @param offs is a byte offset in the xml file pointing at the "<" +	 * character of the tag. +	 * @return a map from attribute keys to the corresponding location +	 * (including range) of the atribute. Also contains the location of the +	 * tagname in the form of the virtual attribute "$tag". +	 */ +	static std::map<std::string, SourceLocation> locate(CharReader &reader, +	                                                    size_t offs); +}; + +} + +#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */ + diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp new file mode 100644 index 0000000..7404960 --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -0,0 +1,547 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <expat.h> + +#include <vector> + +#include <core/common/CharReader.hpp> +#include <core/common/Logger.hpp> +#include <core/common/Variant.hpp> +#include <core/common/VariantReader.hpp> +#include <core/common/Utils.hpp> +#include <core/common/WhitespaceHandler.hpp> + +#include "OsxmlAttributeLocator.hpp" +#include "OsxmlEventParser.hpp" + +namespace ousia { + +/* Class OsxmlEventParser */ + +/** + * Class containing data used by the internal functions. + */ +class OsxmlEventParserData { +public: +	/** +	 * Contains the current depth of the parsing process. +	 */ +	ssize_t depth; + +	/** +	 * Set to a value larger or equal to zero if the parser is currently inside +	 * an annotation end tag -- the value represents the depth in which the +	 * tag was opened. +	 */ +	ssize_t annotationEndTagDepth; + +	/** +	 * Current character data buffer. +	 */ +	std::vector<char> textBuf; + +	/** +	 * Current whitespace buffer (for the trimming whitspace mode) +	 */ +	std::vector<char> whitespaceBuf; + +	/** +	 * Flag indicating whether a whitespace character was present (for the +	 * collapsing whitespace mode). +	 */ +	bool hasWhitespace; + +	/** +	 * Current character data start. +	 */ +	size_t textStart; + +	/** +	 * Current character data end. +	 */ +	size_t textEnd; + +	/** +	 * Default constructor. +	 */ +	OsxmlEventParserData(); + +	/** +	 * Increments the depth. +	 */ +	void incrDepth(); + +	/** +	 * Decrement the depth and reset the annotationEndTagDepth flag. +	 */ +	void decrDepth(); + +	/** +	 * Returns true if we're currently inside an end tag. +	 */ +	bool inAnnotationEndTag(); + +	/** +	 * Returns true if character data is available. +	 * +	 * @return true if character data is available. +	 */ +	bool hasText(); + +	/** +	 * Returns a Variant containing the character data and its location. +	 * +	 * @return a string variant containing the text data and the character +	 * location. +	 */ +	Variant getText(SourceId sourceId); +}; + +/* Class GuardedExpatXmlParser */ + +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class GuardedExpatXmlParser { +private: +	/** +	 * Internal pointer to the XML_Parser instance. +	 */ +	XML_Parser parser; + +public: +	/** +	 * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS +	 * from the expat library. Throws a parser exception if the XML parser +	 * cannot be initialized. +	 * +	 * @param encoding is the protocol-defined encoding passed to expat (or +	 * nullptr if expat should determine the encoding by itself). +	 */ +	GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) +	{ +		parser = XML_ParserCreate(encoding); +		if (!parser) { +			throw LoggableException{ +			    "Internal error: Could not create expat XML parser!"}; +		} +	} + +	/** +	 * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance. +	 */ +	~GuardedExpatXmlParser() +	{ +		if (parser) { +			XML_ParserFree(parser); +			parser = nullptr; +		} +	} + +	/** +	 * Returns the XML_Parser pointer. +	 */ +	XML_Parser operator&() { return parser; } +}; + +/** + * Name of the special outer tag used for allowing multiple top-level elements + * in an xml file. + */ +static const std::string TOP_LEVEL_TAG{"ousia"}; + +/** + * Prefix used to indicate the start of an annoation (note the trailing colon) + */ +static const std::string ANNOTATION_START_PREFIX{"a:start:"}; + +/** + * Prefix used to indicate the end of an annotation. + */ +static const std::string ANNOTATION_END_PREFIX{"a:end"}; + +/** + * Synchronizes the position of the xml parser with the default location of the + * logger instance. + * + * @param p is a pointer at the xml parser instance. + * @param len is the length of the string that should be refered to. + * @return the SourceLocation that has been set in the logger. + */ +static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) +{ +	// Fetch the OsxmlEventParser instance +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// Fetch the current location in the XML file and set the default location +	// in the logger +	size_t offs = XML_GetCurrentByteIndex(p); +	SourceLocation loc = +	    SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; +	parser->getLogger().setDefaultLocation(loc); + +	// Return the fetched location +	return loc; +} + +/** + * Callback called by eXpat whenever a start handler is reached. + */ +static void xmlStartElementHandler(void *ref, const XML_Char *name, +                                   const XML_Char **attrs) +{ +	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser +	XML_Parser p = static_cast<XML_Parser>(ref); +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// If there is any text data in the buffer, issue that first +	if (parser->getData().hasText()) { +		parser->getEvents().data( +		    parser->getData().getText(parser->getReader().getSourceId())); +	} + +	// Read the argument locations -- this is only a stupid and slow hack, +	// but it is necessary, as expat doesn't give use the byte offset of the +	// arguments. +	std::map<std::string, SourceLocation> attributeOffsets = +	    OsxmlAttributeLocator::locate(parser->getReader(), +	                                  XML_GetCurrentByteIndex(p)); + +	// Update the logger position +	SourceLocation loc = xmlSyncLoggerPosition(p); + +	// Fetch the location of the name +	SourceLocation nameLoc = loc; +	auto it = attributeOffsets.find("$tag"); +	if (it != attributeOffsets.end()) { +		nameLoc = it->second; +	} +	// Increment the current depth +	parser->getData().incrDepth(); + +	// Make sure we're currently not inside an annotation end tag -- this would +	// be highly illegal! +	if (parser->getData().inAnnotationEndTag()) { +		parser->getLogger().error( +		    "No tags allowed inside an annotation end tag", nameLoc); +		return; +	} + +	// Assemble the arguments +	Variant::mapType args; +	const XML_Char **attr = attrs; +	while (*attr) { +		// Convert the C string to a std::string +		const std::string key{*(attr++)}; + +		// Search the location of the key +		SourceLocation keyLoc; +		auto it = attributeOffsets.find(key); +		if (it != attributeOffsets.end()) { +			keyLoc = it->second; +		} + +		// Parse the string, pass the location of the key +		std::pair<bool, Variant> value = VariantReader::parseGenericString( +		    *(attr++), parser->getLogger(), keyLoc.getSourceId(), +		    keyLoc.getStart()); + +		// Set the overall location of the parsed element to the attribute +		// location +		value.second.setLocation(keyLoc); + +		// Store the keys in the map +		args.emplace(key, value.second).second; +	} + +	// Fetch the name of the tag, check for special tags +	std::string nameStr(name); +	if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) { +		// We're in the top-level and the magic tag is reached -- just +		// ignore it and issue a warning for each argument that has been given +		for (const auto &arg : args) { +			parser->getLogger().warning(std::string("Ignoring attribute \"") + +			                                arg.first + +			                                std::string("\" for magic tag \"") + +			                                TOP_LEVEL_TAG + std::string("\""), +			                            arg.second); +		} +	} else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { +		// Assemble a name variant containing the name minus the prefix +		Variant nameVar = +		    Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size())); +		nameVar.setLocation(nameLoc); + +		// Issue the "annotationStart" event +		parser->getEvents().annotationStart(nameVar, args); +	} else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) { +		// Assemble a name variant containing the name minus the prefix +		nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size()); + +		// Discard a potentially leading colon +		if (!nameStr.empty() && nameStr[0] == ':') { +			nameStr = nameStr.substr(1); +		} + +		// Assemble the variant containing the name and its location +		Variant nameVar = Variant::fromString(nameStr); +		nameVar.setLocation(nameLoc); + +		// Check whether a "name" attribute was given +		Variant elementName; +		for (const auto &arg : args) { +			if (arg.first == "name") { +				elementName = arg.second; +			} else { +				parser->getLogger().warning( +				    std::string("Ignoring attribute \"") + arg.first + +				        "\" in annotation end tag", +				    arg.second); +			} +		} + +		// Set the annotationEndTagDepth to disallow any further tags to be +		// opened inside the annotation end tag. +		parser->getData().annotationEndTagDepth = parser->getData().depth; + +		// Issue the "annotationEnd" event +		parser->getEvents().annotationEnd(nameVar, args); +	} else { +		// Just issue a "commandStart" event in any other case +		Variant nameVar = Variant::fromString(nameStr); +		nameVar.setLocation(nameLoc); +		parser->getEvents().command(nameVar, args); +	} +} + +static void xmlEndElementHandler(void *ref, const XML_Char *name) +{ +	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser +	XML_Parser p = static_cast<XML_Parser>(ref); +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// Synchronize the position of the logger with teh position +	xmlSyncLoggerPosition(p); + +	// Abort as long as we're in an annotation end tag +	if (parser->getData().inAnnotationEndTag()) { +		parser->getData().decrDepth(); +		return; +	} + +	// Decrement the current depth +	parser->getData().decrDepth(); + +	// If there is any text data in the buffer, issue that first +	if (parser->getData().hasText()) { +		parser->getEvents().data( +		    parser->getData().getText(parser->getReader().getSourceId())); +	} + +	// Abort if the special ousia tag ends here +	std::string nameStr{name}; +	if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) { +		return; +	} + +	// Issue the "fieldEnd" event +	parser->getEvents().fieldEnd(); +} + +static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) +{ +	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser +	XML_Parser p = static_cast<XML_Parser>(ref); +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// Abort as long as we're in an annotation end tag +	if (parser->getData().inAnnotationEndTag()) { +		return; +	} + +	// Convert the signed (smell the 90's C library here?) length to an usigned +	// value +	size_t ulen = len > 0 ? static_cast<size_t>(len) : 0; + +	// Synchronize the logger position +	SourceLocation loc = xmlSyncLoggerPosition(p, ulen); + +	// Fetch some variables for convenience +	const WhitespaceMode mode = parser->getWhitespaceMode(); +	OsxmlEventParserData &data = parser->getData(); +	std::vector<char> &textBuf = data.textBuf; +	std::vector<char> &whitespaceBuf = data.whitespaceBuf; +	bool &hasWhitespace = data.hasWhitespace; +	size_t &textStart = data.textStart; +	size_t &textEnd = data.textEnd; + +	size_t pos = loc.getStart(); +	for (size_t i = 0; i < ulen; i++, pos++) { +		switch (mode) { +			case WhitespaceMode::PRESERVE: +				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, +				                                    textStart, textEnd); +				break; +			case WhitespaceMode::TRIM: +				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, +				                                  textStart, textEnd, +				                                  whitespaceBuf); +				break; +			case WhitespaceMode::COLLAPSE: +				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, +				                                    textStart, textEnd, +				                                    hasWhitespace); +				break; +		} +	} +} + +/* Class OsxmlEvents */ + +OsxmlEvents::~OsxmlEvents() {} + +/* Class OsxmlEventParser */ + +OsxmlEventParserData::OsxmlEventParserData() +    : depth(0), +      annotationEndTagDepth(-1), +      hasWhitespace(false), +      textStart(0), +      textEnd(0) +{ +} + +void OsxmlEventParserData::incrDepth() { depth++; } + +void OsxmlEventParserData::decrDepth() +{ +	if (depth > 0) { +		depth--; +	} +	if (depth < annotationEndTagDepth) { +		annotationEndTagDepth = -1; +	} +} + +bool OsxmlEventParserData::inAnnotationEndTag() +{ +	return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); +} + +bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } + +Variant OsxmlEventParserData::getText(SourceId sourceId) +{ +	// Create a variant containing the string data and the location +	Variant var = +	    Variant::fromString(std::string{textBuf.data(), textBuf.size()}); +	var.setLocation({sourceId, textStart, textEnd}); + +	// Reset the text buffers +	textBuf.clear(); +	whitespaceBuf.clear(); +	hasWhitespace = false; +	textStart = 0; +	textEnd = 0; + +	// Return the variant +	return var; +} + +/* Class OsxmlEventParser */ + +OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, +                                   Logger &logger) +    : reader(reader), +      events(events), +      logger(logger), +      whitespaceMode(WhitespaceMode::TRIM), +      data(new OsxmlEventParserData()) +{ +} + +OsxmlEventParser::~OsxmlEventParser() {} + +void OsxmlEventParser::parse() +{ +	// Create the parser object +	GuardedExpatXmlParser p{"UTF-8"}; + +	// Reset the depth +	data->depth = 0; + +	// Pass the reference to this parser instance to the XML handler +	XML_SetUserData(&p, this); +	XML_UseParserAsHandlerArg(&p); + +	// Set the callback functions +	XML_SetStartElementHandler(&p, xmlStartElementHandler); +	XML_SetEndElementHandler(&p, xmlEndElementHandler); +	XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + +	// Feed data into expat while there is data to process +	constexpr size_t BUFFER_SIZE = 64 * 1024; +	while (true) { +		// Fetch a buffer from expat for the input data +		char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE)); +		if (!buf) { +			throw OusiaException{"Internal error: XML parser out of memory!"}; +		} + +		// Read into the buffer +		size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + +		// Parse the data and handle any XML error as exception +		if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { +			throw LoggableException{ +			    "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, +			    xmlSyncLoggerPosition(&p)}; +		} + +		// Abort once there are no more bytes in the stream +		if (bytesRead == 0) { +			break; +		} +	} +} + +void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ +	this->whitespaceMode = whitespaceMode; +} + +WhitespaceMode OsxmlEventParser::getWhitespaceMode() const +{ +	return whitespaceMode; +} + +CharReader &OsxmlEventParser::getReader() const { return reader; } + +Logger &OsxmlEventParser::getLogger() const { return logger; } + +OsxmlEvents &OsxmlEventParser::getEvents() const { return events; } + +OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; } +} + diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp new file mode 100644 index 0000000..e39245f --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -0,0 +1,217 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlEventParser.hpp + * + * The OsxmlEventParser class is responsible for parsing an XML file and calling + * the corresponding event handler functions if an XML item is found. Event + * handling is performed using a listener interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OSXML_EVENT_PARSER_HPP_ +#define _OSXML_EVENT_PARSER_HPP_ + +#include <memory> +#include <string> + +#include <core/common/Whitespace.hpp> + +namespace ousia { + +// Forward declarations +class Logger; +class Variant; +class OsxmlEventParserData; + +/** + * Interface which defines the callback functions which are called by the + * OsxmlEventParser whenever an event occurs. + */ +class OsxmlEvents { +public: +	/** +	 * Virtual destructor. +	 */ +	virtual ~OsxmlEvents(); + +	/** +	 * Called whenever a command starts. Note that this implicitly always starts +	 * the default field of the command. +	 * +	 * @param name is a string variant containing name and location of the +	 * command. +	 * @param args is a map containing the arguments that were given to the +	 * command. +	 */ +	virtual void command(const Variant &name, const Variant::mapType &args) = 0; + +	/** +	 * Called whenever an annotation starts. Note that this implicitly always +	 * starts the default field of the annotation. +	 * +	 * @param className is a string variant containing the name of the +	 * annotation class and the location of the annotation definition. +	 * @param args is a map variant containing the arguments that were given +	 * to the annotation definition. +	 */ +	virtual void annotationStart(const Variant &className, +	                             const Variant::mapType &args) = 0; + +	/** +	 * Called whenever the range of an annotation ends. The callee must +	 * disambiguate the actual annotation that is finished here. +	 * +	 * @param className is a string variant containing the name of the +	 * annotation class that should end here. May be empty (or nullptr), if no +	 * elementName has been specified at the end of the annotation. +	 * @param elementName is the name of the annotation element that should be +	 * ended here. May be empty (or nullptr), if no elementName has been +	 * specified at the end of the annotation. +	 */ +	virtual void annotationEnd(const Variant &className, +	                           const Variant &elementName) = 0; + +	/** +	 * Called whenever the default field which was implicitly started by +	 * commandStart or annotationStart ends. Note that this does not end the +	 * range of an annotation, but the default field of the annotation. To +	 * signal the end of the annotation this, the annotationEnd method will be +	 * invoked. +	 */ +	virtual void fieldEnd() = 0; + +	/** +	 * Called whenever data is found. Whitespace data is handled as specified +	 * and the data has been parsed to the specified variant type. This function +	 * is not called if the parsing failed, the parser prints an error message +	 * instead. +	 * +	 * @param data is the already parsed data that should be passed to the +	 * handler. +	 */ +	virtual void data(const Variant &data) = 0; +}; + +/** + * The OsxmlEventParser class is a wrapper around eXpat which implements the + * specialities of the osxml formats class (like annotation ranges). It notifies + * a specified event handler whenever a command, annotation or data has been + * reached. + */ +class OsxmlEventParser { +private: +	/** +	 * Reference at the internal CharReader instance. +	 */ +	CharReader &reader; + +	/** +	 * Set of callback functions to be called whenever an event is triggered. +	 */ +	OsxmlEvents &events; + +	/** +	 * Reference at the Logger object to which error messages or warnings should +	 * be logged. +	 */ +	Logger &logger; + +	/** +	 * Current whitespace mode. +	 */ +	WhitespaceMode whitespaceMode; + +	/** +	 * Data to be used by the internal functions. +	 */ +	std::unique_ptr<OsxmlEventParserData> data; + +public: +	/** +	 * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents +	 * of which the callback functions are called. +	 * +	 * @param reader is a reference to the CharReader instance from which the +	 * XML should be read. +	 * @param events is a refence at an instance of the OsxmlEvents class. All +	 * events are forwarded to this class. +	 * @param logger is the Logger instance to which log messages should be +	 * written. +	 */ +	OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger); + +	/** +	 * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type) +	 */ +	~OsxmlEventParser(); + +	/** +	 * Performs the actual parsing. Reads the XML using eXpat and calles the +	 * callbacks in the event listener instance whenever something interesting +	 * happens. +	 */ +	void parse(); + +	/** +	 * Sets the whitespace handling mode. +	 * +	 * @param whitespaceMode defines how whitespace in the data should be +	 * handled. +	 */ +	void setWhitespaceMode(WhitespaceMode whitespaceMode); + +	/** +	 * Returns the current whitespace handling mode. +	 * +	 * @return the currently set whitespace handling mode. +	 */ +	WhitespaceMode getWhitespaceMode() const; + +	/** +	 * Returns the internal CharReader reference. +	 * +	 * @return the CharReader reference. +	 */ +	CharReader &getReader() const; + +	/** +	 * Returns the internal Logger reference. +	 * +	 * @return the internal Logger reference. +	 */ +	Logger &getLogger() const; + +	/** +	 * Returns the internal OsxmlEvents reference. +	 * +	 * @return the internal OsxmlEvents reference. +	 */ +	OsxmlEvents &getEvents() const; + +	/** +	 * Returns a reference at the internal data. +	 */ +	OsxmlEventParserData &getData() const; +}; +} + +#endif /* _OSXML_EVENT_PARSER_HPP_ */ + diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp new file mode 100644 index 0000000..c216855 --- /dev/null +++ b/src/formats/osxml/OsxmlParser.cpp @@ -0,0 +1,98 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/parser/stack/GenericParserStates.hpp> +#include <core/parser/stack/Stack.hpp> +#include <core/parser/ParserContext.hpp> + +#include "OsxmlEventParser.hpp" +#include "OsxmlParser.hpp" + +namespace ousia { + +using namespace parser_stack; + +/** + * Class containing the actual OsxmlParser implementation. + */ +class OsxmlParserImplementation : public OsxmlEvents { +private: +	/** +	 * Actual xml parser -- converts the xml stream into a set of events. +	 */ +	OsxmlEventParser parser; + +	/** +	 * Pushdown automaton responsible for converting the xml events into an +	 * actual Node tree. +	 */ +	Stack stack; + +public: +	/** +	 * Constructor of the OsxmlParserImplementation class. +	 * +	 * @param reader is a reference to the CharReader instance from which the +	 * XML should be read. +	 * @param ctx is a reference to the ParserContext instance that should be +	 * used. +	 */ +	OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) +	    : parser(reader, *this, ctx.getLogger()), +	      stack(ctx, GenericParserStates) +	{ +	} + +	/** +	 * Starts the actual parsing process. +	 */ +	void parse() { parser.parse(); } + +	void command(const Variant &name, const Variant::mapType &args) override +	{ +		stack.command(name, args); +		stack.fieldStart(true); +	} + +	void annotationStart(const Variant &name, +	                     const Variant::mapType &args) override +	{ +		stack.annotationStart(name, args); +		stack.fieldStart(true); +	} + +	void annotationEnd(const Variant &className, +	                   const Variant &elementName) override +	{ +		stack.annotationEnd(className, elementName); +	} + +	void fieldEnd() override { stack.fieldEnd(); } + +	void data(const Variant &data) override { stack.data(data); } +}; + +/* Class OsxmlParser */ + +void OsxmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ +	OsxmlParserImplementation impl(reader, ctx); +	impl.parse(); +} +} + diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp new file mode 100644 index 0000000..0fbf83c --- /dev/null +++ b/src/formats/osxml/OsxmlParser.hpp @@ -0,0 +1,55 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlParser.hpp + * + * Contains the parser responsible for reading Ousía XML Documents (extension + * oxd) and Ousía XML Modules (extension oxm). + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_PARSER_HPP_ +#define _OUSIA_OSXML_PARSER_HPP_ + +#include <core/parser/Parser.hpp> + +namespace ousia { + +/** + * The OsxmlParser class implements parsing the various types of Ousía XML + * documents using the OsxmlEventParser and Stack classes. + */ +class OsxmlParser : public Parser { +protected: +	/** +	 * Parses the given input stream as XML file and returns the parsed +	 * top-level node. +	 * +	 * @param reader is the CharReader from which the input should be read. +	 * @param ctx is a reference to the ParserContext instance that should be +	 * used. +	 */ +	void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSXML_PARSER_HPP_ */ +  | 
