/* Ousía Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" namespace ousia { /* Class OsxmlEventParser */ /** * Class containing data used by the internal functions. */ class OsxmlEventParserData { public: /** * Current character data buffer. */ TokenizedData data; /** * Contains the current depth of the parsing process. */ ssize_t depth; /** * Set to a value larger or equal to zero if the parser is currently inside * an annotation end tag -- the value represents the depth in which the * tag was opened. */ ssize_t annotationEndTagDepth; /** * Constructor taking the sourceId of the file from which the XML is being * parsed. * * @param sourceId is the source if of the XML file from which the data is * currently being parsed. */ OsxmlEventParserData(SourceId sourceId); /** * Increments the depth. */ void incrDepth(); /** * Decrement the depth and reset the annotationEndTagDepth flag. */ void decrDepth(); /** * Returns true if we're currently inside an end tag. */ bool inAnnotationEndTag(); /** * Returns true if character data is available. * * @return true if character data is available. */ bool hasText(); }; /* Class GuardedExpatXmlParser */ /** * Wrapper class around the XML_Parser pointer which safely frees it whenever * the scope is left (e.g. because an exception was thrown). */ class GuardedExpatXmlParser { private: /** * Internal pointer to the XML_Parser instance. */ XML_Parser parser; public: /** * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS * from the expat library. Throws a parser exception if the XML parser * cannot be initialized. * * @param encoding is the protocol-defined encoding passed to expat (or * nullptr if expat should determine the encoding by itself). */ GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) { parser = XML_ParserCreate(encoding); if (!parser) { throw LoggableException{ "Internal error: Could not create expat XML parser!"}; } } /** * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance. */ ~GuardedExpatXmlParser() { if (parser) { XML_ParserFree(parser); parser = nullptr; } } /** * Returns the XML_Parser pointer. */ XML_Parser operator&() { return parser; } }; /** * Name of the special outer tag used for allowing multiple top-level elements * in an xml file. */ static const std::string TOP_LEVEL_TAG{"ousia"}; /** * Prefix used to indicate the start of an annoation (note the trailing colon). */ static const std::string ANNOTATION_START_PREFIX{"a:start:"}; /** * Prefix used to indicate the end of an annotation. */ static const std::string ANNOTATION_END_PREFIX{"a:end"}; /** * Synchronizes the position of the xml parser with the default location of the * logger instance. * * @param p is a pointer at the xml parser instance. * @param len is the length of the string that should be refered to. * @return the SourceLocation that has been set in the logger. */ static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) { // Fetch the OsxmlEventParser instance OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); // Fetch the current location in the XML file and set the default location // in the logger size_t offs = XML_GetCurrentByteIndex(p); SourceLocation loc = SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; parser->getLogger().setDefaultLocation(loc); // Return the fetched location return loc; } /** * Callback called by eXpat whenever a start handler is reached. */ static void xmlStartElementHandler(void *ref, const XML_Char *name, const XML_Char **attrs) { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { TokenizedData &data = parser->getData().data; parser->getEvents().data(data); data.clear(); } // Read the argument locations -- this is only a stupid and slow hack, // but it is necessary, as expat doesn't give use the byte offset of the // arguments. std::map attributeOffsets = OsxmlAttributeLocator::locate(parser->getReader(), XML_GetCurrentByteIndex(p)); // Update the logger position SourceLocation loc = xmlSyncLoggerPosition(p); // Fetch the location of the name SourceLocation nameLoc = loc; auto it = attributeOffsets.find("$tag"); if (it != attributeOffsets.end()) { nameLoc = it->second; } // Increment the current depth parser->getData().incrDepth(); // Make sure we're currently not inside an annotation end tag -- this would // be highly illegal! if (parser->getData().inAnnotationEndTag()) { parser->getLogger().error( "No tags allowed inside an annotation end tag", nameLoc); return; } // Assemble the arguments Variant::mapType args; const XML_Char **attr = attrs; while (*attr) { // Convert the C string to a std::string const std::string key{*(attr++)}; // Ignore xml namespace declarations if (Utils::startsWith(key, "xmlns:") && parser->getData().depth == 1) { attr++; continue; } // Search the location of the key SourceLocation keyLoc; auto it = attributeOffsets.find(key); if (it != attributeOffsets.end()) { keyLoc = it->second; } // Parse the string, pass the location of the key std::pair value = VariantReader::parseGenericString( *(attr++), parser->getLogger(), keyLoc.getSourceId(), keyLoc.getStart()); // Set the overall location of the parsed element to the attribute // location value.second.setLocation(keyLoc); // Store the keys in the map args.emplace(key, value.second); } // Fetch the name of the tag, check for special tags std::string nameStr(name); if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) { // We're in the top-level and the magic tag is reached -- just // ignore it and issue a warning for each argument that has been given for (const auto &arg : args) { parser->getLogger().warning(std::string("Ignoring attribute \"") + arg.first + std::string("\" for magic tag \"") + TOP_LEVEL_TAG + std::string("\""), arg.second); } } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { // Assemble a name variant containing the name minus the prefix Variant nameVar = Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size())); nameVar.setLocation(nameLoc); // Issue the "annotationStart" event parser->getEvents().annotationStart(nameVar, args); } else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) { // Assemble a name variant containing the name minus the prefix nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size()); // Discard a potentially leading colon if (!nameStr.empty() && nameStr[0] == ':') { nameStr = nameStr.substr(1); } // Assemble the variant containing the name and its location Variant nameVar = Variant::fromString(nameStr); nameVar.setLocation(nameLoc); // Check whether a "name" attribute was given Variant elementName; for (const auto &arg : args) { if (arg.first == "name") { elementName = arg.second; } else { parser->getLogger().warning( std::string("Ignoring attribute \"") + arg.first + "\" in annotation end tag", arg.second); } } // Set the annotationEndTagDepth to disallow any further tags to be // opened inside the annotation end tag. parser->getData().annotationEndTagDepth = parser->getData().depth; // Issue the "annotationEnd" event parser->getEvents().annotationEnd(nameVar, args); } else { // Just issue a "commandStart" event in any other case Variant nameVar = Variant::fromString(nameStr); nameVar.setLocation(nameLoc); parser->getEvents().commandStart(nameVar, args); } } static void xmlEndElementHandler(void *ref, const XML_Char *name) { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); // Synchronize the position of the logger with teh position xmlSyncLoggerPosition(p); // Abort as long as we're in an annotation end tag if (parser->getData().inAnnotationEndTag()) { parser->getData().decrDepth(); return; } // Decrement the current depth parser->getData().decrDepth(); // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { TokenizedData &data = parser->getData().data; parser->getEvents().data(data); data.clear(); } // Abort if the special ousia tag ends here std::string nameStr{name}; if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) { return; } // Issue the "rangeEnd" event parser->getEvents().rangeEnd(); } static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); // Abort as long as we're in an annotation end tag if (parser->getData().inAnnotationEndTag()) { return; } // Convert the signed (smell the 90's C library here?) length to an usigned // value size_t ulen = len > 0 ? static_cast(len) : 0; // Synchronize the logger position SourceLocation loc = xmlSyncLoggerPosition(p, ulen); // Append the data to the buffer parser->getData().data.append(std::string(s, ulen), loc.getStart()); } /* Class OsxmlEvents */ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId) : data(sourceId), depth(0), annotationEndTagDepth(-1) { } void OsxmlEventParserData::incrDepth() { depth++; } void OsxmlEventParserData::decrDepth() { if (depth > 0) { depth--; } if (depth < annotationEndTagDepth) { annotationEndTagDepth = -1; } } bool OsxmlEventParserData::inAnnotationEndTag() { return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); } bool OsxmlEventParserData::hasText() { return !data.empty(); } /* Class OsxmlEventParser */ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger) : reader(reader), events(events), logger(logger), data(new OsxmlEventParserData(reader.getSourceId())) { } OsxmlEventParser::~OsxmlEventParser() {} void OsxmlEventParser::parse() { // Create the parser object GuardedExpatXmlParser p{"UTF-8"}; // Reset the depth data->depth = 0; // Pass the reference to this parser instance to the XML handler XML_SetUserData(&p, this); XML_UseParserAsHandlerArg(&p); // Set the callback functions XML_SetStartElementHandler(&p, xmlStartElementHandler); XML_SetEndElementHandler(&p, xmlEndElementHandler); XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); // Feed data into expat while there is data to process constexpr size_t BUFFER_SIZE = 64 * 1024; while (true) { // Fetch a buffer from expat for the input data char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); if (!buf) { throw OusiaException{"Internal error: XML parser out of memory!"}; } // Read into the buffer size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); // Parse the data and handle any XML error as exception if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { throw LoggableException{ "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, xmlSyncLoggerPosition(&p)}; } // Abort once there are no more bytes in the stream if (bytesRead == 0) { break; } } } CharReader &OsxmlEventParser::getReader() const { return reader; } Logger &OsxmlEventParser::getLogger() const { return logger; } OsxmlEvents &OsxmlEventParser::getEvents() const { return events; } OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; } }