/*
Ousía
Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include "OsxmlEventParser.hpp"
namespace ousia {
/**
* Class containing data used by the internal functions.
*/
class OsxmlEventParserData {
public:
/**
* Contains the current depth of the parsing process.
*/
ssize_t depth;
/**
* Set to a value larger or equal to zero if the parser is currently inside
* an annotation end tag -- the value represents the depth in which the
* tag was opened.
*/
ssize_t annotationEndTagDepth;
/**
* Default constructor.
*/
OsxmlEventParserData() : depth(0), annotationEndTagDepth(-1) {}
/**
* Increments the depth.
*/
void incrDepth() { depth++; }
/**
* Decrement the depth and reset the annotationEndTagDepth flag.
*/
void decrDepth()
{
if (depth > 0) {
depth--;
}
if (depth < annotationEndTagDepth) {
annotationEndTagDepth = -1;
}
}
/**
* Returns true if we're currently inside an end tag.
*/
bool inAnnotationEndTag() { depth >= annotationEndTagDepth; }
};
namespace {
/**
* Wrapper class around the XML_Parser pointer which safely frees it whenever
* the scope is left (e.g. because an exception was thrown).
*/
class ScopedExpatXmlParser {
private:
/**
* Internal pointer to the XML_Parser instance.
*/
XML_Parser parser;
public:
/**
* Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS
* from the expat library. Throws a parser exception if the XML parser
* cannot be initialized.
*
* @param encoding is the protocol-defined encoding passed to expat (or
* nullptr if expat should determine the encoding by itself).
*/
ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr)
{
parser = XML_ParserCreate(encoding);
if (!parser) {
throw LoggableException{
"Internal error: Could not create expat XML parser!"};
}
}
/**
* Destuctor of the ScopedExpatXmlParser, frees the XML parser instance.
*/
~ScopedExpatXmlParser()
{
if (parser) {
XML_ParserFree(parser);
parser = nullptr;
}
}
/**
* Returns the XML_Parser pointer.
*/
XML_Parser operator&() { return parser; }
};
/**
* Enum used internally in the statemachine of the micro-xml argument parser.
*/
enum class XmlAttributeState {
IN_TAG_NAME,
SEARCH_ATTR,
IN_ATTR_NAME,
HAS_ATTR_NAME,
HAS_ATTR_EQUALS,
IN_ATTR_DATA
};
/**
* Function used to reconstruct the location of the attributes of a XML tag in
* the source code. This is necessary, as the xml parser only returns an offset
* to the begining of a tag and not to the position of the individual arguments.
*
* @param reader is the char reader from which the character data should be
* read.
* @param offs is a byte offset in the xml file pointing at the "<" character of
* the tag.
* @return a map from attribute keys to the corresponding location (including
* range) of the atribute. Also contains the location of the tagname in the
* form of the virtual attribute "$tag".
*/
static std::map xmlReconstructAttributeOffsets(
CharReader &reader, size_t offs)
{
std::map res;
// Fork the reader, we don't want to mess up the XML parsing process, do we?
CharReaderFork readerFork = reader.fork();
// Move the read cursor to the start location, abort if this does not work
if (!location.isValid() || offs != readerFork.seek(offs)) {
return res;
}
// Now all we need to do is to implement one half of an XML parser. As this
// is inherently complicated we'll totaly fail at it. Don't care. All we
// want to get is those darn offsets for pretty error messages... (and we
// can assume the XML is valid as it was already read by expat)
XmlAttributeState state = XmlAttributeState::IN_TAG_NAME;
char c;
std::stringstream attrName;
while (readerFork.read(c)) {
// Abort at the end of the tag
if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) {
return res;
}
// One state machine to rule them all, one state machine to find them,
// One state machine to bring them all and in the darkness bind them
// (the byte offsets)
switch (state) {
case XmlAttributeState::IN_TAG_NAME:
if (Utils::isWhitespace(c)) {
res.emplace("$tag",
SourceLocation{reader.getSourceId(), offs + 1,
readerFork.getOffset() - 1});
state = XmlAttributeState::SEARCH_ATTR;
}
break;
case XmlAttributeState::SEARCH_ATTR:
if (!Utils::isWhitespace(c)) {
state = XmlAttributeState::IN_ATTR_NAME;
attrName << c;
}
break;
case XmlAttributeState::IN_ATTR_NAME:
if (Utils::isWhitespace(c)) {
state = XmlAttributeState::HAS_ATTR_NAME;
} else if (c == '=') {
state = XmlAttributeState::HAS_ATTR_EQUALS;
} else {
attrName << c;
}
break;
case XmlAttributeState::HAS_ATTR_NAME:
if (!Utils::isWhitespace(c)) {
if (c == '=') {
state = XmlAttributeState::HAS_ATTR_EQUALS;
break;
}
// Well, this is a strange XML file... We expected to
// see a '=' here! Try to continue with the
// "HAS_ATTR_EQUALS" state as this state will hopefully
// inlcude some error recovery
} else {
// Skip whitespace here
break;
}
// Fallthrough
case XmlAttributeState::HAS_ATTR_EQUALS:
if (!Utils::isWhitespace(c)) {
if (c == '"') {
// Here we are! We have found the beginning of an
// attribute. Let's quickly lock the current offset away
// in the result map
res.emplace(attrName.str(),
SourceLocation{reader.getSourceId(),
readerFork.getOffset()});
state = XmlAttributeState::IN_ATTR_DATA;
} else {
// No, this XML file is not well formed. Assume we're in
// an attribute name once again
attrName.str(std::string{&c, 1});
state = XmlAttributeState::IN_ATTR_NAME;
}
}
break;
case XmlAttributeState::IN_ATTR_DATA:
if (c == '"') {
// We're at the end of the attribute data, set the end
// location
auto it = res.find(attrName.str());
if (it != res.end()) {
it->second.setEnd(readerFork.getOffset() - 1);
}
// Reset the attribute name and restart the search
attrName.str(std::string{});
state = XmlAttributeState::SEARCH_ATTR;
}
break;
}
}
return res;
}
/**
* Synchronizes the position of the xml parser with the default location of the
* logger instance.
*
* @param p is a pointer at the xml parser instance.
* @param len is the length of the string that should be refered to.
* @return the SourceLocation that has been set in the logger.
*/
static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0)
{
// Fetch the OsxmlEventParser instance
OsxmlEventParser *parser =
static_cast(XML_GetUserData(p));
// Fetch the current location in the XML file and set the default location
// in the logger
size_t offs = XML_GetCurrentByteIndex(p);
SourceLocation loc =
SourceLocation{parser->getReader().getSourceId(), offs, offs + len};
parser->getLogger().setDefaultLocation(location);
// Return the fetched location
return loc;
}
/**
* Prefix used to indicate the start of an annoation,
*/
static const std::string ANNOTATION_START_PREFIX{"a:start:"};
/**
* Prefix used to indicate the end of an annotation.
*/
static const std::string ANNOTATION_END_PREFIX{"a:end"};
/**
* Callback called by eXpat whenever a start handler is reached.
*/
static void xmlStartElementHandler(void *ref, const XML_Char *name,
const XML_Char **attrs)
{
// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
XML_Parser p = static_cast(ref);
OsxmlEventParser *parser = static_cast(XML_GetUserData(p));
// Read the argument locations -- this is only a stupid and slow hack,
// but it is necessary, as expat doesn't give use the byte offset of the
// arguments.
std::map attributeOffsets =
xmlReconstructXMLAttributeOffsets(*userData->reader,
XML_GetCurrentByteIndex(p));
// Update the logger position
SourceLocation loc = xmlSyncLoggerPosition(p);
// Fetch the location of the name
SourceLocation nameLoc = loc;
auto it = attributeOffsets.find("$tag");
if (it != attributeOffsets.end()) {
nameLoc = it->second;
}
// Increment the current depth
parser->getData().incrDepth();
// Make sure we're currently not inside an annotation end tag -- this would
// be highly illegal!
if (parser->getData().inAnnotationEndTag()) {
logger.error("No tags allowed inside an annotation end tag", nameLoc);
return;
}
// Assemble the arguments
Variant::mapType args;
const XML_Char **attr = attrs;
while (*attr) {
// Convert the C string to a std::string
const std::string key{*(attr++)};
// Search the location of the key
SourceLocation keyLoc;
auto it = attributeOffsets.find(key);
if (it != attributeOffsets.end()) {
keyLoc = it->second;
}
// Parse the string, pass the location of the key
std::pair value = VariantReader::parseGenericString(
*(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(),
keyLoc.getStart());
// Set the overall location of the parsed element to the attribute
// location
value.second->setLocation(keyLoc);
// Store the
if (!args.emplace(key, value.second).second) {
parser->getLogger().warning(
std::string("Attribute \"") + key +
"\" defined multiple times, only using first definition",
keyLoc);
}
}
// Fetch the name of the tag, check for special tags
std::string nameStr(name);
if (nameStr == "ousia" && parser->getData().depth == 1) {
// We're in the top-level and the magic "ousia" tag is reached -- just
// ignore it and issue a warning for each argument that has been given
for (const auto &arg : args) {
parser->getLogger().warning(
std::string("Ignoring attribute \"") + arg.first +
std::string("\" for magic tag \"ousia\""),
arg.second);
}
} else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) {
// Assemble a name variant containing the name minus the prefix
Variant nameVar = nameStr.substr(ANNOTATION_START_PREFIX.size());
nameVar.setLocation(nameLoc);
// Issue the "annotationStart" event
parser->getEvents().annotationStart(nameVar, args);
} else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) {
// Assemble a name variant containing the name minus the prefix
nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size());
// Discard a potentially leading colon
if (!nameStr.empty() && nameStr[0] == ':') {
nameStr = nameStr.substr(1);
}
// Assemble the variant containing the name and its location
Variant nameVar = Variant::fromString(nameStr);
nameVar.setLocation(nameLoc);
// Check whether a "name" attribute was given
Variant elementName;
for (const auto &arg : args) {
if (arg.first == "name") {
elementName = arg.second;
} else {
parser->getLogger().warning(
std::string("Ignoring attribute \"") + arg.first +
"\" in annotation end tag",
arg.second);
}
}
// Set the annotationEndTagDepth to disallow any further tags to be
// opened inside the annotation end tag.
parser->getData().annotationEndTagDepth = parser->getData().depth;
// Issue the "annotationEnd" event
parser->getEvents().annotationEnd(nameVar, args);
} else {
// Just issue a "commandStart" event in any other case
Variant nameVar = Variant::fromString(nameStr);
nameVar.setLocation(nameLoc);
parser->getEvents().commandStart(nameVar, args);
}
}
static void xmlEndElementHandler(void *p, const XML_Char *name)
{
// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
XML_Parser p = static_cast(ref);
OsxmlEventParser *parser = static_cast(XML_GetUserData(p));
// Synchronize the position of the logger with teh position
xmlSyncLoggerPosition(parser);
// Decrement the current depth
parser->getData().decrDepth();
// Abort as long as we're in an annotation end tag
if (parser->getData().inAnnotationEndTag()) {
return;
}
// Abort if the special ousia tag ends here
if (nameStr == "ousia" && parser->getData().depth == 0) {
return;
}
// Issue the "fieldEnd" event
parser->getEvents().fieldEnd();
}
static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len)
{
// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
XML_Parser p = static_cast(ref);
OsxmlEventParser *parser = static_cast(XML_GetUserData(p));
// TODO
/* size_t ulen = len > 0 ? static_cast(len) : 0;
syncLoggerPosition(parser, ulen);
const std::string data = Utils::trim(std::string{s, ulen});
if (!data.empty()) {
stack->data(data);
}*/
}
}
/* Class OsxmlEventParser */
OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
Logger &logger)
: reader(reader),
events(events),
logger(logger),
whitespaceMode(WhitespaceMode::COLLAPSE),
data(new OsxmlEventParserData())
{
}
void OsxmlEventParser::parse(CharReader &reader)
{
// Create the parser object
ScopedExpatXmlParser p{"UTF-8"};
// Reset the depth
depth = 0;
// Pass the reference to the ParserStack to the XML handler
XMLUserData data(&stack, &reader);
XML_SetUserData(&p, this);
XML_UseParserAsHandlerArg(&p);
// Set the callback functions
XML_SetStartElementHandler(&p, xmlStartElementHandler);
XML_SetEndElementHandler(&p, xmlEndElementHandler);
XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler);
// Feed data into expat while there is data to process
constexpr size_t BUFFER_SIZE = 64 * 1024;
while (true) {
// Fetch a buffer from expat for the input data
char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE));
if (!buf) {
throw OusiaException{"Internal error: XML parser out of memory!"};
}
// Read into the buffer
size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE);
// Parse the data and handle any XML error as exception
if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) {
throw LoggableException{
"XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))},
xmlSyncLoggerPosition(p)};
}
// Abort once there are no more bytes in the stream
if (bytesRead == 0) {
break;
}
}
}
void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
{
this->whitespaceMode = whitespaceMode;
}
CharReader &OsxmlEventParser::getCharReader() { return charReader; }
Logger &OsxmlEventParser::getLogger() { return logger; }
OsxmlEvents &OsxmlEventParser::getEvents() { return events; }
OsxmlEventParserData &OsxmlEventParser::getData() { return *data; }
}