summaryrefslogtreecommitdiff
path: root/src/formats
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-15 00:02:54 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-15 00:02:54 +0100
commit2659b4595d809cbd69a77e5ff7e2fc08d225f065 (patch)
treebe6a39fcf7d7070494076832a2e652ea1aa4c91e /src/formats
parent974afd3fdc54380a43445a180263fb162e1ff2c0 (diff)
Tidied OsxmlEventParser up, implemented correct whitespace handling, started to write unit tests for the osxml parser
Diffstat (limited to 'src/formats')
-rw-r--r--src/formats/osxml/OsxmlAttributeLocator.cpp144
-rw-r--r--src/formats/osxml/OsxmlAttributeLocator.hpp67
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp425
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp44
4 files changed, 462 insertions, 218 deletions
diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp
new file mode 100644
index 0000000..e37446a
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.cpp
@@ -0,0 +1,144 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/Location.hpp>
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+
+#include "OsxmlAttributeLocator.hpp"
+
+namespace ousia {
+
+/**
+ * Enum used internally in the statemachine of the xml argument parser.
+ */
+enum class XmlAttributeState {
+ IN_TAG_NAME,
+ SEARCH_ATTR,
+ IN_ATTR_NAME,
+ HAS_ATTR_NAME,
+ HAS_ATTR_EQUALS,
+ IN_ATTR_DATA
+};
+
+std::map<std::string, SourceLocation> OsxmlAttributeLocator::locate(
+ CharReader &reader, size_t offs)
+{
+ std::map<std::string, SourceLocation> res;
+
+ // Fork the reader, we don't want to mess up the XML parsing process, do we?
+ CharReaderFork readerFork = reader.fork();
+
+ // Move the read cursor to the start location, abort if this does not work
+ if (offs != readerFork.seek(offs)) {
+ return res;
+ }
+
+ // Now all we need to do is to implement one half of an XML parser. As this
+ // is inherently complicated we'll totaly fail at it. Don't care. All we
+ // want to get is those darn offsets for pretty error messages... (and we
+ // can assume the XML is valid as it was already read by expat)
+ XmlAttributeState state = XmlAttributeState::IN_TAG_NAME;
+ char c;
+ std::stringstream attrName;
+ while (readerFork.read(c)) {
+ // Abort at the end of the tag
+ if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) {
+ return res;
+ }
+
+ // One state machine to rule them all, one state machine to find them,
+ // One state machine to bring them all and in the darkness bind them
+ // (the byte offsets)
+ switch (state) {
+ case XmlAttributeState::IN_TAG_NAME:
+ if (Utils::isWhitespace(c)) {
+ res.emplace("$tag",
+ SourceLocation{reader.getSourceId(), offs + 1,
+ readerFork.getOffset() - 1});
+ state = XmlAttributeState::SEARCH_ATTR;
+ }
+ break;
+ case XmlAttributeState::SEARCH_ATTR:
+ if (!Utils::isWhitespace(c)) {
+ state = XmlAttributeState::IN_ATTR_NAME;
+ attrName << c;
+ }
+ break;
+ case XmlAttributeState::IN_ATTR_NAME:
+ if (Utils::isWhitespace(c)) {
+ state = XmlAttributeState::HAS_ATTR_NAME;
+ } else if (c == '=') {
+ state = XmlAttributeState::HAS_ATTR_EQUALS;
+ } else {
+ attrName << c;
+ }
+ break;
+ case XmlAttributeState::HAS_ATTR_NAME:
+ if (!Utils::isWhitespace(c)) {
+ if (c == '=') {
+ state = XmlAttributeState::HAS_ATTR_EQUALS;
+ break;
+ }
+ // Well, this is a strange XML file... We expected to
+ // see a '=' here! Try to continue with the
+ // "HAS_ATTR_EQUALS" state as this state will hopefully
+ // inlcude some error recovery
+ } else {
+ // Skip whitespace here
+ break;
+ }
+ // Fallthrough
+ case XmlAttributeState::HAS_ATTR_EQUALS:
+ if (!Utils::isWhitespace(c)) {
+ if (c == '"') {
+ // Here we are! We have found the beginning of an
+ // attribute. Let's quickly lock the current offset away
+ // in the result map
+ res.emplace(attrName.str(),
+ SourceLocation{reader.getSourceId(),
+ readerFork.getOffset()});
+ state = XmlAttributeState::IN_ATTR_DATA;
+ } else {
+ // No, this XML file is not well formed. Assume we're in
+ // an attribute name once again
+ attrName.str(std::string{&c, 1});
+ state = XmlAttributeState::IN_ATTR_NAME;
+ }
+ }
+ break;
+ case XmlAttributeState::IN_ATTR_DATA:
+ if (c == '"') {
+ // We're at the end of the attribute data, set the end
+ // location
+ auto it = res.find(attrName.str());
+ if (it != res.end()) {
+ it->second.setEnd(readerFork.getOffset() - 1);
+ }
+
+ // Reset the attribute name and restart the search
+ attrName.str(std::string{});
+ state = XmlAttributeState::SEARCH_ATTR;
+ }
+ break;
+ }
+ }
+ return res;
+}
+}
+
diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp
new file mode 100644
index 0000000..f9a3437
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.hpp
@@ -0,0 +1,67 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlAttributeLocator.hpp
+ *
+ * Contains a class used for locating the byte offsets of the attributes given
+ * in a XML tag.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+
+#include <map>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class SourceLocation;
+
+/**
+ * Class containing one static function for locating the byte offsets of the
+ * attributes in a XML tag. This are not retrieved by our xml parser, so we have
+ * to do this manually.
+ */
+class OsxmlAttributeLocator {
+public:
+ /**
+ * Function used to reconstruct the location of the attributes of a XML tag
+ * in the source code. This is necessary, as the xml parser only returns an
+ * offset to the begining of a tag and not to the position of the individual
+ * arguments.
+ *
+ * @param reader is the char reader from which the character data should be
+ * read.
+ * @param offs is a byte offset in the xml file pointing at the "<"
+ * character of the tag.
+ * @return a map from attribute keys to the corresponding location
+ * (including range) of the atribute. Also contains the location of the
+ * tagname in the form of the virtual attribute "$tag".
+ */
+ static std::map<std::string, SourceLocation> locate(CharReader &reader,
+ size_t offs);
+};
+
+}
+
+#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */
+
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
index 2ef170e..b4aff77 100644
--- a/src/formats/osxml/OsxmlEventParser.cpp
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -18,14 +18,22 @@
#include <expat.h>
+#include <vector>
+
+#include <core/common/CharReader.hpp>
#include <core/common/Logger.hpp>
#include <core/common/Variant.hpp>
+#include <core/common/VariantReader.hpp>
#include <core/common/Utils.hpp>
+#include <core/common/WhitespaceHandler.hpp>
+#include "OsxmlAttributeLocator.hpp"
#include "OsxmlEventParser.hpp"
namespace ousia {
+/* Class OsxmlEventParser */
+
/**
* Class containing data used by the internal functions.
*/
@@ -44,40 +52,74 @@ public:
ssize_t annotationEndTagDepth;
/**
+ * Current character data buffer.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Current whitespace buffer (for the trimming whitspace mode)
+ */
+ std::vector<char> whitespaceBuf;
+
+ /**
+ * Flag indicating whether a whitespace character was present (for the
+ * collapsing whitespace mode).
+ */
+ bool hasWhitespace;
+
+ /**
+ * Current character data start.
+ */
+ size_t textStart;
+
+ /**
+ * Current character data end.
+ */
+ size_t textEnd;
+
+ /**
* Default constructor.
*/
- OsxmlEventParserData() : depth(0), annotationEndTagDepth(-1) {}
+ OsxmlEventParserData();
/**
* Increments the depth.
*/
- void incrDepth() { depth++; }
+ void incrDepth();
/**
* Decrement the depth and reset the annotationEndTagDepth flag.
*/
- void decrDepth()
- {
- if (depth > 0) {
- depth--;
- }
- if (depth < annotationEndTagDepth) {
- annotationEndTagDepth = -1;
- }
- }
+ void decrDepth();
/**
* Returns true if we're currently inside an end tag.
*/
- bool inAnnotationEndTag() { depth >= annotationEndTagDepth; }
+ bool inAnnotationEndTag();
+
+ /**
+ * Returns true if character data is available.
+ *
+ * @return true if character data is available.
+ */
+ bool hasText();
+
+ /**
+ * Returns a Variant containing the character data and its location.
+ *
+ * @return a string variant containing the text data and the character
+ * location.
+ */
+ Variant getText(SourceId sourceId);
};
-namespace {
+/* Class GuardedExpatXmlParser */
+
/**
* Wrapper class around the XML_Parser pointer which safely frees it whenever
* the scope is left (e.g. because an exception was thrown).
*/
-class ScopedExpatXmlParser {
+class GuardedExpatXmlParser {
private:
/**
* Internal pointer to the XML_Parser instance.
@@ -86,14 +128,14 @@ private:
public:
/**
- * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS
+ * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS
* from the expat library. Throws a parser exception if the XML parser
* cannot be initialized.
*
* @param encoding is the protocol-defined encoding passed to expat (or
* nullptr if expat should determine the encoding by itself).
*/
- ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr)
+ GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr)
{
parser = XML_ParserCreate(encoding);
if (!parser) {
@@ -103,9 +145,9 @@ public:
}
/**
- * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance.
+ * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance.
*/
- ~ScopedExpatXmlParser()
+ ~GuardedExpatXmlParser()
{
if (parser) {
XML_ParserFree(parser);
@@ -120,134 +162,20 @@ public:
};
/**
- * Enum used internally in the statemachine of the micro-xml argument parser.
+ * Name of the special outer tag used for allowing multiple top-level elements
+ * in an xml file.
*/
-enum class XmlAttributeState {
- IN_TAG_NAME,
- SEARCH_ATTR,
- IN_ATTR_NAME,
- HAS_ATTR_NAME,
- HAS_ATTR_EQUALS,
- IN_ATTR_DATA
-};
+static const std::string TOP_LEVEL_TAG{"ousia"};
/**
- * Function used to reconstruct the location of the attributes of a XML tag in
- * the source code. This is necessary, as the xml parser only returns an offset
- * to the begining of a tag and not to the position of the individual arguments.
- *
- * @param reader is the char reader from which the character data should be
- * read.
- * @param offs is a byte offset in the xml file pointing at the "<" character of
- * the tag.
- * @return a map from attribute keys to the corresponding location (including
- * range) of the atribute. Also contains the location of the tagname in the
- * form of the virtual attribute "$tag".
+ * Prefix used to indicate the start of an annoation (note the trailing colon)
*/
-static std::map<std::string, SourceLocation> xmlReconstructAttributeOffsets(
- CharReader &reader, size_t offs)
-{
- std::map<std::string, SourceLocation> res;
-
- // Fork the reader, we don't want to mess up the XML parsing process, do we?
- CharReaderFork readerFork = reader.fork();
-
- // Move the read cursor to the start location, abort if this does not work
- if (!location.isValid() || offs != readerFork.seek(offs)) {
- return res;
- }
-
- // Now all we need to do is to implement one half of an XML parser. As this
- // is inherently complicated we'll totaly fail at it. Don't care. All we
- // want to get is those darn offsets for pretty error messages... (and we
- // can assume the XML is valid as it was already read by expat)
- XmlAttributeState state = XmlAttributeState::IN_TAG_NAME;
- char c;
- std::stringstream attrName;
- while (readerFork.read(c)) {
- // Abort at the end of the tag
- if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) {
- return res;
- }
+static const std::string ANNOTATION_START_PREFIX{"a:start:"};
- // One state machine to rule them all, one state machine to find them,
- // One state machine to bring them all and in the darkness bind them
- // (the byte offsets)
- switch (state) {
- case XmlAttributeState::IN_TAG_NAME:
- if (Utils::isWhitespace(c)) {
- res.emplace("$tag",
- SourceLocation{reader.getSourceId(), offs + 1,
- readerFork.getOffset() - 1});
- state = XmlAttributeState::SEARCH_ATTR;
- }
- break;
- case XmlAttributeState::SEARCH_ATTR:
- if (!Utils::isWhitespace(c)) {
- state = XmlAttributeState::IN_ATTR_NAME;
- attrName << c;
- }
- break;
- case XmlAttributeState::IN_ATTR_NAME:
- if (Utils::isWhitespace(c)) {
- state = XmlAttributeState::HAS_ATTR_NAME;
- } else if (c == '=') {
- state = XmlAttributeState::HAS_ATTR_EQUALS;
- } else {
- attrName << c;
- }
- break;
- case XmlAttributeState::HAS_ATTR_NAME:
- if (!Utils::isWhitespace(c)) {
- if (c == '=') {
- state = XmlAttributeState::HAS_ATTR_EQUALS;
- break;
- }
- // Well, this is a strange XML file... We expected to
- // see a '=' here! Try to continue with the
- // "HAS_ATTR_EQUALS" state as this state will hopefully
- // inlcude some error recovery
- } else {
- // Skip whitespace here
- break;
- }
- // Fallthrough
- case XmlAttributeState::HAS_ATTR_EQUALS:
- if (!Utils::isWhitespace(c)) {
- if (c == '"') {
- // Here we are! We have found the beginning of an
- // attribute. Let's quickly lock the current offset away
- // in the result map
- res.emplace(attrName.str(),
- SourceLocation{reader.getSourceId(),
- readerFork.getOffset()});
- state = XmlAttributeState::IN_ATTR_DATA;
- } else {
- // No, this XML file is not well formed. Assume we're in
- // an attribute name once again
- attrName.str(std::string{&c, 1});
- state = XmlAttributeState::IN_ATTR_NAME;
- }
- }
- break;
- case XmlAttributeState::IN_ATTR_DATA:
- if (c == '"') {
- // We're at the end of the attribute data, set the end
- // location
- auto it = res.find(attrName.str());
- if (it != res.end()) {
- it->second.setEnd(readerFork.getOffset() - 1);
- }
-
- // Reset the attribute name and restart the search
- attrName.str(std::string{});
- state = XmlAttributeState::SEARCH_ATTR;
- }
- break;
- }
- }
- return res;
-}
+/**
+ * Prefix used to indicate the end of an annotation.
+ */
+static const std::string ANNOTATION_END_PREFIX{"a:end"};
/**
* Synchronizes the position of the xml parser with the default location of the
@@ -268,23 +196,13 @@ static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0)
size_t offs = XML_GetCurrentByteIndex(p);
SourceLocation loc =
SourceLocation{parser->getReader().getSourceId(), offs, offs + len};
- parser->getLogger().setDefaultLocation(location);
+ parser->getLogger().setDefaultLocation(loc);
// Return the fetched location
return loc;
}
/**
- * Prefix used to indicate the start of an annoation,
- */
-static const std::string ANNOTATION_START_PREFIX{"a:start:"};
-
-/**
- * Prefix used to indicate the end of an annotation.
- */
-static const std::string ANNOTATION_END_PREFIX{"a:end"};
-
-/**
* Callback called by eXpat whenever a start handler is reached.
*/
static void xmlStartElementHandler(void *ref, const XML_Char *name,
@@ -292,14 +210,21 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
{
// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
XML_Parser p = static_cast<XML_Parser>(ref);
- OsxmlEventParser *parser = static_cast<XMLUserData *>(XML_GetUserData(p));
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // If there is any text data in the buffer, issue that first
+ if (parser->getData().hasText()) {
+ parser->getEvents().data(
+ parser->getData().getText(parser->getReader().getSourceId()));
+ }
// Read the argument locations -- this is only a stupid and slow hack,
// but it is necessary, as expat doesn't give use the byte offset of the
// arguments.
std::map<std::string, SourceLocation> attributeOffsets =
- xmlReconstructXMLAttributeOffsets(*userData->reader,
- XML_GetCurrentByteIndex(p));
+ OsxmlAttributeLocator::locate(parser->getReader(),
+ XML_GetCurrentByteIndex(p));
// Update the logger position
SourceLocation loc = xmlSyncLoggerPosition(p);
@@ -316,7 +241,8 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// Make sure we're currently not inside an annotation end tag -- this would
// be highly illegal!
if (parser->getData().inAnnotationEndTag()) {
- logger.error("No tags allowed inside an annotation end tag", nameLoc);
+ parser->getLogger().error(
+ "No tags allowed inside an annotation end tag", nameLoc);
return;
}
@@ -336,36 +262,33 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// Parse the string, pass the location of the key
std::pair<bool, Variant> value = VariantReader::parseGenericString(
- *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(),
+ *(attr++), parser->getLogger(), keyLoc.getSourceId(),
keyLoc.getStart());
// Set the overall location of the parsed element to the attribute
// location
- value.second->setLocation(keyLoc);
-
- // Store the
- if (!args.emplace(key, value.second).second) {
- parser->getLogger().warning(
- std::string("Attribute \"") + key +
- "\" defined multiple times, only using first definition",
- keyLoc);
- }
+ value.second.setLocation(keyLoc);
+
+ // Store the keys in the map
+ args.emplace(key, value.second).second;
}
// Fetch the name of the tag, check for special tags
std::string nameStr(name);
- if (nameStr == "ousia" && parser->getData().depth == 1) {
- // We're in the top-level and the magic "ousia" tag is reached -- just
+ if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) {
+ // We're in the top-level and the magic tag is reached -- just
// ignore it and issue a warning for each argument that has been given
for (const auto &arg : args) {
- parser->getLogger().warning(
- std::string("Ignoring attribute \"") + arg.first +
- std::string("\" for magic tag \"ousia\""),
- arg.second);
+ parser->getLogger().warning(std::string("Ignoring attribute \"") +
+ arg.first +
+ std::string("\" for magic tag \"") +
+ TOP_LEVEL_TAG + std::string("\""),
+ arg.second);
}
} else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) {
// Assemble a name variant containing the name minus the prefix
- Variant nameVar = nameStr.substr(ANNOTATION_START_PREFIX.size());
+ Variant nameVar =
+ Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size()));
nameVar.setLocation(nameLoc);
// Issue the "annotationStart" event
@@ -410,25 +333,34 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
}
}
-static void xmlEndElementHandler(void *p, const XML_Char *name)
+static void xmlEndElementHandler(void *ref, const XML_Char *name)
{
// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
XML_Parser p = static_cast<XML_Parser>(ref);
- OsxmlEventParser *parser = static_cast<XMLUserData *>(XML_GetUserData(p));
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
// Synchronize the position of the logger with teh position
- xmlSyncLoggerPosition(parser);
-
- // Decrement the current depth
- parser->getData().decrDepth();
+ xmlSyncLoggerPosition(p);
// Abort as long as we're in an annotation end tag
if (parser->getData().inAnnotationEndTag()) {
+ parser->getData().decrDepth();
return;
}
+ // Decrement the current depth
+ parser->getData().decrDepth();
+
+ // If there is any text data in the buffer, issue that first
+ if (parser->getData().hasText()) {
+ parser->getEvents().data(
+ parser->getData().getText(parser->getReader().getSourceId()));
+ }
+
// Abort if the special ousia tag ends here
- if (nameStr == "ousia" && parser->getData().depth == 0) {
+ std::string nameStr{name};
+ if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) {
return;
}
@@ -436,20 +368,105 @@ static void xmlEndElementHandler(void *p, const XML_Char *name)
parser->getEvents().fieldEnd();
}
-static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len)
+static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
{
// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
XML_Parser p = static_cast<XML_Parser>(ref);
- OsxmlEventParser *parser = static_cast<XMLUserData *>(XML_GetUserData(p));
-
- // TODO
-/* size_t ulen = len > 0 ? static_cast<size_t>(len) : 0;
- syncLoggerPosition(parser, ulen);
- const std::string data = Utils::trim(std::string{s, ulen});
- if (!data.empty()) {
- stack->data(data);
- }*/
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // Abort as long as we're in an annotation end tag
+ if (parser->getData().inAnnotationEndTag()) {
+ return;
+ }
+
+ // Convert the signed (smell the 90's C library here?) length to an usigned
+ // value
+ size_t ulen = len > 0 ? static_cast<size_t>(len) : 0;
+
+ // Synchronize the logger position
+ SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
+
+ // Fetch some variables for convenience
+ const WhitespaceMode mode = parser->getWhitespaceMode();
+ OsxmlEventParserData &data = parser->getData();
+ std::vector<char> &textBuf = data.textBuf;
+ std::vector<char> &whitespaceBuf = data.whitespaceBuf;
+ bool &hasWhitespace = data.hasWhitespace;
+ size_t &textStart = data.textStart;
+ size_t &textEnd = data.textEnd;
+
+ size_t pos = loc.getStart();
+ for (size_t i = 0; i < ulen; i++, pos++) {
+ switch (mode) {
+ case WhitespaceMode::PRESERVE:
+ PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd);
+ break;
+ case WhitespaceMode::TRIM:
+ TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd,
+ whitespaceBuf);
+ break;
+ case WhitespaceMode::COLLAPSE:
+ CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd,
+ hasWhitespace);
+ break;
+ }
+ }
+}
+
+/* Class OsxmlEvents */
+
+OsxmlEvents::~OsxmlEvents() {}
+
+/* Class OsxmlEventParser */
+
+OsxmlEventParserData::OsxmlEventParserData()
+ : depth(0),
+ annotationEndTagDepth(-1),
+ hasWhitespace(false),
+ textStart(0),
+ textEnd(0)
+{
+}
+
+void OsxmlEventParserData::incrDepth() { depth++; }
+
+void OsxmlEventParserData::decrDepth()
+{
+ if (depth > 0) {
+ depth--;
+ }
+ if (depth < annotationEndTagDepth) {
+ annotationEndTagDepth = -1;
+ }
+}
+
+bool OsxmlEventParserData::inAnnotationEndTag()
+{
+ return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);
}
+
+bool OsxmlEventParserData::hasText() { return !textBuf.empty(); }
+
+Variant OsxmlEventParserData::getText(SourceId sourceId)
+{
+ // Create a variant containing the string data and the location
+ Variant var =
+ Variant::fromString(std::string{textBuf.data(), textBuf.size()});
+ var.setLocation({sourceId, textStart, textEnd});
+
+ // Reset the text buffers
+ textBuf.clear();
+ whitespaceBuf.clear();
+ hasWhitespace = false;
+ textStart = 0;
+ textEnd = 0;
+
+ // Return the variant
+ return var;
}
/* Class OsxmlEventParser */
@@ -459,21 +476,22 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
: reader(reader),
events(events),
logger(logger),
- whitespaceMode(WhitespaceMode::COLLAPSE),
+ whitespaceMode(WhitespaceMode::TRIM),
data(new OsxmlEventParserData())
{
}
-void OsxmlEventParser::parse(CharReader &reader)
+OsxmlEventParser::~OsxmlEventParser() {}
+
+void OsxmlEventParser::parse()
{
// Create the parser object
- ScopedExpatXmlParser p{"UTF-8"};
+ GuardedExpatXmlParser p{"UTF-8"};
// Reset the depth
- depth = 0;
+ data->depth = 0;
- // Pass the reference to the ParserStack to the XML handler
- XMLUserData data(&stack, &reader);
+ // Pass the reference to this parser instance to the XML handler
XML_SetUserData(&p, this);
XML_UseParserAsHandlerArg(&p);
@@ -498,7 +516,7 @@ void OsxmlEventParser::parse(CharReader &reader)
if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) {
throw LoggableException{
"XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))},
- xmlSyncLoggerPosition(p)};
+ xmlSyncLoggerPosition(&p)};
}
// Abort once there are no more bytes in the stream
@@ -513,12 +531,17 @@ void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
this->whitespaceMode = whitespaceMode;
}
-CharReader &OsxmlEventParser::getCharReader() { return charReader; }
+WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
+{
+ return whitespaceMode;
+}
+
+CharReader &OsxmlEventParser::getReader() const { return reader; }
-Logger &OsxmlEventParser::getLogger() { return logger; }
+Logger &OsxmlEventParser::getLogger() const { return logger; }
-OsxmlEvents &OsxmlEventParser::getEvents() { return events; }
+OsxmlEvents &OsxmlEventParser::getEvents() const { return events; }
-OsxmlEventParserData &OsxmlEventParser::getData() { return *data; }
+OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; }
}
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
index 5319ca6..aa20ea9 100644
--- a/src/formats/osxml/OsxmlEventParser.hpp
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -42,7 +42,7 @@ class Variant;
class OsxmlEventParserData;
/**
- * Interface which defines the callback functions which are called by the
+ * Interface which defines the callback functions which are called by the
* OsxmlEventParser whenever an event occurs.
*/
class OsxmlEvents {
@@ -50,13 +50,13 @@ public:
/**
* Virtual destructor.
*/
- virtual ~OsxmlEvents() {}
+ virtual ~OsxmlEvents();
/**
* Called whenever a command starts. Note that this implicitly always starts
* the default field of the command.
*
- * @param name is a string variant containing name and location of the
+ * @param name is a string variant containing name and location of the
* command.
* @param args is a map variant containing the arguments that were given
* to the command.
@@ -67,12 +67,12 @@ public:
* Called whenever an annotation starts. Note that this implicitly always
* starts the default field of the annotation.
*
- * @param name is a string variant containing the name of the annotation
+ * @param name is a string variant containing the name of the annotation
* class and the location of the annotation definition.
* @param args is a map variant containing the arguments that were given
* to the annotation definition.
*/
- virtual void annotationStart(Variant name, Variant args);
+ virtual void annotationStart(Variant name, Variant args) = 0;
/**
* Called whenever the range of an annotation ends. The callee must
@@ -85,12 +85,12 @@ public:
* ended here. May be empty (or nullptr), if no elementName has been
* specified at the end of the annotation.
*/
- virtual void annotationEnd(Variant name, Variant elementName);
+ virtual void annotationEnd(Variant name, Variant elementName) = 0;
/**
- * Called whenever the default field which was implicitly started by
+ * Called whenever the default field which was implicitly started by
* commandStart or annotationStart ends. Note that this does not end the
- * range of an annotation, but the default field of the annotation. To
+ * range of an annotation, but the default field of the annotation. To
* signal the end of the annotation this, the annotationEnd method will be
* invoked.
*/
@@ -102,11 +102,10 @@ public:
* is not called if the parsing failed, the parser prints an error message
* instead.
*
- * @param data is the already parsed data that should be passed to the
+ * @param data is the already parsed data that should be passed to the
* handler.
*/
virtual void data(Variant data) = 0;
-
};
/**
@@ -148,7 +147,7 @@ public:
* Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents
* of which the callback functions are called.
*
- * @param reader is a reference to the CharReader instance from which the
+ * @param reader is a reference to the CharReader instance from which the
* XML should be read.
* @param events is a refence at an instance of the OsxmlEvents class. All
* events are forwarded to this class.
@@ -158,6 +157,11 @@ public:
OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger);
/**
+ * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type)
+ */
+ ~OsxmlEventParser();
+
+ /**
* Performs the actual parsing. Reads the XML using eXpat and calles the
* callbacks in the event listener instance whenever something interesting
* happens.
@@ -167,38 +171,44 @@ public:
/**
* Sets the whitespace handling mode.
*
- * @param whitespaceMode defines how whitespace in the data should be
+ * @param whitespaceMode defines how whitespace in the data should be
* handled.
*/
void setWhitespaceMode(WhitespaceMode whitespaceMode);
/**
+ * Returns the current whitespace handling mode.
+ *
+ * @return the currently set whitespace handling mode.
+ */
+ WhitespaceMode getWhitespaceMode() const;
+
+ /**
* Returns the internal CharReader reference.
*
* @return the CharReader reference.
*/
- CharReader &getCharReader();
+ CharReader &getReader() const;
/**
* Returns the internal Logger reference.
*
* @return the internal Logger reference.
*/
- Logger &getLogger();
+ Logger &getLogger() const;
/**
* Returns the internal OsxmlEvents reference.
*
* @return the internal OsxmlEvents reference.
*/
- OsxmlEvents &getEvents();
+ OsxmlEvents &getEvents() const;
/**
* Returns a reference at the internal data.
*/
- OsxmlEventParserData &getData();
+ OsxmlEventParserData &getData() const;
};
-
}
#endif /* _OSXML_EVENT_PARSER_HPP_ */