summaryrefslogtreecommitdiff
path: root/src/formats/osxml
diff options
context:
space:
mode:
Diffstat (limited to 'src/formats/osxml')
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp138
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp48
-rw-r--r--src/formats/osxml/OsxmlParser.cpp30
3 files changed, 60 insertions, 156 deletions
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
index c9254b0..79a8dbe 100644
--- a/src/formats/osxml/OsxmlEventParser.cpp
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -25,7 +25,7 @@
#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
#include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include "OsxmlAttributeLocator.hpp"
#include "OsxmlEventParser.hpp"
@@ -40,6 +40,11 @@ namespace ousia {
class OsxmlEventParserData {
public:
/**
+ * Current character data buffer.
+ */
+ TokenizedData data;
+
+ /**
* Contains the current depth of the parsing process.
*/
ssize_t depth;
@@ -52,35 +57,13 @@ public:
ssize_t annotationEndTagDepth;
/**
- * Current character data buffer.
- */
- std::vector<char> textBuf;
-
- /**
- * Current whitespace buffer (for the trimming whitspace mode)
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Flag indicating whether a whitespace character was present (for the
- * collapsing whitespace mode).
- */
- bool hasWhitespace;
-
- /**
- * Current character data start.
- */
- size_t textStart;
-
- /**
- * Current character data end.
- */
- size_t textEnd;
-
- /**
- * Default constructor.
+ * Constructor taking the sourceId of the file from which the XML is being
+ * parsed.
+ *
+ * @param sourceId is the source if of the XML file from which the data is
+ * currently being parsed.
*/
- OsxmlEventParserData();
+ OsxmlEventParserData(SourceId sourceId);
/**
* Increments the depth.
@@ -103,14 +86,6 @@ public:
* @return true if character data is available.
*/
bool hasText();
-
- /**
- * Returns a Variant containing the character data and its location.
- *
- * @return a string variant containing the text data and the character
- * location.
- */
- Variant getText(SourceId sourceId);
};
/* Class GuardedExpatXmlParser */
@@ -168,7 +143,7 @@ public:
static const std::string TOP_LEVEL_TAG{"ousia"};
/**
- * Prefix used to indicate the start of an annoation (note the trailing colon)
+ * Prefix used to indicate the start of an annoation (note the trailing colon).
*/
static const std::string ANNOTATION_START_PREFIX{"a:start:"};
@@ -215,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// If there is any text data in the buffer, issue that first
if (parser->getData().hasText()) {
- parser->getEvents().data(
- parser->getData().getText(parser->getReader().getSourceId()));
+ TokenizedData &data = parser->getData().data;
+ parser->getEvents().data(data);
+ data.clear();
}
// Read the argument locations -- this is only a stupid and slow hack,
@@ -335,7 +311,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// Just issue a "commandStart" event in any other case
Variant nameVar = Variant::fromString(nameStr);
nameVar.setLocation(nameLoc);
- parser->getEvents().command(nameVar, args);
+ parser->getEvents().commandStart(nameVar, args);
}
}
@@ -360,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)
// If there is any text data in the buffer, issue that first
if (parser->getData().hasText()) {
- parser->getEvents().data(
- parser->getData().getText(parser->getReader().getSourceId()));
+ TokenizedData &data = parser->getData().data;
+ parser->getEvents().data(data);
+ data.clear();
}
// Abort if the special ousia tag ends here
@@ -370,8 +347,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)
return;
}
- // Issue the "fieldEnd" event
- parser->getEvents().fieldEnd();
+ // Issue the "rangeEnd" event
+ parser->getEvents().rangeEnd();
}
static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
@@ -393,34 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
// Synchronize the logger position
SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
- // Fetch some variables for convenience
- const WhitespaceMode mode = parser->getWhitespaceMode();
- OsxmlEventParserData &data = parser->getData();
- std::vector<char> &textBuf = data.textBuf;
- std::vector<char> &whitespaceBuf = data.whitespaceBuf;
- bool &hasWhitespace = data.hasWhitespace;
- size_t &textStart = data.textStart;
- size_t &textEnd = data.textEnd;
-
- size_t pos = loc.getStart();
- for (size_t i = 0; i < ulen; i++, pos++) {
- switch (mode) {
- case WhitespaceMode::PRESERVE:
- PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd);
- break;
- case WhitespaceMode::TRIM:
- TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- whitespaceBuf);
- break;
- case WhitespaceMode::COLLAPSE:
- CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- hasWhitespace);
- break;
- }
- }
+ // Append the data to the buffer
+ parser->getData().data.append(std::string(s, ulen), loc.getStart());
}
/* Class OsxmlEvents */
@@ -429,12 +380,8 @@ OsxmlEvents::~OsxmlEvents() {}
/* Class OsxmlEventParser */
-OsxmlEventParserData::OsxmlEventParserData()
- : depth(0),
- annotationEndTagDepth(-1),
- hasWhitespace(false),
- textStart(0),
- textEnd(0)
+OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId)
+ : data(sourceId), depth(0), annotationEndTagDepth(-1)
{
}
@@ -455,25 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag()
return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);
}
-bool OsxmlEventParserData::hasText() { return !textBuf.empty(); }
-
-Variant OsxmlEventParserData::getText(SourceId sourceId)
-{
- // Create a variant containing the string data and the location
- Variant var =
- Variant::fromString(std::string{textBuf.data(), textBuf.size()});
- var.setLocation({sourceId, textStart, textEnd});
-
- // Reset the text buffers
- textBuf.clear();
- whitespaceBuf.clear();
- hasWhitespace = false;
- textStart = 0;
- textEnd = 0;
-
- // Return the variant
- return var;
-}
+bool OsxmlEventParserData::hasText() { return !data.empty(); }
/* Class OsxmlEventParser */
@@ -482,8 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
: reader(reader),
events(events),
logger(logger),
- whitespaceMode(WhitespaceMode::COLLAPSE),
- data(new OsxmlEventParserData())
+ data(new OsxmlEventParserData(reader.getSourceId()))
{
}
@@ -532,16 +460,6 @@ void OsxmlEventParser::parse()
}
}
-void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
-{
- this->whitespaceMode = whitespaceMode;
-}
-
-WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
-{
- return whitespaceMode;
-}
-
CharReader &OsxmlEventParser::getReader() const { return reader; }
Logger &OsxmlEventParser::getLogger() const { return logger; }
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
index e39245f..4c5a485 100644
--- a/src/formats/osxml/OsxmlEventParser.hpp
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -32,8 +32,6 @@
#include <memory>
#include <string>
-#include <core/common/Whitespace.hpp>
-
namespace ousia {
// Forward declarations
@@ -61,7 +59,8 @@ public:
* @param args is a map containing the arguments that were given to the
* command.
*/
- virtual void command(const Variant &name, const Variant::mapType &args) = 0;
+ virtual void commandStart(const Variant &name,
+ const Variant::mapType &args) = 0;
/**
* Called whenever an annotation starts. Note that this implicitly always
@@ -90,24 +89,17 @@ public:
const Variant &elementName) = 0;
/**
- * Called whenever the default field which was implicitly started by
- * commandStart or annotationStart ends. Note that this does not end the
- * range of an annotation, but the default field of the annotation. To
- * signal the end of the annotation this, the annotationEnd method will be
- * invoked.
+ * Called whenever the command or annotation tags end.
*/
- virtual void fieldEnd() = 0;
+ virtual void rangeEnd() = 0;
/**
- * Called whenever data is found. Whitespace data is handled as specified
- * and the data has been parsed to the specified variant type. This function
- * is not called if the parsing failed, the parser prints an error message
- * instead.
+ * Called whenever string data is found.
*
- * @param data is the already parsed data that should be passed to the
- * handler.
+ * @param data is a TokenizedData instance containing the string data that
+ * was found in the XML file.
*/
- virtual void data(const Variant &data) = 0;
+ virtual void data(const TokenizedData &data) = 0;
};
/**
@@ -135,11 +127,6 @@ private:
Logger &logger;
/**
- * Current whitespace mode.
- */
- WhitespaceMode whitespaceMode;
-
- /**
* Data to be used by the internal functions.
*/
std::unique_ptr<OsxmlEventParserData> data;
@@ -171,21 +158,6 @@ public:
void parse();
/**
- * Sets the whitespace handling mode.
- *
- * @param whitespaceMode defines how whitespace in the data should be
- * handled.
- */
- void setWhitespaceMode(WhitespaceMode whitespaceMode);
-
- /**
- * Returns the current whitespace handling mode.
- *
- * @return the currently set whitespace handling mode.
- */
- WhitespaceMode getWhitespaceMode() const;
-
- /**
* Returns the internal CharReader reference.
*
* @return the CharReader reference.
@@ -207,7 +179,9 @@ public:
OsxmlEvents &getEvents() const;
/**
- * Returns a reference at the internal data.
+ * Used internally to fetch a reference at the internal data.
+ *
+ * @return a reference at the internal OsxmlEventParserData structure.
*/
OsxmlEventParserData &getData() const;
};
diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp
index c216855..10cc77a 100644
--- a/src/formats/osxml/OsxmlParser.cpp
+++ b/src/formats/osxml/OsxmlParser.cpp
@@ -16,6 +16,9 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <core/common/Variant.hpp>
+#include <core/common/CharReader.hpp>
+#include <core/parser/stack/Callbacks.hpp>
#include <core/parser/stack/GenericParserStates.hpp>
#include <core/parser/stack/Stack.hpp>
#include <core/parser/ParserContext.hpp>
@@ -30,7 +33,7 @@ using namespace parser_stack;
/**
* Class containing the actual OsxmlParser implementation.
*/
-class OsxmlParserImplementation : public OsxmlEvents {
+class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks {
private:
/**
* Actual xml parser -- converts the xml stream into a set of events.
@@ -54,7 +57,7 @@ public:
*/
OsxmlParserImplementation(CharReader &reader, ParserContext &ctx)
: parser(reader, *this, ctx.getLogger()),
- stack(ctx, GenericParserStates)
+ stack(*this, ctx, GenericParserStates)
{
}
@@ -63,17 +66,16 @@ public:
*/
void parse() { parser.parse(); }
- void command(const Variant &name, const Variant::mapType &args) override
+ void commandStart(const Variant &name,
+ const Variant::mapType &args) override
{
- stack.command(name, args);
- stack.fieldStart(true);
+ stack.commandStart(name, args, true);
}
void annotationStart(const Variant &name,
const Variant::mapType &args) override
{
- stack.annotationStart(name, args);
- stack.fieldStart(true);
+ stack.annotationStart(name, args, true);
}
void annotationEnd(const Variant &className,
@@ -82,9 +84,19 @@ public:
stack.annotationEnd(className, elementName);
}
- void fieldEnd() override { stack.fieldEnd(); }
+ void rangeEnd() override { stack.rangeEnd(); }
- void data(const Variant &data) override { stack.data(data); }
+ void data(const TokenizedData &data) override { stack.data(data); }
+
+ TokenId registerToken(const std::string &token) override
+ {
+ return Tokens::Empty;
+ }
+
+ void unregisterToken(TokenId id) override
+ {
+ // Do nothing here
+ }
};
/* Class OsxmlParser */