diff options
Diffstat (limited to 'src/formats/osxml')
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 138 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 48 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlParser.cpp | 30 |
3 files changed, 60 insertions, 156 deletions
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..79a8dbe 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,7 @@ #include <core/common/Variant.hpp> #include <core/common/VariantReader.hpp> #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include <core/parser/utils/TokenizedData.hpp> #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" @@ -40,6 +40,11 @@ namespace ousia { class OsxmlEventParserData { public: /** + * Current character data buffer. + */ + TokenizedData data; + + /** * Contains the current depth of the parsing process. */ ssize_t depth; @@ -52,35 +57,13 @@ public: ssize_t annotationEndTagDepth; /** - * Current character data buffer. - */ - std::vector<char> textBuf; - - /** - * Current whitespace buffer (for the trimming whitspace mode) - */ - std::vector<char> whitespaceBuf; - - /** - * Flag indicating whether a whitespace character was present (for the - * collapsing whitespace mode). - */ - bool hasWhitespace; - - /** - * Current character data start. - */ - size_t textStart; - - /** - * Current character data end. - */ - size_t textEnd; - - /** - * Default constructor. + * Constructor taking the sourceId of the file from which the XML is being + * parsed. + * + * @param sourceId is the source if of the XML file from which the data is + * currently being parsed. */ - OsxmlEventParserData(); + OsxmlEventParserData(SourceId sourceId); /** * Increments the depth. @@ -103,14 +86,6 @@ public: * @return true if character data is available. */ bool hasText(); - - /** - * Returns a Variant containing the character data and its location. - * - * @return a string variant containing the text data and the character - * location. - */ - Variant getText(SourceId sourceId); }; /* Class GuardedExpatXmlParser */ @@ -168,7 +143,7 @@ public: static const std::string TOP_LEVEL_TAG{"ousia"}; /** - * Prefix used to indicate the start of an annoation (note the trailing colon) + * Prefix used to indicate the start of an annoation (note the trailing colon). */ static const std::string ANNOTATION_START_PREFIX{"a:start:"}; @@ -215,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { - parser->getEvents().data( - parser->getData().getText(parser->getReader().getSourceId())); + TokenizedData &data = parser->getData().data; + parser->getEvents().data(data); + data.clear(); } // Read the argument locations -- this is only a stupid and slow hack, @@ -335,7 +311,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Just issue a "commandStart" event in any other case Variant nameVar = Variant::fromString(nameStr); nameVar.setLocation(nameLoc); - parser->getEvents().command(nameVar, args); + parser->getEvents().commandStart(nameVar, args); } } @@ -360,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name) // If there is any text data in the buffer, issue that first if (parser->getData().hasText()) { - parser->getEvents().data( - parser->getData().getText(parser->getReader().getSourceId())); + TokenizedData &data = parser->getData().data; + parser->getEvents().data(data); + data.clear(); } // Abort if the special ousia tag ends here @@ -370,8 +347,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name) return; } - // Issue the "fieldEnd" event - parser->getEvents().fieldEnd(); + // Issue the "rangeEnd" event + parser->getEvents().rangeEnd(); } static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) @@ -393,34 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) // Synchronize the logger position SourceLocation loc = xmlSyncLoggerPosition(p, ulen); - // Fetch some variables for convenience - const WhitespaceMode mode = parser->getWhitespaceMode(); - OsxmlEventParserData &data = parser->getData(); - std::vector<char> &textBuf = data.textBuf; - std::vector<char> &whitespaceBuf = data.whitespaceBuf; - bool &hasWhitespace = data.hasWhitespace; - size_t &textStart = data.textStart; - size_t &textEnd = data.textEnd; - - size_t pos = loc.getStart(); - for (size_t i = 0; i < ulen; i++, pos++) { - switch (mode) { - case WhitespaceMode::PRESERVE: - PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd); - break; - case WhitespaceMode::TRIM: - TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - whitespaceBuf); - break; - case WhitespaceMode::COLLAPSE: - CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - hasWhitespace); - break; - } - } + // Append the data to the buffer + parser->getData().data.append(std::string(s, ulen), loc.getStart()); } /* Class OsxmlEvents */ @@ -429,12 +380,8 @@ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ -OsxmlEventParserData::OsxmlEventParserData() - : depth(0), - annotationEndTagDepth(-1), - hasWhitespace(false), - textStart(0), - textEnd(0) +OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId) + : data(sourceId), depth(0), annotationEndTagDepth(-1) { } @@ -455,25 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag() return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); } -bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } - -Variant OsxmlEventParserData::getText(SourceId sourceId) -{ - // Create a variant containing the string data and the location - Variant var = - Variant::fromString(std::string{textBuf.data(), textBuf.size()}); - var.setLocation({sourceId, textStart, textEnd}); - - // Reset the text buffers - textBuf.clear(); - whitespaceBuf.clear(); - hasWhitespace = false; - textStart = 0; - textEnd = 0; - - // Return the variant - return var; -} +bool OsxmlEventParserData::hasText() { return !data.empty(); } /* Class OsxmlEventParser */ @@ -482,8 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), - data(new OsxmlEventParserData()) + data(new OsxmlEventParserData(reader.getSourceId())) { } @@ -532,16 +460,6 @@ void OsxmlEventParser::parse() } } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ - return whitespaceMode; -} - CharReader &OsxmlEventParser::getReader() const { return reader; } Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..4c5a485 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@ #include <memory> #include <string> -#include <core/common/Whitespace.hpp> - namespace ousia { // Forward declarations @@ -61,7 +59,8 @@ public: * @param args is a map containing the arguments that were given to the * command. */ - virtual void command(const Variant &name, const Variant::mapType &args) = 0; + virtual void commandStart(const Variant &name, + const Variant::mapType &args) = 0; /** * Called whenever an annotation starts. Note that this implicitly always @@ -90,24 +89,17 @@ public: const Variant &elementName) = 0; /** - * Called whenever the default field which was implicitly started by - * commandStart or annotationStart ends. Note that this does not end the - * range of an annotation, but the default field of the annotation. To - * signal the end of the annotation this, the annotationEnd method will be - * invoked. + * Called whenever the command or annotation tags end. */ - virtual void fieldEnd() = 0; + virtual void rangeEnd() = 0; /** - * Called whenever data is found. Whitespace data is handled as specified - * and the data has been parsed to the specified variant type. This function - * is not called if the parsing failed, the parser prints an error message - * instead. + * Called whenever string data is found. * - * @param data is the already parsed data that should be passed to the - * handler. + * @param data is a TokenizedData instance containing the string data that + * was found in the XML file. */ - virtual void data(const Variant &data) = 0; + virtual void data(const TokenizedData &data) = 0; }; /** @@ -135,11 +127,6 @@ private: Logger &logger; /** - * Current whitespace mode. - */ - WhitespaceMode whitespaceMode; - - /** * Data to be used by the internal functions. */ std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +158,6 @@ public: void parse(); /** - * Sets the whitespace handling mode. - * - * @param whitespaceMode defines how whitespace in the data should be - * handled. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Returns the current whitespace handling mode. - * - * @return the currently set whitespace handling mode. - */ - WhitespaceMode getWhitespaceMode() const; - - /** * Returns the internal CharReader reference. * * @return the CharReader reference. @@ -207,7 +179,9 @@ public: OsxmlEvents &getEvents() const; /** - * Returns a reference at the internal data. + * Used internally to fetch a reference at the internal data. + * + * @return a reference at the internal OsxmlEventParserData structure. */ OsxmlEventParserData &getData() const; }; diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index c216855..10cc77a 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -16,6 +16,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <core/common/Variant.hpp> +#include <core/common/CharReader.hpp> +#include <core/parser/stack/Callbacks.hpp> #include <core/parser/stack/GenericParserStates.hpp> #include <core/parser/stack/Stack.hpp> #include <core/parser/ParserContext.hpp> @@ -30,7 +33,7 @@ using namespace parser_stack; /** * Class containing the actual OsxmlParser implementation. */ -class OsxmlParserImplementation : public OsxmlEvents { +class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks { private: /** * Actual xml parser -- converts the xml stream into a set of events. @@ -54,7 +57,7 @@ public: */ OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) : parser(reader, *this, ctx.getLogger()), - stack(ctx, GenericParserStates) + stack(*this, ctx, GenericParserStates) { } @@ -63,17 +66,16 @@ public: */ void parse() { parser.parse(); } - void command(const Variant &name, const Variant::mapType &args) override + void commandStart(const Variant &name, + const Variant::mapType &args) override { - stack.command(name, args); - stack.fieldStart(true); + stack.commandStart(name, args, true); } void annotationStart(const Variant &name, const Variant::mapType &args) override { - stack.annotationStart(name, args); - stack.fieldStart(true); + stack.annotationStart(name, args, true); } void annotationEnd(const Variant &className, @@ -82,9 +84,19 @@ public: stack.annotationEnd(className, elementName); } - void fieldEnd() override { stack.fieldEnd(); } + void rangeEnd() override { stack.rangeEnd(); } - void data(const Variant &data) override { stack.data(data); } + void data(const TokenizedData &data) override { stack.data(data); } + + TokenId registerToken(const std::string &token) override + { + return Tokens::Empty; + } + + void unregisterToken(TokenId id) override + { + // Do nothing here + } }; /* Class OsxmlParser */ |