diff options
Diffstat (limited to 'src/formats/osxml')
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 138 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 48 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlParser.cpp | 30 | 
3 files changed, 60 insertions, 156 deletions
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..79a8dbe 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,7 @@  #include <core/common/Variant.hpp>  #include <core/common/VariantReader.hpp>  #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include <core/parser/utils/TokenizedData.hpp>  #include "OsxmlAttributeLocator.hpp"  #include "OsxmlEventParser.hpp" @@ -40,6 +40,11 @@ namespace ousia {  class OsxmlEventParserData {  public:  	/** +	 * Current character data buffer. +	 */ +	TokenizedData data; + +	/**  	 * Contains the current depth of the parsing process.  	 */  	ssize_t depth; @@ -52,35 +57,13 @@ public:  	ssize_t annotationEndTagDepth;  	/** -	 * Current character data buffer. -	 */ -	std::vector<char> textBuf; - -	/** -	 * Current whitespace buffer (for the trimming whitspace mode) -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Flag indicating whether a whitespace character was present (for the -	 * collapsing whitespace mode). -	 */ -	bool hasWhitespace; - -	/** -	 * Current character data start. -	 */ -	size_t textStart; - -	/** -	 * Current character data end. -	 */ -	size_t textEnd; - -	/** -	 * Default constructor. +	 * Constructor taking the sourceId of the file from which the XML is being +	 * parsed. +	 * +	 * @param sourceId is the source if of the XML file from which the data is +	 * currently being parsed.  	 */ -	OsxmlEventParserData(); +	OsxmlEventParserData(SourceId sourceId);  	/**  	 * Increments the depth. @@ -103,14 +86,6 @@ public:  	 * @return true if character data is available.  	 */  	bool hasText(); - -	/** -	 * Returns a Variant containing the character data and its location. -	 * -	 * @return a string variant containing the text data and the character -	 * location. -	 */ -	Variant getText(SourceId sourceId);  };  /* Class GuardedExpatXmlParser */ @@ -168,7 +143,7 @@ public:  static const std::string TOP_LEVEL_TAG{"ousia"};  /** - * Prefix used to indicate the start of an annoation (note the trailing colon) + * Prefix used to indicate the start of an annoation (note the trailing colon).   */  static const std::string ANNOTATION_START_PREFIX{"a:start:"}; @@ -215,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,  	// If there is any text data in the buffer, issue that first  	if (parser->getData().hasText()) { -		parser->getEvents().data( -		    parser->getData().getText(parser->getReader().getSourceId())); +		TokenizedData &data = parser->getData().data; +		parser->getEvents().data(data); +		data.clear();  	}  	// Read the argument locations -- this is only a stupid and slow hack, @@ -335,7 +311,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,  		// Just issue a "commandStart" event in any other case  		Variant nameVar = Variant::fromString(nameStr);  		nameVar.setLocation(nameLoc); -		parser->getEvents().command(nameVar, args); +		parser->getEvents().commandStart(nameVar, args);  	}  } @@ -360,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)  	// If there is any text data in the buffer, issue that first  	if (parser->getData().hasText()) { -		parser->getEvents().data( -		    parser->getData().getText(parser->getReader().getSourceId())); +		TokenizedData &data = parser->getData().data; +		parser->getEvents().data(data); +		data.clear();  	}  	// Abort if the special ousia tag ends here @@ -370,8 +347,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)  		return;  	} -	// Issue the "fieldEnd" event -	parser->getEvents().fieldEnd(); +	// Issue the "rangeEnd" event +	parser->getEvents().rangeEnd();  }  static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) @@ -393,34 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)  	// Synchronize the logger position  	SourceLocation loc = xmlSyncLoggerPosition(p, ulen); -	// Fetch some variables for convenience -	const WhitespaceMode mode = parser->getWhitespaceMode(); -	OsxmlEventParserData &data = parser->getData(); -	std::vector<char> &textBuf = data.textBuf; -	std::vector<char> &whitespaceBuf = data.whitespaceBuf; -	bool &hasWhitespace = data.hasWhitespace; -	size_t &textStart = data.textStart; -	size_t &textEnd = data.textEnd; - -	size_t pos = loc.getStart(); -	for (size_t i = 0; i < ulen; i++, pos++) { -		switch (mode) { -			case WhitespaceMode::PRESERVE: -				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd); -				break; -			case WhitespaceMode::TRIM: -				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                  textStart, textEnd, -				                                  whitespaceBuf); -				break; -			case WhitespaceMode::COLLAPSE: -				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd, -				                                    hasWhitespace); -				break; -		} -	} +	// Append the data to the buffer +	parser->getData().data.append(std::string(s, ulen), loc.getStart());  }  /* Class OsxmlEvents */ @@ -429,12 +380,8 @@ OsxmlEvents::~OsxmlEvents() {}  /* Class OsxmlEventParser */ -OsxmlEventParserData::OsxmlEventParserData() -    : depth(0), -      annotationEndTagDepth(-1), -      hasWhitespace(false), -      textStart(0), -      textEnd(0) +OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId) +    : data(sourceId), depth(0), annotationEndTagDepth(-1)  {  } @@ -455,25 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag()  	return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);  } -bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } - -Variant OsxmlEventParserData::getText(SourceId sourceId) -{ -	// Create a variant containing the string data and the location -	Variant var = -	    Variant::fromString(std::string{textBuf.data(), textBuf.size()}); -	var.setLocation({sourceId, textStart, textEnd}); - -	// Reset the text buffers -	textBuf.clear(); -	whitespaceBuf.clear(); -	hasWhitespace = false; -	textStart = 0; -	textEnd = 0; - -	// Return the variant -	return var; -} +bool OsxmlEventParserData::hasText() { return !data.empty(); }  /* Class OsxmlEventParser */ @@ -482,8 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,      : reader(reader),        events(events),        logger(logger), -      whitespaceMode(WhitespaceMode::COLLAPSE), -      data(new OsxmlEventParserData()) +      data(new OsxmlEventParserData(reader.getSourceId()))  {  } @@ -532,16 +460,6 @@ void OsxmlEventParser::parse()  	}  } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ -	this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ -	return whitespaceMode; -} -  CharReader &OsxmlEventParser::getReader() const { return reader; }  Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..4c5a485 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@  #include <memory>  #include <string> -#include <core/common/Whitespace.hpp> -  namespace ousia {  // Forward declarations @@ -61,7 +59,8 @@ public:  	 * @param args is a map containing the arguments that were given to the  	 * command.  	 */ -	virtual void command(const Variant &name, const Variant::mapType &args) = 0; +	virtual void commandStart(const Variant &name, +	                          const Variant::mapType &args) = 0;  	/**  	 * Called whenever an annotation starts. Note that this implicitly always @@ -90,24 +89,17 @@ public:  	                           const Variant &elementName) = 0;  	/** -	 * Called whenever the default field which was implicitly started by -	 * commandStart or annotationStart ends. Note that this does not end the -	 * range of an annotation, but the default field of the annotation. To -	 * signal the end of the annotation this, the annotationEnd method will be -	 * invoked. +	 * Called whenever the command or annotation tags end.  	 */ -	virtual void fieldEnd() = 0; +	virtual void rangeEnd() = 0;  	/** -	 * Called whenever data is found. Whitespace data is handled as specified -	 * and the data has been parsed to the specified variant type. This function -	 * is not called if the parsing failed, the parser prints an error message -	 * instead. +	 * Called whenever string data is found.  	 * -	 * @param data is the already parsed data that should be passed to the -	 * handler. +	 * @param data is a TokenizedData instance containing the string data that +	 * was found in the XML file.  	 */ -	virtual void data(const Variant &data) = 0; +	virtual void data(const TokenizedData &data) = 0;  };  /** @@ -135,11 +127,6 @@ private:  	Logger &logger;  	/** -	 * Current whitespace mode. -	 */ -	WhitespaceMode whitespaceMode; - -	/**  	 * Data to be used by the internal functions.  	 */  	std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +158,6 @@ public:  	void parse();  	/** -	 * Sets the whitespace handling mode. -	 * -	 * @param whitespaceMode defines how whitespace in the data should be -	 * handled. -	 */ -	void setWhitespaceMode(WhitespaceMode whitespaceMode); - -	/** -	 * Returns the current whitespace handling mode. -	 * -	 * @return the currently set whitespace handling mode. -	 */ -	WhitespaceMode getWhitespaceMode() const; - -	/**  	 * Returns the internal CharReader reference.  	 *  	 * @return the CharReader reference. @@ -207,7 +179,9 @@ public:  	OsxmlEvents &getEvents() const;  	/** -	 * Returns a reference at the internal data. +	 * Used internally to fetch a reference at the internal data. +	 * +	 * @return a reference at the internal OsxmlEventParserData structure.  	 */  	OsxmlEventParserData &getData() const;  }; diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index c216855..10cc77a 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -16,6 +16,9 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ +#include <core/common/Variant.hpp> +#include <core/common/CharReader.hpp> +#include <core/parser/stack/Callbacks.hpp>  #include <core/parser/stack/GenericParserStates.hpp>  #include <core/parser/stack/Stack.hpp>  #include <core/parser/ParserContext.hpp> @@ -30,7 +33,7 @@ using namespace parser_stack;  /**   * Class containing the actual OsxmlParser implementation.   */ -class OsxmlParserImplementation : public OsxmlEvents { +class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks {  private:  	/**  	 * Actual xml parser -- converts the xml stream into a set of events. @@ -54,7 +57,7 @@ public:  	 */  	OsxmlParserImplementation(CharReader &reader, ParserContext &ctx)  	    : parser(reader, *this, ctx.getLogger()), -	      stack(ctx, GenericParserStates) +	      stack(*this, ctx, GenericParserStates)  	{  	} @@ -63,17 +66,16 @@ public:  	 */  	void parse() { parser.parse(); } -	void command(const Variant &name, const Variant::mapType &args) override +	void commandStart(const Variant &name, +	                  const Variant::mapType &args) override  	{ -		stack.command(name, args); -		stack.fieldStart(true); +		stack.commandStart(name, args, true);  	}  	void annotationStart(const Variant &name,  	                     const Variant::mapType &args) override  	{ -		stack.annotationStart(name, args); -		stack.fieldStart(true); +		stack.annotationStart(name, args, true);  	}  	void annotationEnd(const Variant &className, @@ -82,9 +84,19 @@ public:  		stack.annotationEnd(className, elementName);  	} -	void fieldEnd() override { stack.fieldEnd(); } +	void rangeEnd() override { stack.rangeEnd(); } -	void data(const Variant &data) override { stack.data(data); } +	void data(const TokenizedData &data) override { stack.data(data); } + +	TokenId registerToken(const std::string &token) override +	{ +		return Tokens::Empty; +	} + +	void unregisterToken(TokenId id) override +	{ +		// Do nothing here +	}  };  /* Class OsxmlParser */  | 
