diff options
Diffstat (limited to 'src/formats')
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp | 157 | ||||
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp | 85 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 63 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 31 | 
4 files changed, 90 insertions, 246 deletions
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..d4cdbf8 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -94,92 +94,11 @@ public:  static const PlainFormatTokens OsmlTokens; -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: -	/** -	 * Internal character buffer. -	 */ -	std::vector<char> buf; - -	/** -	 * Start location of the character data. -	 */ -	SourceOffset start; - -	/** -	 * End location of the character data. -	 */ -	SourceOffset end; - -public: -	/** -	 * Default constructor, initializes start and end with zeros. -	 */ -	DataHandler() : start(0), end(0) {} - -	/** -	 * Returns true if the internal buffer is empty. -	 * -	 * @return true if no characters were added to the internal buffer, false -	 * otherwise. -	 */ -	bool isEmpty() { return buf.empty(); } - -	/** -	 * Appends a single character to the internal buffer. -	 * -	 * @param c is the character that should be added to the internal buffer. -	 * @param charStart is the start position of the character. -	 * @param charEnd is the end position of the character. -	 */ -	void append(char c, SourceOffset charStart, SourceOffset charEnd) -	{ -		if (isEmpty()) { -			start = charStart; -		} -		buf.push_back(c); -		end = charEnd; -	} - -	/** -	 * Appends a string to the internal buffer. -	 * -	 * @param s is the string that should be added to the internal buffer. -	 * @param stringStart is the start position of the string. -	 * @param stringEnd is the end position of the string. -	 */ -	void append(const std::string &s, SourceOffset stringStart, -	            SourceOffset stringEnd) -	{ -		if (isEmpty()) { -			start = stringStart; -		} -		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); -		end = stringEnd; -	} - -	/** -	 * Converts the internal buffer to a variant with attached location -	 * information. -	 * -	 * @param sourceId is the source id which is needed for building the -	 * location information. -	 * @return a Variant with the internal buffer content as string and -	 * the correct start and end location. -	 */ -	Variant toVariant(SourceId sourceId) -	{ -		Variant res = Variant::fromString(std::string(buf.data(), buf.size())); -		res.setLocation({sourceId, start, end}); -		return res; -	} -}; -  OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) -    : reader(reader), logger(logger), tokenizer(OsmlTokens) +    : reader(reader), +      logger(logger), +      tokenizer(OsmlTokens), +      data(reader.getSourceId())  {  	// Place an intial command representing the complete file on the stack  	commands.push(Command{"", Variant::mapType{}, true, true, true, false}); @@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)  Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  {  	bool first = true; -	bool hasCharSiceNSSep = false; +	bool hasCharSinceNSSep = false;  	std::vector<char> identifier;  	size_t end = reader.getPeekOffset();  	char c, c2; @@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  		if ((first && Utils::isIdentifierStartCharacter(c)) ||  		    (!first && Utils::isIdentifierCharacter(c))) {  			identifier.push_back(c); -		} else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && +		} else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&  		           Utils::isIdentifierStartCharacter(c2)) {  			identifier.push_back(c);  		} else { @@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  		// This is no longer the first character  		first = false; -		// Advance the hasCharSiceNSSep flag -		hasCharSiceNSSep = allowNSSep && (c != ':'); +		// Advance the hasCharSinceNSSep flag +		hasCharSinceNSSep = allowNSSep && (c != ':');  		end = reader.getPeekOffset();  		reader.consumePeek(); @@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment()  {  	Token token;  	size_t depth = 1; -	while (tokenizer.read(reader, token)) { +	while (tokenizer.read(reader, token, data)) { +		// Throw the comment data away +		data.clear(); +  		if (token.id == OsmlTokens.BlockCommentEnd) {  			depth--;  			if (depth == 0) { @@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment()  	}  } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData()  { -	if (!handler.isEmpty()) { -		data = handler.toVariant(reader.getSourceId()); +	if (!data.empty()) {  		location = data.getLocation();  		reader.resetPeek();  		return true; @@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField()  OsmlStreamParser::State OsmlStreamParser::parse()  { -	// Handler for incomming data -	DataHandler handler; +	// Reset the data handler +	data.clear();  	// Read tokens until the outer loop should be left  	Token token; -	while (tokenizer.peek(reader, token)) { +	while (tokenizer.peek(reader, token, data)) {  		const TokenId type = token.id;  		// Special handling for Backslash and Text @@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			// Try to parse a command  			if (Utils::isIdentifierStartCharacter(c)) {  				// Make sure to issue any data before it is to late -				if (checkIssueData(handler)) { +				if (checkIssueData()) {  					return State::DATA;  				} @@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			// If this was an annotation start token, add the parsed < to the  			// output  			if (type == OsmlTokens.AnnotationStart) { -				handler.append('<', token.location.getStart(), -				               token.location.getStart() + 1); +				data.append('<', token.location.getStart(), +				            token.location.getStart() + 1);  			} -			handler.append(c, token.location.getStart(), -			               reader.getPeekOffset()); +			data.append(c, token.location.getStart(), reader.getPeekOffset());  			reader.consumePeek();  			continue;  		} else if (type == Tokens::Data) { @@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse()  				location = token.location;  				return State::FIELD_START;  			} - -			// Append the text to the data handler -			handler.append(token.content, token.location.getStart(), -			               token.location.getEnd()); -  			reader.consumePeek();  			continue;  		}  		// A non-text token was reached, make sure all pending data commands  		// have been issued -		if (checkIssueData(handler)) { +		if (checkIssueData()) {  			return State::DATA;  		} @@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			Command &cmd = commands.top();  			if (!cmd.inField) {  				cmd.inField = true; -				return State::FIELD_START;  			} -			logger.error( +			return State::FIELD_START; +/*			logger.error(  			    "Got field start token \"{\", but no command for which to "  			    "start the field. Write \"\\{\" to insert this sequence as "  			    "text.", -			    token); +			    token);*/  		} else if (token.id == OsmlTokens.FieldEnd) { -			if (closeField()) { +			closeField(); +			return State::FIELD_END; +/*			if (closeField()) {  				return State::FIELD_END;  			}  			logger.error(  			    "Got field end token \"}\", but there is no field to end. "  			    "Write \"\\}\" to insert this sequence as text.", -			    token); +			    token);*/  		} else if (token.id == OsmlTokens.DefaultFieldStart) {  			// Try to start a default field the first time the token is reached  			Command &topCmd = commands.top();  			if (!topCmd.inField) {  				topCmd.inField = true;  				topCmd.inDefaultField = true; -				return State::FIELD_START;  			} -			logger.error( +			return State::FIELD_START; +/*			logger.error(  			    "Got default field start token \"{!\", but no command for "  			    "which to start the field. Write \"\\{!\" to insert this "  			    "sequence as text", -			    token); +			    token);*/  		} else if (token.id == OsmlTokens.AnnotationEnd) {  			// We got a single annotation end token "\>" -- simply issue the  			// ANNOTATION_END event @@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()  	}  	// Issue available data -	if (checkIssueData(handler)) { +	if (checkIssueData()) {  		return State::DATA;  	} @@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse()  	return State::END;  } +Variant OsmlStreamParser::getText(WhitespaceMode mode) +{ +	TokenizedData dataFork = data; +	Variant text = dataFork.text(mode); +	location = text.getLocation(); +	return text; +} +  const Variant &OsmlStreamParser::getCommandName() const  {  	return commands.top().name; diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..453a2bb 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,17 +29,19 @@  #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_  #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include <stack> +#include <memory>  #include <core/common/Variant.hpp> +#include <core/common/Whitespace.hpp>  #include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/utils/TokenizedData.hpp>  namespace ousia {  // Forward declarations  class CharReader;  class Logger; -class DataHandler; +class OsmlStreamParserImpl;  /**   * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml @@ -137,26 +139,15 @@ public:  		Variant arguments;  		/** -		 * Set to true if this is a command with clear begin and end. -		 */ -		bool hasRange : 1; - -		/** -		 * Set to true if we are currently inside a field of this command. -		 */ -		bool inField : 1; - -		/** -		 * Set to true if we are currently in the range field of the command -		 * (implies inField being set to true). +		 * Vector used as stack for holding the number of opening/closing braces +		 * and the corresponding "isDefaultField" flag.  		 */ -		bool inRangeField : 1; +		std::vector<bool> fields;  		/** -		 * Set to true if we are currently in a field that has been especially -		 * marked as default field (using the "|") syntax. +		 * Set to true if this is a command with clear begin and end.  		 */ -		bool inDefaultField : 1; +		bool hasRange;  		/**  		 * Default constructor. @@ -164,7 +155,6 @@ public:  		Command()  		    : hasRange(false),  		      inField(false), -		      inRangeField(false),  		      inDefaultField()  		{  		} @@ -178,15 +168,10 @@ public:  		 * command.  		 * @param hasRange should be set to true if this is a command with  		 * explicit range. -		 * @param inField is set to true if we currently are inside a field -		 * of this command. -		 * @param inRangeField is set to true if we currently are inside the -		 * outer field of a ranged command.  		 * @param inDefaultField is set to true if we currently are in a  		 * specially marked default field.  		 */ -		Command(Variant name, Variant arguments, bool hasRange, -		        bool inField, bool inRangeField, bool inDefaultField) +		Command(Variant name, Variant arguments, bool hasRange)  		    : name(std::move(name)),  		      arguments(std::move(arguments)),  		      hasRange(hasRange), @@ -215,25 +200,20 @@ private:  	Tokenizer tokenizer;  	/** -	 * Stack containing the current commands. -	 */ -	std::stack<Command> commands; - -	/** -	 * Variant containing the data that has been read (always is a string, -	 * contains the exact location of the data in the source file). +	 * Variant containing the tokenized data that was returned from the +	 * tokenizer as data.  	 */ -	Variant data; +	TokenizedData data;  	/** -	 * Contains the location of the last token. +	 * Stack containing the current commands.  	 */ -	SourceLocation location; +	std::stack<Command> commands;  	/** -	 * Contains the field index of the current command. +	 * Pointer at   	 */ -	size_t fieldIdx; +	std::unique_ptr<OsmlStreamParserImpl> impl;  	/**  	 * Function used internall to parse an identifier. @@ -291,12 +271,10 @@ private:  	/**  	 * Checks whether there is any data pending to be issued, if yes, issues it.  	 * -	 * @param handler is the data handler that contains the data that may be -	 * returned to the user.  	 * @return true if there was any data and DATA should be returned by the  	 * parse function, false otherwise.  	 */ -	bool checkIssueData(DataHandler &handler); +	bool checkIssueData();  	/**  	 * Called before any data is appended to the internal data handler. Checks @@ -328,6 +306,12 @@ public:  	OsmlStreamParser(CharReader &reader, Logger &logger);  	/** +	 * Destructor of the OsmlStreamParser, needed to destroy the incomplete +	 * OsmlStreamParserImpl. +	 */ +	~OsmlStreamParser(); + +	/**  	 * Continues parsing. Returns one of the states defined in the State enum.  	 * Callers should stop once the State::END state is reached. Use the getter  	 * functions to get more information about the current state, such as the @@ -344,7 +328,19 @@ public:  	 * @return a reference at a variant containing the data parsed by the  	 * "parse" function.  	 */ -	const Variant &getData() const { return data; } +	const TokenizedData &getData() const { return data; } + +	/** +	 * Returns the complete content of the internal TokenizedData instance as +	 * a single string Variant. This method is mainly used in the unit tests for +	 * this class, it simply calls the text() method of TokenizedData. +	 * +	 * @param mode is the WhitespaceMode that should be used for returning the +	 * text. +	 * @return a string variant containing the text content of the internal +	 * TokenizedData instance or a nullptr variant if there is no text. +	 */ +	Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE);  	/**  	 * Returns a reference at the internally stored command name. Only valid if @@ -371,13 +367,6 @@ public:  	 * syntax).  	 */  	bool inDefaultField() const; - -	/** -	 * Returns a reference at the char reader. -	 * -	 * @return the last internal token location. -	 */ -	const SourceLocation &getLocation() const { return location; }  };  } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..855f80d 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,6 @@  #include <core/common/Variant.hpp>  #include <core/common/VariantReader.hpp>  #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp>  #include "OsxmlAttributeLocator.hpp"  #include "OsxmlEventParser.hpp" @@ -57,17 +56,6 @@ public:  	std::vector<char> textBuf;  	/** -	 * Current whitespace buffer (for the trimming whitspace mode) -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Flag indicating whether a whitespace character was present (for the -	 * collapsing whitespace mode). -	 */ -	bool hasWhitespace; - -	/**  	 * Current character data start.  	 */  	size_t textStart; @@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)  	SourceLocation loc = xmlSyncLoggerPosition(p, ulen);  	// Fetch some variables for convenience -	const WhitespaceMode mode = parser->getWhitespaceMode();  	OsxmlEventParserData &data = parser->getData();  	std::vector<char> &textBuf = data.textBuf; -	std::vector<char> &whitespaceBuf = data.whitespaceBuf; -	bool &hasWhitespace = data.hasWhitespace; -	size_t &textStart = data.textStart; -	size_t &textEnd = data.textEnd; - -	size_t pos = loc.getStart(); -	for (size_t i = 0; i < ulen; i++, pos++) { -		switch (mode) { -			case WhitespaceMode::PRESERVE: -				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd); -				break; -			case WhitespaceMode::TRIM: -				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                  textStart, textEnd, -				                                  whitespaceBuf); -				break; -			case WhitespaceMode::COLLAPSE: -				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd, -				                                    hasWhitespace); -				break; -		} + +	// Update start and end position +	if (textBuf.empty()) { +		data.textStart = loc.getStart();  	} +	data.textEnd = loc.getEnd(); + +	// Insert the data into the text buffer +	textBuf.insert(textBuf.end(), &s[0], &s[ulen]);  }  /* Class OsxmlEvents */ @@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {}  /* Class OsxmlEventParser */  OsxmlEventParserData::OsxmlEventParserData() -    : depth(0), -      annotationEndTagDepth(-1), -      hasWhitespace(false), -      textStart(0), -      textEnd(0) +    : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0)  {  } @@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId)  	// Reset the text buffers  	textBuf.clear(); -	whitespaceBuf.clear(); -	hasWhitespace = false;  	textStart = 0;  	textEnd = 0; @@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,      : reader(reader),        events(events),        logger(logger), -      whitespaceMode(WhitespaceMode::COLLAPSE),        data(new OsxmlEventParserData())  {  } @@ -532,16 +497,6 @@ void OsxmlEventParser::parse()  	}  } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ -	this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ -	return whitespaceMode; -} -  CharReader &OsxmlEventParser::getReader() const { return reader; }  Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..e3fd5d4 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@  #include <memory>  #include <string> -#include <core/common/Whitespace.hpp> -  namespace ousia {  // Forward declarations @@ -99,13 +97,10 @@ public:  	virtual void fieldEnd() = 0;  	/** -	 * Called whenever data is found. Whitespace data is handled as specified -	 * and the data has been parsed to the specified variant type. This function -	 * is not called if the parsing failed, the parser prints an error message -	 * instead. +	 * Called whenever string data is found.  	 * -	 * @param data is the already parsed data that should be passed to the -	 * handler. +	 * @param data is a Variant containing the string data that was found in the +	 * XML file.  	 */  	virtual void data(const Variant &data) = 0;  }; @@ -135,11 +130,6 @@ private:  	Logger &logger;  	/** -	 * Current whitespace mode. -	 */ -	WhitespaceMode whitespaceMode; - -	/**  	 * Data to be used by the internal functions.  	 */  	std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +161,6 @@ public:  	void parse();  	/** -	 * Sets the whitespace handling mode. -	 * -	 * @param whitespaceMode defines how whitespace in the data should be -	 * handled. -	 */ -	void setWhitespaceMode(WhitespaceMode whitespaceMode); - -	/** -	 * Returns the current whitespace handling mode. -	 * -	 * @return the currently set whitespace handling mode. -	 */ -	WhitespaceMode getWhitespaceMode() const; - -	/**  	 * Returns the internal CharReader reference.  	 *  	 * @return the CharReader reference.  | 
