diff options
Diffstat (limited to 'src')
20 files changed, 812 insertions, 866 deletions
diff --git a/src/core/parser/utils/Token.cpp b/src/core/common/Token.cpp index 8bcdbb5..8bcdbb5 100644 --- a/src/core/parser/utils/Token.cpp +++ b/src/core/common/Token.cpp diff --git a/src/core/parser/utils/Token.hpp b/src/core/common/Token.hpp index f907450..07d7c8f 100644 --- a/src/core/parser/utils/Token.hpp +++ b/src/core/common/Token.hpp @@ -30,6 +30,7 @@  #include <cstdint>  #include <limits>  #include <string> +#include <unordered_set>  #include <core/common/Location.hpp> @@ -46,6 +47,11 @@ using TokenId = uint32_t;  using TokenLength = uint16_t;  /** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set<TokenId>; + +/**   * Namespace containing constants for TokenId instances with special meaning.   */  namespace Tokens { @@ -66,15 +72,29 @@ constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2;  /**   * Token which represents a paragraph token -- issued if two consecutive - * newlines occur with optionally any amout of whitespace between them. + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached.   */  constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3;  /** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4; + +/**   * Token which represents an indentation token -- issued if the indentation of - * this line is larget than the indentation of the previous line. + * this line is larger than the indentation of the previous line.   */ -constexpr TokenId Indentation = std::numeric_limits<TokenId>::max() - 4; +constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5; + +/** + * Token which represents an unindentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. + */ +constexpr TokenId Unindent = std::numeric_limits<TokenId>::max() - 6;  /**   * Maximum token id to be used. Tokens allocated for users should not surpass @@ -109,6 +129,17 @@ struct Token {  	Token() : id(Tokens::Empty) {}  	/** +	 * Constructor of a "data" token with no explicit content. +	 * +	 * @param location is the location of the extracted string content in the +	 * source file. +	 */ +	Token(SourceLocation location) +	    : id(Tokens::Data), location(location) +	{ +	} + +	/**  	 * Constructor of the Token struct.  	 *  	 * @param id represents the token id. @@ -129,6 +160,14 @@ struct Token {  	Token(TokenId id) : id(id) {}  	/** +	 * Returns true if this token is special. +	 * +	 * @return true if the TokenId indicates that this token is a "special" +	 * token. +	 */ +	 + +	/**  	 * The getLocation function allows the tokens to be directly passed as  	 * parameter to Logger or LoggableException instances.  	 * diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include <string> -#include <vector> - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: -	/** -	 * Start position of the extracted text. -	 */ -	size_t textStart; - -	/** -	 * End position of the extracted text. -	 */ -	size_t textEnd; - -	/** -	 * Buffer containing the extracted text. -	 */ -	std::vector<char> textBuf; - -	/** -	 * Constructor of the TextHandlerBase base class. Initializes the start and -	 * end position with zeros. -	 */ -	WhitespaceHandler() : textStart(0), textEnd(0) {} - -	/** -	 * Returns true if this whitespace handler has found any text and a text -	 * token could be emitted. -	 * -	 * @return true if the internal data buffer is non-empty. -	 */ -	bool hasText() { return !textBuf.empty(); } - -	/** -	 * Returns the content of the WhitespaceHandler as string. -	 */ -	std::string toString() const -	{ -		return std::string(textBuf.data(), textBuf.size()); -	} -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: -	/** -	 * Appends the given character to the internal text buffer, does not -	 * eliminate whitespace. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		append(c, start, end, textBuf, textStart, textEnd); -	} - -	/** -	 * Static version of PreservingWhitespaceHandler append -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 * @param textBuf is a reference at the text buffer that is to be used. -	 * @param textStart is a reference at the text start variable that is to be -	 * used. -	 * @param textEnd is a reference at the text end variable that is to be -	 * used. -	 */ -	static void append(char c, size_t start, size_t end, -	                   std::vector<char> &textBuf, size_t &textStart, -	                   size_t &textEnd) -	{ -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; -		textBuf.push_back(c); -	} -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: -	/** -	 * Buffer used internally to temporarily store all whitespace characters. -	 * They are only added to the output buffer if another non-whitespace -	 * character is reached. -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * whitespace characters at the begin and end of the text. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); -	} - -	/** -	 * Static version of TrimmingWhitespaceHandler append -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 * @param textBuf is a reference at the text buffer that is to be used. -	 * @param textStart is a reference at the text start variable that is to be -	 * used. -	 * @param textEnd is a reference at the text end variable that is to be -	 * used. -	 * @param whitespaceBuf is a reference at the buffer for storing whitespace -	 * characters. -	 */ -	static void append(char c, size_t start, size_t end, -	                   std::vector<char> &textBuf, size_t &textStart, -	                   size_t &textEnd, std::vector<char> &whitespaceBuf) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				whitespaceBuf.push_back(c); -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (!whitespaceBuf.empty()) { -			textBuf.insert(textBuf.end(), whitespaceBuf.begin(), -			               whitespaceBuf.end()); -			whitespaceBuf.clear(); -		} -		textBuf.push_back(c); -	} -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: -	/** -	 * Flag set to true if a whitespace character was reached. -	 */ -	bool hasWhitespace = false; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * redundant whitespace characters. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); -	} - -	/** -	 * Static version of CollapsingWhitespaceHandler append -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 * @param textBuf is a reference at the text buffer that is to be used. -	 * @param textStart is a reference at the text start variable that is to be -	 * used. -	 * @param textEnd is a reference at the text end variable that is to be -	 * used. -	 * @param hasWhitespace is a reference at the "hasWhitespace" flag. -	 */ -	static void append(char c, size_t start, size_t end, -	                   std::vector<char> &textBuf, size_t &textStart, -	                   size_t &textEnd, bool &hasWhitespace) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				hasWhitespace = true; -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (hasWhitespace) { -			textBuf.push_back(' '); -			hasWhitespace = false; -		} -		textBuf.push_back(c); -	} -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template <typename WhitespaceHandler, typename Buffer> -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, -                                      size_t start) -{ -	for (auto elem : buf) { -		handler.append(elem, start, start + 1); -		start++; -	} -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ - diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index bb04bd3..d44176a 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -25,6 +25,7 @@  #include <core/model/Domain.hpp>  #include <core/model/Project.hpp>  #include <core/model/Typesystem.hpp> +#include <core/parser/utils/TokenizedData.hpp>  #include <core/parser/ParserScope.hpp>  #include <core/parser/ParserContext.hpp> @@ -372,8 +373,15 @@ bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field,  	return valid && scope().resolveValue(data, type, logger);  } -bool DocumentChildHandler::data(Variant &data) +bool DocumentChildHandler::data(TokenizedData &data)  { +	// TODO: Handle this correctly +	Variant text = data.text(WhitespaceMode::TRIM); +	if (text == nullptr) { +		// For now, except "no data" as success +		return true; +	} +  	// We're past the region in which explicit fields can be defined in the  	// parent structure element  	scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true); @@ -393,11 +401,11 @@ bool DocumentChildHandler::data(Variant &data)  	// If it is a primitive field directly, try to parse the content.  	if (field->isPrimitive()) {  		// Add it as primitive content. -		if (!convertData(field, data, logger())) { +		if (!convertData(field, text, logger())) {  			return false;  		} -		parent->createChildDocumentPrimitive(data, fieldIdx); +		parent->createChildDocumentPrimitive(text, fieldIdx);  		return true;  	} @@ -411,7 +419,7 @@ bool DocumentChildHandler::data(Variant &data)  	for (auto primitiveField : defaultFields) {  		// Then try to parse the content using the type specification.  		forks.emplace_back(logger().fork()); -		if (!convertData(primitiveField, data, forks.back())) { +		if (!convertData(primitiveField, text, forks.back())) {  			continue;  		} @@ -424,7 +432,7 @@ bool DocumentChildHandler::data(Variant &data)  		createPath(fieldIdx, path, parent);  		// Then create the primitive element -		parent->createChildDocumentPrimitive(data); +		parent->createChildDocumentPrimitive(text);  		return true;  	} @@ -434,10 +442,10 @@ bool DocumentChildHandler::data(Variant &data)  	if (defaultFields.empty()) {  		logger().error("Got data, but structure \"" + name() +  		                   "\" does not have any primitive field", -		               data); +		               text);  	} else {  		logger().error("Could not read data with any of the possible fields:", -		               data); +		               text);  		size_t f = 0;  		for (auto field : defaultFields) {  			logger().note(std::string("Field ") + @@ -471,4 +479,4 @@ namespace RttiTypes {  const Rtti DocumentField = RttiBuilder<ousia::parser_stack::DocumentField>(                                 "DocumentField").parent(&Node);  } -}
\ No newline at end of file +} diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 862081c..dda7d8b 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -167,7 +167,7 @@ public:  	bool start(Variant::mapType &args) override;  	void end() override; -	bool data(Variant &data) override; +	bool data(TokenizedData &data) override;  	bool fieldStart(bool &isDefault, size_t fieldIdx) override; @@ -213,4 +213,4 @@ extern const Rtti DocumentField;  }  } -#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */
\ No newline at end of file +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index bf5d4ea..3d413e8 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -18,6 +18,7 @@  #include <core/common/Exceptions.hpp>  #include <core/common/Logger.hpp> +#include <core/parser/utils/TokenizedData.hpp>  #include <core/parser/ParserContext.hpp>  #include "Callbacks.hpp" @@ -130,7 +131,7 @@ bool EmptyHandler::annotationEnd(const Variant &className,  	return true;  } -bool EmptyHandler::data(Variant &data) +bool EmptyHandler::data(TokenizedData &data)  {  	// Support any data  	return true; @@ -184,10 +185,13 @@ bool StaticHandler::annotationEnd(const Variant &className,  	return false;  } -bool StaticHandler::data(Variant &data) +bool StaticHandler::data(TokenizedData &data)  { -	logger().error("Did not expect any data here", data); -	return false; +	if (data.text(WhitespaceMode::TRIM) != nullptr) { +		logger().error("Did not expect any data here", data); +		return false; +	} +	return true;  }  /* Class StaticFieldHandler */ @@ -227,12 +231,19 @@ void StaticFieldHandler::end()  	}  } -bool StaticFieldHandler::data(Variant &data) +bool StaticFieldHandler::data(TokenizedData &data)  { +	Variant text = data.text(WhitespaceMode::TRIM); +	if (text == nullptr) { +		// Providing no data here is ok as long as the "doHandle" callback +		// function has already been called +		return handled; +	} +  	// Call the doHandle function if this has not been done before  	if (!handled) {  		handled = true; -		doHandle(data, args); +		doHandle(text, args);  		return true;  	} @@ -240,7 +251,7 @@ bool StaticFieldHandler::data(Variant &data)  	logger().error(  	    std::string("Found data, but the corresponding argument \"") + argName +  	        std::string("\" was already specified"), -	    data); +	    text);  	// Print the location at which the attribute was originally specified  	auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 7cda7a4..929466d 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -31,6 +31,7 @@ namespace ousia {  class ParserScope;  class ParserContext;  class Logger; +class TokenizedData;  namespace parser_stack { @@ -158,40 +159,63 @@ protected:  	 */  	const std::string &name() const; -public: -	/** -	 * Virtual destructor. -	 */ -	virtual ~Handler(); -  	/**  	 * Calls the corresponding function in the Callbacks instance. Sets the  	 * whitespace mode that specifies how string data should be processed. The  	 * calls to this function are placed on a stack by the underlying Stack -	 * class. +	 * class. This function should be called from the "fieldStart" callback and +	 * the "start" callback. If no whitespace mode is pushed in the "start" +	 * method the whitespace mode "TRIM" is implicitly assumed.  	 *  	 * @param whitespaceMode specifies one of the three WhitespaceMode constants  	 * PRESERVE, TRIM or COLLAPSE.  	 */ -	void setWhitespaceMode(WhitespaceMode whitespaceMode); +	void pushWhitespaceMode(WhitespaceMode whitespaceMode);  	/** -	 * Calls the corresponding function in the Callbacks instance. -	 * Registers the given token as token that should be reported to the handler -	 * using the "token" function. -	 * -	 * @param token is the token string that should be reported. +	 * Pops a previously pushed whitespace mode. Calls to this function should +	 * occur in the "end" callback and the "fieldEnd" callback. This function +	 * can only undo pushs that were performed by the pushWhitespaceMode() +	 * method of the same handler.  	 */ -	void registerToken(const std::string &token); +	void popWhitespaceMode();  	/** -	 * Calls the corresponding function in the Callbacks instance. -	 * Unregisters the given token, it will no longer be reported to the handler -	 * using the "token" function. +	 * Calls the corresponding function in the Callbacks instance. Sets the +	 * whitespace mode that specifies how string data should be processed. The +	 * calls to this function are placed on a stack by the underlying Stack +	 * class. This function should be called from the "fieldStart" callback and +	 * the "start" callback. If no whitespace mode is pushed in the "start" +	 * method the whitespace mode "TRIM" is implicitly assumed.  	 * -	 * @param token is the token string that should be unregistered. +	 * @param tokens is a list of tokens that should be reported to this handler +	 * instance via the "token" method.  	 */ -	void unregisterToken(const std::string &token); +	void pushTokens(const std::vector<std::string> &tokens); + +	/** +	 * Pops a previously pushed whitespace mode. Calls to this function should +	 * occur in the "end" callback and the "fieldEnd" callback. This function +	 * can only undo pushs that were performed by the pushWhitespaceMode() +	 * method of the same handler. +	 */ +	void popWhitespaceMode(); + + +	/** +	 * Calls the corresponding function in the Callbacks instance. This method +	 * registers the given tokens as tokens that are generally available, tokens +	 * must be explicitly enabled using the "pushTokens" and "popTokens" method. +	 * Tokens that have not been registered are not guaranteed to be reported, +	 * even though they are  +	 */ +	void registerTokens(const std::vector<std::string> &tokens); + +public: +	/** +	 * Virtual destructor. +	 */ +	virtual ~Handler();  	/**  	 * Returns the command name for which the handler was created. @@ -299,11 +323,11 @@ public:  	 * Handler instance. Should return true if the data could be handled, false  	 * otherwise.  	 * -	 * @param data is a string variant containing the character data and its -	 * location. +	 * @param data is an instance of TokenizedData containing the segmented +	 * character data and its location.  	 * @return true if the data could be handled, false otherwise.  	 */ -	virtual bool data(Variant &data) = 0; +	virtual bool data(TokenizedData &data) = 0;  };  /** @@ -333,7 +357,7 @@ public:  	                     Variant::mapType &args) override;  	bool annotationEnd(const Variant &className,  	                   const Variant &elementName) override; -	bool data(Variant &data) override; +	bool data(TokenizedData &data) override;  	/**  	 * Creates an instance of the EmptyHandler class. @@ -359,7 +383,7 @@ public:  	                     Variant::mapType &args) override;  	bool annotationEnd(const Variant &className,  	                   const Variant &elementName) override; -	bool data(Variant &data) override; +	bool data(TokenizedData &data) override;  };  /** @@ -412,7 +436,7 @@ protected:  public:  	bool start(Variant::mapType &args) override;  	void end() override; -	bool data(Variant &data) override; +	bool data(TokenizedData &data) override;  };  }  } diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 5b67248..309c9a0 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -19,6 +19,7 @@  #include <core/common/Logger.hpp>  #include <core/common/Utils.hpp>  #include <core/common/Exceptions.hpp> +#include <core/parser/utils/TokenizedData.hpp>  #include <core/parser/ParserScope.hpp>  #include <core/parser/ParserContext.hpp> @@ -413,16 +414,24 @@ void Stack::command(const Variant &name, const Variant::mapType &args)  	}  } -void Stack::data(const Variant &data) +void Stack::data(TokenizedData data)  { -	// End handlers that already had a default field and are currently not -	// active. -	endOverdueHandlers(); +	// TODO: Rewrite this function for token handling +	// TODO: This loop needs to be refactored out +	while (!data.atEnd()) { +		// End handlers that already had a default field and are currently not +		// active. +		endOverdueHandlers(); -	while (true) { -		// Check whether there is any command the data can be sent to +		const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); + +		// Check whether there is any command the data can be sent to -- if not, +		// make sure the data actually is data  		if (stack.empty()) { -			throw LoggableException("No command here to receive data.", data); +			if (hasNonWhitespaceText) { +				throw LoggableException("No command here to receive data.", data); +			} +			return;  		}  		// Fetch the current command handler information @@ -440,7 +449,10 @@ void Stack::data(const Variant &data)  			// If the "hadDefaultField" flag is set, we already issued an error  			// message  			if (!info.hadDefaultField) { -				logger().error("Did not expect any data here", data); +				if (hasNonWhitespaceText) { +					logger().error("Did not expect any data here", data); +				} +				return;  			}  		} @@ -454,8 +466,16 @@ void Stack::data(const Variant &data)  			// Pass the data to the current Handler instance  			bool valid = false;  			try { -				Variant dataCopy = data; -				valid = info.handler->data(dataCopy); +				// Create a fork of the TokenizedData and let the handler work +				// on it +				TokenizedData dataFork = data; +				valid = info.handler->data(dataFork); + +				// If the data was validly handled by the handler, commit the +				// change +				if (valid) { +					data = dataFork; +				}  			}  			catch (LoggableException ex) {  				loggerFork.log(ex); @@ -482,6 +502,19 @@ void Stack::data(const Variant &data)  	}  } +void Stack::data(const Variant &stringData) +{ +	// Fetch the SourceLocation of the given stringData variant +	SourceLocation loc = stringData.getLocation(); + +	// Create a TokenizedData instance and feed the given string data into it +	TokenizedData tokenizedData(loc.getSourceId()); +	tokenizedData.append(stringData.asString(), loc.getStart()); + +	// Call the actual "data" method +	data(tokenizedData); +} +  void Stack::fieldStart(bool isDefault)  {  	// Make sure the current handler stack is not empty @@ -584,4 +617,4 @@ void Stack::token(Variant token)  	// TODO  }  } -}
\ No newline at end of file +} diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index b67ce82..cd29b28 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -44,6 +44,7 @@ namespace ousia {  // Forward declarations  class ParserContext;  class Logger; +class TokenizedData;  namespace parser_stack { @@ -292,13 +293,24 @@ public:  	void command(const Variant &name, const Variant::mapType &args);  	/** -	 * Function that shuold be called whenever character data is found in the +	 * Function that should be called whenever character data is found in the  	 * input stream. May only be called if the currently is a command on the  	 * stack.  	 * -	 * @param data is a string variant containing the data that has been found. +	 * @param data is a TokenizedData instance containing the pre-segmented data +	 * that should be read. +	 */ +	void data(TokenizedData data); + +	/** +	 * Function that shuold be called whenever character data is found in the +	 * input stream. The given string variant is converted into a TokenizedData +	 * instance internally. +	 * +	 * @param stringData is a string variant containing the data that has been +	 * found.  	 */ -	void data(const Variant &data); +	void data(const Variant &stringData);  	/**  	 * Function that should be called whenever a new field starts. Fields of the diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..aaebe7d 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -127,7 +127,7 @@ public:  	 * read.  	 * @return a pair containing start and end source offset.  	 */ -	std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) +	std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) const  	{  		// Special treatment for the last character  		const size_t count = lens.size(); @@ -157,7 +157,31 @@ public:  	/**  	 * Returns the number of characters for which offsets are stored.  	 */ -	size_t size() { return lens.size(); } +	size_t size() const { return lens.size(); } + +	/** +	 * Trims the length of the TokenizedData instance to the given length. +	 * Removes all token matches that lie within the trimmed region. +	 * +	 * @param length is the number of characters to which the TokenizedData +	 * instance should be trimmed. +	 */ +	void trim(size_t length) { +		if (length < size()) { +			lens.resize(length); +			offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); +		} +	} + +	/** +	 * Resets the SourceOffsetVector to the state it had when it was +	 * constructed. +	 */ +	void clear() { +		lens.clear(); +		offsets.clear(); +		lastEnd = 0; +	}  };  } diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 80cc945..a45d3ff 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia {  /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(Tokens::Empty) {} +TokenTrie::Node::Node() : id(Tokens::Empty) {}  /* Class DynamicTokenTree */  bool TokenTrie::registerToken(const std::string &token, -                              TokenId type) noexcept +                              TokenId id) noexcept  {  	// Abort if the token is empty -- this would taint the root node  	if (token.empty()) { @@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token,  	}  	// If the resulting node already has a type set, we're screwed. -	if (node->type != Tokens::Empty) { +	if (node->id != Tokens::Empty) {  		return false;  	}  	// Otherwise just set the type to the given type. -	node->type = type; +	node->id = id;  	return true;  } @@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept  		// Reset the subtree handler if this node has another type  		node = it->second.get(); -		if ((node->type != Tokens::Empty || node->children.size() > 1) && +		if ((node->id != Tokens::Empty || node->children.size() > 1) &&  		    (i + 1 != token.size())) {  			subtreeRoot = node;  			subtreeKey = token[i + 1]; @@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept  	}  	// If the node type is already Tokens::Empty, we cannot do anything here -	if (node->type == Tokens::Empty) { +	if (node->id == Tokens::Empty) {  		return false;  	}  	// If the target node has children, we cannot delete the subtree. Set the  	// type to Tokens::Empty instead  	if (!node->children.empty()) { -		node->type = Tokens::Empty; +		node->id = Tokens::Empty;  		return true;  	} @@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept  		}  		node = it->second.get();  	} -	return node->type; +	return node->id;  }  } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index b2d1539..c470acc 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,7 +33,7 @@  #include <limits>  #include <unordered_map> -#include "Token.hpp" +#include <core/common/Token.hpp>  namespace ousia { @@ -75,10 +75,9 @@ public:  		ChildMap children;  		/** -		 * Reference at the corresponding token descriptor. Set to nullptr if -		 * no token is attached to this node. +		 * Id of the token represented by this node.  		 */ -		TokenId type; +		TokenId id;  		/**  		 * Default constructor, initializes the descriptor with nullptr. @@ -99,10 +98,10 @@ public:  	 *  	 * @param token is the character sequence that should be registered as  	 * token. -	 * @param type is the descriptor that should be set for this token. +	 * @param id is the descriptor that should be set for this token.  	 * @return true if the operation is successful, false otherwise.  	 */ -	bool registerToken(const std::string &token, TokenId type) noexcept; +	bool registerToken(const std::string &token, TokenId id) noexcept;  	/**  	 * Unregisters the token from the token tree. Returns true if the token was diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..0ec56af 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -110,19 +110,19 @@ private:  	std::vector<char> buf;  	/** -	 * Vector containing all token marks. +	 * Vector storing all the character offsets efficiently.  	 */ -	std::vector<TokenMark> marks; +	SourceOffsetVector offsets;  	/** -	 * Vector storing all the character offsets efficiently. +	 * Vector containing all token marks.  	 */ -	SourceOffsetVector offsets; +	mutable std::vector<TokenMark> marks;  	/**  	 * Flag indicating whether the internal "marks" vector is sorted.  	 */ -	bool sorted; +	mutable bool sorted;  public:  	/** @@ -150,9 +150,12 @@ public:  		// Extend the text regions, interpolate the source position (this may  		// yield incorrect results)  		const size_t size = buf.size(); -		for (SourceOffset offs = offsStart; offs < offsStart + data.size(); -		     offs++) { -			offsets.storeOffset(offs, offs + 1); +		for (size_t i = 0; i < data.size(); i++) { +			if (offsStart != InvalidSourceOffset) { +				offsets.storeOffset(offsStart + i, offsStart + i + 1); +			} else { +				offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset); +			}  		}  		return size; @@ -213,7 +216,7 @@ public:  	 * available.  	 */  	bool next(Token &token, WhitespaceMode mode, -	          const std::unordered_set<TokenId> &tokens, size_t &cursor) +	          const std::unordered_set<TokenId> &tokens, size_t &cursor) const  	{  		// Sort the "marks" vector if it has not been sorted yet.  		if (!sorted) { @@ -222,10 +225,11 @@ public:  		}  		// Fetch the next larger TokenMark instance, make sure the token is in -		// the "enabled" list +		// the "enabled" list and within the buffer range  		auto it =  		    std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); -		while (it != marks.end() && tokens.count(it->id) == 0) { +		while (it != marks.end() && (tokens.count(it->id) == 0 || +		                             it->bufStart + it->len > buf.size())) {  			it++;  		} @@ -304,11 +308,58 @@ public:  	}  	/** +	 * Resets the TokenizedDataImpl instance to the state it had when it was +	 * constructred. +	 */ +	void clear() +	{ +		buf.clear(); +		marks.clear(); +		offsets.clear(); +		sorted = true; +	} + +	/** +	 * Trims the length of the TokenizedDataImpl instance to the given length. +	 * +	 * @param length is the number of characters to which the TokenizedData +	 * instance should be trimmed. +	 */ +	void trim(size_t length) +	{ +		if (length < size()) { +			buf.resize(length); +			offsets.trim(length); +		} +	} + +	/**  	 * Returns the current size of the internal buffer.  	 *  	 * @return the size of the internal character buffer.  	 */ -	size_t getSize() { return buf.size(); } +	size_t size() const { return buf.size(); } + +	/** +	 * Returns true if no data is in the data buffer. +	 * +	 * @return true if the "buf" instance has no data. +	 */ +	bool empty() const { return buf.empty(); } + +	/** +	 * Returns the current location of all data in the buffer. +	 * +	 * @return the location of the entire data represented by this instance. +	 */ +	SourceLocation getLocation() const +	{ +		if (empty()) { +			return SourceLocation{sourceId}; +		} +		return SourceLocation{sourceId, offsets.loadOffset(0).first, +		                      offsets.loadOffset(size()).second}; +	}  };  /* Class TokenizedData */ @@ -335,7 +386,7 @@ size_t TokenizedData::append(char c, SourceOffset offsStart,  void TokenizedData::mark(TokenId id, TokenLength len)  { -	impl->mark(id, impl->getSize() - len, len); +	impl->mark(id, impl->size() - len, len);  }  void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) @@ -343,23 +394,67 @@ void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)  	impl->mark(id, bufStart, len);  } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear()  { -	return impl->next(token, mode, tokens, cursor); +	impl->clear(); +	tokens.clear(); +	cursor = 0;  } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const +{ +	return impl->getLocation(); +} + +TokenizedDataReader reader() const +{ +	return TokenizedDataReader(impl, std::unordered_set<TokenId>{}, 0, 0); +} + +/* Class TokenizedDataReader */ + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ +	return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const { return readCursor >= size(); } + +bool TokenizedData::read(Token &token, const TokenSet &tokens, +                         WhitespaceMode mode) +{ +	peekCursor = readCursor; +	return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedData::peek(Token &token, const TokenSet &tokens, +                         WhitespaceMode mode) +{ +	return impl->next(token, mode, tokens, peekCursor); +} + +Variant TokenizedData::text(WhitespaceMode mode)  {  	// Copy the current cursor position to not update the actual cursor position  	// if the operation was not successful  	size_t cursorCopy = cursor; +	Token token;  	if (!impl->next(token, mode, tokens, cursorCopy) ||  	    token.id != Tokens::Data) { -		return false; +		return Variant{nullptr};  	} -	// There is indeed a text token, update the internal cursor position +	// There is indeed a text token, update the internal cursor position and +	// return the token as variant.  	cursor = cursorCopy; -	return true; +	Variant res = Variant::fromString(token.content); +	res.setLocation(token.getLocation()); +	return res;  }  } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..85b80ae 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -36,42 +36,29 @@  #include <unordered_set>  #include <core/common/Location.hpp> +#include <core/common/Variant.hpp>  #include <core/common/Whitespace.hpp> - -#include "Token.hpp" +#include <core/common/Token.hpp>  namespace ousia {  // Forward declaration  class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork;  /**   * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * The data stored in TokenizedData   */  class TokenizedData {  private:  	/** -	 * Shared pointer pointing at the internal data. This data is shared when -	 * copying TokenizedData instances, which corresponds to forking a -	 * TokenizedData instance. +	 * Shared pointer pointing at the internal data. This data is shared with +	 * all the TokenizedDataReader instances.  	 */  	std::shared_ptr<TokenizedDataImpl> impl; -	/** -	 * Contains all currently enabled token ids. -	 */ -	std::unordered_set<TokenId> tokens; - -	/** -	 * Position from which the last element was read from the internal buffer. -	 * This information is not shared with the other instances of TokenizedData -	 * pointing at the same location. -	 */ -	size_t cursor; -  public:  	/**  	 * Default constructor, creates a new instance of TokenizedData, sets the @@ -136,25 +123,121 @@ public:  	void mark(TokenId id, size_t bufStart, TokenLength len);  	/** -	 * Enables a single token id. Enabled tokens will no longer be returned as -	 * text. Instead, when querying for the next token, TokenizedData will -	 * return them as token and not as part of a Text token. +	 * Resets the TokenizedData instance to the state it had when it was +	 * constructred. +	 */ +	void clear(); + +	/** +	 * Trims the length of the TokenizedData instance to the given length. Note +	 * that this function does not remove any token matches for performance +	 * reasons, it merely renders them incaccessible. Appending new data after +	 * calling trim will make the token marks accessible again. Thus this method +	 * should be the last function called to modify the data buffer and the +	 * token marks.  	 * -	 * @param id is the TokenId of the token that should be enabled. +	 * @param length is the number of characters to which the TokenizedData +	 * instance should be trimmed.  	 */ -	void enableToken(TokenId id) { tokens.insert(id); } +	void trim(size_t length);  	/** -	 * Enables a set of token ids. Enabled tokens will no longer be returned as -	 * text. Instead, when querying for the next token, TokenizedData will -	 * return them as token and not as part of a Text token. +	 * Returns the number of characters currently represented by this +	 * TokenizedData instance. +	 */ +	size_t size() const; + +	/** +	 * Returns true if the TokenizedData instance is empty, false otherwise.  	 * -	 * @param ids is the TokenId of the token that should be enabled. +	 * @return true if not data is stored inside the TokenizedData instance.  	 */ -	void enableToken(const std::unordered_set<TokenId> &ids) -	{ -		tokens.insert(ids.begin(), ids.end()); -	} +	bool empty() const; + +	/** +	 * Returns the location of the entire TokenizedData instance. +	 * +	 * @return the location of the entire data represented by this instance. +	 */ +	SourceLocation getLocation() const; + +	/** +	 * Returns a TokenizedDataReader instance that can be used to access the +	 * data. +	 * +	 * @return a new TokenizedDataReader instance pointing at the beginning of +	 * the internal buffer. +	 */ +	TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: +	friend TokenizedData; + +	/** +	 * Shared pointer pointing at the internal data. This data is shared with +	 * all the TokenizedDataReader instances. +	 */ +	std::shared_ptr<const TokenizedDataImpl> impl; + +	/** +	 * Position from which the last element was read from the internal buffer. +	 */ +	size_t readCursor; + +	/** +	 * Position from which the last element was peeked from the internal buffer. +	 */ +	size_t peekCursor; + +	/** +	 * Private constructor of TokenizedDataReader, taking a reference to the +	 * internal TokenizedDataImpl structure storing the data that is accessed by +	 * the reader. +	 * +	 * @param impl is the TokenizedDataImpl instance that holds the actual data. +	 * @param readCursor is the cursor position from which tokens and text are +	 * read. +	 * @param peekCursor is the cursor position from which tokens and text are +	 * peeked. +	 */ +	TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl, +	                    size_t readCursor, size_t peekCursor); + +public: +	/** +	 * Returns a new TokenizedDataReaderFork from which tokens and text can be +	 * read without advancing this reader instance. +	 */ +	TokenizedDataReaderFork fork(); + +	/** +	 * Returns true if this TokenizedData instance is at the end. +	 * +	 * @return true if the end of the TokenizedData instance has been reached. +	 */ +	bool atEnd() const; + +	/** +	 * Stores the next token in the given token reference, returns true if the +	 * operation was successful, false if there are no more tokens. Advances the +	 * internal cursor and re +	 * +	 * @param token is an output parameter into which the read token will be +	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens. +	 * @param tokens is the set of token identifers, representing the currently +	 * enabled tokens. +	 * @param mode is the whitespace mode that should be used when a text token +	 * is returned. +	 * @return true if the operation was successful and there is a next token, +	 * false if there are no more tokens. +	 */ +	bool read(Token &token, const TokenSet &tokens = TokenSet{}, +	          WhitespaceMode mode = WhitespaceMode::COLLAPSE);  	/**  	 * Stores the next token in the given token reference, returns true if the @@ -162,12 +245,26 @@ public:  	 *  	 * @param token is an output parameter into which the read token will be  	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens. +	 * @param tokens is the set of token identifers, representing the currently +	 * enabled tokens.  	 * @param mode is the whitespace mode that should be used when a text token  	 * is returned.  	 * @return true if the operation was successful and there is a next token,  	 * false if there are no more tokens.  	 */ -	bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); +	bool peek(Token &token, const TokenSet &tokens = TokenSet{}, +	          WhitespaceMode mode = WhitespaceMode::COLLAPSE); + +	/** +	 * Consumes the peeked tokens, the read cursor will now be at the position +	 * of the peek cursor. +	 */ +	void consumePeek() { readCursor = peekCursor; } + +	/** +	 * Resets the peek cursor to the position of the read cursor. +	 */ +	void resetPeek() { peekCursor = readCursor; }  	/**  	 * Stores the next text token in the given token reference, returns true if @@ -178,12 +275,53 @@ public:  	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.  	 * @param mode is the whitespace mode that should be used when a text token  	 * is returned. -	 * @return true if the operation was successful and there is a next token, -	 * false if there are no more tokens. +	 * @return a string variant with the data if there is any data or a nullptr +	 * variant if there is no text.  	 */ -	bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); +	Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE);  }; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: +	friend TokenizedDataReader; + +	/** +	 * Reference pointing at the parent TokenizedDataReader to which changes may +	 * be commited. +	 */ +	TokenizedDataReader &parent; + +	/** +	 * Private constructor of TokenizedDataReaderFork, taking a reference to the +	 * internal TokenizedDataImpl structure storing the data that is accessed by +	 * the reader and a reference at the parent TokenizedDataReader. +	 * +	 * @param parent is the TokenizedDataReader instance to which the current +	 * read/peek progress may be commited. +	 * @param impl is the TokenizedDataImpl instance that holds the actual data. +	 * @param readCursor is the cursor position from which tokens and text are +	 * read. +	 * @param peekCursor is the cursor position from which tokens and text are +	 * peeked. +	 */ +	TokenizedDataReaderFork(TokenizedDataReader &parent, +	                        std::shared_ptr<TokenizedDataImpl> impl, +	                        size_t readCursor, size_t peekCursor) +	    : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) +	{ +	} + +public: +	/** +	 * Commits the read/peek progress to the underlying parent. +	 */ +	void commit() { parent = *this; } +}  } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..51787cd 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@  #include <core/common/CharReader.hpp>  #include <core/common/Exceptions.hpp>  #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include "TokenizedData.hpp"  #include "Tokenizer.hpp"  namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch {  	Token token;  	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. +	 * Position at which this token starts in the TokenizedData instance.  	 */ -	size_t textLength; +	size_t dataStartOffset;  	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. +	 * Set to true if the matched token is a primary token.  	 */ -	size_t textEnd; +	bool primary;  	/**  	 * Constructor of the TokenMatch class.  	 */ -	TokenMatch() : textLength(0), textEnd(0) {} +	TokenMatch() : dataStartOffset(0), primary(false) {}  	/**  	 * Returns true if this TokenMatch instance actually represents a match. +	 * +	 * @return true if the TokenMatch actually has a match. +	 */ +	bool hasMatch() const { return token.id != Tokens::Empty; } + +	/** +	 * Returns the length of the matched token. +	 * +	 * @return the length of the token string.  	 */ -	bool hasMatch() { return token.id != Tokens::Empty; } +	size_t size() const { return token.content.size(); }  };  /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private:  	size_t start;  	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. +	 * Position at which this token starts in the TokenizedData instance.  	 */ -	size_t textLength; - -	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. -	 */ -	size_t textEnd; +	size_t dataStartOffset;  public:  	/**  	 * Constructor of the TokenLookup class.  	 *  	 * @param node is the current node. -	 * @param start is the start position. -	 * @param textLength is the text buffer length of the previous text token. -	 * @param textEnd is the current end location of the previous text token. +	 * @param start is the start position in the source file. +	 * @param dataStartOffset is the current length of the TokenizedData buffer.  	 */ -	TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, -	            size_t textEnd) -	    : node(node), start(start), textLength(textLength), textEnd(textEnd) +	TokenLookup(const TokenTrie::Node *node, size_t start, +	            size_t dataStartOffset) +	    : node(node), start(start), dataStartOffset(dataStartOffset)  	{  	}  	/**  	 * Tries to extend the current path in the token trie with the given -	 * character. If a complete token is matched, stores this match in the -	 * tokens list (in case it is longer than any previous token). +	 * character. If a complete token is matched, stores the match in the given +	 * TokenMatch reference and returns true.  	 *  	 * @param c is the character that should be appended to the current prefix.  	 * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,48 @@ public:  	 * Tokenizer.  	 * @param end is the end byte offset of the current character.  	 * @param sourceId is the source if of this file. +	 * @return true if a token was matched, false otherwise.  	 */ -	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, -	             const std::vector<std::string> &tokens, SourceOffset end, -	             SourceId sourceId) +	bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, +	             const std::vector<Tokenizer::TokenDescriptor> &tokens, +	             SourceOffset end, SourceId sourceId)  	{ -		// Check whether we can continue the current token path with the given -		// character without visiting an already visited node +		// Set to true once a token has been matched +		bool res = false; + +		// Check whether we can continue the current token path, if not, abort  		auto it = node->children.find(c);  		if (it == node->children.end()) { -			return; +			return res;  		}  		// Check whether the new node represents a complete token a whether it  		// is longer than the current token. If yes, replace the current token.  		node = it->second.get(); -		if (node->type != Tokens::Empty) { -			const std::string &str = tokens[node->type]; -			size_t len = str.size(); -			if (len > match.token.content.size()) { -				match.token = -				    Token{node->type, str, {sourceId, start, end}}; -				match.textLength = textLength; -				match.textEnd = textEnd; -			} +		if (node->id != Tokens::Empty) { +			const Tokenizer::TokenDescriptor &descr = tokens[node->id]; +			match.token = Token(node->id, descr.string, +			                    SourceLocation(sourceId, start, end)); +			match.dataStartOffset = dataStartOffset; +			match.primary = descr.primary; +			res = true;  		}  		// If this state can possibly be advanced, store it in the states list.  		if (!node->children.empty()) {  			lookups.emplace_back(*this);  		} +		return res;  	}  }; - -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, -                           SourceId sourceId) -{ -	if (match.hasMatch()) { -		match.token.content = -		    std::string{handler.textBuf.data(), match.textLength}; -		match.token.location = -		    SourceLocation{sourceId, handler.textStart, match.textEnd}; -	} else { -		match.token.content = handler.toString(); -		match.token.location = -		    SourceLocation{sourceId, handler.textStart, handler.textEnd}; -	} -	match.token.id = Tokens::Data; -}  }  /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) -    : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template <typename TextHandler, bool read> -bool Tokenizer::next(CharReader &reader, Token &token) +template <bool read> +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)  {  	// If we're in the read mode, reset the char reader peek position to the  	// current read position @@ -199,43 +173,68 @@ bool Tokenizer::next(CharReader &reader, Token &token)  	// Prepare the lookups in the token trie  	const TokenTrie::Node *root = trie.getRoot(); -	TokenMatch match; +	TokenMatch bestMatch;  	std::vector<TokenLookup> lookups;  	std::vector<TokenLookup> nextLookups; -	// Instantiate the text handler -	TextHandler textHandler; -  	// Peek characters from the reader and try to advance the current token tree  	// cursor  	char c; +	const size_t initialDataSize = data.size();  	size_t charStart = reader.getPeekOffset();  	const SourceId sourceId = reader.getSourceId();  	while (reader.peek(c)) {  		const size_t charEnd = reader.getPeekOffset(); -		const size_t textLength = textHandler.textBuf.size(); -		const size_t textEnd = textHandler.textEnd; +		const size_t dataStartOffset = data.size();  		// If we do not have a match yet, start a new lookup from the root -		if (!match.hasMatch()) { -			TokenLookup{root, charStart, textLength, textEnd}.advance( -			    c, nextLookups, match, tokens, charEnd, sourceId); +		if (!bestMatch.hasMatch()) { +			lookups.emplace_back(root, charStart, dataStartOffset);  		}  		// Try to advance all other lookups with the new character +		TokenMatch match;  		for (TokenLookup &lookup : lookups) { -			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); +			// Continue if the current lookup +			if (!lookup.advance(c, nextLookups, match, tokens, charEnd, +			                    sourceId)) { +				continue; +			} + +			// If the matched token is primary, check whether it is better than +			// the current best match, if yes, replace the best match. In any +			// case just continue +			if (match.primary) { +				if (match.size() > bestMatch.size()) { +					bestMatch = match; +				} +				continue; +			} + +			// Otherwise -- if the matched token is a non-primary token (and no +			// primary token has been found until now) -- mark the match in the +			// TokenizedData +			if (!bestMatch.hasMatch()) { +				data.mark(match.token.id, data.size() - match.size() + 1, +				          match.size()); +			}  		}  		// We have found a token and there are no more states to advance or the  		// text handler has found something -- abort to return the new token -		if (match.hasMatch()) { -			if ((nextLookups.empty() || textHandler.hasText())) { +		if (bestMatch.hasMatch()) { +			if ((nextLookups.empty() || data.size() > initialDataSize)) {  				break;  			}  		} else {  			// Record all incomming characters -			textHandler.append(c, charStart, charEnd); +			data.append(c, charStart, charEnd); + +			// Special token processing +			// TODO: Build a special state machine for this in another class +			if (c == '\n') { +				data.mark(Tokens::Newline, 1); +			}  		}  		// Swap the lookups and the nextLookups list @@ -246,60 +245,53 @@ bool Tokenizer::next(CharReader &reader, Token &token)  		charStart = charEnd;  	} -	// If we found text, emit that text -	if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { -		buildDataToken(textHandler, match, sourceId); +	// If we found data, emit a corresponding data token +	if (data.size() > initialDataSize && +	    (!bestMatch.hasMatch() || +	     bestMatch.dataStartOffset > initialDataSize)) { +		// If we have a "bestMatch" wich starts after text data has started, +		// trim the TokenizedData to this offset +		if (bestMatch.dataStartOffset > initialDataSize) { +			data.trim(bestMatch.dataStartOffset); +		} + +		// Create a token containing the data location +		bestMatch.token = Token{data.getLocation()};  	}  	// Move the read/peek cursor to the end of the token, abort if an error  	// happens while doing so -	if (match.hasMatch()) { +	if (bestMatch.hasMatch()) {  		// Make sure we have a valid location -		if (match.token.location.getEnd() == InvalidSourceOffset) { +		if (bestMatch.token.location.getEnd() == InvalidSourceOffset) {  			throw OusiaException{"Token end position offset out of range"};  		}  		// Seek to the end of the current token -		const size_t end = match.token.location.getEnd(); +		const size_t end = bestMatch.token.location.getEnd();  		if (read) {  			reader.seek(end);  		} else {  			reader.seekPeekCursor(end);  		} -		token = match.token; +		token = bestMatch.token;  	} else {  		token = Token{};  	} -	return match.hasMatch(); +	return bestMatch.hasMatch();  } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data)  { -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingWhitespaceHandler, true>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingWhitespaceHandler, true>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingWhitespaceHandler, true>(reader, token); -	} -	return false; +	return next<true>(reader, token, data);  } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data)  { -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingWhitespaceHandler, false>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingWhitespaceHandler, false>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingWhitespaceHandler, false>(reader, token); -	} -	return false; +	return next<false>(reader, token, data);  } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary)  {  	// Abort if an empty token should be registered  	if (token.empty()) { @@ -309,8 +301,8 @@ TokenId Tokenizer::registerToken(const std::string &token)  	// Search for a new slot in the tokens list  	TokenId type = Tokens::Empty;  	for (size_t i = nextTokenId; i < tokens.size(); i++) { -		if (tokens[i].empty()) { -			tokens[i] = token; +		if (!tokens[i].valid()) { +			tokens[i] = TokenDescriptor(token, primary);  			type = i;  			break;  		} @@ -320,62 +312,47 @@ TokenId Tokenizer::registerToken(const std::string &token)  	// override the special token type handles  	if (type == Tokens::Empty) {  		type = tokens.size(); -		if (type == Tokens::Data || type == Tokens::Empty) { +		if (type >= Tokens::MaxTokenId) {  			throw OusiaException{"Token type ids depleted!"};  		} -		tokens.emplace_back(token); +		tokens.emplace_back(token, primary);  	}  	nextTokenId = type + 1; -	// Try to register the token in the trie -- if this fails, remove it -	// from the tokens list +	// Try to register the token in the trie -- if this fails, remove it from +	// the tokens list  	if (!trie.registerToken(token, type)) { -		tokens[type] = std::string{}; +		tokens[type] = TokenDescriptor();  		nextTokenId = type;  		return Tokens::Empty;  	}  	return type;  } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id)  {  	// Unregister the token from the trie, abort if an invalid type is given -	if (type < tokens.size() && trie.unregisterToken(tokens[type])) { -		tokens[type] = std::string{}; -		nextTokenId = type; +	if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { +		tokens[id] = TokenDescriptor(); +		nextTokenId = id;  		return true;  	}  	return false;  } -std::string Tokenizer::getTokenString(TokenId type) -{ -	if (type < tokens.size()) { -		return tokens[type]; -	} -	return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const  { -	whitespaceMode = mode; +	if (id < tokens.size()) { +		return tokens[id]; +	} +	return EmptyTokenDescriptor;  } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } -  /* Explicitly instantiate all possible instantiations of the "next" member     function */ -template bool Tokenizer::next<PreservingWhitespaceHandler, false>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, false>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, false>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<PreservingWhitespaceHandler, true>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, true>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, true>( -    CharReader &reader, Token &token); +template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &);  } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..2ddb9c9 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@  /**   * @file Tokenizer.hpp   * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format.   *   * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)   */ @@ -33,39 +33,75 @@  #include <vector>  #include <core/common/Location.hpp> -#include <core/common/Whitespace.hpp> +#include <core/common/Token.hpp> -#include "Token.hpp"  #include "TokenTrie.hpp"  namespace ousia {  // Forward declarations  class CharReader; +class TokenizedData;  /**   * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance.   */  class Tokenizer { -private: +public:  	/** -	 * Internally used token trie. This object holds all registered tokens. +	 * Internally used structure describing a registered token.  	 */ -	TokenTrie trie; +	struct TokenDescriptor { +		/** +		 * String describing the token. +		 */ +		std::string string; + +		/** +		 * Set to true if this token is primary. +		 */ +		bool primary; + +		/** +		 * Constructor of the TokenDescriptor class. +		 * +		 * @param string is the string representation of the registered token. +		 * @param primary specifies whether the token is a primary token that +		 * should be returned as a single token, or a secondary token, that +		 * should be returned as part of TokenizedData. +		 */ +		TokenDescriptor(const std::string &string, bool primary) +		    : string(string), primary(primary) +		{ +		} + +		/** +		 * Default constructor. +		 */ +		TokenDescriptor() : primary(false) {} + +		/** +		 * Returns true if the TokenDescriptor represents a valid token. +		 */ +		bool valid() { return !string.empty(); } +	}; +private:  	/** -	 * Flag defining whether whitespaces should be preserved or not. +	 * Internally used token trie. This object holds all registered tokens.  	 */ -	WhitespaceMode whitespaceMode; +	TokenTrie trie;  	/**  	 * Vector containing all registered token types.  	 */ -	std::vector<std::string> tokens; +	std::vector<TokenDescriptor> tokens;  	/**  	 * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private:  	/**  	 * Templated function used internally to read the current token. The -	 * function is templated in order to force code generation for all six -	 * combiations of whitespace modes and reading/peeking. +	 * function is templated in order to force optimized code generation for +	 * both reading and peeking.  	 * -	 * @tparam TextHandler is the type to be used for the textHandler instance. -	 * @tparam read specifies whether the function should start from and advance -	 * the read pointer of the char reader. +	 * @tparam read specifies whether the method should read the token or just +	 * peek.  	 * @param reader is the CharReader instance from which the data should be  	 * read.  	 * @param token is the token structure into which the token information  	 * should be written. +	 * @param data is a reference at the TokenizedData instance to which the +	 * token information should be appended.  	 * @return false if the end of the stream has been reached, true otherwise.  	 */ -	template <typename TextHandler, bool read> -	bool next(CharReader &reader, Token &token); +	template <bool read> +	bool next(CharReader &reader, Token &token, TokenizedData &data);  public:  	/**  	 * Constructor of the Tokenizer class. -	 * -	 * @param whitespaceMode specifies how whitespace should be handled.  	 */ -	Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); +	Tokenizer();  	/** -	 * Registers the given string as a token. Returns a const pointer at a -	 * TokenDescriptor that will be used to reference the newly created token. +	 * Registers the given string as a token. Returns a unique identifier +	 * describing the registered token.  	 *  	 * @param token is the token string that should be registered. -	 * @return a unique identifier for the registered token or EmptyToken if +	 * @param primary specifies whether the token is a primary token -- if true, +	 * the token will be returned as a single, standalone token. Otherwise the +	 * token will be returned as part of a "TokenizedData" structure. +	 * @return a unique identifier for the registered token or Tokens::Empty if  	 * an error occured.  	 */ -	TokenId registerToken(const std::string &token); +	TokenId registerToken(const std::string &token, bool primary = true);  	/**  	 * Unregisters the token belonging to the given TokenId.  	 *  	 * @param type is the token type that should be unregistered. The -	 *TokenId -	 * must have been returned by registerToken. +	 * TokenId must have been returned by registerToken.  	 * @return true if the operation was successful, false otherwise (e.g. -	 * because the given TokenDescriptor was already unregistered). +	 * because the token with the given TokenId was already unregistered).  	 */ -	bool unregisterToken(TokenId type); +	bool unregisterToken(TokenId id);  	/**  	 * Returns the token that was registered under the given TokenId id or -	 *an -	 * empty string if an invalid TokenId id is given. +	 * an empty string if an invalid TokenId id is given.  	 * -	 * @param type is the TokenId id for which the corresponding token -	 *string +	 * @param id is the TokenId for which the corresponding TokenDescriptor  	 * should be returned. -	 * @return the registered token string or an empty string if the given type -	 * was invalid. -	 */ -	std::string getTokenString(TokenId type); - -	/** -	 * Sets the whitespace mode. -	 * -	 * @param whitespaceMode defines how whitespace should be treated in text -	 * tokens. -	 */ -	void setWhitespaceMode(WhitespaceMode mode); - -	/** -	 * Returns the current value of the whitespace mode. -	 * -	 * @return the whitespace mode. +	 * @return the registered TokenDescriptor or an invalid TokenDescriptor if +	 * the given TokenId is invalid.  	 */ -	WhitespaceMode getWhitespaceMode(); +	const TokenDescriptor& lookupToken(TokenId id) const;  	/**  	 * Reads a new token from the CharReader and stores it in the given -	 * Token instance. +	 * Token instance. If the token has the id Tokens::Data, use the "getData" +	 * method to fetch a reference at the underlying TokenizedData instance +	 * storing the data.  	 *  	 * @param reader is the CharReader instance from which the data should be  	 * read.  	 * @param token is a reference at the token instance into which the Token  	 * information should be written. +	 * @param data is a reference at the TokenizedData instance to which the +	 * token information should be appended.  	 * @return true if a token could be read, false if the end of the stream  	 * has been reached.  	 */ -	bool read(CharReader &reader, Token &token); +	bool read(CharReader &reader, Token &token, TokenizedData &data);  	/**  	 * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public:  	 * read.  	 * @param token is a reference at the token instance into which the Token  	 * information should be written. +	 * @param data is a reference at the TokenizedData instance to which the +	 * token information should be appended.  	 * @return true if a token could be read, false if the end of the stream  	 * has been reached.  	 */ -	bool peek(CharReader &reader, Token &token); +	bool peek(CharReader &reader, Token &token, TokenizedData &data);  };  } diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..d4cdbf8 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -94,92 +94,11 @@ public:  static const PlainFormatTokens OsmlTokens; -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: -	/** -	 * Internal character buffer. -	 */ -	std::vector<char> buf; - -	/** -	 * Start location of the character data. -	 */ -	SourceOffset start; - -	/** -	 * End location of the character data. -	 */ -	SourceOffset end; - -public: -	/** -	 * Default constructor, initializes start and end with zeros. -	 */ -	DataHandler() : start(0), end(0) {} - -	/** -	 * Returns true if the internal buffer is empty. -	 * -	 * @return true if no characters were added to the internal buffer, false -	 * otherwise. -	 */ -	bool isEmpty() { return buf.empty(); } - -	/** -	 * Appends a single character to the internal buffer. -	 * -	 * @param c is the character that should be added to the internal buffer. -	 * @param charStart is the start position of the character. -	 * @param charEnd is the end position of the character. -	 */ -	void append(char c, SourceOffset charStart, SourceOffset charEnd) -	{ -		if (isEmpty()) { -			start = charStart; -		} -		buf.push_back(c); -		end = charEnd; -	} - -	/** -	 * Appends a string to the internal buffer. -	 * -	 * @param s is the string that should be added to the internal buffer. -	 * @param stringStart is the start position of the string. -	 * @param stringEnd is the end position of the string. -	 */ -	void append(const std::string &s, SourceOffset stringStart, -	            SourceOffset stringEnd) -	{ -		if (isEmpty()) { -			start = stringStart; -		} -		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); -		end = stringEnd; -	} - -	/** -	 * Converts the internal buffer to a variant with attached location -	 * information. -	 * -	 * @param sourceId is the source id which is needed for building the -	 * location information. -	 * @return a Variant with the internal buffer content as string and -	 * the correct start and end location. -	 */ -	Variant toVariant(SourceId sourceId) -	{ -		Variant res = Variant::fromString(std::string(buf.data(), buf.size())); -		res.setLocation({sourceId, start, end}); -		return res; -	} -}; -  OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) -    : reader(reader), logger(logger), tokenizer(OsmlTokens) +    : reader(reader), +      logger(logger), +      tokenizer(OsmlTokens), +      data(reader.getSourceId())  {  	// Place an intial command representing the complete file on the stack  	commands.push(Command{"", Variant::mapType{}, true, true, true, false}); @@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)  Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  {  	bool first = true; -	bool hasCharSiceNSSep = false; +	bool hasCharSinceNSSep = false;  	std::vector<char> identifier;  	size_t end = reader.getPeekOffset();  	char c, c2; @@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  		if ((first && Utils::isIdentifierStartCharacter(c)) ||  		    (!first && Utils::isIdentifierCharacter(c))) {  			identifier.push_back(c); -		} else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && +		} else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&  		           Utils::isIdentifierStartCharacter(c2)) {  			identifier.push_back(c);  		} else { @@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  		// This is no longer the first character  		first = false; -		// Advance the hasCharSiceNSSep flag -		hasCharSiceNSSep = allowNSSep && (c != ':'); +		// Advance the hasCharSinceNSSep flag +		hasCharSinceNSSep = allowNSSep && (c != ':');  		end = reader.getPeekOffset();  		reader.consumePeek(); @@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment()  {  	Token token;  	size_t depth = 1; -	while (tokenizer.read(reader, token)) { +	while (tokenizer.read(reader, token, data)) { +		// Throw the comment data away +		data.clear(); +  		if (token.id == OsmlTokens.BlockCommentEnd) {  			depth--;  			if (depth == 0) { @@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment()  	}  } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData()  { -	if (!handler.isEmpty()) { -		data = handler.toVariant(reader.getSourceId()); +	if (!data.empty()) {  		location = data.getLocation();  		reader.resetPeek();  		return true; @@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField()  OsmlStreamParser::State OsmlStreamParser::parse()  { -	// Handler for incomming data -	DataHandler handler; +	// Reset the data handler +	data.clear();  	// Read tokens until the outer loop should be left  	Token token; -	while (tokenizer.peek(reader, token)) { +	while (tokenizer.peek(reader, token, data)) {  		const TokenId type = token.id;  		// Special handling for Backslash and Text @@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			// Try to parse a command  			if (Utils::isIdentifierStartCharacter(c)) {  				// Make sure to issue any data before it is to late -				if (checkIssueData(handler)) { +				if (checkIssueData()) {  					return State::DATA;  				} @@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			// If this was an annotation start token, add the parsed < to the  			// output  			if (type == OsmlTokens.AnnotationStart) { -				handler.append('<', token.location.getStart(), -				               token.location.getStart() + 1); +				data.append('<', token.location.getStart(), +				            token.location.getStart() + 1);  			} -			handler.append(c, token.location.getStart(), -			               reader.getPeekOffset()); +			data.append(c, token.location.getStart(), reader.getPeekOffset());  			reader.consumePeek();  			continue;  		} else if (type == Tokens::Data) { @@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse()  				location = token.location;  				return State::FIELD_START;  			} - -			// Append the text to the data handler -			handler.append(token.content, token.location.getStart(), -			               token.location.getEnd()); -  			reader.consumePeek();  			continue;  		}  		// A non-text token was reached, make sure all pending data commands  		// have been issued -		if (checkIssueData(handler)) { +		if (checkIssueData()) {  			return State::DATA;  		} @@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse()  			Command &cmd = commands.top();  			if (!cmd.inField) {  				cmd.inField = true; -				return State::FIELD_START;  			} -			logger.error( +			return State::FIELD_START; +/*			logger.error(  			    "Got field start token \"{\", but no command for which to "  			    "start the field. Write \"\\{\" to insert this sequence as "  			    "text.", -			    token); +			    token);*/  		} else if (token.id == OsmlTokens.FieldEnd) { -			if (closeField()) { +			closeField(); +			return State::FIELD_END; +/*			if (closeField()) {  				return State::FIELD_END;  			}  			logger.error(  			    "Got field end token \"}\", but there is no field to end. "  			    "Write \"\\}\" to insert this sequence as text.", -			    token); +			    token);*/  		} else if (token.id == OsmlTokens.DefaultFieldStart) {  			// Try to start a default field the first time the token is reached  			Command &topCmd = commands.top();  			if (!topCmd.inField) {  				topCmd.inField = true;  				topCmd.inDefaultField = true; -				return State::FIELD_START;  			} -			logger.error( +			return State::FIELD_START; +/*			logger.error(  			    "Got default field start token \"{!\", but no command for "  			    "which to start the field. Write \"\\{!\" to insert this "  			    "sequence as text", -			    token); +			    token);*/  		} else if (token.id == OsmlTokens.AnnotationEnd) {  			// We got a single annotation end token "\>" -- simply issue the  			// ANNOTATION_END event @@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()  	}  	// Issue available data -	if (checkIssueData(handler)) { +	if (checkIssueData()) {  		return State::DATA;  	} @@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse()  	return State::END;  } +Variant OsmlStreamParser::getText(WhitespaceMode mode) +{ +	TokenizedData dataFork = data; +	Variant text = dataFork.text(mode); +	location = text.getLocation(); +	return text; +} +  const Variant &OsmlStreamParser::getCommandName() const  {  	return commands.top().name; diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..453a2bb 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,17 +29,19 @@  #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_  #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include <stack> +#include <memory>  #include <core/common/Variant.hpp> +#include <core/common/Whitespace.hpp>  #include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/utils/TokenizedData.hpp>  namespace ousia {  // Forward declarations  class CharReader;  class Logger; -class DataHandler; +class OsmlStreamParserImpl;  /**   * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml @@ -137,26 +139,15 @@ public:  		Variant arguments;  		/** -		 * Set to true if this is a command with clear begin and end. -		 */ -		bool hasRange : 1; - -		/** -		 * Set to true if we are currently inside a field of this command. -		 */ -		bool inField : 1; - -		/** -		 * Set to true if we are currently in the range field of the command -		 * (implies inField being set to true). +		 * Vector used as stack for holding the number of opening/closing braces +		 * and the corresponding "isDefaultField" flag.  		 */ -		bool inRangeField : 1; +		std::vector<bool> fields;  		/** -		 * Set to true if we are currently in a field that has been especially -		 * marked as default field (using the "|") syntax. +		 * Set to true if this is a command with clear begin and end.  		 */ -		bool inDefaultField : 1; +		bool hasRange;  		/**  		 * Default constructor. @@ -164,7 +155,6 @@ public:  		Command()  		    : hasRange(false),  		      inField(false), -		      inRangeField(false),  		      inDefaultField()  		{  		} @@ -178,15 +168,10 @@ public:  		 * command.  		 * @param hasRange should be set to true if this is a command with  		 * explicit range. -		 * @param inField is set to true if we currently are inside a field -		 * of this command. -		 * @param inRangeField is set to true if we currently are inside the -		 * outer field of a ranged command.  		 * @param inDefaultField is set to true if we currently are in a  		 * specially marked default field.  		 */ -		Command(Variant name, Variant arguments, bool hasRange, -		        bool inField, bool inRangeField, bool inDefaultField) +		Command(Variant name, Variant arguments, bool hasRange)  		    : name(std::move(name)),  		      arguments(std::move(arguments)),  		      hasRange(hasRange), @@ -215,25 +200,20 @@ private:  	Tokenizer tokenizer;  	/** -	 * Stack containing the current commands. -	 */ -	std::stack<Command> commands; - -	/** -	 * Variant containing the data that has been read (always is a string, -	 * contains the exact location of the data in the source file). +	 * Variant containing the tokenized data that was returned from the +	 * tokenizer as data.  	 */ -	Variant data; +	TokenizedData data;  	/** -	 * Contains the location of the last token. +	 * Stack containing the current commands.  	 */ -	SourceLocation location; +	std::stack<Command> commands;  	/** -	 * Contains the field index of the current command. +	 * Pointer at   	 */ -	size_t fieldIdx; +	std::unique_ptr<OsmlStreamParserImpl> impl;  	/**  	 * Function used internall to parse an identifier. @@ -291,12 +271,10 @@ private:  	/**  	 * Checks whether there is any data pending to be issued, if yes, issues it.  	 * -	 * @param handler is the data handler that contains the data that may be -	 * returned to the user.  	 * @return true if there was any data and DATA should be returned by the  	 * parse function, false otherwise.  	 */ -	bool checkIssueData(DataHandler &handler); +	bool checkIssueData();  	/**  	 * Called before any data is appended to the internal data handler. Checks @@ -328,6 +306,12 @@ public:  	OsmlStreamParser(CharReader &reader, Logger &logger);  	/** +	 * Destructor of the OsmlStreamParser, needed to destroy the incomplete +	 * OsmlStreamParserImpl. +	 */ +	~OsmlStreamParser(); + +	/**  	 * Continues parsing. Returns one of the states defined in the State enum.  	 * Callers should stop once the State::END state is reached. Use the getter  	 * functions to get more information about the current state, such as the @@ -344,7 +328,19 @@ public:  	 * @return a reference at a variant containing the data parsed by the  	 * "parse" function.  	 */ -	const Variant &getData() const { return data; } +	const TokenizedData &getData() const { return data; } + +	/** +	 * Returns the complete content of the internal TokenizedData instance as +	 * a single string Variant. This method is mainly used in the unit tests for +	 * this class, it simply calls the text() method of TokenizedData. +	 * +	 * @param mode is the WhitespaceMode that should be used for returning the +	 * text. +	 * @return a string variant containing the text content of the internal +	 * TokenizedData instance or a nullptr variant if there is no text. +	 */ +	Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE);  	/**  	 * Returns a reference at the internally stored command name. Only valid if @@ -371,13 +367,6 @@ public:  	 * syntax).  	 */  	bool inDefaultField() const; - -	/** -	 * Returns a reference at the char reader. -	 * -	 * @return the last internal token location. -	 */ -	const SourceLocation &getLocation() const { return location; }  };  } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..855f80d 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,6 @@  #include <core/common/Variant.hpp>  #include <core/common/VariantReader.hpp>  #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp>  #include "OsxmlAttributeLocator.hpp"  #include "OsxmlEventParser.hpp" @@ -57,17 +56,6 @@ public:  	std::vector<char> textBuf;  	/** -	 * Current whitespace buffer (for the trimming whitspace mode) -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Flag indicating whether a whitespace character was present (for the -	 * collapsing whitespace mode). -	 */ -	bool hasWhitespace; - -	/**  	 * Current character data start.  	 */  	size_t textStart; @@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)  	SourceLocation loc = xmlSyncLoggerPosition(p, ulen);  	// Fetch some variables for convenience -	const WhitespaceMode mode = parser->getWhitespaceMode();  	OsxmlEventParserData &data = parser->getData();  	std::vector<char> &textBuf = data.textBuf; -	std::vector<char> &whitespaceBuf = data.whitespaceBuf; -	bool &hasWhitespace = data.hasWhitespace; -	size_t &textStart = data.textStart; -	size_t &textEnd = data.textEnd; - -	size_t pos = loc.getStart(); -	for (size_t i = 0; i < ulen; i++, pos++) { -		switch (mode) { -			case WhitespaceMode::PRESERVE: -				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd); -				break; -			case WhitespaceMode::TRIM: -				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                  textStart, textEnd, -				                                  whitespaceBuf); -				break; -			case WhitespaceMode::COLLAPSE: -				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, -				                                    textStart, textEnd, -				                                    hasWhitespace); -				break; -		} + +	// Update start and end position +	if (textBuf.empty()) { +		data.textStart = loc.getStart();  	} +	data.textEnd = loc.getEnd(); + +	// Insert the data into the text buffer +	textBuf.insert(textBuf.end(), &s[0], &s[ulen]);  }  /* Class OsxmlEvents */ @@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {}  /* Class OsxmlEventParser */  OsxmlEventParserData::OsxmlEventParserData() -    : depth(0), -      annotationEndTagDepth(-1), -      hasWhitespace(false), -      textStart(0), -      textEnd(0) +    : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0)  {  } @@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId)  	// Reset the text buffers  	textBuf.clear(); -	whitespaceBuf.clear(); -	hasWhitespace = false;  	textStart = 0;  	textEnd = 0; @@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,      : reader(reader),        events(events),        logger(logger), -      whitespaceMode(WhitespaceMode::COLLAPSE),        data(new OsxmlEventParserData())  {  } @@ -532,16 +497,6 @@ void OsxmlEventParser::parse()  	}  } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ -	this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ -	return whitespaceMode; -} -  CharReader &OsxmlEventParser::getReader() const { return reader; }  Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..e3fd5d4 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@  #include <memory>  #include <string> -#include <core/common/Whitespace.hpp> -  namespace ousia {  // Forward declarations @@ -99,13 +97,10 @@ public:  	virtual void fieldEnd() = 0;  	/** -	 * Called whenever data is found. Whitespace data is handled as specified -	 * and the data has been parsed to the specified variant type. This function -	 * is not called if the parsing failed, the parser prints an error message -	 * instead. +	 * Called whenever string data is found.  	 * -	 * @param data is the already parsed data that should be passed to the -	 * handler. +	 * @param data is a Variant containing the string data that was found in the +	 * XML file.  	 */  	virtual void data(const Variant &data) = 0;  }; @@ -135,11 +130,6 @@ private:  	Logger &logger;  	/** -	 * Current whitespace mode. -	 */ -	WhitespaceMode whitespaceMode; - -	/**  	 * Data to be used by the internal functions.  	 */  	std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +161,6 @@ public:  	void parse();  	/** -	 * Sets the whitespace handling mode. -	 * -	 * @param whitespaceMode defines how whitespace in the data should be -	 * handled. -	 */ -	void setWhitespaceMode(WhitespaceMode whitespaceMode); - -	/** -	 * Returns the current whitespace handling mode. -	 * -	 * @return the currently set whitespace handling mode. -	 */ -	WhitespaceMode getWhitespaceMode() const; - -	/**  	 * Returns the internal CharReader reference.  	 *  	 * @return the CharReader reference.  | 
