diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/core/common/CharReader.cpp | 20 | ||||
| -rw-r--r-- | src/core/common/CharReader.hpp | 20 | ||||
| -rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 8 | ||||
| -rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.cpp | 261 | ||||
| -rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.hpp | 144 | 
5 files changed, 347 insertions, 106 deletions
diff --git a/src/core/common/CharReader.cpp b/src/core/common/CharReader.cpp index 4d3638c..3e95280 100644 --- a/src/core/common/CharReader.cpp +++ b/src/core/common/CharReader.cpp @@ -468,15 +468,27 @@ bool CharReader::read(char &c)  	return res;  } +bool CharReader::fetch(char &c) +{ +	return buffer->fetch(readCursor, c); +} + +bool CharReader::fetchPeek(char &c) +{ +	if (coherent) { +		return fetch(c); +	} +	return buffer->fetch(peekCursor, c); +} +  bool CharReader::expect(char c)  { -	char actual = 0; -	peek(actual); -	if (c == actual) { +	char actual; +	if (fetch(actual) && (actual == c)) { +		peek(actual);  		consumePeek();  		return true;  	} -	resetPeek();  	return false;  } diff --git a/src/core/common/CharReader.hpp b/src/core/common/CharReader.hpp index 64c80af..a90d337 100644 --- a/src/core/common/CharReader.hpp +++ b/src/core/common/CharReader.hpp @@ -490,6 +490,26 @@ public:  	bool read(char &c);  	/** +	 * Returns the current character at the read cursor without advancing it. +	 * +	 * @param c is a reference to the character into which the result should be +	 * written. +	 * @return true if the operation was successful, false if the cursor is at +	 * the end of the file. +	 */ +	bool fetch(char &c); + +	/** +	 * Returns the current character at the peek cursor without advancing it. +	 * +	 * @param c is a reference to the character into which the result should be +	 * written. +	 * @return true if the operation was successful, false if the cursor is at +	 * the end of the file. +	 */ +	bool fetchPeek(char &c); + +	/**  	 * Peeks a character, checks whether this character equals the given  	 * character -- and if yes -- consumes the peek, otherwise resets it.  	 * diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp index 0b4dd39..0cac2e8 100644 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -85,6 +85,14 @@ struct DynamicToken {  	 * @param type is the id corresponding to the type of the token.  	 */  	DynamicToken(TokenTypeId type) : type(type) {} + +	/** +	 * The getLocation function allows the tokens to be directly passed as +	 * parameter to Logger or LoggableException instances. +	 * +	 * @return a reference at the location field +	 */ +	const SourceLocation &getLocation() const { return location; }  };  /** diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp index 4469536..1bff24b 100644 --- a/src/plugins/plain/PlainFormatStreamReader.cpp +++ b/src/plugins/plain/PlainFormatStreamReader.cpp @@ -25,7 +25,56 @@  namespace ousia { -namespace { +/** + * Plain format default tokenizer. + */ +class PlainFormatTokens : public DynamicTokenizer { +public: +	/** +	 * Id of the backslash token. +	 */ +	TokenTypeId Backslash; + +	/** +	 * Id of the line comment token. +	 */ +	TokenTypeId LineComment; + +	/** +	 * Id of the block comment start token. +	 */ +	TokenTypeId BlockCommentStart; + +	/** +	 * Id of the block comment end token. +	 */ +	TokenTypeId BlockCommentEnd; + +	/** +	 * Id of the field start token. +	 */ +	TokenTypeId FieldStart; + +	/** +	 * Id of the field end token. +	 */ +	TokenTypeId FieldEnd; + +	/** +	 * Registers the plain format tokens in the internal tokenizer. +	 */ +	PlainFormatTokens() +	{ +		Backslash = registerToken("\\"); +		LineComment = registerToken("%"); +		BlockCommentStart = registerToken("%{"); +		BlockCommentEnd = registerToken("}%"); +		FieldStart = registerToken("{"); +		FieldEnd = registerToken("}"); +	} +}; + +static const PlainFormatTokens Tokens;  /**   * Class used internally to collect data issued via "DATA" event. @@ -110,17 +159,13 @@ public:  		return res;  	}  }; -}  PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,                                                   Logger &logger) -    : reader(reader), logger(logger), fieldIdx(0) +    : reader(reader), logger(logger), tokenizer(Tokens)  { -	tokenBackslash = tokenizer.registerToken("\\"); -	tokenLinebreak = tokenizer.registerToken("\n"); -	tokenLineComment = tokenizer.registerToken("%"); -	tokenBlockCommentStart = tokenizer.registerToken("%{"); -	tokenBlockCommentEnd = tokenizer.registerToken("}%"); +	// Place an intial command representing the complete file on the stack +	commands.push(Command{"", Variant::mapType{}, true, true, true});  }  Variant PlainFormatStreamReader::parseIdentifier(size_t start) @@ -155,7 +200,7 @@ Variant PlainFormatStreamReader::parseIdentifier(size_t start)  void PlainFormatStreamReader::parseCommand(size_t start)  {  	// Parse the commandName as a first identifier -	commandName = parseIdentifier(start); +	Variant commandName = parseIdentifier(start);  	// Check whether the next character is a '#', indicating the start of the  	// command name @@ -169,6 +214,7 @@ void PlainFormatStreamReader::parseCommand(size_t start)  	}  	// Read the arguments (if they are available), otherwise reset them +	Variant commandArguments;  	if (reader.expect('[')) {  		auto res = VariantReader::parseObject(reader, logger, ']');  		commandArguments = res.second; @@ -187,6 +233,13 @@ void PlainFormatStreamReader::parseCommand(size_t start)  			logger.note("Second occurance is here: ", res.first->second);  		}  	} + +	// Place the command on the command stack, remove the last commands if we're +	// not currently inside a field of these commands +	while (!commands.top().inField) { +		commands.pop(); +	} +	commands.push(Command{commandName, commandArguments, false, false, false});  }  void PlainFormatStreamReader::parseBlockComment() @@ -194,13 +247,13 @@ void PlainFormatStreamReader::parseBlockComment()  	DynamicToken token;  	size_t depth = 1;  	while (tokenizer.read(reader, token)) { -		if (token.type == tokenBlockCommentEnd) { +		if (token.type == Tokens.BlockCommentEnd) {  			depth--;  			if (depth == 0) {  				return;  			}  		} -		if (token.type == tokenBlockCommentStart) { +		if (token.type == Tokens.BlockCommentStart) {  			depth++;  		}  	} @@ -212,7 +265,6 @@ void PlainFormatStreamReader::parseBlockComment()  void PlainFormatStreamReader::parseLineComment()  {  	char c; -	reader.consumePeek();  	while (reader.read(c)) {  		if (c == '\n') {  			return; @@ -220,78 +272,171 @@ void PlainFormatStreamReader::parseLineComment()  	}  } -PlainFormatStreamReader::State PlainFormatStreamReader::parse() +bool PlainFormatStreamReader::checkIssueData(DataHandler &handler)  { -// Macro (sorry for that) used for checking whether there is data to issue, and -// if yes, aborting the loop, allowing for a reentry on a later parse call by -// resetting the peek cursor -#define CHECK_ISSUE_DATA()            \ -	{                                 \ -		if (!dataHandler.isEmpty()) { \ -			reader.resetPeek();       \ -			abort = true;             \ -			break;                    \ -		}                             \ +	if (!handler.isEmpty()) { +		data = handler.toVariant(reader.getSourceId()); +		location = data.getLocation(); +		reader.resetPeek(); +		return true;  	} +	return false; +} -	// Handler for incomming data -	DataHandler dataHandler; +bool PlainFormatStreamReader::checkIssueFieldStart() +{ +	// Fetch the current command, and check whether we're currently inside a +	// field of this command +	Command &cmd = commands.top(); +	if (!cmd.inField) { +		// If this is a range command, we're now implicitly inside the field of +		// this command -- we'll have to issue a field start command! +		if (cmd.hasRange) { +			cmd.inField = true; +			reader.resetPeek(); +			return true; +		} -	// Variable set to true if the parser loop should be left -	bool abort = false; +		// This was not a range command, so obviously we're now inside within +		// a field of some command -- so unroll the commands stack until a +		// command with open field is reached +		while (!commands.top().inField) { +			commands.pop(); +		} +	} +	return false; +} + +PlainFormatStreamReader::State PlainFormatStreamReader::parse() +{ +	// Handler for incomming data +	DataHandler handler;  	// Read tokens until the outer loop should be left  	DynamicToken token; -	while (!abort && tokenizer.peek(reader, token)) { -		// Check whether this backslash just escaped some special or -		// whitespace character or was the beginning of a command -		if (token.type == tokenBackslash) { -			// Check whether this character could be the start of a command +	while (tokenizer.peek(reader, token)) { +		const TokenTypeId type = token.type; + +		// Special handling for Backslash and Text +		if (type == Tokens.Backslash) { +			// Check whether a command starts now, without advancing the peek +			// cursor  			char c; -			reader.consumePeek(); -			reader.peek(c); +			if (!reader.fetchPeek(c)) { +				logger.error("Trailing backslash at the end of the file.", +				             token); +				return State::END; +			} + +			// Try to parse a command  			if (Utils::isIdentifierStartCharacter(c)) { -				CHECK_ISSUE_DATA(); -				reader.resetPeek();  				parseCommand(token.location.getStart()); +				if (checkIssueData(handler)) { +					return State::DATA; +				} +				location = commands.top().name.getLocation();  				return State::COMMAND;  			} +			// Before appending anything to the output data, check whether +			// FIELD_START has to be issued, as the current command is a command +			// with range +			if (checkIssueFieldStart()) { +				location = token.location; +				return State::FIELD_START; +			} +  			// This was not a special character, just append the given character  			// to the data buffer, use the escape character start as start  			// location and the peek offset as end location -			dataHandler.append(c, token.location.getStart(), -			                   reader.getPeekOffset()); -		} else if (token.type == tokenLineComment) { -			CHECK_ISSUE_DATA(); -			reader.consumePeek(); -			parseLineComment(); -		} else if (token.type == tokenBlockCommentStart) { -			CHECK_ISSUE_DATA(); +			reader.peek(c);  // Peek the previously fetched character +			handler.append(c, token.location.getStart(), +			               reader.getPeekOffset());  			reader.consumePeek(); -			parseBlockComment(); -		} else if (token.type == tokenLinebreak) { -			CHECK_ISSUE_DATA(); +			continue; +		} else if (type == TextToken) { +			// Check whether FIELD_START has to be issued before appending text +			if (checkIssueFieldStart()) { +				location = token.location; +				return State::FIELD_START; +			} + +			// Append the text to the data handler +			handler.append(token.content, token.location.getStart(), +			               token.location.getEnd()); +  			reader.consumePeek(); -			return State::LINEBREAK; -		} else if (token.type == TextToken) { -			dataHandler.append(token.content, token.location.getStart(), -			                   token.location.getEnd()); +			continue;  		} -		// Consume the peeked character if we did not abort, otherwise abort -		if (!abort) { -			reader.consumePeek(); +		// A non-text token was reached, make sure all pending data commands +		// have been issued +		if (checkIssueData(handler)) { +			return State::DATA; +		} + +		// We will handle the token now, consume the peeked characters +		reader.consumePeek(); + +		// Update the location to the current token location +		location = token.location; + +		if (token.type == Tokens.LineComment) { +			parseLineComment(); +		} else if (token.type == Tokens.BlockCommentStart) { +			parseBlockComment(); +		} else if (token.type == Tokens.FieldStart) { +			Command &cmd = commands.top(); +			if (!cmd.inField) { +				cmd.inField = true; +				return State::FIELD_START; +			} +			logger.error( +			    "Got field start token \"{\", but no command for which to " +			    "start the field. Did you mean to write \"\\{\"?", +			    token); +		} else if (token.type == Tokens.FieldEnd) { +			// Try to end an open field of the current command -- if the current +			// command is not inside an open field, end this command and try to +			// close the next one +			for (int i = 0; i < 2 && commands.size() > 1; i++) { +				Command &cmd = commands.top(); +				if (!cmd.inRangeField) { +					if (cmd.inField) { +						cmd.inField = false; +						return State::FIELD_END; +					} +					commands.pop(); +				} else { +					break; +				} +			} +			logger.error( +			    "Got field end token \"}\" but there is no field to end. Did you " +			    "mean to write \"\\}\"?", +			    token); +		} else { +			logger.error("Unexpected token \"" + token.content + "\"", token);  		}  	} -	// Send out pending output data, otherwise we are at the end of the stream -	if (!dataHandler.isEmpty()) { -		data = dataHandler.toVariant(reader.getSourceId()); +	// Issue available data +	if (checkIssueData(handler)) {  		return State::DATA;  	} + +	location = SourceLocation{reader.getSourceId(), reader.getOffset()};  	return State::END; -#undef CHECK_ISSUE_DATA +} + +const Variant &PlainFormatStreamReader::getCommandName() +{ +	return commands.top().name; +} + +const Variant &PlainFormatStreamReader::getCommandArguments() +{ +	return commands.top().arguments;  }  } diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp index 737bbe8..4a11b8e 100644 --- a/src/plugins/plain/PlainFormatStreamReader.hpp +++ b/src/plugins/plain/PlainFormatStreamReader.hpp @@ -16,9 +16,6 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ -#ifndef _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ -#define _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ -  /**   * @file PlainFormatStreamReader.hpp   * @@ -29,6 +26,11 @@   * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)   */ +#ifndef _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ +#define _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ + +#include <stack> +  #include <core/common/Variant.hpp>  #include "DynamicTokenizer.hpp" @@ -38,6 +40,7 @@ namespace ousia {  // Forward declarations  class CharReader;  class Logger; +class DataHandler;  /**   * The PlainFormatStreamReader class provides a low-level reader for the plain @@ -69,11 +72,6 @@ public:  		DATA,  		/** -		 * State returned if a linebreak has been reached (outside of comments). -		 */ -		LINEBREAK, - -		/**  	     * A user-defined entity has been found. The entity sequence is stored  	     * in the command name.  	     */ @@ -112,6 +110,66 @@ public:  		END  	}; +	/** +	 * Entry used for the command stack. +	 */ +	struct Command { +		/** +		 * Name and location of the current command. +		 */ +		Variant name; + +		/** +		 * Arguments that were passed to the command. +		 */ +		Variant arguments; + +		/** +		 * Set to true if this is a command with clear begin and end. +		 */ +		bool hasRange; + +		/** +		 * Set to true if we are currently inside a field of this command. +		 */ +		bool inField; + +		/** +		 * Set to true if we are currently in the range field of the command +		 * (implies inField being set to true). +		 */ +		bool inRangeField; + +		/** +		 * Default constructor. +		 */ +		Command() : hasRange(false), inField(false), inRangeField(false) {} + +		/** +		 * Constructor of the Command class. +		 * +		 * @param name is a string variant with name and location of the +		 * command. +		 * @param arguments is a map variant with the arguments given to the +		 * command. +		 * @param hasRange should be set to true if this is a command with +		 * explicit range. +		 * @param inField is set to true if we currently are inside a field +		 * of this command. +		 * @param inRangeField is set to true if we currently inside the outer +		 * field of the command. +		 */ +		Command(const Variant &name, const Variant &arguments, bool hasRange, +		        bool inField, bool inRangeField) +		    : name(name), +		      arguments(arguments), +		      hasRange(hasRange), +		      inField(inField), +		      inRangeField(inRangeField) +		{ +		} +	}; +  private:  	/**  	 * Reference to the CharReader instance from which the incomming bytes are @@ -130,16 +188,9 @@ private:  	DynamicTokenizer tokenizer;  	/** -	 * Variant containing the current command name (always is a string variant, -	 * but additionally contains the correct locatino of the name). -	 */ -	Variant commandName; - -	/** -	 * Variant containing the command arguments (always is a map or array -	 * variant, but additionally contains the source location of the arguments). +	 * Stack containing the current commands.  	 */ -	Variant commandArguments; +	std::stack<Command> commands;  	/**  	 * Variant containing the data that has been read (always is a string, @@ -148,29 +199,9 @@ private:  	Variant data;  	/** -	 * Id of the backslash token. -	 */ -	TokenTypeId tokenBackslash; - -	/** -	 * Id of the linebreak token. +	 * Contains the location of the last token.  	 */ -	TokenTypeId tokenLinebreak; - -	/** -	 * Id of the line comment token. -	 */ -	TokenTypeId tokenLineComment; - -	/** -	 * Id of the block comment start token. -	 */ -	TokenTypeId tokenBlockCommentStart; - -	/** -	 * If of the block comment end token. -	 */ -	TokenTypeId tokenBlockCommentEnd; +	SourceLocation location;  	/**  	 * Contains the field index of the current command. @@ -189,7 +220,7 @@ private:  	 * Function used internally to parse a command.  	 *  	 * @param start is the start byte offset of the command (including the -	 * backslash). +	 * backslash)  	 */  	void parseCommand(size_t start); @@ -203,6 +234,24 @@ private:  	 */  	void parseLineComment(); +	/** +	 * Checks whether there is any data pending to be issued, if yes, issues it. +	 * +	 * @param handler is the data handler that contains the data that may be +	 * returned to the user. +	 * @return true if there was any data and DATA should be returned by the +	 * parse function, false otherwise. +	 */ +	bool checkIssueData(DataHandler &handler); + +	/** +	 * Called before any data is appended to the internal data handler. Checks +	 * whether a new field should be started or implicitly ended. +	 * +	 * @return true if FIELD_START should be returned by the parse function. +	 */ +	bool checkIssueFieldStart(); +  public:  	/**  	 * Constructor of the PlainFormatStreamReader class. Attaches the new @@ -224,14 +273,14 @@ public:  	 */  	State parse(); -	/**  +	/**  	 * Returns a reference at the internally stored data. Only valid if  	 * State::DATA was returned by the "parse" function.  	 *  	 * @return a reference at a variant containing the data parsed by the  	 * "parse" function.  	 */ -	const Variant& getData() {return data;} +	const Variant &getData() { return data; }  	/**  	 * Returns a reference at the internally stored command name. Only valid if @@ -240,7 +289,7 @@ public:  	 * @return a reference at a variant containing name and location of the  	 * parsed command.  	 */ -	const Variant& getCommandName() {return commandName;} +	const Variant &getCommandName();  	/**  	 * Returns a reference at the internally stored command name. Only valid if @@ -249,7 +298,14 @@ public:  	 * @return a reference at a variant containing arguments given to the  	 * command.  	 */ -	const Variant& getCommandArguments() {return commandArguments;} +	const Variant &getCommandArguments(); + +	/** +	 * Returns a reference at the char reader. +	 * +	 * @return the last internal token location. +	 */ +	SourceLocation &getLocation() {return location;}  };  }  | 
