Ported PlainFormatStreamReader to DynamicTokenizer

author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-02-08 18:49:02 +0100
committer: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-02-08 18:49:02 +0100
commit: 51f09f4faa7cd4b6a0576758881d322e31e896ba (patch)
tree: 74660d05494ed41a788fcb5d6c7efd8a5407d57c /src/plugins
parent: f066b4887f6f2896fe602f14ede9c02a9f5a7e1a (diff)
2 files changed, 116 insertions, 197 deletions
diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp
index 15ca403..f0721a0 100644
--- a/src/plugins/plain/PlainFormatStreamReader.cpp
+++ b/src/plugins/plain/PlainFormatStreamReader.cpp
@@ -16,9 +16,6 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <sstream>
-#include <unordered_set>
-
 #include <core/common/CharReader.hpp>
 #include <core/common/Logger.hpp>
 #include <core/common/Utils.hpp>
@@ -27,123 +24,40 @@
 
 namespace ousia {
 
-/* Internally used types, protected from spilling the exports by a namespace */
-
 namespace {
-/**
- * Enum used to specify the state of the parseBlockComment state machine.
- */
-enum class BlockCommentState { DEFAULT, HAS_CURLY_CLOSE, HAS_PERCENT };
-
-/**
- * Class taking care of recording plain text data found withing the file.
- */
-class DataHandler {
-private:
-	/**
-	 * Const reference at the reader, used for reading the current location.
-	 */
-	const CharReader &reader;
-
-	/**
-	 * Flag defining whether whitespaces should be preserved or not.
-	 */
-	const bool preserveWhitespaces;
+struct DataHandler {
+	std::vector<char> buf;
 
-	/**
-	 * Current source range of the data in the buffer.
-	 */
-	SourceLocation location;
+	SourceOffset start;
+	SourceOffset end;
 
-	/**
-	 * Current buffer containing all read characters.
-	 */
-	std::stringstream buffer;
+	DataHandler() : start(0), end(0) {}
 
-	/**
-	 * Set to false, once a non-whitespace character was reached.
-	 */
-	bool empty;
+	bool isEmpty() { return buf.empty(); }
 
-	/**
-	 * Set to true if a whitespace was found -- these are normalized to a single
-	 * space.
-	 */
-	bool hasWhitespace;
-
-public:
-	/**
-	 * Constructor of the DataHandler class.
-	 *
-	 * @param reader is the CharReader that should be used for reading the data
-	 * location.
-	 * @param preserveWhitespaces should be set to true if all whitespaces
-	 * should be preserved (for preformated environments).
-	 */
-	DataHandler(const CharReader &reader, bool preserveWhitespaces = false)
-	    : reader(reader),
-	      preserveWhitespaces(preserveWhitespaces),
-	      location(reader.getSourceId()),
-	      empty(true),
-	      hasWhitespace(false)
+	void append(char c, SourceOffset charStart, SourceOffset charEnd)
 	{
+		if (isEmpty()) {
+			start = charStart;
+		}
+		buf.push_back(c);
+		end = charEnd;
 	}
 
-	/**
-	 * Appends the given character to the internal buffer.
-	 *
-	 * @param c is the character that should be appended.
-	 * @param wasEscaped is set to true if the character was escaped (prepended
-	 * with a backslash), this allows whitespace characters to be explicitly
-	 * included.
-	 */
-	void append(char c, bool wasEscaped = false)
+	void append(const std::string &s, SourceOffset stringStart,
+	            SourceOffset stringEnd)
 	{
-		// Check whether the character is a whitespace
-		const bool isWhitespace =
-		    !wasEscaped && !preserveWhitespaces && Utils::isWhitespace(c);
-
-		// Trim leading and trailing whitespaces
-		if (isWhitespace) {
-			if (!empty) {
-				hasWhitespace = true;
-			}
-		} else {
-			// Compress whitespaces to a single space
-			if (hasWhitespace) {
-				buffer << ' ';
-				hasWhitespace = false;
-			}
-
-			// Append the character
-			buffer << c;
-
-			// Update the "empty" flag and set the start and end offset
-			if (empty) {
-				location.setStart(reader.getOffset());
-				empty = false;
-			}
-			location.setEnd(reader.getPeekOffset());
+		if (isEmpty()) {
+			start = stringStart;
 		}
+		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
+		end = stringEnd;
 	}
 
-	/**
-	 * Returns true if no non-whitespace character has been found until now.
-	 *
-	 * @return true if the internal buffer is still empty.
-	 */
-	bool isEmpty() { return empty; }
-
-	/**
-	 * Returns a variant containg the read data and its location.
-	 *
-	 * @return a variant with a string value containing the read data and the
-	 * location being set to
-	 */
-	Variant getData()
+	Variant toVariant(SourceId sourceId)
 	{
-		Variant res = Variant::fromString(buffer.str());
-		res.setLocation(location);
+		Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
+		res.setLocation({sourceId, start, end});
 		return res;
 	}
 };
@@ -153,35 +67,26 @@ PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,
                                                  Logger &logger)
     : reader(reader), logger(logger), fieldIdx(0)
 {
+	tokenBackslash = tokenizer.registerToken("\\");
+	tokenLinebreak = tokenizer.registerToken("\n");
+	tokenLineComment = tokenizer.registerToken("%");
+	tokenBlockCommentStart = tokenizer.registerToken("%{");
+	tokenBlockCommentEnd = tokenizer.registerToken("}%");
 }
 
-/* Comment handling */
-
 void PlainFormatStreamReader::parseBlockComment()
 {
-	char c;
-	BlockCommentState state = BlockCommentState::DEFAULT;
-	while (reader.read(c)) {
-		switch (state) {
-			case BlockCommentState::DEFAULT:
-				if (c == '%') {
-					state = BlockCommentState::HAS_PERCENT;
-				} else if (c == '}') {
-					state = BlockCommentState::HAS_CURLY_CLOSE;
-				}
-				break;
-			case BlockCommentState::HAS_PERCENT:
-				if (c == '{') {
-					parseBlockComment();
-				}
-				state = BlockCommentState::DEFAULT;
-				break;
-			case BlockCommentState::HAS_CURLY_CLOSE:
-				if (c == '%') {
-					return;
-				}
-				state = BlockCommentState::DEFAULT;
-				break;
+	DynamicToken token;
+	size_t depth = 1;
+	while (tokenizer.read(reader, token)) {
+		if (token.type == tokenBlockCommentEnd) {
+			depth--;
+			if (depth == 0) {
+				return;
+			}
+		}
+		if (token.type == tokenBlockCommentStart) {
+			depth++;
 		}
 	}
 
@@ -189,102 +94,84 @@ void PlainFormatStreamReader::parseBlockComment()
 	logger.error("File ended while being in a block comment", reader);
 }
 
-void PlainFormatStreamReader::parseComment()
+void PlainFormatStreamReader::parseLineComment()
 {
 	char c;
-	bool first = true;
 	reader.consumePeek();
 	while (reader.read(c)) {
-		// Continue parsing a block comment if a '{' is found
-		if (c == '{' && first) {
-			parseBlockComment();
-			return;
-		}
 		if (c == '\n') {
 			return;
 		}
-		first = false;
 	}
 }
 
-/* Top level parse function */
-
-static const std::unordered_set<char> EscapeableCharacters{'\\', '<', '>',
-                                                    '{',  '}', '%'};
-
 PlainFormatStreamReader::State PlainFormatStreamReader::parse()
 {
 // Macro (sorry for that) used for checking whether there is data to issue, and
 // if yes, aborting the loop, allowing for a reentry on a later parse call by
 // resetting the peek cursor
-#define CHECK_ISSUE_DATA()      \
-	{                           \
-		if (!dataHandler.isEmpty()) {   \
-			reader.resetPeek(); \
-			abort = true;       \
-			break;              \
-		}                       \
+#define CHECK_ISSUE_DATA()            \
+	{                                 \
+		if (!dataHandler.isEmpty()) { \
+			reader.resetPeek();       \
+			abort = true;             \
+			break;                    \
+		}                             \
 	}
 
-	// Data handler
-	DataHandler dataHandler(reader);
+	// Handler for incomming data
+	DataHandler dataHandler;
 
 	// Variable set to true if the parser loop should be left
 	bool abort = false;
 
-	// Happily add characters to the dataHandler and handle escaping until a
-	// special character is reached. Then go to a specialiced parsing routine
-	char c;
-	while (!abort && reader.peek(c)) {
-		switch (c) {
-			case '\\':
-				reader.peek(c);
-				// Check whether this backslash just escaped some special or
-				// whitespace character or was the beginning of a command
-				if (EscapeableCharacters.count(c) == 0 &&
-				    !Utils::isWhitespace(c)) {
-					CHECK_ISSUE_DATA();
-					// TODO: Parse command (starting from the backslash)
-					return State::COMMAND;
-				}
-				// A character was escaped, add it to the buffer, with the
-				// wasEscaped flag set to true
-				dataHandler.append(c, true);
-				break;
-			case '<':
-				// TODO: Annotations
-				break;
-			case '>':
-				// TODO: Annotations
-				break;
-			case '{':
-				// TODO: Issue start of field
-				break;
-			case '}':
-			// TODO: Issue end of field
-			case '%':
-				CHECK_ISSUE_DATA();
-				parseComment();
-				break;
-			case '\n':
+	// Read tokens until the outer loop should be left
+	DynamicToken token;
+	while (!abort && tokenizer.peek(reader, token)) {
+		// Check whether this backslash just escaped some special or
+		// whitespace character or was the beginning of a command
+		if (token.type == tokenBackslash) {
+			// Check whether this character could be the start of a command
+			char c;
+			reader.consumePeek();
+			reader.peek(c);
+			if (Utils::isIdentifierStart(c)) {
 				CHECK_ISSUE_DATA();
-				reader.consumePeek();
-				return State::LINEBREAK;
-			default:
-				dataHandler.append(c, false);
+				// TODO: Parse a command
+				return State::COMMAND;
+			}
+
+			// This was not a special character, just append the given character
+			// to the data buffer, use the escape character start as start
+			// location and the peek offset as end location
+			dataHandler.append(c, token.location.getStart(),
+			                   reader.getPeekOffset());
+		} else if (token.type == tokenLineComment) {
+			CHECK_ISSUE_DATA();
+			reader.consumePeek();
+			parseLineComment();
+		} else if (token.type == tokenBlockCommentStart) {
+			CHECK_ISSUE_DATA();
+			reader.consumePeek();
+			parseBlockComment();
+		} else if (token.type == tokenLinebreak) {
+			CHECK_ISSUE_DATA();
+			reader.consumePeek();
+			return State::LINEBREAK;
+		} else if (token.type == TextToken) {
+			dataHandler.append(token.content, token.location.getStart(),
+			                   token.location.getEnd());
 		}
 
 		// Consume the peeked character if we did not abort, otherwise abort
 		if (!abort) {
 			reader.consumePeek();
-		} else {
-			break;
 		}
 	}
 
 	// Send out pending output data, otherwise we are at the end of the stream
 	if (!dataHandler.isEmpty()) {
-		data = dataHandler.getData();
+		data = dataHandler.toVariant(reader.getSourceId());
 		return State::DATA;
 	}
 	return State::END;
diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp
index 1a136cd..b2ea378 100644
--- a/src/plugins/plain/PlainFormatStreamReader.hpp
+++ b/src/plugins/plain/PlainFormatStreamReader.hpp
@@ -31,6 +31,8 @@
 
 #include <core/common/Variant.hpp>
 
+#include "DynamicTokenizer.hpp"
+
 namespace ousia {
 
 // Forward declarations
@@ -123,6 +125,11 @@ private:
 	Logger &logger;
 
 	/**
+	 * Tokenizer instance used to read individual tokens from the text.
+	 */
+	DynamicTokenizer tokenizer;
+
+	/**
 	 * Variant containing the current command name (always is a string variant,
 	 * but additionally contains the correct locatino of the name).
 	 */
@@ -141,6 +148,31 @@ private:
 	Variant data;
 
 	/**
+	 * Id of the backslash token.
+	 */
+	TokenTypeId tokenBackslash;
+
+	/**
+	 * Id of the linebreak token.
+	 */
+	TokenTypeId tokenLinebreak;
+
+	/**
+	 * Id of the line comment token.
+	 */
+	TokenTypeId tokenLineComment;
+
+	/**
+	 * Id of the block comment start token.
+	 */
+	TokenTypeId tokenBlockCommentStart;
+
+	/**
+	 * If of the block comment end token.
+	 */
+	TokenTypeId tokenBlockCommentEnd;
+
+	/**
 	 * Contains the field index of the current command.
 	 */
 	size_t fieldIdx;
@@ -153,7 +185,7 @@ private:
 	/**
 	 * Function used internally to parse a generic comment.
 	 */
-	void parseComment();
+	void parseLineComment();
 
 public:
 	/**
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-02-08 18:49:02 +0100
committer	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-02-08 18:49:02 +0100
commit	51f09f4faa7cd4b6a0576758881d322e31e896ba (patch)
tree	74660d05494ed41a788fcb5d6c7efd8a5407d57c /src/plugins
parent	f066b4887f6f2896fe602f14ede9c02a9f5a7e1a (diff)