implemented context function, increased performance of read function

author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2014-12-11 00:22:39 +0100
committer: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2014-12-11 00:22:39 +0100
commit: 2990d12ccca8ddbf0761cf84ce29f38de9f3262c (patch)
tree: 58c492a082cce5cfdf7a5bb57b954a608c9b9d14 /src/core
parent: f053b48f925cf65aaf6ca937f89dacf59196c719 (diff)
2 files changed, 226 insertions, 91 deletions
diff --git a/src/core/utils/CharReader.cpp b/src/core/utils/CharReader.cpp
index 12d0043..effc587 100644
--- a/src/core/utils/CharReader.cpp
+++ b/src/core/utils/CharReader.cpp
@@ -18,6 +18,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <sstream>
 
 #include <core/Utils.hpp>
 
@@ -326,7 +327,7 @@ bool Buffer::atEnd(Buffer::CursorId cursor) const
 	       (c.bucket == endBucket && c.bucketOffs == endBucket->size());
 }
 
-bool Buffer::read(Buffer::CursorId cursor, char &c)
+bool Buffer::fetchCharacter(CursorId cursor, char &c, bool incr)
 {
 	Cursor &cur = cursors[cursor];
 	while (true) {
@@ -336,7 +337,9 @@ bool Buffer::read(Buffer::CursorId cursor, char &c)
 		// If there is still data in the current bucket, return this data
 		if (cur.bucketOffs < bucket.size()) {
 			c = bucket[cur.bucketOffs];
-			cur.bucketOffs++;
+			if (incr) {
+				cur.bucketOffs++;
+			}
 			return true;
 		} else if (cur.bucket == endBucket) {
 			// Return false if the end of the stream has been reached, otherwise
@@ -354,6 +357,16 @@ bool Buffer::read(Buffer::CursorId cursor, char &c)
 	}
 }
 
+bool Buffer::read(Buffer::CursorId cursor, char &c)
+{
+	return fetchCharacter(cursor, c, true);
+}
+
+bool Buffer::fetch(CursorId cursor, char &c)
+{
+	return fetchCharacter(cursor, c, false);
+}
+
 /* CharReader::Cursor class */
 
 void CharReader::Cursor::assign(std::shared_ptr<Buffer> buffer,
@@ -365,8 +378,6 @@ void CharReader::Cursor::assign(std::shared_ptr<Buffer> buffer,
 	// Copy the state
 	line = cursor.line;
 	column = cursor.column;
-	state = cursor.state;
-	lastLinebreak = cursor.lastLinebreak;
 }
 
 /* CharReader class */
@@ -396,73 +407,40 @@ CharReader::~CharReader()
 	buffer->deleteCursor(peekCursor.cursor);
 }
 
-bool CharReader::substituteLinebreaks(Cursor &cursor, char &c)
-{
-	if (c == '\n' || c == '\r') {
-		switch (cursor.state) {
-			case LinebreakState::NONE:
-				// We got a first linebreak character -- output a '\n'
-				if (c == '\n') {
-					cursor.state = LinebreakState::HAS_LF;
-				} else {
-					cursor.state = LinebreakState::HAS_CR;
-				}
-				c = '\n';
-				return true;
-			case LinebreakState::HAS_LF:
-				// If a LF is followed by a LF, output a new linefeed
-				if (c == '\n') {
-					cursor.state = LinebreakState::HAS_LF;
-					return true;
-				}
-
-				// Otherwise, don't handle this character (part of "\n\r")
-				cursor.state = LinebreakState::NONE;
-				return false;
-			case LinebreakState::HAS_CR:
-				// If a CR is followed by a CR, output a new linefeed
-				if (c == '\r') {
-					cursor.state = LinebreakState::HAS_CR;
-					c = '\n';
-					return true;
-				}
-
-				// Otherwise, don't handle this character (part of "\r\n")
-				cursor.state = LinebreakState::NONE;
-				return false;
-		}
-	}
-
-	// No linebreak character, reset the linebreak state
-	cursor.state = LinebreakState::NONE;
-	return true;
-}
-
 bool CharReader::readAtCursor(Cursor &cursor, char &c)
 {
-	while (true) {
-		// Return false if we're at the end of the stream
-		if (!buffer->read(cursor.cursor, c)) {
-			return false;
-		}
+	// Return false if we're at the end of the stream
+	if (!buffer->read(cursor.cursor, c)) {
+		return false;
+	}
 
-		// Substitute linebreak characters with a single '\n'
-		if (substituteLinebreaks(cursor, c)) {
-			if (c == '\n') {
-				// A linebreak was reached, go to the next line
-				cursor.line++;
-				cursor.column = 1;
-				cursor.lastLinebreak = buffer->offset(cursor.cursor);
-			} else {
-				// Ignore UTF-8 continuation bytes
-				if (!((c & 0x80) && !(c & 0x40))) {
-					cursor.column++;
-				}
+	// Substitute linebreak sequences with a single '\n'
+	if (c == '\n' || c == '\r') {
+		// Output a single \n
+		c = '\n';
+
+		// Check whether the next character is a continuation of the
+		// current character
+		char c2;
+		if (buffer->read(cursor.cursor, c2)) {
+			if ((c2 != '\n' && c2 != '\r') || c2 == c) {
+				buffer->moveCursor(cursor.cursor, -1);
 			}
+		}
+	}
 
-			return true;
+	// Count lines and columns
+	if (c == '\n') {
+		// A linebreak was reached, go to the next line
+		cursor.line++;
+		cursor.column = 1;
+	} else {
+		// Ignore UTF-8 continuation bytes
+		if (!((c & 0x80) && !(c & 0x40))) {
+			cursor.column++;
 		}
 	}
+	return true;
 }
 
 bool CharReader::peek(char &c)
@@ -529,6 +507,106 @@ CharReaderFork CharReader::fork()
 	return CharReaderFork(buffer, readCursor, peekCursor, coherent);
 }
 
+CharReader::Context CharReader::getContext(ssize_t maxSize)
+{
+	// Clone the current read cursor
+	Buffer::CursorId cur = buffer->createCursor(readCursor.cursor);
+
+	// Fetch the start position of the search
+	ssize_t offs = buffer->offset(cur);
+	ssize_t start = offs;
+	ssize_t end = offs;
+	char c;
+
+	// Search the beginning of the line with the last non-whitespace character
+	bool hadNonWhitespace = false;
+	bool foundBegin = false;
+	for (ssize_t i = 0; i < maxSize; i++) {
+		// Fetch the character at the current position
+		if (buffer->fetch(cur, c)) {
+			// Abort, at linebreaks if we found a non-linebreak character
+			if (hadNonWhitespace && (c == '\n' || c == '\r')) {
+				buffer->moveCursor(cur, 1);
+				start++;
+				foundBegin = true;
+				break;
+			}
+		}
+		if (buffer->moveCursor(cur, -1) == 0) {
+			foundBegin = true;
+			break;
+		}
+
+		// Update the start position and the hadNonWhitespace flag
+		hadNonWhitespace = hadNonWhitespace || !Utils::isWhitespace(c);
+		start--;
+	}
+
+	// Search the end of the line
+	buffer->moveCursor(cur, offs - start);
+	bool foundEnd = false;
+	for (ssize_t i = 0; i < maxSize; i++) {
+		// Increment the end counter if a character was read, abort if the end
+		// of the stream has been reached
+		if (buffer->read(cur, c)) {
+			end++;
+		} else {
+			foundEnd = true;
+			break;
+		}
+
+		// Abort on linebreak characters
+		if (c == '\n' || c == '\r') {
+			foundEnd = true;
+			break;
+		}
+	}
+
+	// Calculate the truncated start and end position and limit the number of
+	// characters to the maximum number of characters
+	ssize_t tStart = start;
+	ssize_t tEnd = end;
+	if (tEnd - tStart > maxSize) {
+		tStart = std::max(offs - maxSize / 2, tStart);
+		tEnd = tStart + maxSize;
+	}
+
+	// Try to go to the calculated start position and fetch the actual start
+	// position
+	ssize_t aStart = end + buffer->moveCursor(cur, tStart - end);
+	if (aStart > tStart) {
+		tEnd = tEnd + (aStart - tStart);
+		tStart = aStart;
+	}
+
+	// Read one line
+	std::stringstream ss;
+	size_t relPos = 0;
+	for (ssize_t i = tStart; i < tEnd; i++) {
+		if (buffer->read(cur, c)) {
+			// Break once a linebreak is reached
+			if (c == '\n' || c == '\r') {
+				break;
+			}
+
+			// Add the current character to the output
+			ss << c;
+
+			// Increment the string-relative offset as long as the original
+			// offset is not reached in the for loop
+			if (i < offs) {
+				relPos++;
+			}
+		}
+	}
+
+	// Delete the newly created cursor
+	buffer->deleteCursor(cur);
+
+	return CharReader::Context{ss.str(), relPos, !foundBegin || tStart != start,
+	                           !foundEnd || tEnd != end};
+}
+
 /* Class CharReaderFork */
 
 CharReaderFork::CharReaderFork(std::shared_ptr<Buffer> buffer,
diff --git a/src/core/utils/CharReader.hpp b/src/core/utils/CharReader.hpp
index 3d4c894..5daa21d 100644
--- a/src/core/utils/CharReader.hpp
+++ b/src/core/utils/CharReader.hpp
@@ -206,6 +206,12 @@ private:
 	 */
 	size_t moveBackward(CursorId cursor, size_t relativeOffs);
 
+	/**
+	 * Reads a character from the current cursor position and optionally
+	 * advances.
+	 */
+	bool fetchCharacter(CursorId cursor, char &c, bool incr);
+
 public:
 	/**
 	 * Intializes the Buffer with a reference to a ReadCallback that is used
@@ -311,7 +317,8 @@ public:
 	bool atEnd(CursorId cursor) const;
 
 	/**
-	 * Reads a single character from the ring buffer from the given cursor.
+	 * Reads a single character from the ring buffer from the given cursor and
+	 * moves to the next character.
 	 *
 	 * @param cursor specifies the cursor from which the data should be read.
 	 * The cursor will be advanced by one byte.
@@ -320,6 +327,18 @@ public:
 	 * been reached.
 	 */
 	bool read(CursorId cursor, char &c);
+
+	/**
+	 * Returns a single character from the ring buffer from the current cursor
+	 * position and stays at that position.
+	 *
+	 * @param cursor specifies the cursor from which the data should be read.
+	 * The cursor will be advanced by one byte.
+	 * @param c is the character into which the data needs to be read.
+	 * @return true if a character could be fetched, false if the end of the
+	 * stream has been reached.
+	 */
+	bool fetch(CursorId cursor, char &c);
 };
 
 // Forward declaration
@@ -333,13 +352,53 @@ class CharReaderFork;
  * of linebreaks and converts these to a single '\n'.
  */
 class CharReader {
-protected:
+public:
 	/**
-	 * Enum to represent the current state of the internal state machine that
-	 * replaces the linebreaks from multiple platforms to a single '\n'.
+	 * The context struct is used to represent the current context the char
+	 * reader is in. This context can for example be used when building error
+	 * messages.
 	 */
-	enum class LinebreakState { NONE, HAS_LF, HAS_CR };
+	struct Context {
+		/**
+		 * Set to the content of the current line.
+		 */
+		std::string line;
+
+		/**
+		 * Relative position (in characters) within that line.
+		 */
+		size_t relPos;
+
+		/**
+		 * Set to true if the beginning of the line has been truncated (because
+		 * the reader position is too far away from the actual position of the
+		 * line).
+		 */
+		bool truncatedStart;
+
+		/**
+		 * Set to true if the end of the line has been truncated (because the
+		 * reader position is too far away from the actual end position of the
+		 * line.
+		 */
+		bool truncatedEnd;
+
+		Context()
+		    : line(), relPos(0), truncatedStart(false), truncatedEnd(false)
+		{
+		}
 
+		Context(std::string line, size_t relPos, bool truncatedStart,
+		        bool truncatedEnd)
+		    : line(std::move(line)),
+		      relPos(relPos),
+		      truncatedStart(truncatedStart),
+		      truncatedEnd(truncatedEnd)
+		{
+		}
+	};
+
+protected:
 	/**
 	 * Internally used cursor structure for managing the read and the peek
 	 * cursor.
@@ -353,24 +412,12 @@ protected:
 		/**
 		 * Current line the cursor is in.
 		 */
-		size_t line;
+		uint32_t line;
 
 		/**
 		 * Current column the cursor is in.
 		 */
-		size_t column;
-
-		/**
-		 * State of the linebreak replacement statemachine.
-		 */
-		LinebreakState state;
-
-		/**
-		 * Contains the absolute offset in the input stream containing the
-		 * position of the last linebreak. This is used for extracting the
-		 * context (the line) in which an error occured.
-		 */
-		size_t lastLinebreak;
+		uint32_t column;
 
 		/**
 		 * Constructor of the Cursor class.
@@ -378,11 +425,7 @@ protected:
 		 * @param cursor is the underlying cursor in the Buffer instance.
 		 */
 		Cursor(Buffer::CursorId cursor, size_t line, size_t column)
-		    : cursor(cursor),
-		      line(line),
-		      column(column),
-		      state(LinebreakState::NONE),
-		      lastLinebreak(0)
+		    : cursor(cursor), line(line), column(column)
 		{
 		}
 
@@ -434,7 +477,8 @@ protected:
 
 	/**
 	 * Set to true as long the underlying Buffer cursor is at the same position
-	 * for the read and the peek cursor.
+	 * for the read and the peek cursor. This is only used for optimization
+	 * purposes and makes consecutive reads a bit faster.
 	 */
 	bool coherent;
 
@@ -544,14 +588,27 @@ public:
 	 *
 	 * @return the current line number.
 	 */
-	size_t getLine() const { return readCursor.line; }
+	uint32_t getLine() const { return readCursor.line; }
 
 	/**
 	 * Returns the current column (starting with one).
 	 *
 	 * @return the current column number.
 	 */
-	size_t getColumn() const { return readCursor.column; }
+	uint32_t getColumn() const { return readCursor.column; }
+
+	/**
+	 * Returns the current byte offset of the read cursor.
+	 *
+	 * @return the byte position within the stream.
+	 */
+	size_t getOffset() const { return buffer->offset(readCursor.cursor); };
+
+	/**
+	 * Returns the line the read cursor currently is in, but at most the
+	 * given number of characters in the form of a Context structure.
+	 */
+	Context getContext(ssize_t maxSize);
 };
 
 /**
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2014-12-11 00:22:39 +0100
committer	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2014-12-11 00:22:39 +0100
commit	2990d12ccca8ddbf0761cf84ce29f38de9f3262c (patch)
tree	58c492a082cce5cfdf7a5bb57b954a608c9b9d14 /src/core
parent	f053b48f925cf65aaf6ca937f89dacf59196c719 (diff)