diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2014-12-11 00:22:39 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2014-12-11 00:22:39 +0100 |
commit | 2990d12ccca8ddbf0761cf84ce29f38de9f3262c (patch) | |
tree | 58c492a082cce5cfdf7a5bb57b954a608c9b9d14 /src/core | |
parent | f053b48f925cf65aaf6ca937f89dacf59196c719 (diff) |
implemented context function, increased performance of read function
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/utils/CharReader.cpp | 206 | ||||
-rw-r--r-- | src/core/utils/CharReader.hpp | 111 |
2 files changed, 226 insertions, 91 deletions
diff --git a/src/core/utils/CharReader.cpp b/src/core/utils/CharReader.cpp index 12d0043..effc587 100644 --- a/src/core/utils/CharReader.cpp +++ b/src/core/utils/CharReader.cpp @@ -18,6 +18,7 @@ #include <algorithm> #include <limits> +#include <sstream> #include <core/Utils.hpp> @@ -326,7 +327,7 @@ bool Buffer::atEnd(Buffer::CursorId cursor) const (c.bucket == endBucket && c.bucketOffs == endBucket->size()); } -bool Buffer::read(Buffer::CursorId cursor, char &c) +bool Buffer::fetchCharacter(CursorId cursor, char &c, bool incr) { Cursor &cur = cursors[cursor]; while (true) { @@ -336,7 +337,9 @@ bool Buffer::read(Buffer::CursorId cursor, char &c) // If there is still data in the current bucket, return this data if (cur.bucketOffs < bucket.size()) { c = bucket[cur.bucketOffs]; - cur.bucketOffs++; + if (incr) { + cur.bucketOffs++; + } return true; } else if (cur.bucket == endBucket) { // Return false if the end of the stream has been reached, otherwise @@ -354,6 +357,16 @@ bool Buffer::read(Buffer::CursorId cursor, char &c) } } +bool Buffer::read(Buffer::CursorId cursor, char &c) +{ + return fetchCharacter(cursor, c, true); +} + +bool Buffer::fetch(CursorId cursor, char &c) +{ + return fetchCharacter(cursor, c, false); +} + /* CharReader::Cursor class */ void CharReader::Cursor::assign(std::shared_ptr<Buffer> buffer, @@ -365,8 +378,6 @@ void CharReader::Cursor::assign(std::shared_ptr<Buffer> buffer, // Copy the state line = cursor.line; column = cursor.column; - state = cursor.state; - lastLinebreak = cursor.lastLinebreak; } /* CharReader class */ @@ -396,73 +407,40 @@ CharReader::~CharReader() buffer->deleteCursor(peekCursor.cursor); } -bool CharReader::substituteLinebreaks(Cursor &cursor, char &c) -{ - if (c == '\n' || c == '\r') { - switch (cursor.state) { - case LinebreakState::NONE: - // We got a first linebreak character -- output a '\n' - if (c == '\n') { - cursor.state = LinebreakState::HAS_LF; - } else { - cursor.state = LinebreakState::HAS_CR; - } - c = '\n'; - return true; - case LinebreakState::HAS_LF: - // If a LF is followed by a LF, output a new linefeed - if (c == '\n') { - cursor.state = LinebreakState::HAS_LF; - return true; - } - - // Otherwise, don't handle this character (part of "\n\r") - cursor.state = LinebreakState::NONE; - return false; - case LinebreakState::HAS_CR: - // If a CR is followed by a CR, output a new linefeed - if (c == '\r') { - cursor.state = LinebreakState::HAS_CR; - c = '\n'; - return true; - } - - // Otherwise, don't handle this character (part of "\r\n") - cursor.state = LinebreakState::NONE; - return false; - } - } - - // No linebreak character, reset the linebreak state - cursor.state = LinebreakState::NONE; - return true; -} - bool CharReader::readAtCursor(Cursor &cursor, char &c) { - while (true) { - // Return false if we're at the end of the stream - if (!buffer->read(cursor.cursor, c)) { - return false; - } + // Return false if we're at the end of the stream + if (!buffer->read(cursor.cursor, c)) { + return false; + } - // Substitute linebreak characters with a single '\n' - if (substituteLinebreaks(cursor, c)) { - if (c == '\n') { - // A linebreak was reached, go to the next line - cursor.line++; - cursor.column = 1; - cursor.lastLinebreak = buffer->offset(cursor.cursor); - } else { - // Ignore UTF-8 continuation bytes - if (!((c & 0x80) && !(c & 0x40))) { - cursor.column++; - } + // Substitute linebreak sequences with a single '\n' + if (c == '\n' || c == '\r') { + // Output a single \n + c = '\n'; + + // Check whether the next character is a continuation of the + // current character + char c2; + if (buffer->read(cursor.cursor, c2)) { + if ((c2 != '\n' && c2 != '\r') || c2 == c) { + buffer->moveCursor(cursor.cursor, -1); } + } + } - return true; + // Count lines and columns + if (c == '\n') { + // A linebreak was reached, go to the next line + cursor.line++; + cursor.column = 1; + } else { + // Ignore UTF-8 continuation bytes + if (!((c & 0x80) && !(c & 0x40))) { + cursor.column++; } } + return true; } bool CharReader::peek(char &c) @@ -529,6 +507,106 @@ CharReaderFork CharReader::fork() return CharReaderFork(buffer, readCursor, peekCursor, coherent); } +CharReader::Context CharReader::getContext(ssize_t maxSize) +{ + // Clone the current read cursor + Buffer::CursorId cur = buffer->createCursor(readCursor.cursor); + + // Fetch the start position of the search + ssize_t offs = buffer->offset(cur); + ssize_t start = offs; + ssize_t end = offs; + char c; + + // Search the beginning of the line with the last non-whitespace character + bool hadNonWhitespace = false; + bool foundBegin = false; + for (ssize_t i = 0; i < maxSize; i++) { + // Fetch the character at the current position + if (buffer->fetch(cur, c)) { + // Abort, at linebreaks if we found a non-linebreak character + if (hadNonWhitespace && (c == '\n' || c == '\r')) { + buffer->moveCursor(cur, 1); + start++; + foundBegin = true; + break; + } + } + if (buffer->moveCursor(cur, -1) == 0) { + foundBegin = true; + break; + } + + // Update the start position and the hadNonWhitespace flag + hadNonWhitespace = hadNonWhitespace || !Utils::isWhitespace(c); + start--; + } + + // Search the end of the line + buffer->moveCursor(cur, offs - start); + bool foundEnd = false; + for (ssize_t i = 0; i < maxSize; i++) { + // Increment the end counter if a character was read, abort if the end + // of the stream has been reached + if (buffer->read(cur, c)) { + end++; + } else { + foundEnd = true; + break; + } + + // Abort on linebreak characters + if (c == '\n' || c == '\r') { + foundEnd = true; + break; + } + } + + // Calculate the truncated start and end position and limit the number of + // characters to the maximum number of characters + ssize_t tStart = start; + ssize_t tEnd = end; + if (tEnd - tStart > maxSize) { + tStart = std::max(offs - maxSize / 2, tStart); + tEnd = tStart + maxSize; + } + + // Try to go to the calculated start position and fetch the actual start + // position + ssize_t aStart = end + buffer->moveCursor(cur, tStart - end); + if (aStart > tStart) { + tEnd = tEnd + (aStart - tStart); + tStart = aStart; + } + + // Read one line + std::stringstream ss; + size_t relPos = 0; + for (ssize_t i = tStart; i < tEnd; i++) { + if (buffer->read(cur, c)) { + // Break once a linebreak is reached + if (c == '\n' || c == '\r') { + break; + } + + // Add the current character to the output + ss << c; + + // Increment the string-relative offset as long as the original + // offset is not reached in the for loop + if (i < offs) { + relPos++; + } + } + } + + // Delete the newly created cursor + buffer->deleteCursor(cur); + + return CharReader::Context{ss.str(), relPos, !foundBegin || tStart != start, + !foundEnd || tEnd != end}; +} + /* Class CharReaderFork */ CharReaderFork::CharReaderFork(std::shared_ptr<Buffer> buffer, diff --git a/src/core/utils/CharReader.hpp b/src/core/utils/CharReader.hpp index 3d4c894..5daa21d 100644 --- a/src/core/utils/CharReader.hpp +++ b/src/core/utils/CharReader.hpp @@ -206,6 +206,12 @@ private: */ size_t moveBackward(CursorId cursor, size_t relativeOffs); + /** + * Reads a character from the current cursor position and optionally + * advances. + */ + bool fetchCharacter(CursorId cursor, char &c, bool incr); + public: /** * Intializes the Buffer with a reference to a ReadCallback that is used @@ -311,7 +317,8 @@ public: bool atEnd(CursorId cursor) const; /** - * Reads a single character from the ring buffer from the given cursor. + * Reads a single character from the ring buffer from the given cursor and + * moves to the next character. * * @param cursor specifies the cursor from which the data should be read. * The cursor will be advanced by one byte. @@ -320,6 +327,18 @@ public: * been reached. */ bool read(CursorId cursor, char &c); + + /** + * Returns a single character from the ring buffer from the current cursor + * position and stays at that position. + * + * @param cursor specifies the cursor from which the data should be read. + * The cursor will be advanced by one byte. + * @param c is the character into which the data needs to be read. + * @return true if a character could be fetched, false if the end of the + * stream has been reached. + */ + bool fetch(CursorId cursor, char &c); }; // Forward declaration @@ -333,13 +352,53 @@ class CharReaderFork; * of linebreaks and converts these to a single '\n'. */ class CharReader { -protected: +public: /** - * Enum to represent the current state of the internal state machine that - * replaces the linebreaks from multiple platforms to a single '\n'. + * The context struct is used to represent the current context the char + * reader is in. This context can for example be used when building error + * messages. */ - enum class LinebreakState { NONE, HAS_LF, HAS_CR }; + struct Context { + /** + * Set to the content of the current line. + */ + std::string line; + + /** + * Relative position (in characters) within that line. + */ + size_t relPos; + + /** + * Set to true if the beginning of the line has been truncated (because + * the reader position is too far away from the actual position of the + * line). + */ + bool truncatedStart; + + /** + * Set to true if the end of the line has been truncated (because the + * reader position is too far away from the actual end position of the + * line. + */ + bool truncatedEnd; + + Context() + : line(), relPos(0), truncatedStart(false), truncatedEnd(false) + { + } + Context(std::string line, size_t relPos, bool truncatedStart, + bool truncatedEnd) + : line(std::move(line)), + relPos(relPos), + truncatedStart(truncatedStart), + truncatedEnd(truncatedEnd) + { + } + }; + +protected: /** * Internally used cursor structure for managing the read and the peek * cursor. @@ -353,24 +412,12 @@ protected: /** * Current line the cursor is in. */ - size_t line; + uint32_t line; /** * Current column the cursor is in. */ - size_t column; - - /** - * State of the linebreak replacement statemachine. - */ - LinebreakState state; - - /** - * Contains the absolute offset in the input stream containing the - * position of the last linebreak. This is used for extracting the - * context (the line) in which an error occured. - */ - size_t lastLinebreak; + uint32_t column; /** * Constructor of the Cursor class. @@ -378,11 +425,7 @@ protected: * @param cursor is the underlying cursor in the Buffer instance. */ Cursor(Buffer::CursorId cursor, size_t line, size_t column) - : cursor(cursor), - line(line), - column(column), - state(LinebreakState::NONE), - lastLinebreak(0) + : cursor(cursor), line(line), column(column) { } @@ -434,7 +477,8 @@ protected: /** * Set to true as long the underlying Buffer cursor is at the same position - * for the read and the peek cursor. + * for the read and the peek cursor. This is only used for optimization + * purposes and makes consecutive reads a bit faster. */ bool coherent; @@ -544,14 +588,27 @@ public: * * @return the current line number. */ - size_t getLine() const { return readCursor.line; } + uint32_t getLine() const { return readCursor.line; } /** * Returns the current column (starting with one). * * @return the current column number. */ - size_t getColumn() const { return readCursor.column; } + uint32_t getColumn() const { return readCursor.column; } + + /** + * Returns the current byte offset of the read cursor. + * + * @return the byte position within the stream. + */ + size_t getOffset() const { return buffer->offset(readCursor.cursor); }; + + /** + * Returns the line the read cursor currently is in, but at most the + * given number of characters in the form of a Context structure. + */ + Context getContext(ssize_t maxSize); }; /** |