summaryrefslogtreecommitdiff
path: root/src/core/common/CharReader.cpp
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2014-12-11 15:26:50 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2014-12-11 15:26:50 +0100
commit3f62168ed0b088eec3cb2903f03966f7d501f564 (patch)
tree781f5bd9b304d9eb931827a26f463575d772983d /src/core/common/CharReader.cpp
parentb74936760e28a92cadfaec47928ea478fe2d72ee (diff)
moved to CharReader everywhere
Diffstat (limited to 'src/core/common/CharReader.cpp')
-rw-r--r--src/core/common/CharReader.cpp640
1 files changed, 640 insertions, 0 deletions
diff --git a/src/core/common/CharReader.cpp b/src/core/common/CharReader.cpp
new file mode 100644
index 0000000..373c0c1
--- /dev/null
+++ b/src/core/common/CharReader.cpp
@@ -0,0 +1,640 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <sstream>
+
+#include "CharReader.hpp"
+#include "Utils.hpp"
+
+namespace ousia {
+
+/* Helper functions */
+
+/**
+ * istreamReadCallback is used internally by the Buffer calss to stream data
+ * from an input stream.
+ *
+ * @param buf is points a the target memory region.
+ * @param size is the requested number of bytes.
+ * @param userData is a pointer at some user defined data.
+ * @return the actual number of bytes read. If the result is smaller than
+ * the requested size, this tells the Buffer that the end of the input
+ * stream is reached.
+ */
+static size_t istreamReadCallback(char *buf, size_t size, void *userData)
+{
+ return (static_cast<std::istream *>(userData))->read(buf, size).gcount();
+}
+
+/* Class Buffer */
+
+Buffer::Buffer(ReadCallback callback, void *userData)
+ : callback(callback),
+ userData(userData),
+ reachedEnd(false),
+ startBucket(buckets.end()),
+ endBucket(buckets.end()),
+ startOffset(0),
+ firstDead(0)
+{
+ // Load a first block of data from the stream
+ stream();
+ startBucket = buckets.begin();
+}
+
+Buffer::Buffer(std::istream &istream) : Buffer(istreamReadCallback, &istream) {}
+
+Buffer::Buffer(const std::string &str)
+ : callback(nullptr),
+ userData(nullptr),
+ reachedEnd(true),
+ startBucket(buckets.end()),
+ endBucket(buckets.end()),
+ startOffset(0),
+ firstDead(0)
+{
+ // Copy the given string into a first buffer and set the start buffer
+ // correctly
+ Bucket &bucket = nextBucket();
+ bucket.resize(str.size());
+ std::copy(str.begin(), str.end(), bucket.begin());
+ startBucket = buckets.begin();
+}
+
+#ifndef NDEBUG
+Buffer::~Buffer()
+{
+ // Make sure all cursors have been deleted
+ for (bool cursor_alive: alive) {
+ assert(!cursor_alive);
+ }
+}
+#endif
+
+void Buffer::advance(BucketIterator &it)
+{
+ it++;
+ if (it == buckets.end()) {
+ it = buckets.begin();
+ }
+}
+
+void Buffer::advance(BucketList::const_iterator &it) const
+{
+ it++;
+ if (it == buckets.cend()) {
+ it = buckets.cbegin();
+ }
+}
+
+Buffer::Bucket &Buffer::nextBucket()
+{
+ constexpr size_t MAXVAL = std::numeric_limits<size_t>::max();
+
+ // Fetch the minimum bucket index
+ size_t minBucketIdx = MAXVAL;
+ for (size_t i = 0; i < cursors.size(); i++) {
+ if (alive[i]) {
+ // Fetch references to the bucket and the cursor
+ const Cursor &cur = cursors[i];
+ const Bucket &bucket = *(cur.bucket);
+
+ // Increment the bucket index by one, if the cursor is at the end
+ // of the bucket (only valid if the LOOKBACK_SIZE is set to zero)
+ size_t bIdx = cur.bucketIdx;
+ if (LOOKBACK_SIZE == 0 && cur.bucketOffs == bucket.size()) {
+ bIdx++;
+ }
+
+ // Decrement the bucket index by one, if the previous bucket still
+ // needs to be reached and cannot be overridden
+ if (bIdx > 0 && cur.bucketOffs < LOOKBACK_SIZE) {
+ bIdx--;
+ }
+
+ // Set the bucket index to the minium
+ minBucketIdx = std::min(minBucketIdx, bIdx);
+ }
+ }
+
+ // If there is space between the current start bucket and the read
+ // cursor, the start bucket can be safely overridden.
+ if (minBucketIdx > 0 && minBucketIdx != MAXVAL) {
+ // All cursor bucket indices will be decreased by one
+ for (size_t i = 0; i < cursors.size(); i++) {
+ cursors[i].bucketIdx--;
+ }
+
+ // Increment the start offset
+ startOffset += startBucket->size();
+
+ // The old start bucket is the new end bucket
+ endBucket = startBucket;
+
+ // Advance the start bucket, wrap around at the end of the list
+ advance(startBucket);
+ } else {
+ // No free bucket, insert a new one before the start bucket
+ endBucket = buckets.emplace(startBucket);
+ }
+ return *endBucket;
+}
+
+Buffer::CursorId Buffer::nextCursor()
+{
+ bool hasCursor = false;
+ CursorId res = 0;
+
+ // Search for the next free cursor starting with minNextCursorId
+ for (size_t i = firstDead; i < alive.size(); i++) {
+ if (!alive[i]) {
+ res = i;
+ hasCursor = true;
+ break;
+ }
+ }
+
+ // Add a new cursor to the cursor list if no cursor is currently free
+ if (!hasCursor) {
+ res = cursors.size();
+ cursors.resize(res + 1);
+ alive.resize(res + 1);
+ }
+
+ // The next dead cursor is at least the next cursor
+ firstDead = res + 1;
+
+ // Mark the new cursor as alive
+ alive[res] = true;
+
+ return res;
+}
+
+void Buffer::stream()
+{
+ // Fetch the bucket into which the data should be inserted, make sure it
+ // has the correct size
+ Bucket &tar = nextBucket();
+ tar.resize(REQUEST_SIZE);
+
+ // Read data from the stream into the target buffer
+ size_t size = callback(tar.data(), REQUEST_SIZE, userData);
+
+ // If not enough bytes were returned, we're at the end of the stream
+ if (size < REQUEST_SIZE) {
+ tar.resize(size);
+ reachedEnd = true;
+ }
+}
+
+Buffer::CursorId Buffer::createCursor()
+{
+ CursorId res = nextCursor();
+ cursors[res].bucket = startBucket;
+ cursors[res].bucketIdx = 0;
+ cursors[res].bucketOffs = 0;
+ return res;
+}
+
+Buffer::CursorId Buffer::createCursor(Buffer::CursorId ref)
+{
+ CursorId res = nextCursor();
+ cursors[res] = cursors[ref];
+ return res;
+}
+
+void Buffer::copyCursor(Buffer::CursorId from, Buffer::CursorId to)
+{
+ cursors[to] = cursors[from];
+}
+
+void Buffer::deleteCursor(Buffer::CursorId cursor)
+{
+ alive[cursor] = false;
+ firstDead = std::min(firstDead, cursor);
+}
+
+size_t Buffer::offset(Buffer::CursorId cursor) const
+{
+ const Cursor &cur = cursors[cursor];
+ size_t offs = startOffset + cur.bucketOffs;
+ BucketList::const_iterator it = startBucket;
+ while (it != cur.bucket) {
+ offs += it->size();
+ advance(it);
+ }
+ return offs;
+}
+
+size_t Buffer::moveForward(CursorId cursor, size_t relativeOffs)
+{
+ size_t offs = relativeOffs;
+ Cursor &cur = cursors[cursor];
+ while (offs > 0) {
+ // Fetch the current bucket of the cursor
+ Bucket &bucket = *(cur.bucket);
+
+ // If there is enough space in the bucket, simply increment the bucket
+ // offset by the given relative offset
+ const size_t space = bucket.size() - cur.bucketOffs;
+ if (space >= offs) {
+ cur.bucketOffs += offs;
+ break;
+ } else {
+ // Go to the end of the current bucket otherwise
+ offs -= space;
+ cur.bucketOffs = bucket.size();
+
+ // Go to the next bucket
+ if (cur.bucket != endBucket) {
+ // Go to the next bucket
+ advance(cur.bucket);
+ cur.bucketIdx++;
+ cur.bucketOffs = 0;
+ } else {
+ // Abort, if there is no more data to stream, otherwise just
+ // load new data
+ if (reachedEnd) {
+ return relativeOffs - offs;
+ }
+ stream();
+ }
+ }
+ }
+ return relativeOffs;
+}
+
+size_t Buffer::moveBackward(CursorId cursor, size_t relativeOffs)
+{
+ size_t offs = relativeOffs;
+ Cursor &cur = cursors[cursor];
+ while (offs > 0) {
+ // If there is enough space in the bucket, simply decrement the bucket
+ // offset by the given relative offset
+ if (cur.bucketOffs >= offs) {
+ cur.bucketOffs -= offs;
+ break;
+ } else {
+ // Go to the beginning of the current bucket otherwise
+ offs -= cur.bucketOffs;
+ cur.bucketOffs = 0;
+
+ // Abort if there is no more bucket to got back to
+ if (cur.bucketIdx == 0) {
+ return relativeOffs - offs;
+ }
+
+ // Go to the previous bucket (wrap around at the beginning of the
+ // list)
+ if (cur.bucket == buckets.begin()) {
+ cur.bucket = buckets.end();
+ }
+ cur.bucket--;
+
+ // Decrement the bucket index, and set the current offset to the
+ // end of the new bucket
+ cur.bucketIdx--;
+ cur.bucketOffs = cur.bucket->size();
+ }
+ }
+ return relativeOffs;
+}
+
+ssize_t Buffer::moveCursor(CursorId cursor, ssize_t relativeOffs)
+{
+ if (relativeOffs > 0) {
+ return moveForward(cursor, relativeOffs);
+ } else if (relativeOffs < 0) {
+ return -moveBackward(cursor, -relativeOffs);
+ } else {
+ return 0;
+ }
+}
+
+bool Buffer::atEnd(Buffer::CursorId cursor) const
+{
+ const Cursor &c = cursors[cursor];
+ return reachedEnd &&
+ (c.bucket == endBucket && c.bucketOffs == endBucket->size());
+}
+
+bool Buffer::fetchCharacter(CursorId cursor, char &c, bool incr)
+{
+ Cursor &cur = cursors[cursor];
+ while (true) {
+ // Reference at the current bucket
+ Bucket &bucket = *(cur.bucket);
+
+ // If there is still data in the current bucket, return this data
+ if (cur.bucketOffs < bucket.size()) {
+ c = bucket[cur.bucketOffs];
+ if (incr) {
+ cur.bucketOffs++;
+ }
+ return true;
+ } else if (cur.bucket == endBucket) {
+ // Return false if the end of the stream has been reached, otherwise
+ // load new data
+ if (reachedEnd) {
+ return false;
+ }
+ stream();
+ }
+
+ // Go to the next bucket
+ cur.bucketIdx++;
+ cur.bucketOffs = 0;
+ advance(cur.bucket);
+ }
+}
+
+bool Buffer::read(Buffer::CursorId cursor, char &c)
+{
+ return fetchCharacter(cursor, c, true);
+}
+
+bool Buffer::fetch(CursorId cursor, char &c)
+{
+ return fetchCharacter(cursor, c, false);
+}
+
+/* CharReader::Cursor class */
+
+void CharReader::Cursor::assign(std::shared_ptr<Buffer> buffer,
+ CharReader::Cursor &cursor)
+{
+ // Copy the cursor position
+ buffer->copyCursor(cursor.cursor, this->cursor);
+
+ // Copy the state
+ line = cursor.line;
+ column = cursor.column;
+}
+
+/* CharReader class */
+
+CharReader::CharReader(std::shared_ptr<Buffer> buffer, size_t line,
+ size_t column)
+ : buffer(buffer),
+ readCursor(buffer->createCursor(), line, column),
+ peekCursor(buffer->createCursor(), line, column),
+ coherent(true)
+{
+}
+
+CharReader::CharReader(const std::string &str, size_t line, size_t column)
+ : CharReader(std::shared_ptr<Buffer>{new Buffer{str}}, line, column)
+{
+}
+
+CharReader::CharReader(std::istream &istream, size_t line, size_t column)
+ : CharReader(std::shared_ptr<Buffer>{new Buffer{istream}}, line, column)
+{
+}
+
+CharReader::~CharReader()
+{
+ buffer->deleteCursor(readCursor.cursor);
+ buffer->deleteCursor(peekCursor.cursor);
+}
+
+bool CharReader::readAtCursor(Cursor &cursor, char &c)
+{
+ // Return false if we're at the end of the stream
+ if (!buffer->read(cursor.cursor, c)) {
+ return false;
+ }
+
+ // Substitute linebreak sequences with a single '\n'
+ if (c == '\n' || c == '\r') {
+ // Output a single \n
+ c = '\n';
+
+ // Check whether the next character is a continuation of the
+ // current character
+ char c2;
+ if (buffer->read(cursor.cursor, c2)) {
+ if ((c2 != '\n' && c2 != '\r') || c2 == c) {
+ buffer->moveCursor(cursor.cursor, -1);
+ }
+ }
+ }
+
+ // Count lines and columns
+ if (c == '\n') {
+ // A linebreak was reached, go to the next line
+ cursor.line++;
+ cursor.column = 1;
+ } else {
+ // Ignore UTF-8 continuation bytes
+ if (!((c & 0x80) && !(c & 0x40))) {
+ cursor.column++;
+ }
+ }
+ return true;
+}
+
+bool CharReader::peek(char &c)
+{
+ // If the reader was coherent, update the peek cursor state
+ if (coherent) {
+ peekCursor.assign(buffer, readCursor);
+ coherent = false;
+ }
+
+ // Read a character from the peek cursor
+ return readAtCursor(peekCursor, c);
+}
+
+bool CharReader::read(char &c)
+{
+ // Read a character from the buffer at the current read cursor
+ bool res = readAtCursor(readCursor, c);
+
+ // Set the peek position to the current read position, if reading was not
+ // coherent
+ if (!coherent) {
+ peekCursor.assign(buffer, readCursor);
+ coherent = true;
+ } else {
+ buffer->copyCursor(readCursor.cursor, peekCursor.cursor);
+ }
+
+ // Return the result of the read function
+ return res;
+}
+
+void CharReader::resetPeek()
+{
+ if (!coherent) {
+ peekCursor.assign(buffer, readCursor);
+ coherent = true;
+ }
+}
+
+void CharReader::consumePeek()
+{
+ if (!coherent) {
+ readCursor.assign(buffer, peekCursor);
+ coherent = true;
+ }
+}
+
+bool CharReader::consumeWhitespace()
+{
+ char c;
+ while (peek(c)) {
+ if (!Utils::isWhitespace(c)) {
+ resetPeek();
+ return true;
+ }
+ consumePeek();
+ }
+ return false;
+}
+
+CharReaderFork CharReader::fork()
+{
+ return CharReaderFork(buffer, readCursor, peekCursor, coherent);
+}
+
+CharReader::Context CharReader::getContext(ssize_t maxSize)
+{
+ // Clone the current read cursor
+ Buffer::CursorId cur = buffer->createCursor(readCursor.cursor);
+
+ // Fetch the start position of the search
+ ssize_t offs = buffer->offset(cur);
+ ssize_t start = offs;
+ ssize_t end = offs;
+ char c;
+
+ // Search the beginning of the line with the last non-whitespace character
+ bool hadNonWhitespace = false;
+ bool foundBegin = false;
+ for (ssize_t i = 0; i < maxSize; i++) {
+ // Fetch the character at the current position
+ if (buffer->fetch(cur, c)) {
+ // Abort, at linebreaks if we found a non-linebreak character
+ hadNonWhitespace = hadNonWhitespace || !Utils::isWhitespace(c);
+ if (hadNonWhitespace && (c == '\n' || c == '\r')) {
+ buffer->moveCursor(cur, 1);
+ start++;
+ foundBegin = true;
+ break;
+ }
+ }
+ if (buffer->moveCursor(cur, -1) == 0) {
+ foundBegin = true;
+ break;
+ } else {
+ // Update the start position and the hadNonWhitespace flag
+ start--;
+ }
+ }
+
+ // Search the end of the line
+ buffer->moveCursor(cur, offs - start);
+ bool foundEnd = false;
+ for (ssize_t i = 0; i < maxSize; i++) {
+ // Increment the end counter if a character was read, abort if the end
+ // of the stream has been reached
+ if (buffer->read(cur, c)) {
+ end++;
+ } else {
+ foundEnd = true;
+ break;
+ }
+
+ // Abort on linebreak characters
+ if (c == '\n' || c == '\r') {
+ foundEnd = true;
+ break;
+ }
+ }
+
+ // Calculate the truncated start and end position and limit the number of
+ // characters to the maximum number of characters
+ ssize_t tStart = start;
+ ssize_t tEnd = end;
+ if (tEnd - tStart > maxSize) {
+ tStart = std::max(offs - maxSize / 2, tStart);
+ tEnd = tStart + maxSize;
+ }
+
+ // Try to go to the calculated start position and fetch the actual start
+ // position
+ ssize_t aStart = end + buffer->moveCursor(cur, tStart - end);
+ if (aStart > tStart) {
+ tEnd = tEnd + (aStart - tStart);
+ tStart = aStart;
+ }
+
+ // Read one line
+ std::stringstream ss;
+ size_t relPos = 0;
+ for (ssize_t i = tStart; i < tEnd; i++) {
+ if (buffer->read(cur, c)) {
+ // Break once a linebreak is reached
+ if (c == '\n' || c == '\r') {
+ break;
+ }
+
+ // Add the current character to the output
+ ss << c;
+
+ // Increment the string-relative offset as long as the original
+ // offset is not reached in the for loop
+ if (i < offs) {
+ relPos++;
+ }
+ }
+ }
+
+ // Delete the newly created cursor
+ buffer->deleteCursor(cur);
+
+ return CharReader::Context{ss.str(), relPos, !foundBegin || tStart != start,
+ !foundEnd || tEnd != end};
+}
+
+/* Class CharReaderFork */
+
+CharReaderFork::CharReaderFork(std::shared_ptr<Buffer> buffer,
+ CharReader::Cursor &parentReadCursor,
+ CharReader::Cursor &parentPeekCursor,
+ bool coherent)
+ : CharReader(buffer, 1, 1),
+ parentReadCursor(parentReadCursor),
+ parentPeekCursor(parentPeekCursor)
+{
+ readCursor.assign(buffer, parentReadCursor);
+ peekCursor.assign(buffer, parentPeekCursor);
+ this->coherent = coherent;
+}
+
+void CharReaderFork::commit()
+{
+ parentReadCursor.assign(buffer, readCursor);
+ parentPeekCursor.assign(buffer, peekCursor);
+}
+}
+