/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include "CharReader.hpp" namespace ousia { namespace utils { /* Helper functions */ /** * istreamReadCallback is used internally by the Buffer calss to stream data * from an input stream. * * @param buf is points a the target memory region. * @param size is the requested number of bytes. * @param userData is a pointer at some user defined data. * @return the actual number of bytes read. If the result is smaller than * the requested size, this tells the Buffer that the end of the input * stream is reached. */ static size_t istreamReadCallback(char *buf, size_t size, void *userData) { return (static_cast(userData))->read(buf, size).gcount(); } /* Class Buffer */ Buffer::Buffer(ReadCallback callback, void *userData) : callback(callback), userData(userData), reachedEnd(false), startBucket(buckets.end()), endBucket(buckets.end()), startOffset(0), firstDead(0) { // Load a first block of data from the stream stream(); startBucket = buckets.begin(); } Buffer::Buffer(std::istream &istream) : Buffer(istreamReadCallback, &istream) {} Buffer::Buffer(const std::string &str) : callback(nullptr), userData(nullptr), reachedEnd(true), startBucket(buckets.end()), endBucket(buckets.end()), startOffset(0), firstDead(0) { // Copy the given string into a first buffer and set the start buffer // correctly Bucket &bucket = nextBucket(); bucket.resize(str.size()); std::copy(str.begin(), str.end(), bucket.begin()); startBucket = buckets.begin(); } void Buffer::advance(BucketIterator &it) { it++; if (it == buckets.end()) { it = buckets.begin(); } } void Buffer::advance(BucketList::const_iterator &it) const { it++; if (it == buckets.cend()) { it = buckets.cbegin(); } } Buffer::Bucket &Buffer::nextBucket() { constexpr size_t MAXVAL = std::numeric_limits::max(); // Fetch the minimum bucket index size_t minBucketIdx = MAXVAL; for (size_t i = 0; i < cursors.size(); i++) { if (alive[i]) { // Fetch references to the bucket and the cursor const Cursor &cur = cursors[i]; const Bucket &bucket = *(cur.bucket); // Increment the bucket index by one, if the cursor is at the end // of the bucket (only valid if the LOOKBACK_SIZE is set to zero) size_t bIdx = cur.bucketIdx; if (LOOKBACK_SIZE == 0 && cur.bucketOffs == bucket.size()) { bIdx++; } // Decrement the bucket index by one, if the previous bucket still // needs to be reached and cannot be overridden if (bIdx > 0 && cur.bucketOffs < LOOKBACK_SIZE) { bIdx--; } // Set the bucket index to the minium minBucketIdx = std::min(minBucketIdx, bIdx); } } // If there is space between the current start bucket and the read // cursor, the start bucket can be safely overridden. if (minBucketIdx > 0 && minBucketIdx != MAXVAL) { // All cursor bucket indices will be decreased by one for (size_t i = 0; i < cursors.size(); i++) { cursors[i].bucketIdx--; } // Increment the start offset startOffset += startBucket->size(); // The old start bucket is the new end bucket endBucket = startBucket; // Advance the start bucket, wrap around at the end of the list advance(startBucket); } else { // No free bucket, insert a new one before the start bucket endBucket = buckets.emplace(startBucket); } return *endBucket; } Buffer::CursorId Buffer::nextCursor() { bool hasCursor = false; CursorId res = 0; // Search for the next free cursor starting with minNextCursorId for (size_t i = firstDead; i < alive.size(); i++) { if (!alive[i]) { res = i; hasCursor = true; break; } } // Add a new cursor to the cursor list if no cursor is currently free if (!hasCursor) { res = cursors.size(); cursors.resize(res + 1); alive.resize(res + 1); } // The next dead cursor is at least the next cursor firstDead = res + 1; // Mark the new cursor as alive alive[res] = true; return res; } void Buffer::stream() { // Fetch the bucket into which the data should be inserted, make sure it // has the correct size Bucket &tar = nextBucket(); tar.resize(REQUEST_SIZE); // Read data from the stream into the target buffer size_t size = callback(tar.data(), REQUEST_SIZE, userData); // If not enough bytes were returned, we're at the end of the stream if (size < REQUEST_SIZE) { tar.resize(size); reachedEnd = true; } } Buffer::CursorId Buffer::createCursor() { CursorId res = nextCursor(); cursors[res].bucket = startBucket; cursors[res].bucketIdx = 0; cursors[res].bucketOffs = 0; return res; } Buffer::CursorId Buffer::createCursor(Buffer::CursorId ref) { CursorId res = nextCursor(); cursors[res] = cursors[ref]; return res; } void Buffer::copyCursor(Buffer::CursorId from, Buffer::CursorId to) { cursors[to] = cursors[from]; } void Buffer::deleteCursor(Buffer::CursorId cursor) { alive[cursor] = false; firstDead = std::min(firstDead, cursor); } size_t Buffer::offset(Buffer::CursorId cursor) const { const Cursor &cur = cursors[cursor]; size_t offs = startOffset + cur.bucketOffs; BucketList::const_iterator it = startBucket; while (it != cur.bucket) { offs += it->size(); advance(it); } return offs; } size_t Buffer::moveForward(CursorId cursor, size_t relativeOffs) { size_t offs = relativeOffs; Cursor &cur = cursors[cursor]; while (offs > 0) { // Fetch the current bucket of the cursor Bucket &bucket = *(cur.bucket); // If there is enough space in the bucket, simply increment the bucket // offset by the given relative offset const size_t space = bucket.size() - cur.bucketOffs; if (space >= offs) { cur.bucketOffs += offs; break; } else { // Go to the end of the current bucket otherwise offs -= space; cur.bucketOffs = bucket.size(); // Go to the next bucket if (cur.bucket != endBucket) { // Go to the next bucket advance(cur.bucket); cur.bucketIdx++; cur.bucketOffs = 0; } else { // Abort, if there is no more data to stream, otherwise just // load new data if (reachedEnd) { return relativeOffs - offs; } stream(); } } } return relativeOffs; } size_t Buffer::moveBackward(CursorId cursor, size_t relativeOffs) { size_t offs = relativeOffs; Cursor &cur = cursors[cursor]; while (offs > 0) { // If there is enough space in the bucket, simply decrement the bucket // offset by the given relative offset if (cur.bucketOffs >= offs) { cur.bucketOffs -= offs; break; } else { // Go to the beginning of the current bucket otherwise offs -= cur.bucketOffs; cur.bucketOffs = 0; // Abort if there is no more bucket to got back to if (cur.bucketIdx == 0) { return relativeOffs - offs; } // Go to the previous bucket (wrap around at the beginning of the // list) if (cur.bucket == buckets.begin()) { cur.bucket = buckets.end(); } cur.bucket--; // Decrement the bucket index, and set the current offset to the // end of the new bucket cur.bucketIdx--; cur.bucketOffs = cur.bucket->size(); } } return relativeOffs; } ssize_t Buffer::moveCursor(CursorId cursor, ssize_t relativeOffs) { if (relativeOffs > 0) { return moveForward(cursor, relativeOffs); } else if (relativeOffs < 0) { return -moveBackward(cursor, -relativeOffs); } else { return 0; } } bool Buffer::atEnd(Buffer::CursorId cursor) const { const Cursor &c = cursors[cursor]; return reachedEnd && (c.bucket == endBucket && c.bucketOffs == endBucket->size()); } bool Buffer::read(Buffer::CursorId cursor, char &c) { Cursor &cur = cursors[cursor]; while (true) { // Reference at the current bucket Bucket &bucket = *(cur.bucket); // If there is still data in the current bucket, return this data if (cur.bucketOffs < bucket.size()) { c = bucket[cur.bucketOffs]; cur.bucketOffs++; return true; } else if (cur.bucket == endBucket) { // Return false if the end of the stream has been reached, otherwise // load new data if (reachedEnd) { return false; } stream(); } // Go to the next bucket cur.bucketIdx++; cur.bucketOffs = 0; advance(cur.bucket); } } /* CharReader::Cursor class */ void CharReader::Cursor::assign(std::shared_ptr buffer, CharReader::Cursor &cursor) { // Copy the cursor position buffer->copyCursor(cursor.cursor, this->cursor); // Copy the state line = cursor.line; column = cursor.column; state = cursor.state; lastLinebreak = cursor.lastLinebreak; } /* CharReader class */ CharReader::CharReader(std::shared_ptr buffer) : buffer(buffer), readCursor(buffer->createCursor()), peekCursor(buffer->createCursor()) { } CharReader::CharReader(const std::string &str, size_t line, size_t column) : buffer(new Buffer{str}), readCursor(buffer->createCursor(), line, column), peekCursor(buffer->createCursor(), line, column) { } CharReader::CharReader(std::istream &istream, size_t line, size_t column) : buffer(new Buffer{istream}), readCursor(buffer->createCursor(), line, column), peekCursor(buffer->createCursor(), line, column) { } CharReader::~CharReader() { buffer->deleteCursor(readCursor.cursor); buffer->deleteCursor(peekCursor.cursor); } bool CharReader::substituteLinebreaks(Cursor &cursor, char &c) { if (c == '\n' || c == '\r') { switch (cursor.state) { case LinebreakState::NONE: // We got a first linebreak character -- output a '\n' if (c == '\n') { cursor.state = LinebreakState::HAS_LF; } else { cursor.state = LinebreakState::HAS_CR; } c = '\n'; return true; case LinebreakState::HAS_LF: // If a LF is followed by a LF, output a new linefeed if (c == '\n') { cursor.state = LinebreakState::HAS_LF; return true; } // Otherwise, don't handle this character (part of "\n\r") cursor.state = LinebreakState::NONE; return false; case LinebreakState::HAS_CR: // If a CR is followed by a CR, output a new linefeed if (c == '\r') { cursor.state = LinebreakState::HAS_CR; c = '\n'; return true; } // Otherwise, don't handle this character (part of "\r\n") cursor.state = LinebreakState::NONE; return false; } } // No linebreak character, reset the linebreak state cursor.state = LinebreakState::NONE; return true; } bool CharReader::readAtCursor(Cursor &cursor, char &c) { while (true) { // Return false if we're at the end of the stream if (!buffer->read(cursor.cursor, c)) { return false; } // Substitute linebreak characters with a single '\n' if (substituteLinebreaks(cursor, c)) { if (c == '\n') { // A linebreak was reached, go to the next line cursor.line++; cursor.column = 1; cursor.lastLinebreak = buffer->offset(cursor.cursor); } else { // Ignore UTF-8 continuation bytes if (!((c & 0x80) && !(c & 0x40))) { cursor.column++; } } return true; } } } bool CharReader::peek(char &c) { return readAtCursor(peekCursor, c); } bool CharReader::read(char &c) { return readAtCursor(readCursor, c); } void CharReader::resetPeek() { peekCursor.assign(buffer, readCursor); } void CharReader::consumePeek() { readCursor.assign(buffer, peekCursor); } bool CharReader::consumeWhitespace() { char c; while (peek(c)) { if (!Utils::isWhitespace(c)) { resetPeek(); return true; } consumePeek(); } return false; } CharReaderFork CharReader::fork() { return CharReaderFork(buffer, readCursor, peekCursor); } /* Class CharReaderFork */ CharReaderFork::CharReaderFork(std::shared_ptr buffer, CharReader::Cursor &parentReadCursor, CharReader::Cursor &parentPeekCursor) : CharReader(buffer), parentReadCursor(parentReadCursor), parentPeekCursor(parentPeekCursor) { readCursor.assign(buffer, parentReadCursor); peekCursor.assign(buffer, parentPeekCursor); } void CharReaderFork::commit() { parentReadCursor.assign(buffer, readCursor); parentPeekCursor.assign(buffer, peekCursor); } } }