diff options
Diffstat (limited to 'src/core/utils')
-rw-r--r-- | src/core/utils/CharReader.cpp | 643 | ||||
-rw-r--r-- | src/core/utils/CharReader.hpp | 672 |
2 files changed, 0 insertions, 1315 deletions
diff --git a/src/core/utils/CharReader.cpp b/src/core/utils/CharReader.cpp deleted file mode 100644 index 61616d7..0000000 --- a/src/core/utils/CharReader.cpp +++ /dev/null @@ -1,643 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <algorithm> -#include <cassert> -#include <limits> -#include <sstream> - -#include <core/Utils.hpp> - -#include "CharReader.hpp" - -namespace ousia { -namespace utils { - -/* Helper functions */ - -/** - * istreamReadCallback is used internally by the Buffer calss to stream data - * from an input stream. - * - * @param buf is points a the target memory region. - * @param size is the requested number of bytes. - * @param userData is a pointer at some user defined data. - * @return the actual number of bytes read. If the result is smaller than - * the requested size, this tells the Buffer that the end of the input - * stream is reached. - */ -static size_t istreamReadCallback(char *buf, size_t size, void *userData) -{ - return (static_cast<std::istream *>(userData))->read(buf, size).gcount(); -} - -/* Class Buffer */ - -Buffer::Buffer(ReadCallback callback, void *userData) - : callback(callback), - userData(userData), - reachedEnd(false), - startBucket(buckets.end()), - endBucket(buckets.end()), - startOffset(0), - firstDead(0) -{ - // Load a first block of data from the stream - stream(); - startBucket = buckets.begin(); -} - -Buffer::Buffer(std::istream &istream) : Buffer(istreamReadCallback, &istream) {} - -Buffer::Buffer(const std::string &str) - : callback(nullptr), - userData(nullptr), - reachedEnd(true), - startBucket(buckets.end()), - endBucket(buckets.end()), - startOffset(0), - firstDead(0) -{ - // Copy the given string into a first buffer and set the start buffer - // correctly - Bucket &bucket = nextBucket(); - bucket.resize(str.size()); - std::copy(str.begin(), str.end(), bucket.begin()); - startBucket = buckets.begin(); -} - -#ifndef NDEBUG -Buffer::~Buffer() -{ - // Make sure all cursors have been deleted - for (bool cursor_alive: alive) { - assert(!cursor_alive); - } -} -#endif - -void Buffer::advance(BucketIterator &it) -{ - it++; - if (it == buckets.end()) { - it = buckets.begin(); - } -} - -void Buffer::advance(BucketList::const_iterator &it) const -{ - it++; - if (it == buckets.cend()) { - it = buckets.cbegin(); - } -} - -Buffer::Bucket &Buffer::nextBucket() -{ - constexpr size_t MAXVAL = std::numeric_limits<size_t>::max(); - - // Fetch the minimum bucket index - size_t minBucketIdx = MAXVAL; - for (size_t i = 0; i < cursors.size(); i++) { - if (alive[i]) { - // Fetch references to the bucket and the cursor - const Cursor &cur = cursors[i]; - const Bucket &bucket = *(cur.bucket); - - // Increment the bucket index by one, if the cursor is at the end - // of the bucket (only valid if the LOOKBACK_SIZE is set to zero) - size_t bIdx = cur.bucketIdx; - if (LOOKBACK_SIZE == 0 && cur.bucketOffs == bucket.size()) { - bIdx++; - } - - // Decrement the bucket index by one, if the previous bucket still - // needs to be reached and cannot be overridden - if (bIdx > 0 && cur.bucketOffs < LOOKBACK_SIZE) { - bIdx--; - } - - // Set the bucket index to the minium - minBucketIdx = std::min(minBucketIdx, bIdx); - } - } - - // If there is space between the current start bucket and the read - // cursor, the start bucket can be safely overridden. - if (minBucketIdx > 0 && minBucketIdx != MAXVAL) { - // All cursor bucket indices will be decreased by one - for (size_t i = 0; i < cursors.size(); i++) { - cursors[i].bucketIdx--; - } - - // Increment the start offset - startOffset += startBucket->size(); - - // The old start bucket is the new end bucket - endBucket = startBucket; - - // Advance the start bucket, wrap around at the end of the list - advance(startBucket); - } else { - // No free bucket, insert a new one before the start bucket - endBucket = buckets.emplace(startBucket); - } - return *endBucket; -} - -Buffer::CursorId Buffer::nextCursor() -{ - bool hasCursor = false; - CursorId res = 0; - - // Search for the next free cursor starting with minNextCursorId - for (size_t i = firstDead; i < alive.size(); i++) { - if (!alive[i]) { - res = i; - hasCursor = true; - break; - } - } - - // Add a new cursor to the cursor list if no cursor is currently free - if (!hasCursor) { - res = cursors.size(); - cursors.resize(res + 1); - alive.resize(res + 1); - } - - // The next dead cursor is at least the next cursor - firstDead = res + 1; - - // Mark the new cursor as alive - alive[res] = true; - - return res; -} - -void Buffer::stream() -{ - // Fetch the bucket into which the data should be inserted, make sure it - // has the correct size - Bucket &tar = nextBucket(); - tar.resize(REQUEST_SIZE); - - // Read data from the stream into the target buffer - size_t size = callback(tar.data(), REQUEST_SIZE, userData); - - // If not enough bytes were returned, we're at the end of the stream - if (size < REQUEST_SIZE) { - tar.resize(size); - reachedEnd = true; - } -} - -Buffer::CursorId Buffer::createCursor() -{ - CursorId res = nextCursor(); - cursors[res].bucket = startBucket; - cursors[res].bucketIdx = 0; - cursors[res].bucketOffs = 0; - return res; -} - -Buffer::CursorId Buffer::createCursor(Buffer::CursorId ref) -{ - CursorId res = nextCursor(); - cursors[res] = cursors[ref]; - return res; -} - -void Buffer::copyCursor(Buffer::CursorId from, Buffer::CursorId to) -{ - cursors[to] = cursors[from]; -} - -void Buffer::deleteCursor(Buffer::CursorId cursor) -{ - alive[cursor] = false; - firstDead = std::min(firstDead, cursor); -} - -size_t Buffer::offset(Buffer::CursorId cursor) const -{ - const Cursor &cur = cursors[cursor]; - size_t offs = startOffset + cur.bucketOffs; - BucketList::const_iterator it = startBucket; - while (it != cur.bucket) { - offs += it->size(); - advance(it); - } - return offs; -} - -size_t Buffer::moveForward(CursorId cursor, size_t relativeOffs) -{ - size_t offs = relativeOffs; - Cursor &cur = cursors[cursor]; - while (offs > 0) { - // Fetch the current bucket of the cursor - Bucket &bucket = *(cur.bucket); - - // If there is enough space in the bucket, simply increment the bucket - // offset by the given relative offset - const size_t space = bucket.size() - cur.bucketOffs; - if (space >= offs) { - cur.bucketOffs += offs; - break; - } else { - // Go to the end of the current bucket otherwise - offs -= space; - cur.bucketOffs = bucket.size(); - - // Go to the next bucket - if (cur.bucket != endBucket) { - // Go to the next bucket - advance(cur.bucket); - cur.bucketIdx++; - cur.bucketOffs = 0; - } else { - // Abort, if there is no more data to stream, otherwise just - // load new data - if (reachedEnd) { - return relativeOffs - offs; - } - stream(); - } - } - } - return relativeOffs; -} - -size_t Buffer::moveBackward(CursorId cursor, size_t relativeOffs) -{ - size_t offs = relativeOffs; - Cursor &cur = cursors[cursor]; - while (offs > 0) { - // If there is enough space in the bucket, simply decrement the bucket - // offset by the given relative offset - if (cur.bucketOffs >= offs) { - cur.bucketOffs -= offs; - break; - } else { - // Go to the beginning of the current bucket otherwise - offs -= cur.bucketOffs; - cur.bucketOffs = 0; - - // Abort if there is no more bucket to got back to - if (cur.bucketIdx == 0) { - return relativeOffs - offs; - } - - // Go to the previous bucket (wrap around at the beginning of the - // list) - if (cur.bucket == buckets.begin()) { - cur.bucket = buckets.end(); - } - cur.bucket--; - - // Decrement the bucket index, and set the current offset to the - // end of the new bucket - cur.bucketIdx--; - cur.bucketOffs = cur.bucket->size(); - } - } - return relativeOffs; -} - -ssize_t Buffer::moveCursor(CursorId cursor, ssize_t relativeOffs) -{ - if (relativeOffs > 0) { - return moveForward(cursor, relativeOffs); - } else if (relativeOffs < 0) { - return -moveBackward(cursor, -relativeOffs); - } else { - return 0; - } -} - -bool Buffer::atEnd(Buffer::CursorId cursor) const -{ - const Cursor &c = cursors[cursor]; - return reachedEnd && - (c.bucket == endBucket && c.bucketOffs == endBucket->size()); -} - -bool Buffer::fetchCharacter(CursorId cursor, char &c, bool incr) -{ - Cursor &cur = cursors[cursor]; - while (true) { - // Reference at the current bucket - Bucket &bucket = *(cur.bucket); - - // If there is still data in the current bucket, return this data - if (cur.bucketOffs < bucket.size()) { - c = bucket[cur.bucketOffs]; - if (incr) { - cur.bucketOffs++; - } - return true; - } else if (cur.bucket == endBucket) { - // Return false if the end of the stream has been reached, otherwise - // load new data - if (reachedEnd) { - return false; - } - stream(); - } - - // Go to the next bucket - cur.bucketIdx++; - cur.bucketOffs = 0; - advance(cur.bucket); - } -} - -bool Buffer::read(Buffer::CursorId cursor, char &c) -{ - return fetchCharacter(cursor, c, true); -} - -bool Buffer::fetch(CursorId cursor, char &c) -{ - return fetchCharacter(cursor, c, false); -} - -/* CharReader::Cursor class */ - -void CharReader::Cursor::assign(std::shared_ptr<Buffer> buffer, - CharReader::Cursor &cursor) -{ - // Copy the cursor position - buffer->copyCursor(cursor.cursor, this->cursor); - - // Copy the state - line = cursor.line; - column = cursor.column; -} - -/* CharReader class */ - -CharReader::CharReader(std::shared_ptr<Buffer> buffer, size_t line, - size_t column) - : buffer(buffer), - readCursor(buffer->createCursor(), line, column), - peekCursor(buffer->createCursor(), line, column), - coherent(true) -{ -} - -CharReader::CharReader(const std::string &str, size_t line, size_t column) - : CharReader(std::shared_ptr<Buffer>{new Buffer{str}}, line, column) -{ -} - -CharReader::CharReader(std::istream &istream, size_t line, size_t column) - : CharReader(std::shared_ptr<Buffer>{new Buffer{istream}}, line, column) -{ -} - -CharReader::~CharReader() -{ - buffer->deleteCursor(readCursor.cursor); - buffer->deleteCursor(peekCursor.cursor); -} - -bool CharReader::readAtCursor(Cursor &cursor, char &c) -{ - // Return false if we're at the end of the stream - if (!buffer->read(cursor.cursor, c)) { - return false; - } - - // Substitute linebreak sequences with a single '\n' - if (c == '\n' || c == '\r') { - // Output a single \n - c = '\n'; - - // Check whether the next character is a continuation of the - // current character - char c2; - if (buffer->read(cursor.cursor, c2)) { - if ((c2 != '\n' && c2 != '\r') || c2 == c) { - buffer->moveCursor(cursor.cursor, -1); - } - } - } - - // Count lines and columns - if (c == '\n') { - // A linebreak was reached, go to the next line - cursor.line++; - cursor.column = 1; - } else { - // Ignore UTF-8 continuation bytes - if (!((c & 0x80) && !(c & 0x40))) { - cursor.column++; - } - } - return true; -} - -bool CharReader::peek(char &c) -{ - // If the reader was coherent, update the peek cursor state - if (coherent) { - peekCursor.assign(buffer, readCursor); - coherent = false; - } - - // Read a character from the peek cursor - return readAtCursor(peekCursor, c); -} - -bool CharReader::read(char &c) -{ - // Read a character from the buffer at the current read cursor - bool res = readAtCursor(readCursor, c); - - // Set the peek position to the current read position, if reading was not - // coherent - if (!coherent) { - peekCursor.assign(buffer, readCursor); - coherent = true; - } else { - buffer->copyCursor(readCursor.cursor, peekCursor.cursor); - } - - // Return the result of the read function - return res; -} - -void CharReader::resetPeek() -{ - if (!coherent) { - peekCursor.assign(buffer, readCursor); - coherent = true; - } -} - -void CharReader::consumePeek() -{ - if (!coherent) { - readCursor.assign(buffer, peekCursor); - coherent = true; - } -} - -bool CharReader::consumeWhitespace() -{ - char c; - while (peek(c)) { - if (!Utils::isWhitespace(c)) { - resetPeek(); - return true; - } - consumePeek(); - } - return false; -} - -CharReaderFork CharReader::fork() -{ - return CharReaderFork(buffer, readCursor, peekCursor, coherent); -} - -CharReader::Context CharReader::getContext(ssize_t maxSize) -{ - // Clone the current read cursor - Buffer::CursorId cur = buffer->createCursor(readCursor.cursor); - - // Fetch the start position of the search - ssize_t offs = buffer->offset(cur); - ssize_t start = offs; - ssize_t end = offs; - char c; - - // Search the beginning of the line with the last non-whitespace character - bool hadNonWhitespace = false; - bool foundBegin = false; - for (ssize_t i = 0; i < maxSize; i++) { - // Fetch the character at the current position - if (buffer->fetch(cur, c)) { - // Abort, at linebreaks if we found a non-linebreak character - hadNonWhitespace = hadNonWhitespace || !Utils::isWhitespace(c); - if (hadNonWhitespace && (c == '\n' || c == '\r')) { - buffer->moveCursor(cur, 1); - start++; - foundBegin = true; - break; - } - } - if (buffer->moveCursor(cur, -1) == 0) { - foundBegin = true; - break; - } else { - // Update the start position and the hadNonWhitespace flag - start--; - } - } - - // Search the end of the line - buffer->moveCursor(cur, offs - start); - bool foundEnd = false; - for (ssize_t i = 0; i < maxSize; i++) { - // Increment the end counter if a character was read, abort if the end - // of the stream has been reached - if (buffer->read(cur, c)) { - end++; - } else { - foundEnd = true; - break; - } - - // Abort on linebreak characters - if (c == '\n' || c == '\r') { - foundEnd = true; - break; - } - } - - // Calculate the truncated start and end position and limit the number of - // characters to the maximum number of characters - ssize_t tStart = start; - ssize_t tEnd = end; - if (tEnd - tStart > maxSize) { - tStart = std::max(offs - maxSize / 2, tStart); - tEnd = tStart + maxSize; - } - - // Try to go to the calculated start position and fetch the actual start - // position - ssize_t aStart = end + buffer->moveCursor(cur, tStart - end); - if (aStart > tStart) { - tEnd = tEnd + (aStart - tStart); - tStart = aStart; - } - - // Read one line - std::stringstream ss; - size_t relPos = 0; - for (ssize_t i = tStart; i < tEnd; i++) { - if (buffer->read(cur, c)) { - // Break once a linebreak is reached - if (c == '\n' || c == '\r') { - break; - } - - // Add the current character to the output - ss << c; - - // Increment the string-relative offset as long as the original - // offset is not reached in the for loop - if (i < offs) { - relPos++; - } - } - } - - // Delete the newly created cursor - buffer->deleteCursor(cur); - - return CharReader::Context{ss.str(), relPos, !foundBegin || tStart != start, - !foundEnd || tEnd != end}; -} - -/* Class CharReaderFork */ - -CharReaderFork::CharReaderFork(std::shared_ptr<Buffer> buffer, - CharReader::Cursor &parentReadCursor, - CharReader::Cursor &parentPeekCursor, - bool coherent) - : CharReader(buffer, 1, 1), - parentReadCursor(parentReadCursor), - parentPeekCursor(parentPeekCursor) -{ - readCursor.assign(buffer, parentReadCursor); - peekCursor.assign(buffer, parentPeekCursor); - this->coherent = coherent; -} - -void CharReaderFork::commit() -{ - parentReadCursor.assign(buffer, readCursor); - parentPeekCursor.assign(buffer, peekCursor); -} -} -} - diff --git a/src/core/utils/CharReader.hpp b/src/core/utils/CharReader.hpp deleted file mode 100644 index 1306026..0000000 --- a/src/core/utils/CharReader.hpp +++ /dev/null @@ -1,672 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file CharReader.hpp - * - * Used within all parsers to read single characters from an underlying stream. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_CHAR_READER_HPP_ -#define _OUSIA_CHAR_READER_HPP_ - -#include <istream> -#include <list> -#include <memory> -#include <vector> - -namespace ousia { -namespace utils { - -/** - * A chunked ring buffer used in CharReader to provide access to an input stream - * with multiple read cursors. The Buffer automatically expands to the - * size of the spanned by the read cursors while reusing already allocated - * memory. - */ -class Buffer { -public: - /** - * Callback function which is called whenever new data is requested from the - * input stream. - * - * @param buf is points a the target memory region. - * @param size is the requested number of bytes. - * @param userData is a pointer at some user defined data given in the - * constructor. - * @return the actual number of bytes read. If the result is smaller than - * the requested size, this tells the Buffer that the end of the input - * stream is reached. - */ - using ReadCallback = size_t (*)(char *buf, size_t size, void *userData); - - /** - * Handle used to identify a cursor. - */ - using CursorId = size_t; - -private: - /** - * Number of bytes to request from the input stream. Set to 64 KiB because - * this seems to be a nice value for I/O operations according to multiple - * sources. - */ - static constexpr size_t REQUEST_SIZE = 64 * 1024; - - /** - * Number of bytes the buffer guarantees to be capable of looking back - * for extracting the current context. - */ - static constexpr size_t LOOKBACK_SIZE = 128; - - /** - * Type used internally to represent one chunk of memory. - */ - using Bucket = std::vector<char>; - - /** - * Type used internally to represent a bucket container. - */ - using BucketList = std::list<Bucket>; - - /** - * Type used internally for representing iterators in the bucket list. - */ - using BucketIterator = BucketList::iterator; - - /** - * Type used internally to represent a read cursor. - */ - struct Cursor { - /** - * Iterator pointing at the current bucket. - */ - BucketIterator bucket; - - /** - * Index of the bucket relative to the start bucket. - */ - size_t bucketIdx; - - /** - * Current offset within that bucket. - */ - size_t bucketOffs; - }; - - /** - * List of buckets containing the buffered memory. - */ - BucketList buckets; - - /** - * List of cursors used to access the memory. Note that cursors can be - * marked as inactive and reused lateron (to avoid having to resize the - * vector). - */ - std::vector<Cursor> cursors; - - /** - * Bitfield specifying which of the cursors is actually valid. - */ - std::vector<bool> alive; - - /** - * Function to be called whenever new data is needed. Set to nullptr if the - * Buffer is not backed by an input stream. - */ - const ReadCallback callback; - - /** - * User data given in the constructor. - */ - void *userData; - - /** - * Set to true if the input stream is at its end. - */ - bool reachedEnd; - - /** - * Iterator pointing at the current start bucket. - */ - BucketIterator startBucket; - - /** - * Iterator pointing at the last bucket. - */ - BucketIterator endBucket; - - /** - * Byte offset of the start bucket relative to the beginning of the stream. - */ - size_t startOffset; - - /** - * Points at the smallest possible available cursor index, yet does not - * guarantee that this cursor index actuall is free. - */ - CursorId firstDead; - - /** - * Advances the bucket iterator, cares about wrapping around in the ring. - */ - void advance(BucketIterator &it); - - /** - * Advances the bucket iterator, cares about wrapping around in the ring. - */ - void advance(BucketList::const_iterator &it) const; - - /** - * Internally used to find the next free cursor in the cursors vector. The - * cursor is marked as active. - * - * @return the next free cursor index. - */ - CursorId nextCursor(); - - /** - * Returns a reference at the next bucket into which data should be - * inserted. - * - * @return a bucket into which the data can be inserted. - */ - Bucket &nextBucket(); - - /** - * Reads data from the input stream and places it in the next free buffer. - */ - void stream(); - - /** - * Moves the given cursor forward. - */ - size_t moveForward(CursorId cursor, size_t relativeOffs); - - /** - * Moves the given cursor backward. - */ - size_t moveBackward(CursorId cursor, size_t relativeOffs); - - /** - * Reads a character from the current cursor position and optionally - * advances. - */ - bool fetchCharacter(CursorId cursor, char &c, bool incr); - -public: - /** - * Intializes the Buffer with a reference to a ReadCallback that is used - * to fetch data from an underlying input stream. - * - * @param callback is the function that will be called whenever data is read - * from the ring buffer and the buffer does not hold enough data to fulfill - * this read request. - * @param userData is a pointer to user defined data which will be passed to - * the callback function. - */ - Buffer(ReadCallback callback, void *userData); - - /** - * Initializes the Buffer with a reference to an std::istream from which - * data will be read. - * - * @param istream is the input stream from which the data should be read. - */ - Buffer(std::istream &istream); - - /** - * Initializes the Buffer with the contents of the given string, after - * this operation the Buffer has a fixed size. - * - * @param str is the string containing the data that should be copied into - * the ring buffer. - */ - Buffer(const std::string &str); - -#ifndef NDEBUG - /** - * Destructor of the Buffer class. Makes sure that all cursors have been - * freed. - */ - ~Buffer(); -#endif - - // No copy - Buffer(const Buffer &) = delete; - - // No assign - Buffer &operator=(const Buffer &) = delete; - - /** - * Creates a new read cursor positioned at the smallest possible position - * in the ring buffer. - */ - CursorId createCursor(); - - /** - * Creates a new read cursor positioned at the same position as the given - * read cursor. - * - * @param ref is the read cursor that should be used as reference for the - * new read cursor. - */ - CursorId createCursor(CursorId ref); - - /** - * Copies the position of one cursor to another cursor. - * - * @param from is the cursor id of which the position should be copied. - * @param to is the cursor id to which the position should be copied. - */ - void copyCursor(CursorId from, CursorId to); - - /** - * Deletes the cursor with the given id. The cursor may no longer be used - * after this function has been called. - * - * @param cursor is the id of the cursor that should be freed. - */ - void deleteCursor(CursorId cursor); - - /** - * Moves a cursor by offs bytes. Note that moving backwards is theoretically - * limited by the LOOKBACK_SIZE of the Buffer, practically it will most - * likely be limited by the REQUEST_SIZE, so you can got at most 64 KiB - * backwards. - * - * @param cursor is the cursor that should be moved. - * @param relativeOffs is a positive or negative integer number specifying - * the number of bytes the cursor should be moved forward (positive numbers) - * or backwards (negative numbers). - * @return the actual number of bytes the cursor was moved. This number is - * smaller than the relativeOffs given in the constructor if the - */ - ssize_t moveCursor(CursorId cursor, ssize_t relativeOffs); - - /** - * Returns the current byte offset of the given cursor relative to the - * beginning of the stream. - * - * @param cursor is the cursor for which the byte offset relative to the - * beginning of the stream should be returned. - * @return the number of bytes since the beginning of the stream for the - * given cursor. - */ - size_t offset(CursorId cursor) const; - - /** - * Returns true if the given cursor currently is at the end of the stream. - * - * @param cursor is the cursor for which the atEnd flag should be returned. - * @return true if the there are no more bytes for this cursor. If false - * is returned, this means that there may be more bytes in the stream, - * nevertheless the end of the stream may be hit once the next read function - * is called. - */ - bool atEnd(CursorId cursor) const; - - /** - * Reads a single character from the ring buffer from the given cursor and - * moves to the next character. - * - * @param cursor specifies the cursor from which the data should be read. - * The cursor will be advanced by one byte. - * @param c is the character into which the data needs to be read. - * @return true if a character was read, false if the end of the stream has - * been reached. - */ - bool read(CursorId cursor, char &c); - - /** - * Returns a single character from the ring buffer from the current cursor - * position and stays at that position. - * - * @param cursor specifies the cursor from which the data should be read. - * The cursor will be advanced by one byte. - * @param c is the character into which the data needs to be read. - * @return true if a character could be fetched, false if the end of the - * stream has been reached. - */ - bool fetch(CursorId cursor, char &c); -}; - -// Forward declaration -class CharReaderFork; - -/** - * Used within parsers for convenient access to single characters in an input - * stream or buffer. It allows reading and peeking single characters from a - * buffer. Additionally it counts the current column/row (with correct handling - * for UTF-8) and contains an internal state machine that handles the detection - * of linebreaks and converts these to a single '\n'. - */ -class CharReader { -public: - /** - * The context struct is used to represent the current context the char - * reader is in. This context can for example be used when building error - * messages. - */ - struct Context { - /** - * Set to the content of the current line. - */ - std::string line; - - /** - * Relative position (in characters) within that line. - */ - size_t relPos; - - /** - * Set to true if the beginning of the line has been truncated (because - * the reader position is too far away from the actual position of the - * line). - */ - bool truncatedStart; - - /** - * Set to true if the end of the line has been truncated (because the - * reader position is too far away from the actual end position of the - * line. - */ - bool truncatedEnd; - - Context() - : line(), relPos(0), truncatedStart(false), truncatedEnd(false) - { - } - - Context(std::string line, size_t relPos, bool truncatedStart, - bool truncatedEnd) - : line(std::move(line)), - relPos(relPos), - truncatedStart(truncatedStart), - truncatedEnd(truncatedEnd) - { - } - }; - -protected: - /** - * Internally used cursor structure for managing the read and the peek - * cursor. - */ - struct Cursor { - /** - * Corresponding cursor in the underlying buffer instance. - */ - const Buffer::CursorId cursor; - - /** - * Current line the cursor is in. - */ - uint32_t line; - - /** - * Current column the cursor is in. - */ - uint32_t column; - - /** - * Constructor of the Cursor class. - * - * @param cursor is the underlying cursor in the Buffer instance. - */ - Cursor(Buffer::CursorId cursor, size_t line, size_t column) - : cursor(cursor), line(line), column(column) - { - } - - /** - * Assigns one cursor to another. - * - * @param buffer is the underlying buffer instance the internal cursor - * belongs to. - * @param cursor is the cursor from which the state should be copied. - */ - void assign(std::shared_ptr<Buffer> buffer, Cursor &cursor); - }; - -private: - /** - * Substitutes "\r", "\n\r", "\r\n" with a single "\n". - * - * @param cursor is the cursor from which the character should be read. - * @param c a reference to the character that should be written. - * @return true if another character needs to be read. - */ - bool substituteLinebreaks(Cursor &cursor, char &c); - - /** - * Reads a single character from the given cursor. - * - * @param cursor is the cursor from which the character should be read. - * @param c a reference to the character that should be written. - * @return true if a character was read, false if the end of the stream has - * been reached. - */ - bool readAtCursor(Cursor &cursor, char &c); - -protected: - /** - * Reference pointing at the underlying buffer. - */ - std::shared_ptr<Buffer> buffer; - - /** - * Cursor used for reading. - */ - Cursor readCursor; - - /** - * Cursor used for peeking. - */ - Cursor peekCursor; - - /** - * Set to true as long the underlying Buffer cursor is at the same position - * for the read and the peek cursor. This is only used for optimization - * purposes and makes consecutive reads a bit faster. - */ - bool coherent; - - /** - * Protected constructor of the CharReader base class. Creates new read - * and peek cursors for the given buffer. - * - * @param buffer is a reference to the underlying Buffer class responsible - * for allowing to read from a single input stream from multiple locations. - */ - CharReader(std::shared_ptr<Buffer> buffer, size_t line, size_t column); - -public: - /** - * Creates a new CharReader instance from a string. - * - * @param str is a string containing the input data. - * @param line is the start line. - * @param column is the start column. - */ - CharReader(const std::string &str, size_t line = 1, size_t column = 1); - - /** - * Creates a new CharReader instance for an input stream. - * - * @param istream is the input stream from which incomming data should be - * read. - * @param line is the start line. - * @param column is the start column. - */ - CharReader(std::istream &istream, size_t line = 1, size_t column = 1); - - /** - * Deletes the used cursors from the underlying buffer instance. - */ - ~CharReader(); - - // No copy - CharReader(const Buffer &) = delete; - - // No assign - CharReader &operator=(const Buffer &) = delete; - - /** - * Peeks a single character. If called multiple times, returns the - * character after the previously peeked character. - * - * @param c is a reference to the character to which the result should be - * written. - * @return true if the character was successfully read, false if there are - * no more characters to be read in the buffer. - */ - bool peek(char &c); - - /** - * Reads a character from the input data. If "peek" was called - * beforehand resets the peek pointer. - * - * @param c is a reference to the character to which the result should be - * written. - * @return true if the character was successfully read, false if there are - * no more characters to be read in the buffer. - */ - bool read(char &c); - - /** - * Resets the peek pointer to the "read" pointer. - */ - void resetPeek(); - - /** - * Advances the read pointer to the peek pointer -- so if the "peek" - * function was called, "read" will now return the character after - * the last peeked character. - */ - void consumePeek(); - - /** - * Moves the read cursor to the next non-whitespace character. Returns - * false, if the end of the stream was reached. - * - * @return false if the end of the stream was reached, false othrwise. - */ - bool consumeWhitespace(); - - /** - * Creates a new CharReader located at the same position as this CharReader - * instance, yet the new CharReader can be used independently of this - * CharReader. Use the "commit" function of the returned CharReader to - * copy the state of the forked CharReaderFork to this CharReader. - * - * @return a CharReaderFork instance positioned at the same location as this - * CharReader instance. - */ - CharReaderFork fork(); - - /** - * Returns true if there are no more characters as the stream was - * closed. - * - * @return true if there is no more data. - */ - bool atEnd() const { return buffer->atEnd(readCursor.cursor); } - - /** - * Returns the current line (starting with one). - * - * @return the current line number. - */ - uint32_t getLine() const { return readCursor.line; } - - /** - * Returns the current column (starting with one). - * - * @return the current column number. - */ - uint32_t getColumn() const { return readCursor.column; } - - /** - * Returns the current byte offset of the read cursor. - * - * @return the byte position within the stream. - */ - size_t getOffset() const { return buffer->offset(readCursor.cursor); }; - - /** - * Returns the line the read cursor currently is in, but at most the - * given number of characters in the form of a Context structure. - */ - Context getContext(ssize_t maxSize); -}; - -/** - * A CharReaderFork is returned whenever the "fork" function of the CharReader - * class is used. Its "commit" function can be used to move the underlying - * CharReader instance to the location of the CharReaderFork instance. Otherwise - * the read location of the underlying CharReader is left unchanged. - */ -class CharReaderFork : public CharReader { -private: - friend CharReader; - - /** - * The reader cursor of the underlying CharReader instance. - */ - CharReader::Cursor &parentReadCursor; - - /** - * The peek cursor of the underlying CharReader instance. - */ - CharReader::Cursor &parentPeekCursor; - - /** - * Constructor of the CharReaderFork class. - * - * @param buffer is a reference at the parent Buffer instance. - * @param parentPeekCursor is a reference at the parent read cursor. - * @param parentPeekCursor is a reference at the parent peek cursor. - * @param coherent specifies whether the char reader cursors are initialized - * coherently. - */ - CharReaderFork(std::shared_ptr<Buffer> buffer, - CharReader::Cursor &parentReadCursor, - CharReader::Cursor &parentPeekCursor, bool coherent); - -public: - /** - * Moves the read and peek cursor of the parent CharReader to the location - * of the read and peek cursor in the fork. - */ - void commit(); -}; -} - -/** - * Alias of the commonly used CharReader class. - */ -using CharReader = utils::CharReader; - -} - -#endif /* _OUSIA_CHAR_READER_HPP_ */ - |