/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* @file CharReader.hpp
*
* Used within all parsers to read single characters from an underlying stream.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_CHAR_READER_HPP_
#define _OUSIA_CHAR_READER_HPP_
#include
#include
#include
#include
#include "Location.hpp"
namespace ousia {
/**
* A chunked ring buffer used in CharReader to provide access to an input stream
* with multiple read cursors. The Buffer automatically expands to the size of
* the spanned by the read cursors while reusing already allocated memory.
*/
class Buffer {
public:
/**
* Callback function which is called whenever new data is requested from the
* input stream.
*
* @param buf is points a the target memory region.
* @param size is the requested number of bytes.
* @param userData is a pointer at some user defined data given in the
* constructor.
* @return the actual number of bytes read. If the result is smaller than
* the requested size, this tells the Buffer that the end of the input
* stream is reached.
*/
using ReadCallback = size_t (*)(char *buf, size_t size, void *userData);
/**
* Handle used to identify a cursor.
*/
using CursorId = size_t;
private:
/**
* Number of bytes to request from the input stream. Set to 64 KiB because
* this seems to be a nice value for I/O operations according to multiple
* sources.
*/
static constexpr size_t REQUEST_SIZE = 64 * 1024;
/**
* Number of bytes the buffer guarantees to be capable of looking back
* for extracting the current context.
*/
static constexpr size_t LOOKBACK_SIZE = 128;
/**
* Type used internally to represent one chunk of memory.
*/
using Bucket = std::vector;
/**
* Type used internally to represent a bucket container.
*/
using BucketList = std::list;
/**
* Type used internally for representing iterators in the bucket list.
*/
using BucketIterator = BucketList::iterator;
/**
* Type used internally to represent a read cursor.
*/
struct Cursor {
/**
* Iterator pointing at the current bucket.
*/
BucketIterator bucket;
/**
* Index of the bucket relative to the start bucket.
*/
size_t bucketIdx;
/**
* Current offset within that bucket.
*/
size_t bucketOffs;
};
/**
* List of buckets containing the buffered memory.
*/
BucketList buckets;
/**
* List of cursors used to access the memory. Note that cursors can be
* marked as inactive and reused lateron (to avoid having to resize the
* vector).
*/
std::vector cursors;
/**
* Bitfield specifying which of the cursors is actually valid.
*/
std::vector alive;
/**
* Function to be called whenever new data is needed. Set to nullptr if the
* Buffer is not backed by an input stream.
*/
const ReadCallback callback;
/**
* User data given in the constructor.
*/
void *userData;
/**
* Set to true if the input stream is at its end.
*/
bool reachedEnd;
/**
* Iterator pointing at the current start bucket.
*/
BucketIterator startBucket;
/**
* Iterator pointing at the last bucket.
*/
BucketIterator endBucket;
/**
* Byte offset of the start bucket relative to the beginning of the stream.
*/
size_t startOffset;
/**
* Points at the smallest possible available cursor index, yet does not
* guarantee that this cursor index actuall is free.
*/
CursorId firstDead;
/**
* Advances the bucket iterator, cares about wrapping around in the ring.
*/
void advance(BucketIterator &it);
/**
* Advances the bucket iterator, cares about wrapping around in the ring.
*/
void advance(BucketList::const_iterator &it) const;
/**
* Internally used to find the next free cursor in the cursors vector. The
* cursor is marked as active.
*
* @return the next free cursor index.
*/
CursorId nextCursor();
/**
* Returns a reference at the next bucket into which data should be
* inserted.
*
* @return a bucket into which the data can be inserted.
*/
Bucket &nextBucket();
/**
* Reads data from the input stream and places it in the next free buffer.
*/
void stream();
/**
* Moves the given cursor forward.
*/
size_t moveForward(CursorId cursor, size_t relativeOffs);
/**
* Moves the given cursor backward.
*/
size_t moveBackward(CursorId cursor, size_t relativeOffs);
/**
* Reads a character from the current cursor position and optionally
* advances.
*/
bool fetchCharacter(CursorId cursor, char &c, bool incr);
public:
/**
* Intializes the Buffer with a reference to a ReadCallback that is used
* to fetch data from an underlying input stream.
*
* @param callback is the function that will be called whenever data is read
* from the ring buffer and the buffer does not hold enough data to fulfill
* this read request.
* @param userData is a pointer to user defined data which will be passed to
* the callback function.
*/
Buffer(ReadCallback callback, void *userData);
/**
* Initializes the Buffer with a reference to an std::istream from which
* data will be read.
*
* @param istream is the input stream from which the data should be read.
*/
Buffer(std::istream &istream);
/**
* Initializes the Buffer with the contents of the given string, after
* this operation the Buffer has a fixed size.
*
* @param str is the string containing the data that should be copied into
* the ring buffer.
*/
Buffer(const std::string &str);
#ifndef NDEBUG
/**
* Destructor of the Buffer class. Makes sure that all cursors have been
* freed.
*/
~Buffer();
#endif
// No copy
Buffer(const Buffer &) = delete;
// No assign
Buffer &operator=(const Buffer &) = delete;
/**
* Creates a new read cursor positioned at the smallest possible position
* in the ring buffer.
*/
CursorId createCursor();
/**
* Creates a new read cursor positioned at the same position as the given
* read cursor.
*
* @param ref is the read cursor that should be used as reference for the
* new read cursor.
*/
CursorId createCursor(CursorId ref);
/**
* Copies the position of one cursor to another cursor.
*
* @param from is the cursor id of which the position should be copied.
* @param to is the cursor id to which the position should be copied.
*/
void copyCursor(CursorId from, CursorId to);
/**
* Deletes the cursor with the given id. The cursor may no longer be used
* after this function has been called.
*
* @param cursor is the id of the cursor that should be freed.
*/
void deleteCursor(CursorId cursor);
/**
* Moves a cursor by offs bytes. Note that moving backwards is theoretically
* limited by the LOOKBACK_SIZE of the Buffer, practically it will most
* likely be limited by the REQUEST_SIZE, so you can got at most 64 KiB
* backwards.
*
* @param cursor is the cursor that should be moved.
* @param relativeOffs is a positive or negative integer number specifying
* the number of bytes the cursor should be moved forward (positive numbers)
* or backwards (negative numbers).
* @return the actual number of bytes the cursor was moved. This number is
* smaller than the relativeOffs given in the constructor if the
*/
ssize_t moveCursor(CursorId cursor, ssize_t relativeOffs);
/**
* Returns the current byte offset of the given cursor relative to the
* beginning of the stream.
*
* @param cursor is the cursor for which the byte offset relative to the
* beginning of the stream should be returned.
* @return the number of bytes since the beginning of the stream for the
* given cursor.
*/
size_t offset(CursorId cursor) const;
/**
* Returns true if the given cursor currently is at the end of the stream.
*
* @param cursor is the cursor for which the atEnd flag should be returned.
* @return true if the there are no more bytes for this cursor. If false
* is returned, this means that there may be more bytes in the stream,
* nevertheless the end of the stream may be hit once the next read function
* is called.
*/
bool atEnd(CursorId cursor) const;
/**
* Reads a single character from the ring buffer from the given cursor and
* moves to the next character.
*
* @param cursor specifies the cursor from which the data should be read.
* The cursor will be advanced by one byte.
* @param c is the character into which the data needs to be read.
* @return true if a character was read, false if the end of the stream has
* been reached.
*/
bool read(CursorId cursor, char &c);
/**
* Returns a single character from the ring buffer from the current cursor
* position and stays at that position.
*
* @param cursor specifies the cursor from which the data should be read.
* The cursor will be advanced by one byte.
* @param c is the character into which the data needs to be read.
* @return true if a character could be fetched, false if the end of the
* stream has been reached.
*/
bool fetch(CursorId cursor, char &c);
};
// Forward declaration
class CharReaderFork;
/**
* Used within parsers for convenient access to single characters in an input
* stream or buffer. It allows reading and peeking single characters from a
* buffer. Additionally it counts the current column/row (with correct handling
* for UTF-8) and contains an internal state machine that handles the detection
* of linebreaks and converts these to a single '\n'.
*/
class CharReader {
protected:
/**
* Internally used cursor structure for managing the read and the peek
* cursor.
*/
struct Cursor {
/**
* Corresponding cursor in the underlying buffer instance.
*/
const Buffer::CursorId cursor;
/**
* Current line the cursor is in.
*/
int line;
/**
* Current column the cursor is in.
*/
int column;
/**
* Constructor of the Cursor class.
*
* @param cursor is the underlying cursor in the Buffer instance.
* @param line is the line at which the cursor is positioned.
* @param column is the column at which the cursor is positioned.
*/
Cursor(Buffer::CursorId cursor, int line, int column)
: cursor(cursor), line(line), column(column)
{
}
/**
* Assigns one cursor to another.
*
* @param buffer is the underlying buffer instance the internal cursor
* belongs to.
* @param cursor is the cursor from which the state should be copied.
*/
void assign(std::shared_ptr buffer, Cursor &cursor);
};
private:
/**
* Substitutes "\r", "\n\r", "\r\n" with a single "\n".
*
* @param cursor is the cursor from which the character should be read.
* @param c a reference to the character that should be written.
* @return true if another character needs to be read.
*/
bool substituteLinebreaks(Cursor &cursor, char &c);
/**
* Reads a single character from the given cursor.
*
* @param cursor is the cursor from which the character should be read.
* @param c a reference to the character that should be written.
* @return true if a character was read, false if the end of the stream has
* been reached.
*/
bool readAtCursor(Cursor &cursor, char &c);
/**
* Returns the line the given cursor currently is in, but at most the
* given number of characters in the form of a Context structure.
*
* @param maxSize is the maximum length of the extracted context
* @param referenceCursor is a cursor in the internal buffer pointing at the
* location at which the context should be read.
*/
SourceContext getContextAt(ssize_t maxSize,
Buffer::CursorId referenceCursor);
/**
* Returns the line the at the given byte offset, but at most the
* given number of characters in the form of a Context structure.
*
* @param maxSize is the maximum length of the extracted context
* @param offs is the byte offset for which the context should be read.
* @return the context at the specified position or an empty (invalid)
* context if the context could not be read.
*/
SourceContext getContextAtOffs(ssize_t maxSize, size_t offs);
protected:
/**
* Reference pointing at the underlying buffer.
*/
std::shared_ptr buffer;
/**
* Cursor used for reading.
*/
Cursor readCursor;
/**
* Cursor used for peeking.
*/
Cursor peekCursor;
/**
* Set to true as long the underlying Buffer cursor is at the same position
* for the read and the peek cursor. This is only used for optimization
* purposes and makes consecutive reads a bit faster.
*/
bool coherent;
/**
* Protected constructor of the CharReader base class. Creates new read
* and peek cursors for the given buffer.
*
* @param buffer is a reference to the underlying Buffer class responsible
* for allowing to read from a single input stream from multiple locations.
*/
CharReader(std::shared_ptr buffer, size_t line, size_t column);
public:
/**
* Creates a new CharReader instance from a string.
*
* @param str is a string containing the input data.
* @param line is the start line.
* @param column is the start column.
*/
CharReader(const std::string &str, size_t line = 1, size_t column = 1);
/**
* Creates a new CharReader instance for an input stream.
*
* @param istream is the input stream from which incomming data should be
* read.
* @param line is the start line.
* @param column is the start column.
*/
CharReader(std::istream &istream, size_t line = 1, size_t column = 1);
/**
* Deletes the used cursors from the underlying buffer instance.
*/
~CharReader();
// No copy
CharReader(const Buffer &) = delete;
// No assign
CharReader &operator=(const Buffer &) = delete;
/**
* Peeks a single character. If called multiple times, returns the
* character after the previously peeked character.
*
* @param c is a reference to the character to which the result should be
* written.
* @return true if the character was successfully read, false if there are
* no more characters to be read in the buffer.
*/
bool peek(char &c);
/**
* Reads a character from the input data. If "peek" was called
* beforehand resets the peek pointer.
*
* @param c is a reference to the character to which the result should be
* written.
* @return true if the character was successfully read, false if there are
* no more characters to be read in the buffer.
*/
bool read(char &c);
/**
* Resets the peek pointer to the "read" pointer.
*/
void resetPeek();
/**
* Advances the read pointer to the peek pointer -- so if the "peek"
* function was called, "read" will now return the character after
* the last peeked character.
*/
void consumePeek();
/**
* Moves the read cursor to the next non-whitespace character. Returns
* false, if the end of the stream was reached.
*
* @return false if the end of the stream was reached, false othrwise.
*/
bool consumeWhitespace();
/**
* Creates a new CharReader located at the same position as this CharReader
* instance, yet the new CharReader can be used independently of this
* CharReader. Use the "commit" function of the returned CharReader to
* copy the state of the forked CharReaderFork to this CharReader.
*
* @return a CharReaderFork instance positioned at the same location as this
* CharReader instance.
*/
CharReaderFork fork();
/**
* Reads raw data from the CharReader without any processing. Data is always
* read from the read cursor.
*
* @param buf is the target memory buffer.
* @param size is the number of bytes to be read.
* @return the number of bytes read.
*/
size_t readRaw(char *buf, size_t size);
/**
* Returns true if there are no more characters as the stream was
* closed.
*
* @return true if there is no more data.
*/
bool atEnd() const { return buffer->atEnd(readCursor.cursor); }
/**
* Returns the offset of the read cursor in bytes.
*/
size_t getOffset() const { return buffer->offset(readCursor.cursor); }
/**
* Returns the line number the read cursor currently is at.
*/
int getLine() const { return readCursor.line; }
/**
* Returns the column the read cursor currently is at.
*/
int getColumn() const { return readCursor.column; }
/**
* Returns the current position of the read cursor (line and column).
*/
SourceLocation getLocation() const
{
return SourceLocation(getLine(), getColumn(), getOffset());
}
/**
* Returns the line the read cursor currently is in, but at most the
* given number of characters in the form of a Context structure.
*
* @param maxSize is the maximum length of the extracted context
*/
SourceContext getContext(ssize_t maxSize = 60);
/**
* Function that can be used to provide the context for a certain source
* location. A pointer to this function can be supplied to a Logger instance
* in the pushFile() method. The data should be set to a pointer to the
* CharReader instance.
*
* @param location is the location for which the context should be returned.
* Only the "offs" field within the location is used.
* @param data is a pointer pointing at a CharReader instance.
*/
static SourceContext contextCallback(const SourceLocation &location,
void *data);
};
/**
* A CharReaderFork is returned whenever the "fork" function of the CharReader
* class is used. Its "commit" function can be used to move the underlying
* CharReader instance to the location of the CharReaderFork instance. Otherwise
* the read location of the underlying CharReader is left unchanged.
*/
class CharReaderFork : public CharReader {
private:
friend CharReader;
/**
* The reader cursor of the underlying CharReader instance.
*/
CharReader::Cursor &parentReadCursor;
/**
* The peek cursor of the underlying CharReader instance.
*/
CharReader::Cursor &parentPeekCursor;
/**
* Constructor of the CharReaderFork class.
*
* @param buffer is a reference at the parent Buffer instance.
* @param parentPeekCursor is a reference at the parent read cursor.
* @param parentPeekCursor is a reference at the parent peek cursor.
* @param coherent specifies whether the char reader cursors are initialized
* coherently.
*/
CharReaderFork(std::shared_ptr buffer,
CharReader::Cursor &parentReadCursor,
CharReader::Cursor &parentPeekCursor, bool coherent);
public:
/**
* Moves the read and peek cursor of the parent CharReader to the location
* of the read and peek cursor in the fork.
*/
void commit();
};
}
#endif /* _OUSIA_CHAR_READER_HPP_ */