/*
Ousía
Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* @file BufferedCharReader.hpp
*
* Contains the BufferedCharReader class which is used for reading/peeking
* single characters from an input stream or string.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_BUFFERED_CHAR_READER_H_
#define _OUSIA_BUFFERED_CHAR_READER_H_
#include
#include
#include
#include
namespace ousia {
// TODO: Better split this class into multiple classes with base class
// BufferedCharReader where each sub class represents one method of supplying
// the input data (feeding, initial string, input stream).
/**
* The BufferedCharReader class is used for storing incomming data that
* is fed into the pipeline as well as reading/peeking single characters
* from that buffer. Additionally it counts the current column/row
* (with correct handling for UTF-8) and contains an internal state
* machine that handles the detection of linebreaks and converts these to a
* single '\n'.
*/
class BufferedCharReader {
private:
/**
* The ReadCursor structure is responsible for representing the read
* position within the text an all state machine states belonging to the
* cursor. There are two types of read cursors: destructive and
* non-destructive read cursors.
*/
struct ReadCursor {
/**
* The line the cursor currently points to.
*/
unsigned int line;
/**
* The column the cursor currently points to.
*/
unsigned int column;
/**
* The index of the element in the data buffer we're currently reading
* from.
*/
unsigned int bufferElem;
/**
* The byte position within this data buffer.
*/
unsigned int bufferPos;
/**
* Specifies whether this is a destructive cursor (bytes are discarded
* once they were read from the buffer).
*/
const bool destructive;
/**
* State variable used in the internal state machine of the
* line feed detection.
*/
uint8_t lbState;
/**
* Constructor of the ReadCursor structure.
*
* @param line is the start line.
* @param column is the start column.
* @param destructive specifies whether the ReadCursor is destructive
* (consumes all read characters, as used in the "read cursor") or
* non-destructive (as used in the "peek cursor").
*/
ReadCursor(unsigned int line, unsigned int column, bool destructive);
/**
* Copys the data from another ReadCursor without overriding the
* "destructive" flag.
*
* @param cursor is the cursor that should be copied.
*/
void assign(const ReadCursor &cursor);
};
/**
* Pointer at an (optional) input stream used for reading a chunk of data
* whenever the input buffer depletes.
*/
std::istream *inputStream;
/**
* The read and the peek cursor.
*/
ReadCursor readCursor, peekCursor;
/**
* Set to true if there is no more input data.
*/
bool depleted;
/**
* Queue containing the data that has been fed into the char reader.
*/
std::deque buffer;
/**
* Substitute any combination of linebreaks in the incomming code with "\n".
* Returns true if the current character is meant as output, false
* otherwise.
*/
bool substituteLinebreaks(ReadCursor &cursor, char *c);
/**
* Reads a character from the input buffer and advances the given read
* cursor.
*
* @param cursor is a reference to the read cursor that should be used
* for reading.
* @param hasChar is set to true, if a character is available, false if
* no character is available (e.g. because line breaks are substituted or
* the end of a buffer boundary is reached -- in this case this function
* should be called again with the same parameters.)
* @param c is a output parameter, which will be set to the read character.
* @param returns true if there was enough data in the buffer, false
* otherwise.
*/
bool readCharacterAtCursor(ReadCursor &cursor, char *c);
/**
* Function that is called for each read character -- updates the row and
* column count.
*/
void updatePositionCounters(const char c);
public:
/**
* Constructor of the buffered char reader class with empty buffer as input.
* This operates the BufferedCharReader in a mode where new data has to be
* fed using the "feed" function and explicitly closed using the "close"
* function.
*
* @param line is the start line.
* @param column is the start column.
*/
BufferedCharReader(int line = 1, int column = 1);
/**
* Constructor of the buffered char reader class with a string as input.
*
* @param str is a string containing the input data.
* @param line is the start line.
* @param column is the start column.
*/
BufferedCharReader(const std::string &str, int line, int column);
/**
* Constructor of the buffered char reader class with a string as input.
*
* @param str is a string containing the input data.
*/
BufferedCharReader(const std::string &str);
/**
* Constructor of the buffered char reader class with a string as input.
*
* @param inputStream is the input stream from which incomming data should
* be read.
* @param line is the start line.
* @param column is the start column.
*/
BufferedCharReader(std::istream &inputStream, int line = 1, int column = 1);
/**
* Peeks a single character. If called multiple times, returns the
* character after the previously peeked character.
*
* @param c is a reference to the character to which the result should be
* writtern.
* @return true if the character was successfully read, false if there are
* no more characters to be read in the buffer.
*/
bool peek(char *c);
/**
* Reads a character from the input data. If "peek" was called
* beforehand resets the peek pointer.
*
* @param c is a reference to the character to which the result should be
* writtern.
* @return true if the character was successfully read, false if there are
* no more characters to be read in the buffer.
*/
bool read(char *c);
/**
* Advances the read pointer to the peek pointer -- so if the "peek"
* function was called, "read" will now return the character after
* the last peeked character.
*/
void consumePeek();
/**
* Moves the read cursor to the next non-whitespace character. Returns
* false, if the end of the stream was reached.
*
* @return false if the end of the stream was reached, false othrwise.
*/
bool consumeWhitespace();
/**
* Resets the peek pointer to the "read" pointer.
*/
void resetPeek();
/**
* Feeds new data into the internal buffer of the BufferedCharReader
* class. Only applicable if the buffered char reader was constructed
* without an input stream or string.
*
* @param data is a string containing the data that should be
* appended to the internal buffer.
*/
void feed(const std::string &data);
/**
* Tells the buffered char reader that no more data will be fed.
* Only applicable if the buffered char reader was constructed without an
* input stream or string.
*
* @param data is a string containing the data that should be
* appended to the internal buffer.
*/
void close();
/**
* Returns true if there are no more characters as the stream was
* closed.
*
* @return true if there is no more data.
*/
bool atEnd() const;
/**
* Returns the current line (starting with one).
*
* @return the current line number.
*/
int getLine() const { return readCursor.line; }
/**
* Returns the current column (starting with one).
*
* @return the current column number.
*/
int getColumn() const { return readCursor.column; }
};
}
#endif /* _OUSIA_BUFFERED_CHAR_READER_H_ */