diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2014-11-24 00:54:18 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2014-11-24 00:54:18 +0100 |
commit | 6c132ad008d375e5929eae32beba0e5bfe48515c (patch) | |
tree | a00ec1ece0b37a650587deaa510d5397ec2c2d1e /src/core | |
parent | aaaf493e3cddcc2cb0797ca3fe7eca4f12a04453 (diff) |
added initial buffer and input stream mode to BufferedCharReader -- needs some further refactoring still (see TODO)
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/BufferedCharReader.cpp | 144 | ||||
-rw-r--r-- | src/core/BufferedCharReader.hpp | 140 |
2 files changed, 173 insertions, 111 deletions
diff --git a/src/core/BufferedCharReader.cpp b/src/core/BufferedCharReader.cpp index 15aa6c0..cf481df 100644 --- a/src/core/BufferedCharReader.cpp +++ b/src/core/BufferedCharReader.cpp @@ -16,6 +16,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <array> + #include "BufferedCharReader.hpp" namespace ousia { @@ -28,14 +30,18 @@ static const uint8_t LB_STATE_CR = 0x20; static const uint8_t LB_STATE_MASK_CNT = 0x0F; static const uint8_t LB_STATE_MASK_TYPE = 0xF0; -/******************************************************************************* - * Struct BufferedCharReader::ReadCursor - ******************************************************************************/ - -BufferedCharReader::ReadCursor::ReadCursor(const bool destructive) : - destructive(destructive) +/* Struct BufferedCharReader::ReadCursor */ + +BufferedCharReader::ReadCursor::ReadCursor(unsigned int line, + unsigned int column, + bool destructive) + : line(line), + column(column), + bufferElem(0), + bufferPos(0), + destructive(destructive), + lbState(LB_STATE_NONE) { - reset(); } void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor) @@ -47,51 +53,50 @@ void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor) this->lbState = cursor.lbState; } -void BufferedCharReader::ReadCursor::reset() +/* Class BufferedCharReader */ + +BufferedCharReader::BufferedCharReader(int line, int column) + : inputStream(nullptr), + readCursor(line, column, true), + peekCursor(line, column, false), + depleted(false) { - this->line = 1; - this->column = 1; - this->bufferElem = 0; - this->bufferPos = 0; - this->lbState = LB_STATE_NONE; } -/******************************************************************************* - * Class BufferedCharReader - ******************************************************************************/ - -BufferedCharReader::BufferedCharReader() : - readCursor(true), peekCursor(false) +BufferedCharReader::BufferedCharReader(const std::string &str, int line, + int column) + : inputStream(nullptr), + readCursor(line, column, true), + peekCursor(line, column, false), + depleted(true) { - reset(); + buffer.push_back(str); } -void BufferedCharReader::reset() +BufferedCharReader::BufferedCharReader(std::istream &inputStream, int line, + int column) + : inputStream(&inputStream), + readCursor(line, column, true), + peekCursor(line, column, false), + depleted(false) { - readCursor.reset(); - peekCursor.reset(); - buffer.clear(); - closed = false; } -bool BufferedCharReader::feed(const std::string &data) +void BufferedCharReader::feed(const std::string &data) { - // Abort if the BufferedCharReader was closed - if (closed) { - return false; + if (!depleted && !inputStream) { + buffer.push_back(data); } - - // Append the data onto the queue - buffer.push_back(data); - return true; } void BufferedCharReader::close() { - closed = true; + if (!inputStream) { + depleted = true; + } } -bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c) +bool BufferedCharReader::substituteLinebreaks(ReadCursor &cursor, char *c) { // Handle line breaks, inserts breakes after the following character // combinations: \n, \r, \n\r, \r\n TODO: Change behaviour to \n, \n\r, \r\n @@ -100,11 +105,11 @@ bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c) const uint8_t type = (*c == '\n') ? LB_STATE_LF : LB_STATE_CR; // Read the last count and the last type from the state - const uint8_t lastCount = cursor->lbState & LB_STATE_MASK_CNT; - const uint8_t lastType = cursor->lbState & LB_STATE_MASK_TYPE; + const uint8_t lastCount = cursor.lbState & LB_STATE_MASK_CNT; + const uint8_t lastType = cursor.lbState & LB_STATE_MASK_TYPE; // Set the current linebreak type and counter in the state - cursor->lbState = ((lastCount + 1) & 1) | type; + cursor.lbState = ((lastCount + 1) & 1) | type; // If either this is the first instance of this character or the same // return character is repeated @@ -116,40 +121,61 @@ bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c) } // Find the state - cursor->lbState = LB_STATE_NONE; + cursor.lbState = LB_STATE_NONE; return true; } -bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor, - char *c) +bool BufferedCharReader::readCharacterAtCursor(ReadCursor &cursor, char *c) { bool hasChar = false; while (!hasChar) { // Abort if the current buffer element does not point to a valid entry - // in the buffer -- we must wait until another data block has been fed - // into the buffer - if (cursor->bufferElem >= buffer.size()) { - return false; + // in the buffer -- we must try to feed another data block into the + // internal buffer + if (cursor.bufferElem >= buffer.size()) { + // Abort if there is no more data or no input stream is set + if (depleted || !inputStream) { + return false; + } + + // Read a buffer of the specified size + constexpr std::streamsize BUFFER_SIZE = 1024; + std::array<char, BUFFER_SIZE> buf; + const std::streamsize cnt = + (*inputStream).read(buf.data(), BUFFER_SIZE).gcount(); + + // If data has been read, append it to the input buffer and try + // again + if (cnt > 0) { + buffer.emplace_back(buf.data()); + continue; + } + + // End of file handling + if (inputStream->fail() || inputStream->eof()) { + depleted = true; + return false; + } } // Fetch the current element the peek pointer points to - const std::string &data = buffer[cursor->bufferElem]; + const std::string &data = buffer[cursor.bufferElem]; // Handle the "no data" case -- either in a destructive or // non-destructive manner. - if (cursor->bufferPos >= data.length()) { - if (cursor->destructive) { + if (cursor.bufferPos >= data.length()) { + if (cursor.destructive) { buffer.pop_front(); } else { - cursor->bufferElem++; + cursor.bufferElem++; } - cursor->bufferPos = 0; + cursor.bufferPos = 0; continue; } // Read the character, advance the buffer position - *c = *(data.data() + cursor->bufferPos); - cursor->bufferPos++; + *c = *(data.data() + cursor.bufferPos); + cursor.bufferPos++; // Substitute linebreaks with a single LF (0x0A) hasChar = substituteLinebreaks(cursor, c); @@ -157,12 +183,12 @@ bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor, // Update the position counter if (*c == '\n') { - cursor->line++; - cursor->column = 1; + cursor.line++; + cursor.column = 1; } else { // Ignore UTF-8 continuation bytes if (!((*c & 0x80) && !(*c & 0x40))) { - cursor->column++; + cursor.column++; } } @@ -171,13 +197,13 @@ bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor, bool BufferedCharReader::peek(char *c) { - return readCharacterAtCursor(&peekCursor, c); + return readCharacterAtCursor(peekCursor, c); } bool BufferedCharReader::read(char *c) { resetPeek(); - return readCharacterAtCursor(&readCursor, c); + return readCharacterAtCursor(readCursor, c); } void BufferedCharReader::consumePeek() @@ -200,7 +226,7 @@ void BufferedCharReader::resetPeek() bool BufferedCharReader::atEnd() { - if (closed) { + if (depleted || !inputStream) { if (buffer.size() <= 0) { return true; } else if (buffer.size() == 1) { @@ -209,5 +235,5 @@ bool BufferedCharReader::atEnd() } return false; } - } + diff --git a/src/core/BufferedCharReader.hpp b/src/core/BufferedCharReader.hpp index 0d72347..ec76b03 100644 --- a/src/core/BufferedCharReader.hpp +++ b/src/core/BufferedCharReader.hpp @@ -16,30 +16,39 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +/** + * @file BufferedCharReader.hpp + * + * Contains the BufferedCharReader class which is used for reading/peeking + * single characters from an input stream or string. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + #ifndef _OUSIA_BUFFERED_CHAR_READER_H_ #define _OUSIA_BUFFERED_CHAR_READER_H_ #include <deque> #include <string> +#include <istream> #include <cstdint> namespace ousia { +// TODO: Better split this class into multiple classes with base class +// BufferedCharReader where each sub class represents one method of supplying +// the input data (feeding, initial string, input stream). + /** * The BufferedCharReader class is used for storing incomming data that * is fed into the pipeline as well as reading/peeking single characters * from that buffer. Additionally it counts the current column/row * (with correct handling for UTF-8) and contains an internal state - * machine that handles the detection of linebreaks. - * - * Additionally the BufferedCharReader performs the following tasks: - * 1. Convert the incomming character encoding to UTF-8 (TODO: implement) - * 2. Convert arbitrary linebreaks to a single "\n" + * machine that handles the detection of linebreaks and converts these to a + * single '\n'. */ class BufferedCharReader { - private: - /** * The ReadCursor structure is responsible for representing the read * position within the text an all state machine states belonging to the @@ -48,12 +57,6 @@ private: */ struct ReadCursor { /** - * Specifies whether this is a destructive cursor (bytes are discarded - * once they were read from the buffer). - */ - const bool destructive; - - /** * The line the cursor currently points to. */ unsigned int line; @@ -75,6 +78,12 @@ private: unsigned int bufferPos; /** + * Specifies whether this is a destructive cursor (bytes are discarded + * once they were read from the buffer). + */ + const bool destructive; + + /** * State variable used in the internal state machine of the * line feed detection. */ @@ -83,45 +92,50 @@ private: /** * Constructor of the ReadCursor structure. * + * @param line is the start line. + * @param column is the start column. * @param destructive specifies whether the ReadCursor is destructive * (consumes all read characters, as used in the "read cursor") or * non-destructive (as used in the "peek cursor"). */ - ReadCursor(const bool destructive); + ReadCursor(unsigned int line, unsigned int column, bool destructive); /** * Copys the data from another ReadCursor without overriding the * "destructive" flag. + * + * @param cursor is the cursor that should be copied. */ void assign(const ReadCursor &cursor); - - /** - * Resets the cursor without changing the "destructive" flag. - */ - void reset(); }; /** - * Queue containing the data that has been fed into the char reader. + * Pointer at an (optional) input stream used for reading a chunk of data + * whenever the input buffer depletes. */ - std::deque<std::string> buffer; + std::istream *inputStream; /** - * The read and the peek cursor. + * The read and the peek cursor. */ ReadCursor readCursor, peekCursor; /** - * Determines whether the reader has been closed. + * Set to true if there is no more input data. */ - bool closed; + bool depleted; + + /** + * Queue containing the data that has been fed into the char reader. + */ + std::deque<std::string> buffer; /** * Substitute any combination of linebreaks in the incomming code with "\n". * Returns true if the current character is meant as output, false * otherwise. */ - bool substituteLinebreaks(ReadCursor *cursor, char *c); + bool substituteLinebreaks(ReadCursor &cursor, char *c); /** * Reads a character from the input buffer and advances the given read @@ -137,7 +151,7 @@ private: * @param returns true if there was enough data in the buffer, false * otherwise. */ - bool readCharacterAtCursor(ReadCursor *cursor, char *c); + bool readCharacterAtCursor(ReadCursor &cursor, char *c); /** * Function that is called for each read character -- updates the row and @@ -148,31 +162,35 @@ private: public: /** - * Constructor of the buffered char reader class. + * Constructor of the buffered char reader class with empty buffer as input. + * This operates the BufferedCharReader in a mode where new data has to be + * fed using the "feed" function and explicitly closed using the "close" + * function. + * + * @param line is the start line. + * @param column is the start column. */ - BufferedCharReader(); + BufferedCharReader(int line = 1, int column = 1); - /** - * Resets the reader to its initial state. - */ - void reset(); /** - * Feeds new data into the internal buffer of the BufferedCharReader - * class. + * Constructor of the buffered char reader class with a string as input. * - * @param data is a string containing the data that should be - * appended to the internal buffer. - * @return true if the operation was successful, false otherwise (e.g. - * because the reader is closed). + * @param str is a string containing the input data. + * @param line is the start line. + * @param column is the start column. */ - bool feed(const std::string &data); + BufferedCharReader(const std::string &str, int line = 1, int column = 1); /** - * Marks the end of the input, allowing successors in the pipeline - * to react properly (e.g. creating the end of stream token). + * Constructor of the buffered char reader class with a string as input. + * + * @param inputStream is the input stream from which incomming data should + * be read. + * @param line is the start line. + * @param column is the start column. */ - void close(); + BufferedCharReader(std::istream &inputStream, int line = 1, int column = 1); /** * Peeks a single character. If called multiple times, returns the @@ -209,29 +227,47 @@ public: void resetPeek(); /** + * Feeds new data into the internal buffer of the BufferedCharReader + * class. Only applicable if the buffered char reader was constructed + * without an input stream or string. + * + * @param data is a string containing the data that should be + * appended to the internal buffer. + */ + void feed(const std::string &data); + + /** + * Tells the buffered char reader that no more data will be fed. + * Only applicable if the buffered char reader was constructed without an + * input stream or string. + * + * @param data is a string containing the data that should be + * appended to the internal buffer. + */ + void close(); + + /** * Returns true if there are no more characters as the stream was * closed. + * + * @return true if there is no more data. */ bool atEnd(); /** * Returns the current line (starting with one). + * + * @return the current line number. */ - inline int getLine() - { - return readCursor.line; - } + inline int getLine() { return readCursor.line; } /** * Returns the current column (starting with one). + * + * @return the current column number. */ - inline int getColumn() - { - return readCursor.column; - } - + inline int getColumn() { return readCursor.column; } }; - } #endif /* _OUSIA_BUFFERED_CHAR_READER_H_ */ |