diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/core/BufferedCharReader.cpp | 144 | ||||
| -rw-r--r-- | src/core/BufferedCharReader.hpp | 140 | 
2 files changed, 173 insertions, 111 deletions
diff --git a/src/core/BufferedCharReader.cpp b/src/core/BufferedCharReader.cpp index 15aa6c0..cf481df 100644 --- a/src/core/BufferedCharReader.cpp +++ b/src/core/BufferedCharReader.cpp @@ -16,6 +16,8 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ +#include <array> +  #include "BufferedCharReader.hpp"  namespace ousia { @@ -28,14 +30,18 @@ static const uint8_t LB_STATE_CR = 0x20;  static const uint8_t LB_STATE_MASK_CNT = 0x0F;  static const uint8_t LB_STATE_MASK_TYPE = 0xF0; -/******************************************************************************* - * Struct BufferedCharReader::ReadCursor - ******************************************************************************/ - -BufferedCharReader::ReadCursor::ReadCursor(const bool destructive) : -		destructive(destructive) +/* Struct BufferedCharReader::ReadCursor */ + +BufferedCharReader::ReadCursor::ReadCursor(unsigned int line, +                                           unsigned int column, +                                           bool destructive) +    : line(line), +      column(column), +      bufferElem(0), +      bufferPos(0), +      destructive(destructive), +      lbState(LB_STATE_NONE)  { -	reset();  }  void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor) @@ -47,51 +53,50 @@ void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor)  	this->lbState = cursor.lbState;  } -void BufferedCharReader::ReadCursor::reset() +/* Class BufferedCharReader */ + +BufferedCharReader::BufferedCharReader(int line, int column) +    : inputStream(nullptr), +      readCursor(line, column, true), +      peekCursor(line, column, false), +      depleted(false)  { -	this->line = 1; -	this->column = 1; -	this->bufferElem = 0; -	this->bufferPos = 0; -	this->lbState = LB_STATE_NONE;  } -/******************************************************************************* - * Class BufferedCharReader - ******************************************************************************/ - -BufferedCharReader::BufferedCharReader() : -	readCursor(true), peekCursor(false) +BufferedCharReader::BufferedCharReader(const std::string &str, int line, +                                       int column) +    : inputStream(nullptr), +      readCursor(line, column, true), +      peekCursor(line, column, false), +      depleted(true)  { -	reset(); +	buffer.push_back(str);  } -void BufferedCharReader::reset() +BufferedCharReader::BufferedCharReader(std::istream &inputStream, int line, +                                       int column) +    : inputStream(&inputStream), +      readCursor(line, column, true), +      peekCursor(line, column, false), +      depleted(false)  { -	readCursor.reset(); -	peekCursor.reset(); -	buffer.clear(); -	closed = false;  } -bool BufferedCharReader::feed(const std::string &data) +void BufferedCharReader::feed(const std::string &data)  { -	// Abort if the BufferedCharReader was closed -	if (closed) { -		return false; +	if (!depleted && !inputStream) { +		buffer.push_back(data);  	} - -	// Append the data onto the queue -	buffer.push_back(data); -	return true;  }  void BufferedCharReader::close()  { -	closed = true; +	if (!inputStream) { +		depleted = true; +	}  } -bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c) +bool BufferedCharReader::substituteLinebreaks(ReadCursor &cursor, char *c)  {  	// Handle line breaks, inserts breakes after the following character  	// combinations: \n, \r, \n\r, \r\n TODO: Change behaviour to \n, \n\r, \r\n @@ -100,11 +105,11 @@ bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c)  		const uint8_t type = (*c == '\n') ? LB_STATE_LF : LB_STATE_CR;  		// Read the last count and the last type from the state -		const uint8_t lastCount = cursor->lbState & LB_STATE_MASK_CNT; -		const uint8_t lastType = cursor->lbState & LB_STATE_MASK_TYPE; +		const uint8_t lastCount = cursor.lbState & LB_STATE_MASK_CNT; +		const uint8_t lastType = cursor.lbState & LB_STATE_MASK_TYPE;  		// Set the current linebreak type and counter in the state -		cursor->lbState = ((lastCount + 1) & 1) | type; +		cursor.lbState = ((lastCount + 1) & 1) | type;  		// If either this is the first instance of this character or the same  		// return character is repeated @@ -116,40 +121,61 @@ bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c)  	}  	// Find the state -	cursor->lbState = LB_STATE_NONE; +	cursor.lbState = LB_STATE_NONE;  	return true;  } -bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor, -		char *c) +bool BufferedCharReader::readCharacterAtCursor(ReadCursor &cursor, char *c)  {  	bool hasChar = false;  	while (!hasChar) {  		// Abort if the current buffer element does not point to a valid entry -		// in the buffer -- we must wait until another data block has been fed -		// into the buffer -		if (cursor->bufferElem >= buffer.size()) { -			return false; +		// in the buffer -- we must try to feed another data block into the +		// internal buffer +		if (cursor.bufferElem >= buffer.size()) { +			// Abort if there is no more data or no input stream is set +			if (depleted || !inputStream) { +				return false; +			} + +			// Read a buffer of the specified size +			constexpr std::streamsize BUFFER_SIZE = 1024; +			std::array<char, BUFFER_SIZE> buf; +			const std::streamsize cnt = +			    (*inputStream).read(buf.data(), BUFFER_SIZE).gcount(); + +			// If data has been read, append it to the input buffer and try +			// again +			if (cnt > 0) { +				buffer.emplace_back(buf.data()); +				continue; +			} + +			// End of file handling +			if (inputStream->fail() || inputStream->eof()) { +				depleted = true; +				return false; +			}  		}  		// Fetch the current element the peek pointer points to -		const std::string &data = buffer[cursor->bufferElem]; +		const std::string &data = buffer[cursor.bufferElem];  		// Handle the "no data" case -- either in a destructive or  		// non-destructive manner. -		if (cursor->bufferPos >= data.length()) { -			if (cursor->destructive) { +		if (cursor.bufferPos >= data.length()) { +			if (cursor.destructive) {  				buffer.pop_front();  			} else { -				cursor->bufferElem++; +				cursor.bufferElem++;  			} -			cursor->bufferPos = 0; +			cursor.bufferPos = 0;  			continue;  		}  		// Read the character, advance the buffer position -		*c = *(data.data() + cursor->bufferPos); -		cursor->bufferPos++; +		*c = *(data.data() + cursor.bufferPos); +		cursor.bufferPos++;  		// Substitute linebreaks with a single LF (0x0A)  		hasChar = substituteLinebreaks(cursor, c); @@ -157,12 +183,12 @@ bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor,  	// Update the position counter  	if (*c == '\n') { -		cursor->line++; -		cursor->column = 1; +		cursor.line++; +		cursor.column = 1;  	} else {  		// Ignore UTF-8 continuation bytes  		if (!((*c & 0x80) && !(*c & 0x40))) { -			cursor->column++; +			cursor.column++;  		}  	} @@ -171,13 +197,13 @@ bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor,  bool BufferedCharReader::peek(char *c)  { -	return readCharacterAtCursor(&peekCursor, c); +	return readCharacterAtCursor(peekCursor, c);  }  bool BufferedCharReader::read(char *c)  {  	resetPeek(); -	return readCharacterAtCursor(&readCursor, c); +	return readCharacterAtCursor(readCursor, c);  }  void BufferedCharReader::consumePeek() @@ -200,7 +226,7 @@ void BufferedCharReader::resetPeek()  bool BufferedCharReader::atEnd()  { -	if (closed) { +	if (depleted || !inputStream) {  		if (buffer.size() <= 0) {  			return true;  		} else if (buffer.size() == 1) { @@ -209,5 +235,5 @@ bool BufferedCharReader::atEnd()  	}  	return false;  } -  } + diff --git a/src/core/BufferedCharReader.hpp b/src/core/BufferedCharReader.hpp index 0d72347..ec76b03 100644 --- a/src/core/BufferedCharReader.hpp +++ b/src/core/BufferedCharReader.hpp @@ -16,30 +16,39 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ +/** + * @file BufferedCharReader.hpp + * + * Contains the BufferedCharReader class which is used for reading/peeking + * single characters from an input stream or string. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ +  #ifndef _OUSIA_BUFFERED_CHAR_READER_H_  #define _OUSIA_BUFFERED_CHAR_READER_H_  #include <deque>  #include <string> +#include <istream>  #include <cstdint>  namespace ousia { +// TODO: Better split this class into multiple classes with base class +// BufferedCharReader where each sub class represents one method of supplying +// the input data (feeding, initial string, input stream). +  /**   * The BufferedCharReader class is used for storing incomming data that   * is fed into the pipeline as well as reading/peeking single characters   * from that buffer. Additionally it counts the current column/row   * (with correct handling for UTF-8) and contains an internal state - * machine that handles the detection of linebreaks. - * - * Additionally the BufferedCharReader performs the following tasks: - * 1. Convert the incomming character encoding to UTF-8 (TODO: implement) - * 2. Convert arbitrary linebreaks to a single "\n" + * machine that handles the detection of linebreaks and converts these to a + * single '\n'.   */  class BufferedCharReader { -  private: -  	/**  	 * The ReadCursor structure is responsible for representing the read  	 * position within the text an all state machine states belonging to the @@ -48,12 +57,6 @@ private:  	 */  	struct ReadCursor {  		/** -		 * Specifies whether this is a destructive cursor (bytes are discarded -		 * once they were read from the buffer). -		 */ -		const bool destructive; - -		/**  		 * The line the cursor currently points to.  		 */  		unsigned int line; @@ -75,6 +78,12 @@ private:  		unsigned int bufferPos;  		/** +		 * Specifies whether this is a destructive cursor (bytes are discarded +		 * once they were read from the buffer). +		 */ +		const bool destructive; + +		/**  		 * State variable used in the internal state machine of the  		 * line feed detection.  		 */ @@ -83,45 +92,50 @@ private:  		/**  		 * Constructor of the ReadCursor structure.  		 * +		 * @param line is the start line. +		 * @param column is the start column.  		 * @param destructive specifies whether the ReadCursor is destructive  		 * (consumes all read characters, as used in the "read cursor") or  		 * non-destructive (as used in the "peek cursor").  		 */ -		ReadCursor(const bool destructive); +		ReadCursor(unsigned int line, unsigned int column, bool destructive);  		/**  		 * Copys the data from another ReadCursor without overriding the  		 * "destructive" flag. +		 * +		 * @param cursor is the cursor that should be copied.  		 */  		void assign(const ReadCursor &cursor); - -		/** -		 * Resets the cursor without changing the "destructive" flag. -		 */ -		void reset();  	};  	/** -	 * Queue containing the data that has been fed into the char reader. +	 * Pointer at an (optional) input stream used for reading a chunk of data +	 * whenever the input buffer depletes.  	 */ -	std::deque<std::string> buffer; +	std::istream *inputStream;  	/** -	 * The read and the peek cursor.  +	 * The read and the peek cursor.  	 */  	ReadCursor readCursor, peekCursor;  	/** -	 * Determines whether the reader has been closed. +	 * Set to true if there is no more input data.  	 */ -	bool closed; +	bool depleted; + +	/** +	 * Queue containing the data that has been fed into the char reader. +	 */ +	std::deque<std::string> buffer;  	/**  	 * Substitute any combination of linebreaks in the incomming code with "\n".  	 * Returns true if the current character is meant as output, false  	 * otherwise.  	 */ -	bool substituteLinebreaks(ReadCursor *cursor, char *c); +	bool substituteLinebreaks(ReadCursor &cursor, char *c);  	/**  	 * Reads a character from the input buffer and advances the given read @@ -137,7 +151,7 @@ private:  	 * @param returns true if there was enough data in the buffer, false  	 * otherwise.  	 */ -	bool readCharacterAtCursor(ReadCursor *cursor, char *c); +	bool readCharacterAtCursor(ReadCursor &cursor, char *c);  	/**  	 * Function that is called for each read character -- updates the row and @@ -148,31 +162,35 @@ private:  public:  	/** -	 * Constructor of the buffered char reader class. +	 * Constructor of the buffered char reader class with empty buffer as input. +	 * This operates the BufferedCharReader in a mode where new data has to be +	 * fed using the "feed" function and explicitly closed using the "close" +	 * function. +	 * +	 * @param line is the start line. +	 * @param column is the start column.  	 */ -	BufferedCharReader(); +	BufferedCharReader(int line = 1, int column = 1); -	/** -	 * Resets the reader to its initial state. -	 */ -	void reset();  	/** -	 * Feeds new data into the internal buffer of the BufferedCharReader -	 * class. +	 * Constructor of the buffered char reader class with a string as input.  	 * -	 * @param data is a string containing the data that should be -	 * appended to the internal buffer. -	 * @return true if the operation was successful, false otherwise (e.g. -	 * because the reader is closed). +	 * @param str is a string containing the input data. +	 * @param line is the start line. +	 * @param column is the start column.  	 */ -	bool feed(const std::string &data); +	BufferedCharReader(const std::string &str, int line = 1, int column = 1);  	/** -	 * Marks the end of the input, allowing successors in the pipeline -	 * to react properly (e.g. creating the end of stream token). +	 * Constructor of the buffered char reader class with a string as input. +	 * +	 * @param inputStream is the input stream from which incomming data should +	 * be read. +	 * @param line is the start line. +	 * @param column is the start column.  	 */ -	void close(); +	BufferedCharReader(std::istream &inputStream, int line = 1, int column = 1);  	/**  	 * Peeks a single character. If called multiple times, returns the @@ -209,29 +227,47 @@ public:  	void resetPeek();  	/** +	 * Feeds new data into the internal buffer of the BufferedCharReader +	 * class. Only applicable if the buffered char reader was constructed +	 * without an input stream or string. +	 * +	 * @param data is a string containing the data that should be +	 * appended to the internal buffer. +	 */ +	void feed(const std::string &data); + +	/** +	 * Tells the buffered char reader that no more data will be fed. +	 * Only applicable if the buffered char reader was constructed without an +	 * input stream or string. +	 * +	 * @param data is a string containing the data that should be +	 * appended to the internal buffer. +	 */ +	void close(); + +	/**  	 * Returns true if there are no more characters as the stream was  	 * closed. +	 * +	 * @return true if there is no more data.  	 */  	bool atEnd();  	/**  	 * Returns the current line (starting with one). +	 * +	 * @return the current line number.  	 */ -	inline int getLine() -	{ -		return readCursor.line; -	} +	inline int getLine() { return readCursor.line; }  	/**  	 * Returns the current column (starting with one). +	 * +	 * @return the current column number.  	 */ -	inline int getColumn() -	{ -		return readCursor.column; -	} - +	inline int getColumn() { return readCursor.column; }  }; -  }  #endif /* _OUSIA_BUFFERED_CHAR_READER_H_ */  | 
