diff options
Diffstat (limited to 'src/core/utils')
| -rw-r--r-- | src/core/utils/CharReader.cpp | 218 | ||||
| -rw-r--r-- | src/core/utils/CharReader.hpp | 295 | 
2 files changed, 507 insertions, 6 deletions
diff --git a/src/core/utils/CharReader.cpp b/src/core/utils/CharReader.cpp index bf25a01..12d0043 100644 --- a/src/core/utils/CharReader.cpp +++ b/src/core/utils/CharReader.cpp @@ -19,11 +19,31 @@  #include <algorithm>  #include <limits> +#include <core/Utils.hpp> +  #include "CharReader.hpp"  namespace ousia {  namespace utils { +/* Helper functions */ + +/** + * istreamReadCallback is used internally by the Buffer calss to stream data + * from an input stream. + * + * @param buf is points a the target memory region. + * @param size is the requested number of bytes. + * @param userData is a pointer at some user defined data. + * @return the actual number of bytes read. If the result is smaller than + * the requested size, this tells the Buffer that the end of the input + * stream is reached. + */ +static size_t istreamReadCallback(char *buf, size_t size, void *userData) +{ +	return (static_cast<std::istream *>(userData))->read(buf, size).gcount(); +} +  /* Class Buffer */  Buffer::Buffer(ReadCallback callback, void *userData) @@ -40,6 +60,8 @@ Buffer::Buffer(ReadCallback callback, void *userData)  	startBucket = buckets.begin();  } +Buffer::Buffer(std::istream &istream) : Buffer(istreamReadCallback, &istream) {} +  Buffer::Buffer(const std::string &str)      : callback(nullptr),        userData(nullptr), @@ -331,6 +353,202 @@ bool Buffer::read(Buffer::CursorId cursor, char &c)  		advance(cur.bucket);  	}  } + +/* CharReader::Cursor class */ + +void CharReader::Cursor::assign(std::shared_ptr<Buffer> buffer, +                                CharReader::Cursor &cursor) +{ +	// Copy the cursor position +	buffer->copyCursor(cursor.cursor, this->cursor); + +	// Copy the state +	line = cursor.line; +	column = cursor.column; +	state = cursor.state; +	lastLinebreak = cursor.lastLinebreak; +} + +/* CharReader class */ + +CharReader::CharReader(std::shared_ptr<Buffer> buffer, size_t line, +                       size_t column) +    : buffer(buffer), +      readCursor(buffer->createCursor(), line, column), +      peekCursor(buffer->createCursor(), line, column), +      coherent(true) +{ +} + +CharReader::CharReader(const std::string &str, size_t line, size_t column) +    : CharReader(std::shared_ptr<Buffer>{new Buffer{str}}, line, column) +{ +} + +CharReader::CharReader(std::istream &istream, size_t line, size_t column) +    : CharReader(std::shared_ptr<Buffer>{new Buffer{istream}}, line, column) +{ +} + +CharReader::~CharReader() +{ +	buffer->deleteCursor(readCursor.cursor); +	buffer->deleteCursor(peekCursor.cursor); +} + +bool CharReader::substituteLinebreaks(Cursor &cursor, char &c) +{ +	if (c == '\n' || c == '\r') { +		switch (cursor.state) { +			case LinebreakState::NONE: +				// We got a first linebreak character -- output a '\n' +				if (c == '\n') { +					cursor.state = LinebreakState::HAS_LF; +				} else { +					cursor.state = LinebreakState::HAS_CR; +				} +				c = '\n'; +				return true; +			case LinebreakState::HAS_LF: +				// If a LF is followed by a LF, output a new linefeed +				if (c == '\n') { +					cursor.state = LinebreakState::HAS_LF; +					return true; +				} + +				// Otherwise, don't handle this character (part of "\n\r") +				cursor.state = LinebreakState::NONE; +				return false; +			case LinebreakState::HAS_CR: +				// If a CR is followed by a CR, output a new linefeed +				if (c == '\r') { +					cursor.state = LinebreakState::HAS_CR; +					c = '\n'; +					return true; +				} + +				// Otherwise, don't handle this character (part of "\r\n") +				cursor.state = LinebreakState::NONE; +				return false; +		} +	} + +	// No linebreak character, reset the linebreak state +	cursor.state = LinebreakState::NONE; +	return true; +} + +bool CharReader::readAtCursor(Cursor &cursor, char &c) +{ +	while (true) { +		// Return false if we're at the end of the stream +		if (!buffer->read(cursor.cursor, c)) { +			return false; +		} + +		// Substitute linebreak characters with a single '\n' +		if (substituteLinebreaks(cursor, c)) { +			if (c == '\n') { +				// A linebreak was reached, go to the next line +				cursor.line++; +				cursor.column = 1; +				cursor.lastLinebreak = buffer->offset(cursor.cursor); +			} else { +				// Ignore UTF-8 continuation bytes +				if (!((c & 0x80) && !(c & 0x40))) { +					cursor.column++; +				} +			} + +			return true; +		} +	} +} + +bool CharReader::peek(char &c) +{ +	// If the reader was coherent, update the peek cursor state +	if (coherent) { +		peekCursor.assign(buffer, readCursor); +		coherent = false; +	} + +	// Read a character from the peek cursor +	return readAtCursor(peekCursor, c); +} + +bool CharReader::read(char &c) +{ +	// Read a character from the buffer at the current read cursor +	bool res = readAtCursor(readCursor, c); + +	// Set the peek position to the current read position, if reading was not +	// coherent +	if (!coherent) { +		peekCursor.assign(buffer, readCursor); +		coherent = true; +	} else { +		buffer->copyCursor(readCursor.cursor, peekCursor.cursor); +	} + +	// Return the result of the read function +	return res; +} + +void CharReader::resetPeek() +{ +	if (!coherent) { +		peekCursor.assign(buffer, readCursor); +		coherent = true; +	} +} + +void CharReader::consumePeek() +{ +	if (!coherent) { +		readCursor.assign(buffer, peekCursor); +		coherent = true; +	} +} + +bool CharReader::consumeWhitespace() +{ +	char c; +	while (peek(c)) { +		if (!Utils::isWhitespace(c)) { +			resetPeek(); +			return true; +		} +		consumePeek(); +	} +	return false; +} + +CharReaderFork CharReader::fork() +{ +	return CharReaderFork(buffer, readCursor, peekCursor, coherent); +} + +/* Class CharReaderFork */ + +CharReaderFork::CharReaderFork(std::shared_ptr<Buffer> buffer, +                               CharReader::Cursor &parentReadCursor, +                               CharReader::Cursor &parentPeekCursor, +                               bool coherent) +    : CharReader(buffer, 1, 1), +      parentReadCursor(parentReadCursor), +      parentPeekCursor(parentPeekCursor) +{ +	readCursor.assign(buffer, parentReadCursor); +	peekCursor.assign(buffer, parentPeekCursor); +	this->coherent = coherent; +} + +void CharReaderFork::commit() +{ +	parentReadCursor.assign(buffer, readCursor); +	parentPeekCursor.assign(buffer, peekCursor); +}  }  } diff --git a/src/core/utils/CharReader.hpp b/src/core/utils/CharReader.hpp index 8d97d39..3d4c894 100644 --- a/src/core/utils/CharReader.hpp +++ b/src/core/utils/CharReader.hpp @@ -27,7 +27,9 @@  #ifndef _OUSIA_CHAR_READER_HPP_  #define _OUSIA_CHAR_READER_HPP_ +#include <istream>  #include <list> +#include <memory>  #include <vector>  namespace ousia { @@ -45,9 +47,8 @@ public:  	 * Callback function which is called whenever new data is requested from the  	 * input stream.  	 * -	 * @param buf is a pointer at the memory region to which the data should be -	 * writtern. -	 * @param size is the size of the +	 * @param buf is points a the target memory region. +	 * @param size is the requested number of bytes.  	 * @param userData is a pointer at some user defined data given in the  	 * constructor.  	 * @return the actual number of bytes read. If the result is smaller than @@ -219,6 +220,14 @@ public:  	Buffer(ReadCallback callback, void *userData);  	/** +	 * Initializes the Buffer with a reference to an std::istream from which +	 * data will be read. +	 * +	 * @param istream is the input stream from which the data should be read. +	 */ +	Buffer(std::istream &istream); + +	/**  	 * Initializes the Buffer with the contents of the given string, after  	 * this operation the Buffer has a fixed size.  	 * @@ -266,14 +275,16 @@ public:  	/**  	 * Moves a cursor by offs bytes. Note that moving backwards is theoretically -	 * limited by the LOOKBACK_SIZE of the Buffer, practically it will most likely -	 * be limited by the REQUEST_SIZE, so you can got at most 64 KiB backwards. +	 * limited by the LOOKBACK_SIZE of the Buffer, practically it will most +	 * likely be limited by the REQUEST_SIZE, so you can got at most 64 KiB +	 * backwards.  	 *  	 * @param cursor is the cursor that should be moved.  	 * @param relativeOffs is a positive or negative integer number specifying  	 * the number of bytes the cursor should be moved forward (positive numbers)  	 * or backwards (negative numbers). -	 * @return the actual number of bytes the cursor was moved. +	 * @return the actual number of bytes the cursor was moved. This number is +	 * smaller than the relativeOffs given in the constructor if the  	 */  	ssize_t moveCursor(CursorId cursor, ssize_t relativeOffs); @@ -311,6 +322,278 @@ public:  	bool read(CursorId cursor, char &c);  }; +// Forward declaration +class CharReaderFork; + +/** + * Used within parsers for convenient access to single characters in an input + * stream or buffer. It allows reading and peeking single characters from a + * buffer. Additionally it counts the current column/row (with correct handling + * for UTF-8) and contains an internal state machine that handles the detection + * of linebreaks and converts these to a single '\n'. + */ +class CharReader { +protected: +	/** +	 * Enum to represent the current state of the internal state machine that +	 * replaces the linebreaks from multiple platforms to a single '\n'. +	 */ +	enum class LinebreakState { NONE, HAS_LF, HAS_CR }; + +	/** +	 * Internally used cursor structure for managing the read and the peek +	 * cursor. +	 */ +	struct Cursor { +		/** +		 * Corresponding cursor in the underlying buffer instance. +		 */ +		const Buffer::CursorId cursor; + +		/** +		 * Current line the cursor is in. +		 */ +		size_t line; + +		/** +		 * Current column the cursor is in. +		 */ +		size_t column; + +		/** +		 * State of the linebreak replacement statemachine. +		 */ +		LinebreakState state; + +		/** +		 * Contains the absolute offset in the input stream containing the +		 * position of the last linebreak. This is used for extracting the +		 * context (the line) in which an error occured. +		 */ +		size_t lastLinebreak; + +		/** +		 * Constructor of the Cursor class. +		 * +		 * @param cursor is the underlying cursor in the Buffer instance. +		 */ +		Cursor(Buffer::CursorId cursor, size_t line, size_t column) +		    : cursor(cursor), +		      line(line), +		      column(column), +		      state(LinebreakState::NONE), +		      lastLinebreak(0) +		{ +		} + +		/** +		 * Assigns one cursor to another. +		 * +		 * @param buffer is the underlying buffer instance the internal cursor +		 * belongs to. +		 * @param cursor is the cursor from which the state should be copied. +		 */ +		void assign(std::shared_ptr<Buffer> buffer, Cursor &cursor); +	}; + +private: +	/** +	 * Substitutes "\r", "\n\r", "\r\n" with a single "\n". +	 * +	 * @param cursor is the cursor from which the character should be read. +	 * @param c a reference to the character that should be written. +	 * @return true if another character needs to be read. +	 */ +	bool substituteLinebreaks(Cursor &cursor, char &c); + +	/** +	 * Reads a single character from the given cursor. +	 * +	 * @param cursor is the cursor from which the character should be read. +	 * @param c a reference to the character that should be written. +	 * @return true if a character was read, false if the end of the stream has +	 * been reached. +	 */ +	bool readAtCursor(Cursor &cursor, char &c); + +protected: +	/** +	 * Reference pointing at the underlying buffer. +	 */ +	std::shared_ptr<Buffer> buffer; + +	/** +	 * Cursor used for reading. +	 */ +	Cursor readCursor; + +	/** +	 * Cursor used for peeking. +	 */ +	Cursor peekCursor; + +	/** +	 * Set to true as long the underlying Buffer cursor is at the same position +	 * for the read and the peek cursor. +	 */ +	bool coherent; + +	/** +	 * Protected constructor of the CharReader base class. Creates new read +	 * and peek cursors for the given buffer. +	 * +	 * @param buffer is a reference to the underlying Buffer class responsible +	 * for allowing to read from a single input stream from multiple locations. +	 */ +	CharReader(std::shared_ptr<Buffer> buffer, size_t line, size_t column); + +public: +	/** +	 * Creates a new CharReader instance from a string. +	 * +	 * @param str is a string containing the input data. +	 * @param line is the start line. +	 * @param column is the start column. +	 */ +	CharReader(const std::string &str, size_t line = 1, size_t column = 1); + +	/** +	 * Creates a new CharReader instance for an input stream. +	 * +	 * @param istream is the input stream from which incomming data should be +	 * read. +	 * @param line is the start line. +	 * @param column is the start column. +	 */ +	CharReader(std::istream &istream, size_t line = 1, size_t column = 1); + +	/** +	 * Deletes the used cursors from the underlying buffer instance. +	 */ +	~CharReader(); + +	// No copy +	CharReader(const Buffer &) = delete; + +	// No assign +	CharReader &operator=(const Buffer &) = delete; + +	/** +	 * Peeks a single character. If called multiple times, returns the +	 * character after the previously peeked character. +	 * +	 * @param c is a reference to the character to which the result should be +	 * written. +	 * @return true if the character was successfully read, false if there are +	 * no more characters to be read in the buffer. +	 */ +	bool peek(char &c); + +	/** +	 * Reads a character from the input data. If "peek" was called +	 * beforehand resets the peek pointer. +	 * +	 * @param c is a reference to the character to which the result should be +	 * written. +	 * @return true if the character was successfully read, false if there are +	 * no more characters to be read in the buffer. +	 */ +	bool read(char &c); + +	/** +	 * Resets the peek pointer to the "read" pointer. +	 */ +	void resetPeek(); + +	/** +	 * Advances the read pointer to the peek pointer -- so if the "peek" +	 * function was called, "read" will now return the character after +	 * the last peeked character. +	 */ +	void consumePeek(); + +	/** +	 * Moves the read cursor to the next non-whitespace character. Returns +	 * false, if the end of the stream was reached. +	 * +	 * @return false if the end of the stream was reached, false othrwise. +	 */ +	bool consumeWhitespace(); + +	/** +	 * Creates a new CharReader located at the same position as this CharReader +	 * instance, yet the new CharReader can be used independently of this +	 * CharReader. Use the "commit" function of the returned CharReader to +	 * copy the state of the forked CharReaderFork to this CharReader. +	 * +	 * @return a CharReaderFork instance positioned at the same location as this +	 * CharReader instance. +	 */ +	CharReaderFork fork(); + +	/** +	 * Returns true if there are no more characters as the stream was +	 * closed. +	 * +	 * @return true if there is no more data. +	 */ +	bool atEnd() const { return buffer->atEnd(readCursor.cursor); } + +	/** +	 * Returns the current line (starting with one). +	 * +	 * @return the current line number. +	 */ +	size_t getLine() const { return readCursor.line; } + +	/** +	 * Returns the current column (starting with one). +	 * +	 * @return the current column number. +	 */ +	size_t getColumn() const { return readCursor.column; } +}; + +/** + * A CharReaderFork is returned whenever the "fork" function of the CharReader + * class is used. Its "commit" function can be used to move the underlying + * CharReader instance to the location of the CharReaderFork instance. Otherwise + * the read location of the underlying CharReader is left unchanged. + */ +class CharReaderFork : public CharReader { +private: +	friend CharReader; + +	/** +	 * The reader cursor of the underlying CharReader instance. +	 */ +	CharReader::Cursor &parentReadCursor; + +	/** +	 * The peek cursor of the underlying CharReader instance. +	 */ +	CharReader::Cursor &parentPeekCursor; + +	/** +	 * Constructor of the CharReaderFork class. +	 * +	 * @param buffer is a reference at the parent Buffer instance. +	 * @param parentPeekCursor is a reference at the parent read cursor. +	 * @param parentPeekCursor is a reference at the parent peek cursor. +	 * @param coherent specifies whether the char reader cursors are initialized +	 * coherently. +	 */ +	CharReaderFork(std::shared_ptr<Buffer> buffer, +	               CharReader::Cursor &parentReadCursor, +	               CharReader::Cursor &parentPeekCursor, bool coherent); + +public: +	/** +	 * Moves the read and peek cursor of the parent CharReader to the location +	 * of the read and peek cursor in the fork. +	 */ +	void commit(); +};  }  }  | 
