summaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2014-11-24 00:54:18 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2014-11-24 00:54:18 +0100
commit6c132ad008d375e5929eae32beba0e5bfe48515c (patch)
treea00ec1ece0b37a650587deaa510d5397ec2c2d1e /src/core
parentaaaf493e3cddcc2cb0797ca3fe7eca4f12a04453 (diff)
added initial buffer and input stream mode to BufferedCharReader -- needs some further refactoring still (see TODO)
Diffstat (limited to 'src/core')
-rw-r--r--src/core/BufferedCharReader.cpp144
-rw-r--r--src/core/BufferedCharReader.hpp140
2 files changed, 173 insertions, 111 deletions
diff --git a/src/core/BufferedCharReader.cpp b/src/core/BufferedCharReader.cpp
index 15aa6c0..cf481df 100644
--- a/src/core/BufferedCharReader.cpp
+++ b/src/core/BufferedCharReader.cpp
@@ -16,6 +16,8 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <array>
+
#include "BufferedCharReader.hpp"
namespace ousia {
@@ -28,14 +30,18 @@ static const uint8_t LB_STATE_CR = 0x20;
static const uint8_t LB_STATE_MASK_CNT = 0x0F;
static const uint8_t LB_STATE_MASK_TYPE = 0xF0;
-/*******************************************************************************
- * Struct BufferedCharReader::ReadCursor
- ******************************************************************************/
-
-BufferedCharReader::ReadCursor::ReadCursor(const bool destructive) :
- destructive(destructive)
+/* Struct BufferedCharReader::ReadCursor */
+
+BufferedCharReader::ReadCursor::ReadCursor(unsigned int line,
+ unsigned int column,
+ bool destructive)
+ : line(line),
+ column(column),
+ bufferElem(0),
+ bufferPos(0),
+ destructive(destructive),
+ lbState(LB_STATE_NONE)
{
- reset();
}
void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor)
@@ -47,51 +53,50 @@ void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor)
this->lbState = cursor.lbState;
}
-void BufferedCharReader::ReadCursor::reset()
+/* Class BufferedCharReader */
+
+BufferedCharReader::BufferedCharReader(int line, int column)
+ : inputStream(nullptr),
+ readCursor(line, column, true),
+ peekCursor(line, column, false),
+ depleted(false)
{
- this->line = 1;
- this->column = 1;
- this->bufferElem = 0;
- this->bufferPos = 0;
- this->lbState = LB_STATE_NONE;
}
-/*******************************************************************************
- * Class BufferedCharReader
- ******************************************************************************/
-
-BufferedCharReader::BufferedCharReader() :
- readCursor(true), peekCursor(false)
+BufferedCharReader::BufferedCharReader(const std::string &str, int line,
+ int column)
+ : inputStream(nullptr),
+ readCursor(line, column, true),
+ peekCursor(line, column, false),
+ depleted(true)
{
- reset();
+ buffer.push_back(str);
}
-void BufferedCharReader::reset()
+BufferedCharReader::BufferedCharReader(std::istream &inputStream, int line,
+ int column)
+ : inputStream(&inputStream),
+ readCursor(line, column, true),
+ peekCursor(line, column, false),
+ depleted(false)
{
- readCursor.reset();
- peekCursor.reset();
- buffer.clear();
- closed = false;
}
-bool BufferedCharReader::feed(const std::string &data)
+void BufferedCharReader::feed(const std::string &data)
{
- // Abort if the BufferedCharReader was closed
- if (closed) {
- return false;
+ if (!depleted && !inputStream) {
+ buffer.push_back(data);
}
-
- // Append the data onto the queue
- buffer.push_back(data);
- return true;
}
void BufferedCharReader::close()
{
- closed = true;
+ if (!inputStream) {
+ depleted = true;
+ }
}
-bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c)
+bool BufferedCharReader::substituteLinebreaks(ReadCursor &cursor, char *c)
{
// Handle line breaks, inserts breakes after the following character
// combinations: \n, \r, \n\r, \r\n TODO: Change behaviour to \n, \n\r, \r\n
@@ -100,11 +105,11 @@ bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c)
const uint8_t type = (*c == '\n') ? LB_STATE_LF : LB_STATE_CR;
// Read the last count and the last type from the state
- const uint8_t lastCount = cursor->lbState & LB_STATE_MASK_CNT;
- const uint8_t lastType = cursor->lbState & LB_STATE_MASK_TYPE;
+ const uint8_t lastCount = cursor.lbState & LB_STATE_MASK_CNT;
+ const uint8_t lastType = cursor.lbState & LB_STATE_MASK_TYPE;
// Set the current linebreak type and counter in the state
- cursor->lbState = ((lastCount + 1) & 1) | type;
+ cursor.lbState = ((lastCount + 1) & 1) | type;
// If either this is the first instance of this character or the same
// return character is repeated
@@ -116,40 +121,61 @@ bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c)
}
// Find the state
- cursor->lbState = LB_STATE_NONE;
+ cursor.lbState = LB_STATE_NONE;
return true;
}
-bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor,
- char *c)
+bool BufferedCharReader::readCharacterAtCursor(ReadCursor &cursor, char *c)
{
bool hasChar = false;
while (!hasChar) {
// Abort if the current buffer element does not point to a valid entry
- // in the buffer -- we must wait until another data block has been fed
- // into the buffer
- if (cursor->bufferElem >= buffer.size()) {
- return false;
+ // in the buffer -- we must try to feed another data block into the
+ // internal buffer
+ if (cursor.bufferElem >= buffer.size()) {
+ // Abort if there is no more data or no input stream is set
+ if (depleted || !inputStream) {
+ return false;
+ }
+
+ // Read a buffer of the specified size
+ constexpr std::streamsize BUFFER_SIZE = 1024;
+ std::array<char, BUFFER_SIZE> buf;
+ const std::streamsize cnt =
+ (*inputStream).read(buf.data(), BUFFER_SIZE).gcount();
+
+ // If data has been read, append it to the input buffer and try
+ // again
+ if (cnt > 0) {
+ buffer.emplace_back(buf.data());
+ continue;
+ }
+
+ // End of file handling
+ if (inputStream->fail() || inputStream->eof()) {
+ depleted = true;
+ return false;
+ }
}
// Fetch the current element the peek pointer points to
- const std::string &data = buffer[cursor->bufferElem];
+ const std::string &data = buffer[cursor.bufferElem];
// Handle the "no data" case -- either in a destructive or
// non-destructive manner.
- if (cursor->bufferPos >= data.length()) {
- if (cursor->destructive) {
+ if (cursor.bufferPos >= data.length()) {
+ if (cursor.destructive) {
buffer.pop_front();
} else {
- cursor->bufferElem++;
+ cursor.bufferElem++;
}
- cursor->bufferPos = 0;
+ cursor.bufferPos = 0;
continue;
}
// Read the character, advance the buffer position
- *c = *(data.data() + cursor->bufferPos);
- cursor->bufferPos++;
+ *c = *(data.data() + cursor.bufferPos);
+ cursor.bufferPos++;
// Substitute linebreaks with a single LF (0x0A)
hasChar = substituteLinebreaks(cursor, c);
@@ -157,12 +183,12 @@ bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor,
// Update the position counter
if (*c == '\n') {
- cursor->line++;
- cursor->column = 1;
+ cursor.line++;
+ cursor.column = 1;
} else {
// Ignore UTF-8 continuation bytes
if (!((*c & 0x80) && !(*c & 0x40))) {
- cursor->column++;
+ cursor.column++;
}
}
@@ -171,13 +197,13 @@ bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor,
bool BufferedCharReader::peek(char *c)
{
- return readCharacterAtCursor(&peekCursor, c);
+ return readCharacterAtCursor(peekCursor, c);
}
bool BufferedCharReader::read(char *c)
{
resetPeek();
- return readCharacterAtCursor(&readCursor, c);
+ return readCharacterAtCursor(readCursor, c);
}
void BufferedCharReader::consumePeek()
@@ -200,7 +226,7 @@ void BufferedCharReader::resetPeek()
bool BufferedCharReader::atEnd()
{
- if (closed) {
+ if (depleted || !inputStream) {
if (buffer.size() <= 0) {
return true;
} else if (buffer.size() == 1) {
@@ -209,5 +235,5 @@ bool BufferedCharReader::atEnd()
}
return false;
}
-
}
+
diff --git a/src/core/BufferedCharReader.hpp b/src/core/BufferedCharReader.hpp
index 0d72347..ec76b03 100644
--- a/src/core/BufferedCharReader.hpp
+++ b/src/core/BufferedCharReader.hpp
@@ -16,30 +16,39 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+/**
+ * @file BufferedCharReader.hpp
+ *
+ * Contains the BufferedCharReader class which is used for reading/peeking
+ * single characters from an input stream or string.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
#ifndef _OUSIA_BUFFERED_CHAR_READER_H_
#define _OUSIA_BUFFERED_CHAR_READER_H_
#include <deque>
#include <string>
+#include <istream>
#include <cstdint>
namespace ousia {
+// TODO: Better split this class into multiple classes with base class
+// BufferedCharReader where each sub class represents one method of supplying
+// the input data (feeding, initial string, input stream).
+
/**
* The BufferedCharReader class is used for storing incomming data that
* is fed into the pipeline as well as reading/peeking single characters
* from that buffer. Additionally it counts the current column/row
* (with correct handling for UTF-8) and contains an internal state
- * machine that handles the detection of linebreaks.
- *
- * Additionally the BufferedCharReader performs the following tasks:
- * 1. Convert the incomming character encoding to UTF-8 (TODO: implement)
- * 2. Convert arbitrary linebreaks to a single "\n"
+ * machine that handles the detection of linebreaks and converts these to a
+ * single '\n'.
*/
class BufferedCharReader {
-
private:
-
/**
* The ReadCursor structure is responsible for representing the read
* position within the text an all state machine states belonging to the
@@ -48,12 +57,6 @@ private:
*/
struct ReadCursor {
/**
- * Specifies whether this is a destructive cursor (bytes are discarded
- * once they were read from the buffer).
- */
- const bool destructive;
-
- /**
* The line the cursor currently points to.
*/
unsigned int line;
@@ -75,6 +78,12 @@ private:
unsigned int bufferPos;
/**
+ * Specifies whether this is a destructive cursor (bytes are discarded
+ * once they were read from the buffer).
+ */
+ const bool destructive;
+
+ /**
* State variable used in the internal state machine of the
* line feed detection.
*/
@@ -83,45 +92,50 @@ private:
/**
* Constructor of the ReadCursor structure.
*
+ * @param line is the start line.
+ * @param column is the start column.
* @param destructive specifies whether the ReadCursor is destructive
* (consumes all read characters, as used in the "read cursor") or
* non-destructive (as used in the "peek cursor").
*/
- ReadCursor(const bool destructive);
+ ReadCursor(unsigned int line, unsigned int column, bool destructive);
/**
* Copys the data from another ReadCursor without overriding the
* "destructive" flag.
+ *
+ * @param cursor is the cursor that should be copied.
*/
void assign(const ReadCursor &cursor);
-
- /**
- * Resets the cursor without changing the "destructive" flag.
- */
- void reset();
};
/**
- * Queue containing the data that has been fed into the char reader.
+ * Pointer at an (optional) input stream used for reading a chunk of data
+ * whenever the input buffer depletes.
*/
- std::deque<std::string> buffer;
+ std::istream *inputStream;
/**
- * The read and the peek cursor.
+ * The read and the peek cursor.
*/
ReadCursor readCursor, peekCursor;
/**
- * Determines whether the reader has been closed.
+ * Set to true if there is no more input data.
*/
- bool closed;
+ bool depleted;
+
+ /**
+ * Queue containing the data that has been fed into the char reader.
+ */
+ std::deque<std::string> buffer;
/**
* Substitute any combination of linebreaks in the incomming code with "\n".
* Returns true if the current character is meant as output, false
* otherwise.
*/
- bool substituteLinebreaks(ReadCursor *cursor, char *c);
+ bool substituteLinebreaks(ReadCursor &cursor, char *c);
/**
* Reads a character from the input buffer and advances the given read
@@ -137,7 +151,7 @@ private:
* @param returns true if there was enough data in the buffer, false
* otherwise.
*/
- bool readCharacterAtCursor(ReadCursor *cursor, char *c);
+ bool readCharacterAtCursor(ReadCursor &cursor, char *c);
/**
* Function that is called for each read character -- updates the row and
@@ -148,31 +162,35 @@ private:
public:
/**
- * Constructor of the buffered char reader class.
+ * Constructor of the buffered char reader class with empty buffer as input.
+ * This operates the BufferedCharReader in a mode where new data has to be
+ * fed using the "feed" function and explicitly closed using the "close"
+ * function.
+ *
+ * @param line is the start line.
+ * @param column is the start column.
*/
- BufferedCharReader();
+ BufferedCharReader(int line = 1, int column = 1);
- /**
- * Resets the reader to its initial state.
- */
- void reset();
/**
- * Feeds new data into the internal buffer of the BufferedCharReader
- * class.
+ * Constructor of the buffered char reader class with a string as input.
*
- * @param data is a string containing the data that should be
- * appended to the internal buffer.
- * @return true if the operation was successful, false otherwise (e.g.
- * because the reader is closed).
+ * @param str is a string containing the input data.
+ * @param line is the start line.
+ * @param column is the start column.
*/
- bool feed(const std::string &data);
+ BufferedCharReader(const std::string &str, int line = 1, int column = 1);
/**
- * Marks the end of the input, allowing successors in the pipeline
- * to react properly (e.g. creating the end of stream token).
+ * Constructor of the buffered char reader class with a string as input.
+ *
+ * @param inputStream is the input stream from which incomming data should
+ * be read.
+ * @param line is the start line.
+ * @param column is the start column.
*/
- void close();
+ BufferedCharReader(std::istream &inputStream, int line = 1, int column = 1);
/**
* Peeks a single character. If called multiple times, returns the
@@ -209,29 +227,47 @@ public:
void resetPeek();
/**
+ * Feeds new data into the internal buffer of the BufferedCharReader
+ * class. Only applicable if the buffered char reader was constructed
+ * without an input stream or string.
+ *
+ * @param data is a string containing the data that should be
+ * appended to the internal buffer.
+ */
+ void feed(const std::string &data);
+
+ /**
+ * Tells the buffered char reader that no more data will be fed.
+ * Only applicable if the buffered char reader was constructed without an
+ * input stream or string.
+ *
+ * @param data is a string containing the data that should be
+ * appended to the internal buffer.
+ */
+ void close();
+
+ /**
* Returns true if there are no more characters as the stream was
* closed.
+ *
+ * @return true if there is no more data.
*/
bool atEnd();
/**
* Returns the current line (starting with one).
+ *
+ * @return the current line number.
*/
- inline int getLine()
- {
- return readCursor.line;
- }
+ inline int getLine() { return readCursor.line; }
/**
* Returns the current column (starting with one).
+ *
+ * @return the current column number.
*/
- inline int getColumn()
- {
- return readCursor.column;
- }
-
+ inline int getColumn() { return readCursor.column; }
};
-
}
#endif /* _OUSIA_BUFFERED_CHAR_READER_H_ */