summaryrefslogtreecommitdiff
path: root/src/core/BufferedCharReader.hpp
blob: b13cde64c6dce23df34a0f50256a6c719f4b15e0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/*
    Ousía
    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef _OUSIA_UTILS_BUFFERED_CHAR_READER_H_
#define _OUSIA_UTILS_BUFFERED_CHAR_READER_H_

#include <deque>
#include <string>
#include <cstdint>

namespace ousia {
namespace utils {

/**
 * The BufferedCharReader class is used for storing incomming data that
 * is fed into the pipeline as well as reading/peeking single characters
 * from that buffer. Additionally it counts the current column/row
 * (with correct handling for UTF-8) and contains an internal state
 * machine that handles the detection of linebreaks.
 *
 * Additionally the BufferedCharReader performs the following tasks:
 * 1. Convert the incomming character encoding to UTF-8 (TODO: implement)
 * 2. Convert arbitrary linebreaks to a single "\n"
 */
class BufferedCharReader {

private:

	/**
	 * The ReadCursor structure is responsible for representing the read
	 * position within the text an all state machine states belonging to the
	 * cursor. There are two types of read cursors: destructive and
	 * non-destructive read cursors.
	 */
	struct ReadCursor {
		/**
		 * Specifies whether this is a destructive cursor (bytes are discarded
		 * once they were read from the buffer).
		 */
		const bool destructive;

		/**
		 * The line the cursor currently points to.
		 */
		unsigned int line;

		/**
		 * The column the cursor currently points to.
		 */
		unsigned int column;

		/**
		 * The index of the element in the data buffer we're currently reading
		 * from.
		 */
		unsigned int bufferElem;

		/**
		 * The byte position within this data buffer.
		 */
		unsigned int bufferPos;

		/**
		 * State variable used in the internal state machine of the
		 * line feed detection.
		 */
		uint8_t lbState;

		/**
		 * Constructor of the ReadCursor structure.
		 *
		 * @param destructive specifies whether the ReadCursor is destructive
		 * (consumes all read characters, as used in the "read cursor") or
		 * non-destructive (as used in the "peek cursor").
		 */
		ReadCursor(const bool destructive);

		/**
		 * Copys the data from another ReadCursor without overriding the
		 * "destructive" flag.
		 */
		void assign(const ReadCursor &cursor);

		/**
		 * Resets the cursor without changing the "destructive" flag.
		 */
		void reset();
	};

	/**
	 * Queue containing the data that has been fed into the char reader.
	 */
	std::deque<std::string> buffer;

	/**
	 * The read and the peek cursor. 
	 */
	ReadCursor readCursor, peekCursor;

	/**
	 * Determines whether the reader has been closed.
	 */
	bool closed;

	/**
	 * Substitute any combination of linebreaks in the incomming code with "\n".
	 * Returns true if the current character is meant as output, false
	 * otherwise.
	 */
	bool substituteLinebreaks(ReadCursor *cursor, char *c);

	/**
	 * Reads a character from the input buffer and advances the given read
	 * cursor.
	 *
	 * @param cursor is a reference to the read cursor that should be used
	 * for reading.
	 * @param hasChar is set to true, if a character is available, false if
	 * no character is available (e.g. because line breaks are substituted or
	 * the end of a buffer boundary is reached -- in this case this function
	 * should be called again with the same parameters.)
	 * @param c is a output parameter, which will be set to the read character.
	 * @param returns true if there was enough data in the buffer, false
	 * otherwise.
	 */
	bool readCharacterAtCursor(ReadCursor *cursor, char *c);

	/**
	 * Function that is called for each read character -- updates the row and
	 * column count.
	 */
	void updatePositionCounters(const char c);

public:

	/**
	 * Constructor of the buffered char reader class.
	 */
	BufferedCharReader();

	/**
	 * Resets the reader to its initial state.
	 */
	void reset();

	/**
	 * Feeds new data into the internal buffer of the BufferedCharReader
	 * class.
	 *
	 * @param data is a string containing the data that should be
	 * appended to the internal buffer.
	 * @return true if the operation was successful, false otherwise (e.g.
	 * because the reader is closed).
	 */
	bool feed(const std::string &data);

	/**
	 * Marks the end of the input, allowing successors in the pipeline
	 * to react properly (e.g. creating the end of stream token).
	 */
	void close();

	/**
	 * Peeks a single character. If called multiple times, returns the
	 * character after the previously peeked character.
	 *
	 * @param c is a reference to the character to which the result should be
	 * writtern.
	 * @return true if the character was successfully read, false if there are
	 * no more characters to be read in the buffer.
	 */
	bool peek(char *c);

	/**
	 * Reads a character from the input data. If "peek" was called
	 * beforehand resets the peek pointer.
	 *
	 * @param c is a reference to the character to which the result should be
	 * writtern.
	 * @return true if the character was successfully read, false if there are
	 * no more characters to be read in the buffer.
	 */
	bool read(char *c);

	/**
	 * Advances the read pointer to the peek pointer -- so if the "peek"
	 * function was called, "read" will now return the character after
	 * the last peeked character.
	 */
	void consumePeek();

	/**
	 * Resets the peek pointer to the "read" pointer.
	 */
	void resetPeek();

	/**
	 * Returns true if there are no more characters as the stream was
	 * closed.
	 */
	bool atEnd();

	/**
	 * Returns the current line (starting with one).
	 */
	inline int getLine()
	{
		return readCursor.line;
	}

	/**
	 * Returns the current column (starting with one).
	 */
	inline int getColumn()
	{
		return readCursor.column;
	}

};

}
}

#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */