1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
|
/*
Ousía
Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file BufferedCharReader.hpp
*
* Contains the BufferedCharReader class which is used for reading/peeking
* single characters from an input stream or string.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_BUFFERED_CHAR_READER_H_
#define _OUSIA_BUFFERED_CHAR_READER_H_
#include <deque>
#include <string>
#include <istream>
#include <cstdint>
namespace ousia {
// TODO: Better split this class into multiple classes with base class
// BufferedCharReader where each sub class represents one method of supplying
// the input data (feeding, initial string, input stream).
/**
* The BufferedCharReader class is used for storing incomming data that
* is fed into the pipeline as well as reading/peeking single characters
* from that buffer. Additionally it counts the current column/row
* (with correct handling for UTF-8) and contains an internal state
* machine that handles the detection of linebreaks and converts these to a
* single '\n'.
*/
class BufferedCharReader {
private:
/**
* The ReadCursor structure is responsible for representing the read
* position within the text an all state machine states belonging to the
* cursor. There are two types of read cursors: destructive and
* non-destructive read cursors.
*/
struct ReadCursor {
/**
* The line the cursor currently points to.
*/
unsigned int line;
/**
* The column the cursor currently points to.
*/
unsigned int column;
/**
* The index of the element in the data buffer we're currently reading
* from.
*/
unsigned int bufferElem;
/**
* The byte position within this data buffer.
*/
unsigned int bufferPos;
/**
* Specifies whether this is a destructive cursor (bytes are discarded
* once they were read from the buffer).
*/
const bool destructive;
/**
* State variable used in the internal state machine of the
* line feed detection.
*/
uint8_t lbState;
/**
* Constructor of the ReadCursor structure.
*
* @param line is the start line.
* @param column is the start column.
* @param destructive specifies whether the ReadCursor is destructive
* (consumes all read characters, as used in the "read cursor") or
* non-destructive (as used in the "peek cursor").
*/
ReadCursor(unsigned int line, unsigned int column, bool destructive);
/**
* Copys the data from another ReadCursor without overriding the
* "destructive" flag.
*
* @param cursor is the cursor that should be copied.
*/
void assign(const ReadCursor &cursor);
};
/**
* Pointer at an (optional) input stream used for reading a chunk of data
* whenever the input buffer depletes.
*/
std::istream *inputStream;
/**
* The read and the peek cursor.
*/
ReadCursor readCursor, peekCursor;
/**
* Set to true if there is no more input data.
*/
bool depleted;
/**
* Queue containing the data that has been fed into the char reader.
*/
std::deque<std::string> buffer;
/**
* Substitute any combination of linebreaks in the incomming code with "\n".
* Returns true if the current character is meant as output, false
* otherwise.
*/
bool substituteLinebreaks(ReadCursor &cursor, char *c);
/**
* Reads a character from the input buffer and advances the given read
* cursor.
*
* @param cursor is a reference to the read cursor that should be used
* for reading.
* @param hasChar is set to true, if a character is available, false if
* no character is available (e.g. because line breaks are substituted or
* the end of a buffer boundary is reached -- in this case this function
* should be called again with the same parameters.)
* @param c is a output parameter, which will be set to the read character.
* @param returns true if there was enough data in the buffer, false
* otherwise.
*/
bool readCharacterAtCursor(ReadCursor &cursor, char *c);
/**
* Function that is called for each read character -- updates the row and
* column count.
*/
void updatePositionCounters(const char c);
public:
/**
* Constructor of the buffered char reader class with empty buffer as input.
* This operates the BufferedCharReader in a mode where new data has to be
* fed using the "feed" function and explicitly closed using the "close"
* function.
*
* @param line is the start line.
* @param column is the start column.
*/
BufferedCharReader(int line = 1, int column = 1);
/**
* Constructor of the buffered char reader class with a string as input.
*
* @param str is a string containing the input data.
* @param line is the start line.
* @param column is the start column.
*/
BufferedCharReader(const std::string &str, int line, int column);
/**
* Constructor of the buffered char reader class with a string as input.
*
* @param str is a string containing the input data.
*/
BufferedCharReader(const std::string &str);
/**
* Constructor of the buffered char reader class with a string as input.
*
* @param inputStream is the input stream from which incomming data should
* be read.
* @param line is the start line.
* @param column is the start column.
*/
BufferedCharReader(std::istream &inputStream, int line = 1, int column = 1);
/**
* Peeks a single character. If called multiple times, returns the
* character after the previously peeked character.
*
* @param c is a reference to the character to which the result should be
* writtern.
* @return true if the character was successfully read, false if there are
* no more characters to be read in the buffer.
*/
bool peek(char *c);
/**
* Reads a character from the input data. If "peek" was called
* beforehand resets the peek pointer.
*
* @param c is a reference to the character to which the result should be
* writtern.
* @return true if the character was successfully read, false if there are
* no more characters to be read in the buffer.
*/
bool read(char *c);
/**
* Advances the read pointer to the peek pointer -- so if the "peek"
* function was called, "read" will now return the character after
* the last peeked character.
*/
void consumePeek();
/**
* Moves the read cursor to the next non-whitespace character. Returns
* false, if the end of the stream was reached.
*
* @return false if the end of the stream was reached, false othrwise.
*/
bool consumeWhitespace();
/**
* Resets the peek pointer to the "read" pointer.
*/
void resetPeek();
/**
* Feeds new data into the internal buffer of the BufferedCharReader
* class. Only applicable if the buffered char reader was constructed
* without an input stream or string.
*
* @param data is a string containing the data that should be
* appended to the internal buffer.
*/
void feed(const std::string &data);
/**
* Tells the buffered char reader that no more data will be fed.
* Only applicable if the buffered char reader was constructed without an
* input stream or string.
*
* @param data is a string containing the data that should be
* appended to the internal buffer.
*/
void close();
/**
* Returns true if there are no more characters as the stream was
* closed.
*
* @return true if there is no more data.
*/
bool atEnd() const;
/**
* Returns the current line (starting with one).
*
* @return the current line number.
*/
int getLine() const { return readCursor.line; }
/**
* Returns the current column (starting with one).
*
* @return the current column number.
*/
int getColumn() const { return readCursor.column; }
};
}
#endif /* _OUSIA_BUFFERED_CHAR_READER_H_ */
|