1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
|
/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cassert>
#include <stack>
#include <vector>
#include <core/common/CharReader.hpp>
#include <core/common/Logger.hpp>
#include <core/common/Utils.hpp>
#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
#include <core/parser/utils/Tokenizer.hpp>
#include <core/parser/utils/TokenizedData.hpp>
#include "OsmlStreamParser.hpp"
namespace ousia {
namespace {
/**
* Osml format default tokenizer. Registers the primary tokens in its
* constructor. A single, static instance of this class is created as
* "OsmlTokens", which is copied to the Tokenizer instance of
* OsmlStreamParserImpl.
*/
class OsmlFormatTokens : public Tokenizer {
public:
TokenId Backslash;
TokenId LineComment;
TokenId BlockCommentStart;
TokenId BlockCommentEnd;
TokenId FieldStart;
TokenId FieldEnd;
TokenId DefaultFieldStart;
TokenId AnnotationStart;
TokenId AnnotationEnd;
/**
* Registers the plain format tokens in the internal tokenizer.
*/
OsmlFormatTokens()
{
Backslash = registerToken("\\");
LineComment = registerToken("%");
BlockCommentStart = registerToken("%{");
BlockCommentEnd = registerToken("}%");
FieldStart = registerToken("{");
FieldEnd = registerToken("}");
DefaultFieldStart = registerToken("{!");
AnnotationStart = registerToken("<\\");
AnnotationEnd = registerToken("\\>");
}
};
/**
* Instance of OsmlFormatTokens used to initialize the internal tokenizer
* instance of OsmlStreamParserImpl.
*/
static const OsmlFormatTokens OsmlTokens;
/**
* Structure representing a field.
*/
struct Field {
/**
* Specifies whether this field was marked as default field.
*/
bool defaultField;
/**
* Location at which the field was started.
*/
SourceLocation location;
/**
* Constructor of the Field structure, initializes all member variables with
* the given values.
*
* @param defaultField is a flag specifying whether this field is a default
* field.
* @param location specifies the location at which the field was started.
*/
Field(bool defaultField = false,
const SourceLocation &location = SourceLocation{})
: defaultField(defaultField), location(location)
{
}
};
/**
* Entry used for the command stack.
*/
class Command {
private:
/**
* Name and location of the current command.
*/
Variant name;
/**
* Arguments that were passed to the command.
*/
Variant arguments;
/**
* Vector used as stack for holding the number of opening/closing braces
* and the corresponding "isDefaultField" flag.
*/
std::vector<Field> fields;
/**
* Set to true if this is a command with clear begin and end.
*/
bool hasRange;
public:
/**
* Default constructor, marks this command as normal, non-range command.
*/
Command() : hasRange(false) {}
/**
* Constructor of the Command class.
*
* @param name is a string variant with name and location of the
* command.
* @param arguments is a map variant with the arguments given to the
* command.
* @param hasRange should be set to true if this is a command with
* explicit range.
*/
Command(Variant name, Variant arguments, bool hasRange)
: name(std::move(name)),
arguments(std::move(arguments)),
hasRange(hasRange)
{
}
/**
* Returns a reference at the variant representing name and location of the
* command.
*
* @return a variant containing name and location of the command.
*/
const Variant &getName() const { return name; }
/**
* Returns a reference at the variant containing name, value and location of
* the arguments.
*
* @return the arguments stored for the command.
*/
const Variant &getArguments() const { return arguments; }
/**
* Returns a reference at the internal field list. This list should be used
* for printing error messages when fields are still open although the outer
* range field closes.
*
* @return a const reference at the internal field vector.
*/
const std::vector<Field> &getFields() const { return fields; }
/**
* Returns true if this command is currently in a default field.
*
* @return true if the current field on the field stack was explicitly
* marked as default field. If the field stack is empty, true is returned
* if this is a range command.
*/
bool inDefaultField() const
{
return (!fields.empty() && fields.back().defaultField) ||
(fields.empty() && hasRange);
}
/**
* Returns true if this command currently is in any field.
*
* @return true if a field is on the stack or this is a range commands.
* Range commands always are in a field.
*/
bool inField() const { return !fields.empty() || hasRange; }
/**
* Returns true if this command currently is in a range field.
*
* @return true if the command has a range and no other ranges are on the
* stack.
*/
bool inRangeField() const { return fields.empty() && hasRange; }
/**
* Returns true if this command currently is in a non-range field.
*
* @return true if the command is in a field, but the field is not the field
* constructed by the "range"
*/
bool inNonRangeField() const { return !fields.empty(); }
/**
* Pushes another field onto the field stack of this command.
*
* @param defaultField if true, explicitly marks this field as default
* field.
* @param location is the source location at which the field was started.
* Used for error messages in which the user is notified about an error with
* too few closing fields.
*/
void pushField(bool defaultField = false,
const SourceLocation &location = SourceLocation{})
{
fields.emplace_back(defaultField, location);
}
/**
* Removes another field from the field stack of this command, returns true
* if the operation was successful.
*
* @return true if there was a field to pop on the stack, false otherwise.
*/
bool popField()
{
if (!fields.empty()) {
fields.pop_back();
return true;
}
return false;
}
};
}
/* Class OsmlStreamParserImpl */
/**
* Internal implementation of OsmlStreamParser.
*/
class OsmlStreamParserImpl {
public:
/**
* State enum compatible with OsmlStreamParserState but extended by two more
* entries (END and NONE).
*/
enum class State : uint8_t {
COMMAND_START = 0,
RANGE_END = 1,
FIELD_START = 2,
FIELD_END = 3,
ANNOTATION_START = 4,
ANNOTATION_END = 5,
DATA = 6,
END = 7,
RECOVERABLE_ERROR = 8,
IRRECOVERABLE_ERROR = 9
};
private:
/**
* Reference to the CharReader instance from which the incomming bytes are
* read.
*/
CharReader &reader;
/**
* Reference at the logger instance to which all error messages are sent.
*/
Logger &logger;
/**
* Tokenizer instance used to read individual tokens from the text.
*/
Tokenizer tokenizer;
/**
* Variant containing the tokenized data that was returned from the
* tokenizer as data.
*/
TokenizedData data;
/**
* Stack containing the current commands.
*/
std::stack<Command> commands;
/**
* Variable containing the current location of the parser.
*/
SourceLocation location;
/**
* Function used internally to parse an identifier.
*
* @param start is the start byte offset of the identifier (including the
* backslash).
* @param allowNSSep should be set to true if the namespace separator is
* allowed in the identifier name. Issues error if the namespace separator
* is placed incorrectly.
*/
Variant parseIdentifier(size_t start, bool allowNSSep = false);
/**
* Function used internally to handle the special "\begin" command.
*
* @return an internal State specifying whether an error occured (return
* values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a
* command was actually started (return value State::COMMAND_START).
*/
State parseBeginCommand();
/**
* Function used internally to handle the special "\end" command.
*
* @return an internal State specifying whether an error occured (return
* values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a
* command was actually ended (return value State::RANGE_END).
*/
State parseEndCommand();
/**
* Parses the command arguments. Handles errors if the name of the command
* was given using the hash notation and as a name field.
*
* @param commandArgName is the name argument that was given using the hash
* notation.
* @return a map variant containing the arguments.
*/
Variant parseCommandArguments(Variant commandArgName);
/**
* Function used internally to parse a command.
*
* @param start is the start byte offset of the command (including the
* backslash)
* @param isAnnotation if true, the command is not returned as command, but
* as annotation start.
* @return true if a command was actuall parsed, false otherwise.
*/
State parseCommand(size_t start, bool isAnnotation);
/**
* Function used internally to parse a block comment.
*/
void parseBlockComment();
/**
* Function used internally to parse a generic comment.
*/
void parseLineComment();
/**
* Pushes the parsed command onto the command stack.
*/
void pushCommand(Variant commandName, Variant commandArguments,
bool hasRange);
/**
* Checks whether there is any data pending to be issued, if yes, resets the
* currently peeked characters and returns true.
*
* @return true if there was any data and DATA should be returned by the
* parse function, false otherwise.
*/
bool checkIssueData();
/**
* Returns a reference at the current command at the top of the command
* stack.
*
* @return a reference at the top command in the command stack.
*/
Command &cmd() { return commands.top(); }
/**
* Returns a reference at the current command at the top of the command
* stack.
*
* @return a reference at the top command in the command stack.
*/
const Command &cmd() const { return commands.top(); }
public:
/**
* Constructor of the OsmlStreamParserImpl class. Attaches the new
* OsmlStreamParserImpl to the given CharReader and Logger instances.
*
* @param reader is the reader instance from which incomming characters
* should be read.
* @param logger is the logger instance to which errors should be written.
*/
OsmlStreamParserImpl(CharReader &reader, Logger &logger);
State parse();
TokenId registerToken(const std::string &token);
void unregisterToken(TokenId id);
const TokenizedData &getData() const { return data; }
const Variant &getCommandName() const { return cmd().getName(); }
const Variant &getCommandArguments() const { return cmd().getArguments(); }
const SourceLocation &getLocation() const { return location; }
bool inRangeCommand() const { return cmd().inRangeField(); };
bool inDefaultField() const { return cmd().inDefaultField(); }
};
/* Class OsmlStreamParserImpl */
OsmlStreamParserImpl::OsmlStreamParserImpl(CharReader &reader, Logger &logger)
: reader(reader),
logger(logger),
tokenizer(OsmlTokens),
data(reader.getSourceId())
{
commands.emplace("", Variant::mapType{}, true);
}
Variant OsmlStreamParserImpl::parseIdentifier(size_t start, bool allowNSSep)
{
bool first = true;
bool hasCharSinceNSSep = false;
std::vector<char> identifier;
size_t end = reader.getPeekOffset();
char c, c2;
while (reader.peek(c)) {
// Abort if this character is not a valid identifer character
if ((first && Utils::isIdentifierStartCharacter(c)) ||
(!first && Utils::isIdentifierCharacter(c))) {
if (Utils::isIdentifierEndCharacter(c) ||
(reader.fetchPeek(c2) && Utils::isIdentifierCharacter(c2))) {
identifier.push_back(c);
} else {
// Break if a non-identifier-end character is reached and the
// next character is a non-identifer character
reader.resetPeek();
break;
}
} else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&
Utils::isIdentifierStartCharacter(c2)) {
identifier.push_back(c);
} else {
if (c == ':' && allowNSSep) {
logger.error(
"Expected character before and after namespace separator "
"\":\"",
reader);
}
reader.resetPeek();
break;
}
// This is no longer the first character
first = false;
// Advance the hasCharSinceNSSep flag
hasCharSinceNSSep = allowNSSep && (c != ':');
end = reader.getPeekOffset();
reader.consumePeek();
}
// Return the identifier at its location
Variant res =
Variant::fromString(std::string(identifier.data(), identifier.size()));
res.setLocation({reader.getSourceId(), start, end});
return res;
}
OsmlStreamParserImpl::State OsmlStreamParserImpl::parseBeginCommand()
{
// Expect a '{' after the command
reader.consumeWhitespace();
if (!reader.expect('{')) {
logger.error("Expected \"{\" after \\begin", reader);
return State::RECOVERABLE_ERROR;
}
// Parse the name of the command that should be opened
Variant commandName = parseIdentifier(reader.getOffset(), true);
if (commandName.asString().empty()) {
logger.error("Expected identifier", commandName);
return State::IRRECOVERABLE_ERROR;
}
// Check whether the next character is a '#', indicating the start of the
// command name
Variant commandArgName;
SourceOffset start = reader.getOffset();
if (reader.expect('#')) {
commandArgName = parseIdentifier(start);
if (commandArgName.asString().empty()) {
logger.error("Expected identifier after \"#\"", commandArgName);
}
}
if (!reader.expect('}')) {
logger.error("Expected \"}\"", reader);
return State::IRRECOVERABLE_ERROR;
}
// Parse the arguments
Variant commandArguments = parseCommandArguments(std::move(commandArgName));
// Push the command onto the command stack
pushCommand(std::move(commandName), std::move(commandArguments), true);
return State::COMMAND_START;
}
OsmlStreamParserImpl::State OsmlStreamParserImpl::parseEndCommand()
{
// Expect a '{' after the command
if (!reader.expect('{')) {
logger.error("Expected \"{\" after \\end", reader);
return State::RECOVERABLE_ERROR;
}
// Fetch the name of the command that should be ended here
Variant name = parseIdentifier(reader.getOffset(), true);
// Make sure the given command name is not empty
if (name.asString().empty()) {
logger.error("Expected identifier", name);
return State::IRRECOVERABLE_ERROR;
}
// Make sure the command name is terminated with a '}'
if (!reader.expect('}')) {
logger.error("Expected \"}\"", reader);
return State::IRRECOVERABLE_ERROR;
}
// Unroll the command stack up to the last range command, make sure we do
// not intersect with any open field
while (!cmd().inRangeField()) {
if (cmd().inField()) {
logger.error(std::string("\\end in open field of command \"") +
cmd().getName().asString() + std::string("\""),
name);
const std::vector<Field> &fields = cmd().getFields();
for (const Field &field : fields) {
logger.note(std::string("Still open field started here: "),
field.location);
}
return State::IRRECOVERABLE_ERROR;
}
commands.pop();
}
// Special error message if the top-level command is reached
if (commands.size() == 1) {
logger.error(std::string("Cannot end command \"") + name.asString() +
std::string("\" here, no command open"),
name);
return State::IRRECOVERABLE_ERROR;
}
// Inform the user about command mismatches, copy the current command
// descriptor before popping it from the stack
if (getCommandName().asString() != name.asString()) {
logger.error(std::string("Trying to end command \"") + name.asString() +
std::string("\", but open command is \"") +
getCommandName().asString() + std::string("\""),
name);
logger.note("Open command started here:", getCommandName());
return State::IRRECOVERABLE_ERROR;
}
// End the current command
location = name.getLocation();
commands.pop();
return State::RANGE_END;
}
Variant OsmlStreamParserImpl::parseCommandArguments(Variant commandArgName)
{
// Parse the arguments using the universal VariantReader
Variant commandArguments;
if (reader.expect('[')) {
auto res = VariantReader::parseObject(reader, logger, ']');
commandArguments = res.second;
} else {
commandArguments = Variant::mapType{};
}
// Insert the parsed name, make sure "name" was not specified in the
// arguments
if (commandArgName.isString()) {
auto res =
commandArguments.asMap().emplace("name", std::move(commandArgName));
if (!res.second) {
logger.error("Name argument specified multiple times",
SourceLocation{}, MessageMode::NO_CONTEXT);
logger.note("First occurance is here: ", commandArgName);
logger.note("Second occurance is here: ", res.first->second);
}
}
return commandArguments;
}
OsmlStreamParserImpl::State OsmlStreamParserImpl::parseCommand(
size_t start, bool isAnnotation)
{
// Parse the commandName as a first identifier
Variant commandName = parseIdentifier(start, true);
if (commandName.asString().empty()) {
logger.error("Empty command name", reader);
return State::RECOVERABLE_ERROR;
}
// Handle the special "begin" and "end" commands
const auto commandNameComponents =
Utils::split(commandName.asString(), ':');
const bool isBegin = commandNameComponents[0] == "begin";
const bool isEnd = commandNameComponents[0] == "end";
// Parse the begin or end command
State res = State::COMMAND_START;
if (isBegin || isEnd) {
if (commandNameComponents.size() > 1) {
logger.error(
"Special commands \"\\begin\" and \"\\end\" may not contain a "
"namespace separator \":\"",
commandName);
}
if (isBegin) {
res = parseBeginCommand();
} else if (isEnd) {
res = parseEndCommand();
}
} else {
// Check whether the next character is a '#', indicating the start of
// the command name
Variant commandArgName;
start = reader.getOffset();
if (reader.expect('#')) {
commandArgName = parseIdentifier(start);
if (commandArgName.asString().empty()) {
logger.error("Expected identifier after \"#\"", commandArgName);
}
}
// Parse the arugments
Variant commandArguments =
parseCommandArguments(std::move(commandArgName));
// Push the command onto the command stack
pushCommand(std::move(commandName), std::move(commandArguments), false);
}
// Check whether a ">" character is the next character that is to be read.
// In that case the current command could be an annotation end command!
char c;
if (reader.fetch(c) && c == '>') {
// Ignore the character after a begin or end command
if (isBegin || isEnd) {
logger.warning(
"Ignoring annotation end character \">\" after special "
"commands \"begin\" or \"end\". Write \"\\>\" to end a "
"\"begin\"/\"end\" enclosed annotation.",
reader);
return res;
}
// If this should be an annoation, ignore the character
if (isAnnotation) {
logger.warning(
"Ignoring annotation end character \">\" after annotation "
"start command. Write \"\\>\" to end the annotation.",
reader);
} else {
// Make sure no arguments apart from the "name" argument are given
// to an annotation end
const Variant::mapType &map = getCommandArguments().asMap();
if (!map.empty()) {
if (map.count("name") == 0 || map.size() > 1U) {
logger.error(
"An annotation end command may not have any arguments "
"other than \"name\"",
reader);
return res;
}
}
// If we got here, this is a valid ANNOTATION_END command, issue it
reader.peek(c);
reader.consumePeek();
return State::ANNOTATION_END;
}
}
// If we're starting an annotation, return the command as annotation start
// instead of command
if (isAnnotation && res == State::COMMAND_START) {
return State::ANNOTATION_START;
}
return res;
}
void OsmlStreamParserImpl::parseBlockComment()
{
Token token;
TokenizedData commentData;
size_t depth = 1;
while (tokenizer.read(reader, token, commentData)) {
// Throw the comment data away
commentData.clear();
if (token.id == OsmlTokens.BlockCommentEnd) {
depth--;
if (depth == 0) {
return;
}
}
if (token.id == OsmlTokens.BlockCommentStart) {
depth++;
}
}
// Issue an error if the file ends while we are in a block comment
logger.error("File ended while being in a block comment", reader);
}
void OsmlStreamParserImpl::parseLineComment()
{
char c;
while (reader.read(c)) {
if (c == '\n') {
return;
}
}
}
void OsmlStreamParserImpl::pushCommand(Variant commandName,
Variant commandArguments, bool hasRange)
{
// Store the location of the command
location = commandName.getLocation();
// Place the command on the command stack, remove the last commands if we're
// not currently inside a field of these commands
while (!cmd().inField()) {
commands.pop();
}
// Push the new command onto the command stack
commands.emplace(std::move(commandName), std::move(commandArguments),
hasRange);
}
bool OsmlStreamParserImpl::checkIssueData()
{
if (!data.empty()) {
location = data.getLocation();
reader.resetPeek();
return true;
}
return false;
}
OsmlStreamParserImpl::State OsmlStreamParserImpl::parse()
{
// Reset the data handler
data.clear();
// Read tokens until the outer loop should be left
Token token;
while (tokenizer.peek(reader, token, data)) {
const TokenId type = token.id;
// Special handling for Backslash and Text
if (type == OsmlTokens.Backslash ||
type == OsmlTokens.AnnotationStart) {
// Check whether a command starts now, without advancing the peek
// cursor
char c;
if (!reader.fetchPeek(c)) {
logger.error("Trailing backslash at the end of the file.",
token);
return State::END;
}
// Try to parse a command
if (Utils::isIdentifierStartCharacter(c)) {
// Make sure to issue any data before it is to late
if (checkIssueData()) {
return State::DATA;
}
// Parse the actual command
State res = parseCommand(token.location.getStart(),
type == OsmlTokens.AnnotationStart);
switch (res) {
case State::IRRECOVERABLE_ERROR:
throw LoggableException(
"Last error was irrecoverable, ending parsing "
"process");
case State::RECOVERABLE_ERROR:
continue;
default:
return res;
}
}
// This was not a special character, just append the given character
// to the data buffer, use the escape character start as start
// location and the peek offset as end location
reader.peek(c); // Peek the previously fetched character
// If this was an annotation start token, add the parsed < to the
// output
SourceOffset charStart = token.location.getStart();
SourceOffset charEnd = reader.getPeekOffset();
if (type == OsmlTokens.AnnotationStart) {
data.append('<', charStart, charStart + 1);
charStart = charStart + 1;
}
// Append the character to the output data, mark it as protected
data.append(c, charStart, charEnd, true);
reader.consumePeek();
continue;
} else if (type == Tokens::Data) {
reader.consumePeek();
continue;
} else if (type == OsmlTokens.LineComment) {
reader.consumePeek();
parseLineComment();
continue;
} else if (type == OsmlTokens.BlockCommentStart) {
reader.consumePeek();
parseBlockComment();
continue;
}
// A non-text token was reached, make sure all pending data commands
// have been issued
if (checkIssueData()) {
return State::DATA;
}
// We will handle the token now, consume the peeked characters
reader.consumePeek();
// Synchronize the location with the current token location
location = token.location;
if (token.id == OsmlTokens.FieldStart) {
cmd().pushField(false, token.location);
return State::FIELD_START;
} else if (token.id == OsmlTokens.FieldEnd) {
// Remove all commands from the list that currently are not in any
// field
while (!cmd().inField()) {
commands.pop();
}
// If the remaining command is not in a range field, remove this
// command
if (cmd().inNonRangeField()) {
cmd().popField();
return State::FIELD_END;
}
logger.error(
"Got field end token \"}\", but there is no field to end.",
token);
} else if (token.id == OsmlTokens.DefaultFieldStart) {
cmd().pushField(true, token.location);
return State::FIELD_START;
} else if (token.id == OsmlTokens.AnnotationEnd) {
// We got a single annotation end token "\>" -- simply issue the
// ANNOTATION_END event
Variant annotationName = Variant::fromString("");
annotationName.setLocation(token.location);
pushCommand(annotationName, Variant::mapType{}, false);
return State::ANNOTATION_END;
} else {
logger.error("Unexpected token \"" + token.content + "\"", token);
}
}
// Issue available data
if (checkIssueData()) {
return State::DATA;
}
// Make sure all open commands and fields have been ended at the end of the
// stream
while (true) {
bool topLevelCommand = commands.size() == 1U;
if (cmd().inField()) {
// If the stream ended with an open range field, issue information
// about the range field
if (cmd().inRangeField() && !topLevelCommand) {
// Inform about the still open command itself
logger.error("Reached end of stream, but command \"" +
getCommandName().asString() +
"\" has not been ended",
getCommandName());
} else {
// Issue information about still open fields
const std::vector<Field> &fields = cmd().getFields();
if (!fields.empty()) {
logger.error(
std::string(
"Reached end of stream, but field is still open."),
fields.back().location);
}
}
}
if (!topLevelCommand) {
commands.pop();
} else {
break;
}
}
location = SourceLocation{reader.getSourceId(), reader.getOffset()};
return State::END;
}
TokenId OsmlStreamParserImpl::registerToken(const std::string &token)
{
return tokenizer.registerToken(token, false);
}
void OsmlStreamParserImpl::unregisterToken(TokenId id)
{
assert(tokenizer.unregisterToken(id));
}
/* Class OsmlStreamParser */
OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
: impl(new OsmlStreamParserImpl(reader, logger))
{
}
OsmlStreamParser::~OsmlStreamParser()
{
// Stub needed because OsmlStreamParserImpl is incomplete in header
}
OsmlStreamParser::State OsmlStreamParser::parse()
{
return static_cast<State>(impl->parse());
}
const TokenizedData &OsmlStreamParser::getData() const
{
return impl->getData();
}
const Variant &OsmlStreamParser::getCommandName() const
{
return impl->getCommandName();
}
const Variant &OsmlStreamParser::getCommandArguments() const
{
return impl->getCommandArguments();
}
const SourceLocation &OsmlStreamParser::getLocation() const
{
return impl->getLocation();
}
bool OsmlStreamParser::inDefaultField() const { return impl->inDefaultField(); }
bool OsmlStreamParser::inRangeCommand() const { return impl->inRangeCommand(); }
TokenId OsmlStreamParser::registerToken(const std::string &token)
{
return impl->registerToken(token);
}
void OsmlStreamParser::unregisterToken(TokenId id)
{
impl->unregisterToken(id);
}
}
|