summaryrefslogtreecommitdiff
path: root/src/plugins/plain/PlainFormatStreamReader.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/plain/PlainFormatStreamReader.cpp')
-rw-r--r--src/plugins/plain/PlainFormatStreamReader.cpp641
1 files changed, 0 insertions, 641 deletions
diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp
deleted file mode 100644
index 05769f0..0000000
--- a/src/plugins/plain/PlainFormatStreamReader.cpp
+++ /dev/null
@@ -1,641 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <core/common/CharReader.hpp>
-#include <core/common/Logger.hpp>
-#include <core/common/Utils.hpp>
-#include <core/common/VariantReader.hpp>
-
-#include "PlainFormatStreamReader.hpp"
-
-namespace ousia {
-
-/**
- * Plain format default tokenizer.
- */
-class PlainFormatTokens : public DynamicTokenizer {
-public:
- /**
- * Id of the backslash token.
- */
- TokenTypeId Backslash;
-
- /**
- * Id of the line comment token.
- */
- TokenTypeId LineComment;
-
- /**
- * Id of the block comment start token.
- */
- TokenTypeId BlockCommentStart;
-
- /**
- * Id of the block comment end token.
- */
- TokenTypeId BlockCommentEnd;
-
- /**
- * Id of the field start token.
- */
- TokenTypeId FieldStart;
-
- /**
- * Id of the field end token.
- */
- TokenTypeId FieldEnd;
-
- /**
- * Registers the plain format tokens in the internal tokenizer.
- */
- PlainFormatTokens()
- {
- Backslash = registerToken("\\");
- LineComment = registerToken("%");
- BlockCommentStart = registerToken("%{");
- BlockCommentEnd = registerToken("}%");
- FieldStart = registerToken("{");
- FieldEnd = registerToken("}");
- }
-};
-
-static const PlainFormatTokens Tokens;
-
-/**
- * Class used internally to collect data issued via "DATA" event.
- */
-class DataHandler {
-private:
- /**
- * Internal character buffer.
- */
- std::vector<char> buf;
-
- /**
- * Start location of the character data.
- */
- SourceOffset start;
-
- /**
- * End location of the character data.
- */
- SourceOffset end;
-
-public:
- /**
- * Default constructor, initializes start and end with zeros.
- */
- DataHandler() : start(0), end(0) {}
-
- /**
- * Returns true if the internal buffer is empty.
- *
- * @return true if no characters were added to the internal buffer, false
- * otherwise.
- */
- bool isEmpty() { return buf.empty(); }
-
- /**
- * Appends a single character to the internal buffer.
- *
- * @param c is the character that should be added to the internal buffer.
- * @param charStart is the start position of the character.
- * @param charEnd is the end position of the character.
- */
- void append(char c, SourceOffset charStart, SourceOffset charEnd)
- {
- if (isEmpty()) {
- start = charStart;
- }
- buf.push_back(c);
- end = charEnd;
- }
-
- /**
- * Appends a string to the internal buffer.
- *
- * @param s is the string that should be added to the internal buffer.
- * @param stringStart is the start position of the string.
- * @param stringEnd is the end position of the string.
- */
- void append(const std::string &s, SourceOffset stringStart,
- SourceOffset stringEnd)
- {
- if (isEmpty()) {
- start = stringStart;
- }
- std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
- end = stringEnd;
- }
-
- /**
- * Converts the internal buffer to a variant with attached location
- * information.
- *
- * @param sourceId is the source id which is needed for building the
- * location information.
- * @return a Variant with the internal buffer content as string and
- * the correct start and end location.
- */
- Variant toVariant(SourceId sourceId)
- {
- Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
- res.setLocation({sourceId, start, end});
- return res;
- }
-};
-
-PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,
- Logger &logger)
- : reader(reader), logger(logger), tokenizer(Tokens)
-{
- // Place an intial command representing the complete file on the stack
- commands.push(Command{"", Variant::mapType{}, true, true, true});
-}
-
-Variant PlainFormatStreamReader::parseIdentifier(size_t start, bool allowNSSep)
-{
- bool first = true;
- bool hasCharSiceNSSep = false;
- std::vector<char> identifier;
- size_t end = reader.getPeekOffset();
- char c, c2;
- while (reader.peek(c)) {
- // Abort if this character is not a valid identifer character
- if ((first && Utils::isIdentifierStartCharacter(c)) ||
- (!first && Utils::isIdentifierCharacter(c))) {
- identifier.push_back(c);
- } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) {
- identifier.push_back(c);
- } else {
- if (c == ':' && allowNSSep) {
- logger.error(
- "Expected character before and after namespace separator \":\"",
- reader);
- }
- reader.resetPeek();
- break;
- }
-
- // This is no longer the first character
- first = false;
-
- // Advance the hasCharSiceNSSep flag
- hasCharSiceNSSep = allowNSSep && (c != ':');
-
- end = reader.getPeekOffset();
- reader.consumePeek();
- }
-
- // Return the identifier at its location
- Variant res =
- Variant::fromString(std::string(identifier.data(), identifier.size()));
- res.setLocation({reader.getSourceId(), start, end});
- return res;
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parseBeginCommand()
-{
- // Expect a '{' after the command
- reader.consumeWhitespace();
- if (!reader.expect('{')) {
- logger.error("Expected \"{\" after \\begin", reader);
- return State::NONE;
- }
-
- // Parse the name of the command that should be opened
- Variant commandName = parseIdentifier(reader.getOffset(), true);
- if (commandName.asString().empty()) {
- logger.error("Expected identifier", commandName);
- return State::ERROR;
- }
-
- // Check whether the next character is a '#', indicating the start of the
- // command name
- Variant commandArgName;
- SourceOffset start = reader.getOffset();
- if (reader.expect('#')) {
- commandArgName = parseIdentifier(start);
- if (commandArgName.asString().empty()) {
- logger.error("Expected identifier after \"#\"", commandArgName);
- }
- }
-
- if (!reader.expect('}')) {
- logger.error("Expected \"}\"", reader);
- return State::ERROR;
- }
-
- // Parse the arguments
- Variant commandArguments = parseCommandArguments(std::move(commandArgName));
-
- // Push the command onto the command stack
- pushCommand(std::move(commandName), std::move(commandArguments), true);
-
- return State::COMMAND;
-}
-
-static bool checkStillInField(const PlainFormatStreamReader::Command &cmd,
- const Variant &endName, Logger &logger)
-{
- if (cmd.inField && !cmd.inRangeField) {
- logger.error(std::string("\\end in open field of command \"") +
- cmd.name.asString() + std::string("\""),
- endName);
- logger.note(std::string("Open command started here:"), cmd.name);
- return true;
- }
- return false;
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parseEndCommand()
-{
- // Expect a '{' after the command
- if (!reader.expect('{')) {
- logger.error("Expected \"{\" after \\end", reader);
- return State::NONE;
- }
-
- // Fetch the name of the command that should be ended here
- Variant name = parseIdentifier(reader.getOffset(), true);
-
- // Make sure the given command name is not empty
- if (name.asString().empty()) {
- logger.error("Expected identifier", name);
- return State::ERROR;
- }
-
- // Make sure the command name is terminated with a '}'
- if (!reader.expect('}')) {
- logger.error("Expected \"}\"", reader);
- return State::ERROR;
- }
-
- // Unroll the command stack up to the last range command
- while (!commands.top().hasRange) {
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
- }
- commands.pop();
- }
-
- // Make sure we're not in an open field of this command
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
- }
-
- // Special error message if the top-level command is reached
- if (commands.size() == 1) {
- logger.error(std::string("Cannot end command \"") + name.asString() +
- std::string("\" here, no command open"),
- name);
- return State::ERROR;
- }
-
- // Inform the about command mismatches
- const Command &cmd = commands.top();
- if (commands.top().name.asString() != name.asString()) {
- logger.error(std::string("Trying to end command \"") +
- cmd.name.asString() +
- std::string("\", but open command is \"") +
- name.asString() + std::string("\""),
- name);
- logger.note("Last command was opened here:", cmd.name);
- return State::ERROR;
- }
-
- // Set the location to the location of the command that was ended, then end
- // the current command
- location = name.getLocation();
- commands.pop();
- return cmd.inRangeField ? State::FIELD_END : State::NONE;
-}
-
-Variant PlainFormatStreamReader::parseCommandArguments(Variant commandArgName)
-{
- // Parse the arguments using the universal VariantReader
- Variant commandArguments;
- if (reader.expect('[')) {
- auto res = VariantReader::parseObject(reader, logger, ']');
- commandArguments = res.second;
- } else {
- commandArguments = Variant::mapType{};
- }
-
- // Insert the parsed name, make sure "name" was not specified in the
- // arguments
- if (commandArgName.isString()) {
- auto res =
- commandArguments.asMap().emplace("name", std::move(commandArgName));
- if (!res.second) {
- logger.error("Name argument specified multiple times",
- SourceLocation{}, MessageMode::NO_CONTEXT);
- logger.note("First occurance is here: ", commandArgName);
- logger.note("Second occurance is here: ", res.first->second);
- }
- }
- return commandArguments;
-}
-
-void PlainFormatStreamReader::pushCommand(Variant commandName,
- Variant commandArguments,
- bool hasRange)
-{
- // Store the location on the stack
- location = commandName.getLocation();
-
- // Place the command on the command stack, remove the last commands if we're
- // not currently inside a field of these commands
- while (!commands.top().inField) {
- commands.pop();
- }
- commands.push(Command{std::move(commandName), std::move(commandArguments),
- hasRange, false, false});
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parseCommand(
- size_t start)
-{
- // Parse the commandName as a first identifier
- Variant commandName = parseIdentifier(start, true);
- if (commandName.asString().empty()) {
- logger.error("Empty command name", reader);
- return State::NONE;
- }
-
- // Handle the special "begin" and "end" commands
- const auto commandNameComponents =
- Utils::split(commandName.asString(), ':');
- const bool isBegin = commandNameComponents[0] == "begin";
- const bool isEnd = commandNameComponents[0] == "end";
- if (isBegin || isEnd) {
- if (commandNameComponents.size() > 1) {
- logger.error(
- "Special commands \"\\begin\" and \"\\end\" may not contain a "
- "namespace separator \":\"",
- commandName);
- }
- if (isBegin) {
- return parseBeginCommand();
- } else if (isEnd) {
- return parseEndCommand();
- }
- }
-
- // Check whether the next character is a '#', indicating the start of the
- // command name
- Variant commandArgName;
- start = reader.getOffset();
- if (reader.expect('#')) {
- commandArgName = parseIdentifier(start);
- if (commandArgName.asString().empty()) {
- logger.error("Expected identifier after \"#\"", commandArgName);
- }
- }
-
- // Parse the arugments
- Variant commandArguments = parseCommandArguments(std::move(commandArgName));
-
- // Push the command onto the command stack
- pushCommand(std::move(commandName), std::move(commandArguments), false);
-
- return State::COMMAND;
-}
-
-void PlainFormatStreamReader::parseBlockComment()
-{
- DynamicToken token;
- size_t depth = 1;
- while (tokenizer.read(reader, token)) {
- if (token.type == Tokens.BlockCommentEnd) {
- depth--;
- if (depth == 0) {
- return;
- }
- }
- if (token.type == Tokens.BlockCommentStart) {
- depth++;
- }
- }
-
- // Issue an error if the file ends while we are in a block comment
- logger.error("File ended while being in a block comment", reader);
-}
-
-void PlainFormatStreamReader::parseLineComment()
-{
- char c;
- while (reader.read(c)) {
- if (c == '\n') {
- return;
- }
- }
-}
-
-bool PlainFormatStreamReader::checkIssueData(DataHandler &handler)
-{
- if (!handler.isEmpty()) {
- data = handler.toVariant(reader.getSourceId());
- location = data.getLocation();
- reader.resetPeek();
- return true;
- }
- return false;
-}
-
-bool PlainFormatStreamReader::checkIssueFieldStart()
-{
- // Fetch the current command, and check whether we're currently inside a
- // field of this command
- Command &cmd = commands.top();
- if (!cmd.inField) {
- // If this is a range command, we're now implicitly inside the field of
- // this command -- we'll have to issue a field start command!
- if (cmd.hasRange) {
- cmd.inField = true;
- cmd.inRangeField = true;
- reader.resetPeek();
- return true;
- }
-
- // This was not a range command, so obviously we're now inside within
- // a field of some command -- so unroll the commands stack until a
- // command with open field is reached
- while (!commands.top().inField) {
- commands.pop();
- }
- }
- return false;
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parse()
-{
- // Handler for incomming data
- DataHandler handler;
-
- // Read tokens until the outer loop should be left
- DynamicToken token;
- while (tokenizer.peek(reader, token)) {
- const TokenTypeId type = token.type;
-
- // Special handling for Backslash and Text
- if (type == Tokens.Backslash) {
- // Before appending anything to the output data or starting a new
- // command, check whether FIELD_START has to be issued, as the
- // current command is a command with range
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
- // Check whether a command starts now, without advancing the peek
- // cursor
- char c;
- if (!reader.fetchPeek(c)) {
- logger.error("Trailing backslash at the end of the file.",
- token);
- return State::END;
- }
-
- // Try to parse a command
- if (Utils::isIdentifierStartCharacter(c)) {
- // Make sure to issue any data before it is to late
- if (checkIssueData(handler)) {
- return State::DATA;
- }
-
- // Parse the actual command
- State res = parseCommand(token.location.getStart());
- switch (res) {
- case State::ERROR:
- throw LoggableException(
- "Last error was irrecoverable, ending parsing "
- "process");
- case State::NONE:
- continue;
- default:
- return res;
- }
- }
-
- // This was not a special character, just append the given character
- // to the data buffer, use the escape character start as start
- // location and the peek offset as end location
- reader.peek(c); // Peek the previously fetched character
- handler.append(c, token.location.getStart(),
- reader.getPeekOffset());
- reader.consumePeek();
- continue;
- } else if (type == TextToken) {
- // Check whether FIELD_START has to be issued before appending text
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
- // Append the text to the data handler
- handler.append(token.content, token.location.getStart(),
- token.location.getEnd());
-
- reader.consumePeek();
- continue;
- }
-
- // A non-text token was reached, make sure all pending data commands
- // have been issued
- if (checkIssueData(handler)) {
- return State::DATA;
- }
-
- // We will handle the token now, consume the peeked characters
- reader.consumePeek();
-
- // Update the location to the current token location
- location = token.location;
-
- if (token.type == Tokens.LineComment) {
- parseLineComment();
- } else if (token.type == Tokens.BlockCommentStart) {
- parseBlockComment();
- } else if (token.type == Tokens.FieldStart) {
- Command &cmd = commands.top();
- if (!cmd.inField) {
- cmd.inField = true;
- return State::FIELD_START;
- }
- logger.error(
- "Got field start token \"{\", but no command for which to "
- "start the field. Did you mean \"\\{\"?",
- token);
- } else if (token.type == Tokens.FieldEnd) {
- // Try to end an open field of the current command -- if the current
- // command is not inside an open field, end this command and try to
- // close the next one
- for (int i = 0; i < 2 && commands.size() > 1; i++) {
- Command &cmd = commands.top();
- if (!cmd.inRangeField) {
- if (cmd.inField) {
- cmd.inField = false;
- return State::FIELD_END;
- }
- commands.pop();
- } else {
- break;
- }
- }
- logger.error(
- "Got field end token \"}\", but there is no field to end. Did "
- "you mean \"\\}\"?",
- token);
- } else {
- logger.error("Unexpected token \"" + token.content + "\"", token);
- }
- }
-
- // Issue available data
- if (checkIssueData(handler)) {
- return State::DATA;
- }
-
- // Make sure all open commands and fields have been ended at the end of the
- // stream
- while (commands.size() > 1) {
- Command &cmd = commands.top();
- if (cmd.inField || cmd.hasRange) {
- logger.error("Reached end of stream, but command \"" +
- cmd.name.asString() + "\" has not been ended",
- cmd.name);
- }
- commands.pop();
- }
-
- location = SourceLocation{reader.getSourceId(), reader.getOffset()};
- return State::END;
-}
-
-const Variant &PlainFormatStreamReader::getCommandName()
-{
- return commands.top().name;
-}
-
-const Variant &PlainFormatStreamReader::getCommandArguments()
-{
- return commands.top().arguments;
-}
-}
-