/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include "OsmlStreamParser.hpp" namespace ousia { /** * Plain format default tokenizer. */ class PlainFormatTokens : public Tokenizer { public: /** * Id of the backslash token. */ TokenTypeId Backslash; /** * Id of the line comment token. */ TokenTypeId LineComment; /** * Id of the block comment start token. */ TokenTypeId BlockCommentStart; /** * Id of the block comment end token. */ TokenTypeId BlockCommentEnd; /** * Id of the field start token. */ TokenTypeId FieldStart; /** * Id of the field end token. */ TokenTypeId FieldEnd; /** * Id of the default field start token. */ TokenTypeId DefaultFieldStart; /** * Registers the plain format tokens in the internal tokenizer. */ PlainFormatTokens() { Backslash = registerToken("\\"); LineComment = registerToken("%"); BlockCommentStart = registerToken("%{"); BlockCommentEnd = registerToken("}%"); FieldStart = registerToken("{"); FieldEnd = registerToken("}"); DefaultFieldStart = registerToken("{!"); } }; static const PlainFormatTokens Tokens; /** * Class used internally to collect data issued via "DATA" event. */ class DataHandler { private: /** * Internal character buffer. */ std::vector buf; /** * Start location of the character data. */ SourceOffset start; /** * End location of the character data. */ SourceOffset end; public: /** * Default constructor, initializes start and end with zeros. */ DataHandler() : start(0), end(0) {} /** * Returns true if the internal buffer is empty. * * @return true if no characters were added to the internal buffer, false * otherwise. */ bool isEmpty() { return buf.empty(); } /** * Appends a single character to the internal buffer. * * @param c is the character that should be added to the internal buffer. * @param charStart is the start position of the character. * @param charEnd is the end position of the character. */ void append(char c, SourceOffset charStart, SourceOffset charEnd) { if (isEmpty()) { start = charStart; } buf.push_back(c); end = charEnd; } /** * Appends a string to the internal buffer. * * @param s is the string that should be added to the internal buffer. * @param stringStart is the start position of the string. * @param stringEnd is the end position of the string. */ void append(const std::string &s, SourceOffset stringStart, SourceOffset stringEnd) { if (isEmpty()) { start = stringStart; } std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); end = stringEnd; } /** * Converts the internal buffer to a variant with attached location * information. * * @param sourceId is the source id which is needed for building the * location information. * @return a Variant with the internal buffer content as string and * the correct start and end location. */ Variant toVariant(SourceId sourceId) { Variant res = Variant::fromString(std::string(buf.data(), buf.size())); res.setLocation({sourceId, start, end}); return res; } }; OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) : reader(reader), logger(logger), tokenizer(Tokens) { // Place an intial command representing the complete file on the stack commands.push(Command{"", Variant::mapType{}, true, true, true, false}); } Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; bool hasCharSiceNSSep = false; std::vector identifier; size_t end = reader.getPeekOffset(); char c, c2; while (reader.peek(c)) { // Abort if this character is not a valid identifer character if ((first && Utils::isIdentifierStartCharacter(c)) || (!first && Utils::isIdentifierCharacter(c))) { identifier.push_back(c); } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { identifier.push_back(c); } else { if (c == ':' && allowNSSep) { logger.error( "Expected character before and after namespace separator " "\":\"", reader); } reader.resetPeek(); break; } // This is no longer the first character first = false; // Advance the hasCharSiceNSSep flag hasCharSiceNSSep = allowNSSep && (c != ':'); end = reader.getPeekOffset(); reader.consumePeek(); } // Return the identifier at its location Variant res = Variant::fromString(std::string(identifier.data(), identifier.size())); res.setLocation({reader.getSourceId(), start, end}); return res; } OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() { // Expect a '{' after the command reader.consumeWhitespace(); if (!reader.expect('{')) { logger.error("Expected \"{\" after \\begin", reader); return State::NONE; } // Parse the name of the command that should be opened Variant commandName = parseIdentifier(reader.getOffset(), true); if (commandName.asString().empty()) { logger.error("Expected identifier", commandName); return State::ERROR; } // Check whether the next character is a '#', indicating the start of the // command name Variant commandArgName; SourceOffset start = reader.getOffset(); if (reader.expect('#')) { commandArgName = parseIdentifier(start); if (commandArgName.asString().empty()) { logger.error("Expected identifier after \"#\"", commandArgName); } } if (!reader.expect('}')) { logger.error("Expected \"}\"", reader); return State::ERROR; } // Parse the arguments Variant commandArguments = parseCommandArguments(std::move(commandArgName)); // Push the command onto the command stack pushCommand(std::move(commandName), std::move(commandArguments), true); return State::COMMAND; } static bool checkStillInField(const OsmlStreamParser::Command &cmd, const Variant &endName, Logger &logger) { if (cmd.inField && !cmd.inRangeField) { logger.error(std::string("\\end in open field of command \"") + cmd.name.asString() + std::string("\""), endName); logger.note(std::string("Open command started here:"), cmd.name); return true; } return false; } OsmlStreamParser::State OsmlStreamParser::parseEndCommand() { // Expect a '{' after the command if (!reader.expect('{')) { logger.error("Expected \"{\" after \\end", reader); return State::NONE; } // Fetch the name of the command that should be ended here Variant name = parseIdentifier(reader.getOffset(), true); // Make sure the given command name is not empty if (name.asString().empty()) { logger.error("Expected identifier", name); return State::ERROR; } // Make sure the command name is terminated with a '}' if (!reader.expect('}')) { logger.error("Expected \"}\"", reader); return State::ERROR; } // Unroll the command stack up to the last range command while (!commands.top().hasRange) { if (checkStillInField(commands.top(), name, logger)) { return State::ERROR; } commands.pop(); } // Make sure we're not in an open field of this command if (checkStillInField(commands.top(), name, logger)) { return State::ERROR; } // Special error message if the top-level command is reached if (commands.size() == 1) { logger.error(std::string("Cannot end command \"") + name.asString() + std::string("\" here, no command open"), name); return State::ERROR; } // Inform the about command mismatches const Command &cmd = commands.top(); if (commands.top().name.asString() != name.asString()) { logger.error(std::string("Trying to end command \"") + cmd.name.asString() + std::string("\", but open command is \"") + name.asString() + std::string("\""), name); logger.note("Last command was opened here:", cmd.name); return State::ERROR; } // Set the location to the location of the command that was ended, then end // the current command location = name.getLocation(); commands.pop(); return cmd.inRangeField ? State::FIELD_END : State::NONE; } Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) { // Parse the arguments using the universal VariantReader Variant commandArguments; if (reader.expect('[')) { auto res = VariantReader::parseObject(reader, logger, ']'); commandArguments = res.second; } else { commandArguments = Variant::mapType{}; } // Insert the parsed name, make sure "name" was not specified in the // arguments if (commandArgName.isString()) { auto res = commandArguments.asMap().emplace("name", std::move(commandArgName)); if (!res.second) { logger.error("Name argument specified multiple times", SourceLocation{}, MessageMode::NO_CONTEXT); logger.note("First occurance is here: ", commandArgName); logger.note("Second occurance is here: ", res.first->second); } } return commandArguments; } void OsmlStreamParser::pushCommand(Variant commandName, Variant commandArguments, bool hasRange) { // Store the location on the stack location = commandName.getLocation(); // Place the command on the command stack, remove the last commands if we're // not currently inside a field of these commands while (!commands.top().inField) { commands.pop(); } commands.push(Command{std::move(commandName), std::move(commandArguments), hasRange, false, false, false}); } OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); if (commandName.asString().empty()) { logger.error("Empty command name", reader); return State::NONE; } // Handle the special "begin" and "end" commands const auto commandNameComponents = Utils::split(commandName.asString(), ':'); const bool isBegin = commandNameComponents[0] == "begin"; const bool isEnd = commandNameComponents[0] == "end"; if (isBegin || isEnd) { if (commandNameComponents.size() > 1) { logger.error( "Special commands \"\\begin\" and \"\\end\" may not contain a " "namespace separator \":\"", commandName); } if (isBegin) { return parseBeginCommand(); } else if (isEnd) { return parseEndCommand(); } } // Check whether the next character is a '#', indicating the start of the // command name Variant commandArgName; start = reader.getOffset(); if (reader.expect('#')) { commandArgName = parseIdentifier(start); if (commandArgName.asString().empty()) { logger.error("Expected identifier after \"#\"", commandArgName); } } // Parse the arugments Variant commandArguments = parseCommandArguments(std::move(commandArgName)); // Push the command onto the command stack pushCommand(std::move(commandName), std::move(commandArguments), false); return State::COMMAND; } void OsmlStreamParser::parseBlockComment() { Token token; size_t depth = 1; while (tokenizer.read(reader, token)) { if (token.type == Tokens.BlockCommentEnd) { depth--; if (depth == 0) { return; } } if (token.type == Tokens.BlockCommentStart) { depth++; } } // Issue an error if the file ends while we are in a block comment logger.error("File ended while being in a block comment", reader); } void OsmlStreamParser::parseLineComment() { char c; while (reader.read(c)) { if (c == '\n') { return; } } } bool OsmlStreamParser::checkIssueData(DataHandler &handler) { if (!handler.isEmpty()) { data = handler.toVariant(reader.getSourceId()); location = data.getLocation(); reader.resetPeek(); return true; } return false; } bool OsmlStreamParser::checkIssueFieldStart() { // Fetch the current command, and check whether we're currently inside a // field of this command Command &cmd = commands.top(); if (!cmd.inField) { // If this is a range command, we're now implicitly inside the field of // this command -- we'll have to issue a field start command! if (cmd.hasRange) { cmd.inField = true; cmd.inRangeField = true; reader.resetPeek(); return true; } // This was not a range command, so obviously we're now inside within // a field of some command -- so unroll the commands stack until a // command with open field is reached while (!commands.top().inField) { commands.pop(); } } return false; } bool OsmlStreamParser::closeField() { // Try to end an open field of the current command -- if the current command // is not inside an open field, end this command and try to close the next // one for (int i = 0; i < 2 && commands.size() > 1; i++) { Command &cmd = commands.top(); if (!cmd.inRangeField) { if (cmd.inField) { cmd.inField = false; if (cmd.inDefaultField) { commands.pop(); } return true; } commands.pop(); } else { return false; } } return false; } OsmlStreamParser::State OsmlStreamParser::parse() { // Handler for incomming data DataHandler handler; // Read tokens until the outer loop should be left Token token; while (tokenizer.peek(reader, token)) { const TokenTypeId type = token.type; // Special handling for Backslash and Text if (type == Tokens.Backslash) { // Before appending anything to the output data or starting a new // command, check whether FIELD_START has to be issued, as the // current command is a command with range if (checkIssueFieldStart()) { location = token.location; return State::FIELD_START; } // Check whether a command starts now, without advancing the peek // cursor char c; if (!reader.fetchPeek(c)) { logger.error("Trailing backslash at the end of the file.", token); return State::END; } // Try to parse a command if (Utils::isIdentifierStartCharacter(c)) { // Make sure to issue any data before it is to late if (checkIssueData(handler)) { return State::DATA; } // Parse the actual command State res = parseCommand(token.location.getStart()); switch (res) { case State::ERROR: throw LoggableException( "Last error was irrecoverable, ending parsing " "process"); case State::NONE: continue; default: return res; } } // This was not a special character, just append the given character // to the data buffer, use the escape character start as start // location and the peek offset as end location reader.peek(c); // Peek the previously fetched character handler.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); continue; } else if (type == TextToken) { // Check whether FIELD_START has to be issued before appending text if (checkIssueFieldStart()) { location = token.location; return State::FIELD_START; } // Append the text to the data handler handler.append(token.content, token.location.getStart(), token.location.getEnd()); reader.consumePeek(); continue; } // A non-text token was reached, make sure all pending data commands // have been issued if (checkIssueData(handler)) { return State::DATA; } // We will handle the token now, consume the peeked characters reader.consumePeek(); // Update the location to the current token location location = token.location; if (token.type == Tokens.LineComment) { parseLineComment(); } else if (token.type == Tokens.BlockCommentStart) { parseBlockComment(); } else if (token.type == Tokens.FieldStart) { Command &cmd = commands.top(); if (!cmd.inField) { cmd.inField = true; return State::FIELD_START; } logger.error( "Got field start token \"{\", but no command for which to " "start the field. Write \"\\{\" to insert this sequence as " "text.", token); } else if (token.type == Tokens.FieldEnd) { if (closeField()) { return State::FIELD_END; } logger.error( "Got field end token \"}\", but there is no field to end. " "Write \"\\}\" to insert this sequence as text.", token); } else if (token.type == Tokens.DefaultFieldStart) { // Try to start a default field the first time the token is reached Command &topCmd = commands.top(); if (!topCmd.inField) { topCmd.inField = true; topCmd.inDefaultField = true; return State::FIELD_START; } logger.error( "Got default field start token \"{!\", but no command for " "which to start the field. Write \"\\{!\" to insert this " "sequence as text", token); } else { logger.error("Unexpected token \"" + token.content + "\"", token); } } // Issue available data if (checkIssueData(handler)) { return State::DATA; } // Make sure all open commands and fields have been ended at the end of the // stream while (commands.size() > 1) { Command &cmd = commands.top(); if (cmd.inField || cmd.hasRange) { logger.error("Reached end of stream, but command \"" + cmd.name.asString() + "\" has not been ended", cmd.name); } commands.pop(); } location = SourceLocation{reader.getSourceId(), reader.getOffset()}; return State::END; } const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; } const Variant &OsmlStreamParser::getCommandArguments() const { return commands.top().arguments; } bool OsmlStreamParser::inDefaultField() const { return commands.top().inRangeField || commands.top().inDefaultField; } }