/*
Ousía
Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include "Number.hpp"
#include "VariantReader.hpp"
#include "Utils.hpp"
namespace ousia {
/* Error Messages */
// TODO: Invent common system for error messages which allows localization
// TODO: Possibly adapt the clang error logging system
static const char *ERR_UNEXPECTED_CHAR = "Unexpected character";
static const char *ERR_UNEXPECTED_END = "Unexpected end of literal";
static const char *ERR_UNTERMINATED = "Unterminated literal";
static const char *ERR_INVALID_ESCAPE = "Invalid escape sequence";
static const char *ERR_INVALID_INTEGER = "Invalid integer value";
template
static std::pair error(CharReader &reader, Logger &logger,
const std::string &err, T res)
{
logger.error(err, reader);
return std::make_pair(false, std::move(res));
}
static std::string unexpectedMsg(const char *expected, const char got)
{
std::stringstream ss;
ss << ERR_UNEXPECTED_CHAR << ": Expected " << expected << " but got \'"
<< got << "\'";
return ss.str();
}
static std::string invalidMsg(const char *invalidType,
const std::string &invalidValue)
{
std::stringstream ss;
ss << "Invalid " << invalidType << " \"" << invalidValue << "\"";
return ss.str();
}
template
static std::pair unexpected(CharReader &reader, Logger &logger,
const char *expected, const char got,
T res)
{
return error(reader, logger, unexpectedMsg(expected, got), res);
}
/* State machine states */
static const int STATE_INIT = 0;
static const int STATE_IN_STRING = 1;
static const int STATE_IN_COMPLEX = 2;
static const int STATE_ESCAPE = 4;
static const int STATE_WHITESPACE = 5;
static const int STATE_RESYNC = 6;
static const int STATE_EXPECT_COMMA = 7;
static const int STATE_HAS_KEY = 8;
/* Helper function for parsing arrays or objects */
// TODO: Refactor this to own class
enum class ComplexMode { ARRAY, OBJECT, BOTH };
static std::string idxKey(size_t idx)
{
return std::string{"#"} + std::to_string(idx);
}
static Variant parseComplexResult(Variant::mapType &objectResult,
Variant::arrayType &arrayResult, bool isArray,
ComplexMode mode)
{
// If the result is an array, simply return an array variant
if (isArray && mode != ComplexMode::OBJECT) {
return Variant{arrayResult};
}
// Otherwise add missing array keys to the resulting map
for (size_t i = 0; i < arrayResult.size(); i++) {
objectResult.insert(std::make_pair(idxKey(i), arrayResult[i]));
}
return Variant{objectResult};
}
static std::pair parseComplex(CharReader &reader, Logger &logger,
char delim, ComplexMode mode)
{
// Result for either objects or arrays
Variant::mapType objectResult;
Variant::arrayType arrayResult;
// Auxiliary state variables
bool hadError = false;
bool isArray = true;
// Determine the start state and set the actual delimiter
int state = delim ? STATE_IN_COMPLEX : STATE_INIT;
delim = delim ? delim : ']';
// Current array element index
size_t idx = 0;
// Current key value
Variant key;
// Consume all whitespace
reader.consumePeek();
reader.consumeWhitespace();
// Iterate over the characters, use the parseGeneric function to read the
// pairs
char c;
while (reader.peek(c)) {
// Generically handle the end of the array
if (state != STATE_INIT && c == delim) {
reader.consumePeek();
// Add final keys to the result
if (state == STATE_HAS_KEY) {
if (isArray) {
arrayResult.push_back(key);
} else {
objectResult.insert(std::make_pair(idxKey(idx), key));
}
key = nullptr;
}
return std::make_pair(
!hadError,
parseComplexResult(objectResult, arrayResult, isArray, mode));
} else if (Utils::isWhitespace(c)) {
reader.consumePeek();
continue;
}
switch (state) {
case STATE_INIT:
if (c != '[') {
return error(reader, logger, ERR_UNEXPECTED_CHAR,
parseComplexResult(objectResult, arrayResult,
isArray, mode));
}
state = STATE_IN_COMPLEX;
reader.consumePeek();
break;
case STATE_IN_COMPLEX: {
// Try to read an element using the parseGeneric function
reader.resetPeek();
auto elem = VariantReader::parseGenericToken(
reader, logger, {',', '=', delim}, true);
// If the reader had no error, expect an comma, otherwise skip
// to the next comma in the stream
if (elem.first) {
key = elem.second;
state = STATE_HAS_KEY;
} else {
state = STATE_RESYNC;
hadError = true;
}
break;
}
case STATE_HAS_KEY: {
// When finding an equals sign, read the value corresponding to
// the key
if (c == '=') {
// Abort if only arrays are allowed
if (mode == ComplexMode::ARRAY) {
logger.error(unexpectedMsg("\",\"", c), reader);
hadError = true;
state = STATE_RESYNC;
break;
}
// Make sure the key is a valid identifier, if not, issue
// an error
std::string keyString = key.toString();
if (!Utils::isIdentifier(keyString)) {
logger.error(invalidMsg("identifier", keyString),
reader);
hadError = true;
}
// This no longer is an array
isArray = false;
// Consume the equals sign and parse the value
reader.consumePeek();
auto elem = VariantReader::parseGenericToken(
reader, logger, {',', delim}, true);
if (elem.first) {
objectResult.insert(
std::make_pair(keyString, elem.second));
idx++;
} else {
state = STATE_RESYNC;
hadError = true;
key = nullptr;
break;
}
state = STATE_EXPECT_COMMA;
} else if (c == ',') {
// Simply add the previously read value to the result
// array or the result object
if (isArray) {
arrayResult.push_back(key);
} else {
objectResult.insert(std::make_pair(idxKey(idx), key));
}
idx++;
state = STATE_IN_COMPLEX;
reader.consumePeek();
} else {
if (mode == ComplexMode::ARRAY) {
logger.error(unexpectedMsg("\",\"", c), reader);
} else {
logger.error(unexpectedMsg("\",\" or \"=\"", c),
reader);
}
state = STATE_RESYNC;
hadError = true;
}
key = nullptr;
break;
}
case STATE_EXPECT_COMMA:
if (c == ',') {
state = STATE_IN_COMPLEX;
} else {
logger.error(unexpectedMsg("\",\"", c), reader);
state = STATE_RESYNC;
hadError = true;
}
reader.consumePeek();
break;
case STATE_RESYNC:
// Just wait for another comma to arrive
if (c == ',') {
state = STATE_IN_COMPLEX;
}
reader.consumePeek();
break;
}
}
return error(reader, logger, ERR_UNEXPECTED_END,
parseComplexResult(objectResult, arrayResult, isArray, mode));
}
/* Class Reader */
static bool encodeUtf8(std::stringstream &res, CharReader &reader,
Logger &logger, int64_t v, bool latin1)
{
// Encode the unicode codepoint as UTF-8
uint32_t cp = static_cast(v);
if (latin1 && cp > 0xFF) {
logger.error("Not a valid ISO-8859-1 (Latin-1) character, skipping",
reader);
return false;
}
// Append the code point to the output stream
try {
utf8::append(cp, std::ostream_iterator{res});
return true;
}
catch (utf8::invalid_code_point ex) {
logger.error("Invalid Unicode codepoint, skipping", reader);
}
return false;
}
std::pair VariantReader::parseString(
CharReader &reader, Logger &logger, const std::unordered_set *delims)
{
// Initialize the internal state
bool hadError = false;
int state = STATE_INIT;
char quote = 0;
std::stringstream res;
// Consume all whitespace
reader.consumePeek();
reader.consumeWhitespace();
// Statemachine whic iterates over each character in the stream
// TODO: Combination of peeking and consumePeek is stupid as consumePeek is
// the default (read and putBack would obviously be better, yet the latter
// is not trivial to implement in the current CharReader).
char c;
while (reader.peek(c)) {
switch (state) {
case STATE_INIT:
if (c == '"' || c == '\'') {
quote = c;
state = STATE_IN_STRING;
break;
} else if (delims && delims->count(c)) {
return error(reader, logger, ERR_UNEXPECTED_END, res.str());
}
return unexpected(reader, logger, "\" or \'", c, res.str());
case STATE_IN_STRING:
if (c == quote) {
reader.consumePeek();
return std::make_pair(!hadError, res.str());
} else if (c == '\\') {
state = STATE_ESCAPE;
reader.consumePeek();
break;
} else if (c == '\n') {
return error(reader, logger, ERR_UNTERMINATED, res.str());
}
res << c;
reader.consumePeek();
break;
case STATE_ESCAPE:
// Handle all possible special escape characters
switch (c) {
case 'b':
res << '\b';
break;
case 'f':
res << '\f';
break;
case 'n':
res << '\n';
break;
case 'r':
res << '\r';
break;
case 't':
res << '\t';
break;
case 'v':
res << '\v';
break;
case '\'':
res << '\'';
break;
case '"':
res << '"';
break;
case '\\':
res << '\\';
break;
case '\n':
break;
case 'x': {
// Parse Latin-1 sequence \xXX
Number n;
hadError = !(n.parseFixedLengthInteger(reader, 2, 16,
logger) &&
encodeUtf8(res, reader, logger,
n.intValue(), true)) ||
hadError;
break;
}
case 'u': {
// Parse Unicode sequence \uXXXX
Number n;
hadError = !(n.parseFixedLengthInteger(reader, 4, 16,
logger) &&
encodeUtf8(res, reader, logger,
n.intValue(), false)) ||
hadError;
break;
}
default:
if (Utils::isNumeric(c)) {
// Parse Latin-1 sequence \000
reader.resetPeek();
Number n;
hadError = !(n.parseFixedLengthInteger(reader, 3, 8,
logger) &&
encodeUtf8(res, reader, logger,
n.intValue(), true)) ||
hadError;
} else {
logger.error(ERR_INVALID_ESCAPE, reader);
hadError = true;
}
break;
}
// Switch back to the "normal" state
state = STATE_IN_STRING;
reader.consumePeek();
break;
}
}
return error(reader, logger, ERR_UNEXPECTED_END, res.str());
}
std::pair VariantReader::parseToken(
CharReader &reader, Logger &logger, const std::unordered_set &delims)
{
std::stringstream res;
char c;
// Consume all whitespace
reader.consumePeek();
reader.consumeWhitespace();
// Copy all characters, skip whitespace at the end
int state = STATE_WHITESPACE;
while (reader.peek(c)) {
bool whitespace = Utils::isWhitespace(c);
if (delims.count(c) || (state == STATE_IN_STRING && whitespace)) {
reader.resetPeek();
return std::make_pair(state == STATE_IN_STRING, res.str());
} else if (!whitespace) {
state = STATE_IN_STRING;
res << c;
}
reader.consumePeek();
}
return std::make_pair(state == STATE_IN_STRING, res.str());
}
std::pair VariantReader::parseUnescapedString(
CharReader &reader, Logger &logger, const std::unordered_set &delims)
{
std::stringstream res;
std::stringstream buf;
char c;
// Consume all whitespace
reader.consumePeek();
reader.consumeWhitespace();
// Copy all characters, skip whitespace at the end
int state = STATE_IN_STRING;
while (reader.peek(c)) {
if (delims.count(c)) {
reader.resetPeek();
return std::make_pair(true, res.str());
} else if (Utils::isWhitespace(c)) {
// Do not add trailing whitespaces to the output buffer. We have
// a temporary store (buf) here which we only append to the output
// buffer (res) if another character follows.
state = STATE_WHITESPACE;
buf << c;
} else {
// If we just had a sequence of whitespace, append it to the output
// buffer and continue
if (state == STATE_WHITESPACE) {
res << buf.str();
buf.str(std::string{});
buf.clear();
state = STATE_IN_STRING;
}
res << c;
}
reader.consumePeek();
}
return std::make_pair(true, res.str());
}
std::pair VariantReader::parseBool(CharReader &reader,
Logger &logger)
{
// first we consume all whitespaces.
reader.consumePeek();
reader.consumeWhitespace();
// then we try to find the words "true" or "false".
bool val = false;
CharReaderFork readerFork = reader.fork();
LoggerFork loggerFork = logger.fork();
auto res = parseToken(readerFork, loggerFork, std::unordered_set{});
if (res.first) {
bool valid = false;
if (res.second == "true") {
val = true;
valid = true;
} else if (res.second == "false") {
val = false;
valid = true;
}
if (valid) {
readerFork.commit();
loggerFork.commit();
return std::make_pair(true, val);
}
}
return std::make_pair(false, val);
}
std::pair VariantReader::parseInteger(
CharReader &reader, Logger &logger, const std::unordered_set &delims)
{
Number n;
if (n.parse(reader, logger, delims)) {
// Only succeed if the parsed number is an integer, otherwise this is an
// error
if (n.isInt()) {
return std::make_pair(true, n.intValue());
} else {
return error(reader, logger, ERR_INVALID_INTEGER, n.intValue());
}
}
return std::make_pair(false, n.intValue());
}
std::pair VariantReader::parseDouble(
CharReader &reader, Logger &logger, const std::unordered_set &delims)
{
Number n;
bool res = n.parse(reader, logger, delims);
return std::make_pair(res, n.doubleValue());
}
std::pair VariantReader::parseArray(
CharReader &reader, Logger &logger, char delim)
{
auto res = parseComplex(reader, logger, delim, ComplexMode::ARRAY);
return std::make_pair(res.first, res.second.asArray());
}
std::pair VariantReader::parseObject(CharReader &reader,
Logger &logger,
char delim)
{
auto res = parseComplex(reader, logger, delim, ComplexMode::OBJECT);
return std::make_pair(res.first, res.second.asMap());
}
static const std::unordered_set cardDelims{' ', ',', '}', '-'};
std::pair VariantReader::parseCardinality(
CharReader &reader, Logger &logger)
{
// first we consume all whitespaces.
reader.consumePeek();
reader.consumeWhitespace();
// then we expect curly braces.
char c;
if (!reader.read(c) || c != '{') {
return unexpected(reader, logger, "{", c, Variant::cardinalityType{});
}
Variant::cardinalityType card{};
reader.consumePeek();
reader.consumeWhitespace();
// which should in turn be followed by ranges.
while (reader.peek(c)) {
if (Utils::isNumeric(c)) {
// in case of a numeric character we want to read an integer.
reader.resetPeek();
Number n;
if (!n.parse(reader, logger, cardDelims) || !n.isInt() ||
n.intValue() < 0) {
return error(reader, logger, "Invalid number for cardinality!",
Variant::cardinalityType{});
}
unsigned int start = (unsigned int)n.intValue();
// if we have that we might either find a } or , making this a
// range or a - leading us to expect another integer.
reader.consumePeek();
reader.consumeWhitespace();
if (!reader.peek(c)) {
error(reader, logger, ERR_UNEXPECTED_END,
Variant::cardinalityType{});
}
switch (c) {
case '}':
case ',':
card.merge({start});
reader.resetPeek();
break;
case '-': {
// get another integer.
reader.consumePeek();
reader.consumeWhitespace();
if (!reader.peek(c)) {
error(reader, logger, ERR_UNEXPECTED_END,
Variant::cardinalityType{});
}
reader.resetPeek();
Number n2;
if (!n2.parse(reader, logger, cardDelims) || !n2.isInt() ||
n2.intValue() < 0) {
return error(reader, logger,
"Invalid number for cardinality!",
Variant::cardinalityType{});
}
unsigned int end = (unsigned int)n2.intValue();
if (end <= start) {
return error(reader, logger,
std::string("The start of the range (") +
std::to_string(start) +
") was bigger (or equal) to the end "
"of the range (" +
std::to_string(end) + ")!",
Variant::cardinalityType{});
}
card.merge({start, end});
break;
}
default:
return unexpected(reader, logger, "}, , or -", c,
Variant::cardinalityType{});
}
} else {
switch (c) {
case '*':
// in case of a Kleene star we can construct the
// cardinality right away.
card.merge(Variant::rangeType::typeRangeFrom(0));
break;
case '<':
case '>': {
// in case of an open range we expect a number.
reader.consumePeek();
reader.consumeWhitespace();
Number n;
if (!n.parse(reader, logger, cardDelims)) {
return error(reader, logger,
"Expected number in an open range "
"specifier!",
Variant::cardinalityType{});
}
if (!n.isInt() || n.intValue() < 0) {
return error(reader, logger,
"Invalid number for cardinality!",
Variant::cardinalityType{});
}
if (c == '<') {
card.merge(Variant::rangeType{
0, (unsigned int)n.intValue() - 1});
} else {
card.merge(Variant::rangeType::typeRangeFrom(
(unsigned int)n.intValue() + 1));
}
break;
}
default:
return unexpected(reader, logger,
"Unsigned integer, *, < or >", c,
Variant::cardinalityType{});
}
}
// after we have parsed a range, read all whitespaces.
reader.consumePeek();
reader.consumeWhitespace();
// ... and check if we are at the end.
if (!reader.read(c)) {
error(reader, logger, ERR_UNEXPECTED_END,
Variant::cardinalityType{});
}
switch (c) {
case '}':
return std::make_pair(true, card);
case ',':
reader.consumePeek();
reader.consumeWhitespace();
break;
default:
return unexpected(reader, logger, "} or ,", c,
Variant::cardinalityType{});
}
}
return error(reader, logger, ERR_UNEXPECTED_END,
Variant::cardinalityType{});
}
std::pair VariantReader::parseGeneric(
CharReader &reader, Logger &logger, const std::unordered_set &delims)
{
Variant::arrayType arr;
char c;
bool hadError = false;
// Skip all peeked characters
reader.consumePeek();
// Read the start offset
const SourceOffset start = reader.getOffset();
// Parse generic tokens until the end of the stream or the delimiter is
// reached
while (reader.peek(c) && !delims.count(c)) {
reader.resetPeek();
auto res = parseGenericToken(reader, logger, delims);
hadError = hadError || !res.first;
arr.push_back(res.second);
}
reader.resetPeek();
// The resulting array should not be empty
if (arr.empty()) {
return error(reader, logger, ERR_UNEXPECTED_END, nullptr);
}
// If there only one element was extracted, return this element instead of
// an array
if (arr.size() == 1) {
return std::make_pair(!hadError, arr[0]);
} else {
Variant res{arr};
res.setLocation({reader.getSourceId(), start, reader.getOffset()});
return std::make_pair(!hadError, res);
}
}
std::pair VariantReader::parseGenericToken(
CharReader &reader, Logger &logger, const std::unordered_set &delims,
bool extractUnescapedStrings)
{
char c;
// Skip all whitespace characters, read a character and abort if at the end
reader.consumePeek();
reader.consumeWhitespace();
if (!reader.fetch(c) || delims.count(c)) {
return error(reader, logger, ERR_UNEXPECTED_END, nullptr);
}
// Fetch the start offset
const SourceOffset start = reader.getOffset();
// Parse a string if a quote is reached
if (c == '"' || c == '\'') {
auto res = parseString(reader, logger);
Variant v = Variant::fromString(res.second);
v.setLocation({reader.getSourceId(), start, reader.getOffset()});
return std::make_pair(res.first, v);
}
// Try to parse everything that looks like a number as number
if (Utils::isNumeric(c) || c == '-') {
// Try to parse the number
Number n;
CharReaderFork readerFork = reader.fork();
LoggerFork loggerFork = logger.fork();
if (n.parse(readerFork, loggerFork, delims)) {
Variant v;
if (n.isInt()) {
if (n.intValue() <
std::numeric_limits::min() ||
n.intValue() >
std::numeric_limits::max()) {
logger.error("Number exceeds type limits.", reader);
return std::make_pair(false, v);
}
v = Variant{static_cast(n.intValue())};
} else {
v = Variant{n.doubleValue()};
}
readerFork.commit();
loggerFork.commit();
v.setLocation({reader.getSourceId(), start, reader.getOffset()});
return std::make_pair(true, v);
}
}
// Try to parse a cardinality
if (c == '{') {
CharReaderFork readerFork = reader.fork();
LoggerFork loggerFork = logger.fork();
auto res = parseCardinality(readerFork, loggerFork);
if (res.first) {
readerFork.commit();
loggerFork.commit();
Variant v{res.second};
v.setLocation({reader.getSourceId(), start, reader.getOffset()});
return std::make_pair(true, v);
}
}
// Try to parse an object
if (c == '[') {
auto res = parseComplex(reader, logger, 0, ComplexMode::BOTH);
res.second.setLocation(
{reader.getSourceId(), start, reader.getOffset()});
return res;
}
// Otherwise parse a single token
std::pair res;
if (extractUnescapedStrings) {
res = parseUnescapedString(reader, logger, delims);
} else {
res = parseToken(reader, logger, delims);
}
// Handling for special primitive values
bool isSpecial = false;
Variant v;
if (res.first) {
if (res.second == "true") {
v = Variant{true};
isSpecial = true;
} else if (res.second == "false") {
v = Variant{false};
isSpecial = true;
} else if (res.second == "null") {
v = Variant{nullptr};
isSpecial = true;
}
}
// Check whether the parsed string is a valid identifier -- if yes, flag it
// as "magic" string
if (!isSpecial) {
if (Utils::isIdentifier(res.second)) {
v.setMagic(res.second.c_str());
} else {
v = Variant::fromString(res.second);
}
}
v.setLocation({reader.getSourceId(), start, reader.getOffset()});
return std::make_pair(res.first, v);
}
std::pair VariantReader::parseGenericString(
const std::string &str, Logger &logger, SourceId sourceId, size_t offs)
{
// If the given string is empty, just return it as a string (there is no
// other type for which something empty would be valid)
// TODO: How to integrate this into parseGenericToken?
if (!str.empty()) {
CharReader reader{str, sourceId, offs};
LoggerFork loggerFork = logger.fork();
// Try to parse a single token
std::pair res = parseGenericToken(
reader, loggerFork, std::unordered_set{}, true);
// If the string was actually consisted of a single token, return that
// token
if (reader.atEnd()) {
loggerFork.commit();
return res;
}
}
// Otherwise return the given string as a string, set the location of the
// string correctly
Variant v = Variant::fromString(str);
v.setLocation({sourceId, offs, offs + str.size()});
return std::make_pair(true, v);
}
std::pair VariantReader::parseTyped(
VariantType type, CharReader &reader, Logger &logger,
const std::unordered_set &delims)
{
switch (type) {
case VariantType::BOOL: {
auto res = parseBool(reader, logger);
return std::make_pair(res.first, Variant{res.second});
}
case VariantType::INT: {
auto res = parseInteger(reader, logger, delims);
if (res.second < std::numeric_limits::min() ||
res.second > std::numeric_limits::max()) {
logger.error("Number exceeds type limits.", reader);
return std::make_pair(false, Variant{});
}
return std::make_pair(
res.first, Variant{static_cast(res.second)});
}
case VariantType::DOUBLE: {
auto res = parseDouble(reader, logger, delims);
return std::make_pair(res.first, Variant{res.second});
}
case VariantType::STRING: {
auto res = parseString(reader, logger, delims);
return std::make_pair(res.first, Variant::fromString(res.second));
}
case VariantType::ARRAY: {
char delim = 0;
if (delims.size() == 1) {
delim = *delims.begin();
}
auto res = parseArray(reader, logger, delim);
return std::make_pair(res.first, Variant{res.second});
}
case VariantType::MAP:
case VariantType::OBJECT: {
char delim = 0;
if (delims.size() == 1) {
delim = *delims.begin();
}
auto res = parseObject(reader, logger, delim);
return std::make_pair(res.first, Variant{res.second});
}
case VariantType::CARDINALITY: {
auto res = parseCardinality(reader, logger);
return std::make_pair(res.first, Variant{res.second});
}
default:
break;
}
return std::make_pair(false, Variant{});
}
std::pair VariantReader::parseTyped(VariantType type,
const std::string &str,
Logger &logger,
SourceId sourceId,
size_t offs)
{
// create a char reader and forward the method.
CharReader reader{str, sourceId, offs};
LoggerFork loggerFork = logger.fork();
std::pair res =
parseTyped(type, reader, loggerFork, std::unordered_set{});
// If all content could be parsed, commit the result.
if (reader.atEnd()) {
loggerFork.commit();
return res;
}
// otherwise do not.
logger.error("Not all input could be processed",
{sourceId, offs, offs + str.size()});
return std::make_pair(false, Variant{});
}
}