summaryrefslogtreecommitdiff
path: root/src/core/common/VariantReader.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/common/VariantReader.cpp')
-rw-r--r--src/core/common/VariantReader.cpp625
1 files changed, 625 insertions, 0 deletions
diff --git a/src/core/common/VariantReader.cpp b/src/core/common/VariantReader.cpp
new file mode 100644
index 0000000..e611842
--- /dev/null
+++ b/src/core/common/VariantReader.cpp
@@ -0,0 +1,625 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+
+#include <cmath>
+#include <sstream>
+
+#include "VariantReader.hpp"
+#include "Utils.hpp"
+
+namespace ousia {
+
+// TODO: Better error messages (like "Expected 'x' but got 'y'")
+// TODO: Replace delims with single char delim where possible
+// TODO: Use custom return value instead of std::pair
+// TODO: Allow buffered char reader to "fork"
+// TODO: Rename CharReader to shorter CharReader
+// TODO: Implement context in CharReader (to allow error messages to extract the
+// current line)
+
+/* Error Messages */
+
+static const char *ERR_UNEXPECTED_CHAR = "Unexpected character";
+static const char *ERR_UNEXPECTED_END = "Unexpected literal end";
+static const char *ERR_UNTERMINATED = "Unterminated literal";
+static const char *ERR_INVALID_ESCAPE = "Invalid escape sequence";
+static const char *ERR_INVALID_INTEGER = "Invalid integer value";
+static const char *ERR_TOO_LARGE = "Value too large to represent";
+
+/* Class Number */
+
+/**
+ * Class used internally to represent a number (integer or double). The number
+ * is represented by its components (base value a, nominator n, denominator d,
+ * exponent e, sign s and exponent sign sE).
+ */
+class Number {
+private:
+ /**
+ * Reprsents the part of the number: Base value a, nominator n, exponent e.
+ */
+ enum class Part { A, N, E };
+
+ /**
+ * State used in the parser state machine
+ */
+ enum class State {
+ INIT,
+ HAS_MINUS,
+ LEADING_ZERO,
+ LEADING_POINT,
+ INT,
+ HEX,
+ POINT,
+ EXP_INIT,
+ EXP_HAS_MINUS,
+ EXP
+ };
+
+ /**
+ * Returns the numeric value of the given ASCII character (returns 0 for
+ * '0', 1 for '1', 10 for 'A' and so on).
+ *
+ * @param c is the character for which the numeric value should be returned.
+ * @return the numeric value the character represents.
+ */
+ static int charValue(char c)
+ {
+ if (c >= '0' && c <= '9') {
+ return c & 0x0F;
+ }
+ if ((c >= 'A' && c <= 'O') || (c >= 'a' && c <= 'o')) {
+ return (c & 0x0F) + 9;
+ }
+ return -1;
+ }
+
+ /**
+ * Appends the value of the character c to the internal number
+ * representation and reports any errors that might occur.
+ */
+ bool appendChar(char c, int base, Part p, CharReader &reader,
+ Logger &logger)
+ {
+ // Check whether the given character is valid
+ int v = charValue(c);
+ if (v < 0 || v >= base) {
+ logger.errorAt(ERR_UNEXPECTED_CHAR, reader);
+ return false;
+ }
+
+ // Append the number to the specified part
+ switch (p) {
+ case Part::A:
+ a = a * base + v;
+ break;
+ case Part::N:
+ n = n * base + v;
+ d = d * base;
+ break;
+ case Part::E:
+ e = e * base + v;
+ break;
+ }
+
+ // Check for any overflows
+ if (a < 0 || n < 0 || d < 0 || e < 0) {
+ logger.errorAt(ERR_TOO_LARGE, reader);
+ return false;
+ }
+ return true;
+ }
+
+public:
+ /**
+ * Sign and exponent sign.
+ */
+ int8_t s, sE;
+
+ /**
+ * Exponent
+ */
+ int16_t e;
+
+ /**
+ * Base value, nominator, denominator
+ */
+ int64_t a, n, d;
+
+ /**
+ * Constructor of the number class.
+ */
+ Number() : s(1), sE(1), e(0), a(0), n(0), d(1) {}
+
+ /**
+ * Returns the represented double value.
+ */
+ double doubleValue()
+ {
+ return s * (a + ((double)n / (double)d)) * pow(10.0, (double)(sE * e));
+ }
+
+ /**
+ * Returns the represented integer value. Only a lossless operation, if the
+ * number is an integer (as can be checked via the isInt method), otherwise
+ * the exponent and the fractional value will be truncated.
+ */
+ int64_t intValue() { return s * a; }
+
+ /**
+ * Returns true, if the number is an integer (has no fractional or
+ * exponential part).
+ */
+ bool isInt() { return (n == 0) && (d == 1) && (e == 0); }
+
+ /**
+ * Tries to parse the number from the given stream and loggs any errors to
+ * the given logger instance. Numbers are terminated by one of the given
+ * delimiters.
+ */
+ bool parse(CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims);
+};
+
+bool Number::parse(CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims)
+{
+ State state = State::INIT;
+ char c;
+
+ // Consume the first whitespace characters
+ reader.consumeWhitespace();
+
+ // Iterate over the FSM to extract numbers
+ while (reader.peek(c)) {
+ // Abort, once a delimiter or whitespace is reached
+ if (Utils::isWhitespace(c) || delims.count(c)) {
+ reader.resetPeek();
+ break;
+ }
+
+ // The character is not a whitespace character and not a delimiter
+ switch (state) {
+ case State::INIT:
+ case State::HAS_MINUS:
+ switch (c) {
+ case '-':
+ // Do not allow multiple minus signs
+ if (state == State::HAS_MINUS) {
+ logger.errorAt(ERR_UNEXPECTED_CHAR, reader);
+ return false;
+ }
+ state = State::HAS_MINUS;
+ s = -1;
+ break;
+ case '0':
+ // Remember a leading zero for the detection of "0x"
+ state = State::LEADING_ZERO;
+ break;
+ case '.':
+ // Remember a leading point as ".eXXX" is invalid
+ state = State::LEADING_POINT;
+ break;
+ default:
+ state = State::INT;
+ if (!appendChar(c, 10, Part::A, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ break;
+ case State::LEADING_ZERO:
+ if (c == 'x' || c == 'X') {
+ state = State::HEX;
+ break;
+ }
+ // fallthrough
+ case State::INT:
+ switch (c) {
+ case '.':
+ state = State::POINT;
+ break;
+ case 'e':
+ case 'E':
+ state = State::EXP_INIT;
+ break;
+ default:
+ state = State::INT;
+ if (!appendChar(c, 10, Part::A, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ break;
+ case State::HEX:
+ if (!appendChar(c, 16, Part::A, reader, logger)) {
+ return false;
+ }
+ break;
+ case State::LEADING_POINT:
+ case State::POINT:
+ switch (c) {
+ case 'e':
+ case 'E':
+ if (state == State::LEADING_POINT) {
+ logger.errorAt(ERR_UNEXPECTED_CHAR, reader);
+ return false;
+ }
+ state = State::EXP_INIT;
+ break;
+ default:
+ state = State::POINT;
+ if (!appendChar(c, 10, Part::N, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ break;
+ case State::EXP_HAS_MINUS:
+ case State::EXP_INIT:
+ if (c == '-') {
+ if (state == State::EXP_HAS_MINUS) {
+ logger.errorAt(ERR_UNEXPECTED_CHAR, reader);
+ return false;
+ }
+ state = State::EXP_HAS_MINUS;
+ sE = -1;
+ } else {
+ state = State::EXP;
+ if (!appendChar(c, 10, Part::E, reader, logger)) {
+ return false;
+ }
+ }
+ break;
+ case State::EXP:
+ if (!appendChar(c, 10, Part::E, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ reader.consumePeek();
+ }
+
+ // States in which ending is valid. Log an error in other states
+ if (state == State::LEADING_ZERO || state == State::HEX ||
+ state == State::INT || state == State::POINT ||
+ state == State::EXP) {
+ return true;
+ }
+ logger.errorAt(ERR_UNEXPECTED_END, reader);
+ return false;
+}
+
+
+/* Class Reader */
+
+static const int STATE_INIT = 0;
+static const int STATE_IN_STRING = 1;
+static const int STATE_IN_ARRAY = 2;
+static const int STATE_EXPECT_COMMA = 3;
+static const int STATE_ESCAPE = 4;
+static const int STATE_WHITESPACE = 5;
+static const int STATE_RESYNC = 6;
+
+template <class T>
+static std::pair<bool, T> error(CharReader &reader, Logger &logger,
+ const char *err, T res)
+{
+ logger.errorAt(err, reader);
+ return std::make_pair(false, std::move(res));
+}
+
+std::pair<bool, std::string> VariantReader::parseString(
+ CharReader &reader, Logger &logger,
+ const std::unordered_set<char> *delims)
+{
+ // Initialize the internal state
+ int state = STATE_INIT;
+ char quote = 0;
+ std::stringstream res;
+
+ // Consume all whitespace
+ reader.consumeWhitespace();
+
+ // Statemachine whic iterates over each character in the stream
+ // TODO: Combination of peeking and consumePeek is stupid as consumePeek is
+ // the default (read and putBack would obviously be better, yet the latter
+ // is not trivial to implement in the current CharReader).
+ char c;
+ while (reader.peek(c)) {
+ switch (state) {
+ case STATE_INIT:
+ if (c == '"' || c == '\'') {
+ quote = c;
+ state = STATE_IN_STRING;
+ break;
+ } else if (delims && delims->count(c)) {
+ return error(reader, logger, ERR_UNEXPECTED_END, res.str());
+ }
+ return error(reader, logger, ERR_UNEXPECTED_CHAR, res.str());
+ case STATE_IN_STRING:
+ if (c == quote) {
+ reader.consumePeek();
+ return std::make_pair(true, res.str());
+ } else if (c == '\\') {
+ state = STATE_ESCAPE;
+ reader.consumePeek();
+ break;
+ } else if (c == '\n') {
+ return error(reader, logger, ERR_UNTERMINATED, res.str());
+ }
+ res << c;
+ reader.consumePeek();
+ break;
+ case STATE_ESCAPE:
+ // Handle all possible special escape characters
+ switch (c) {
+ case 'b':
+ res << '\b';
+ break;
+ case 'f':
+ res << '\f';
+ break;
+ case 'n':
+ res << '\n';
+ break;
+ case 'r':
+ res << '\r';
+ break;
+ case 't':
+ res << '\t';
+ break;
+ case 'v':
+ res << '\v';
+ break;
+ case '\'':
+ res << '\'';
+ break;
+ case '"':
+ res << '"';
+ break;
+ case '\\':
+ res << '\\';
+ break;
+ case '\n':
+ break;
+ case 'x':
+ // TODO: Parse Latin-1 sequence hex XX
+ break;
+ case 'u':
+ // TODO: Parse 16-Bit unicode character hex XXXX
+ break;
+ default:
+ if (Utils::isNumeric(c)) {
+ // TODO: Parse octal 000 sequence
+ } else {
+ logger.errorAt(ERR_INVALID_ESCAPE, reader);
+ }
+ break;
+ }
+
+ // Switch back to the "normal" state
+ state = STATE_IN_STRING;
+ reader.consumePeek();
+ break;
+ }
+ }
+ return error(reader, logger, ERR_UNEXPECTED_END, res.str());
+}
+
+std::pair<bool, Variant::arrayType> VariantReader::parseArray(
+ CharReader &reader, Logger &logger, char delim)
+{
+ Variant::arrayType res;
+ bool hadError = false;
+ int state = delim ? STATE_IN_ARRAY : STATE_INIT;
+ delim = delim ? delim : ']';
+ char c;
+
+ // Consume all whitespace
+ reader.consumeWhitespace();
+
+ // Iterate over the characters, use the parseGeneric function to read the
+ // pairs
+ while (reader.peek(c)) {
+ // Generically handle the end of the array
+ if (state != STATE_INIT && c == delim) {
+ reader.consumePeek();
+ return std::make_pair(!hadError, res);
+ }
+
+ switch (state) {
+ case STATE_INIT:
+ if (c != '[') {
+ return error(reader, logger, ERR_UNEXPECTED_CHAR, res);
+ }
+ state = STATE_IN_ARRAY;
+ reader.consumePeek();
+ break;
+ case STATE_IN_ARRAY: {
+ // Try to read an element using the parseGeneric function
+ reader.resetPeek();
+ auto elem = parseGeneric(reader, logger, {',', delim});
+ res.push_back(elem.second);
+
+ // If the reader had no error, expect an comma, otherwise skip
+ // to the next comma in the stream
+ if (elem.first) {
+ state = STATE_EXPECT_COMMA;
+ } else {
+ state = STATE_RESYNC;
+ hadError = true;
+ }
+ break;
+ }
+ case STATE_EXPECT_COMMA:
+ // Skip whitespace
+ if (c == ',') {
+ state = STATE_IN_ARRAY;
+ } else if (!Utils::isWhitespace(c)) {
+ hadError = true;
+ state = STATE_RESYNC;
+ logger.errorAt(ERR_UNEXPECTED_CHAR, reader);
+ }
+ reader.consumePeek();
+ break;
+ case STATE_RESYNC:
+ // Just wait for another comma to arrive
+ if (c == ',') {
+ state = STATE_IN_ARRAY;
+ }
+ reader.consumePeek();
+ break;
+ }
+ }
+ return error(reader, logger, ERR_UNEXPECTED_END, res);
+}
+
+std::pair<bool, std::string> VariantReader::parseUnescapedString(
+ CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims)
+{
+ std::stringstream res;
+ std::stringstream buf;
+ char c;
+
+ // Consume all whitespace
+ reader.consumeWhitespace();
+
+ // Copy all characters, skip whitespace at the end
+ int state = STATE_IN_STRING;
+ while (reader.peek(c)) {
+ if (delims.count(c)) {
+ reader.resetPeek();
+ return std::make_pair(true, res.str());
+ } else if (Utils::isWhitespace(c)) {
+ // Do not add whitespace to the output buffer
+ state = STATE_WHITESPACE;
+ buf << c;
+ } else {
+ // If we just hat a sequence of whitespace, append it to the output
+ // buffer and continue
+ if (state == STATE_WHITESPACE) {
+ res << buf.str();
+ buf.str(std::string{});
+ buf.clear();
+ state = STATE_IN_STRING;
+ }
+ res << c;
+ }
+ reader.consumePeek();
+ }
+ return std::make_pair(true, res.str());
+}
+
+std::pair<bool, int64_t> VariantReader::parseInteger(
+ CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims)
+{
+ Number n;
+ if (n.parse(reader, logger, delims)) {
+ // Only succeed if the parsed number is an integer, otherwise this is an
+ // error
+ if (n.isInt()) {
+ return std::make_pair(true, n.intValue());
+ } else {
+ return error(reader, logger, ERR_INVALID_INTEGER, n.intValue());
+ }
+ }
+ return std::make_pair(false, n.intValue());
+}
+
+std::pair<bool, double> VariantReader::parseDouble(
+ CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims)
+{
+ Number n;
+ bool res = n.parse(reader, logger, delims);
+ return std::make_pair(res, n.doubleValue());
+}
+
+std::pair<bool, Variant> VariantReader::parseGeneric(
+ CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims)
+{
+ char c;
+
+ // Skip all whitespace characters
+ reader.consumeWhitespace();
+ while (reader.peek(c)) {
+ // Stop if a delimiter is reached
+ if (delims.count(c)) {
+ return error(reader, logger, ERR_UNEXPECTED_END, nullptr);
+ }
+
+ // Parse a string if a quote is reached
+ if (c == '"' || c == '\'') {
+ auto res = parseString(reader, logger);
+ return std::make_pair(res.first, res.second.c_str());
+ }
+
+ if (c == '[') {
+ // TODO: Parse struct descriptor
+ }
+
+ // Try to parse everything that looks like a number as number
+ if (Utils::isNumeric(c) || c == '-') {
+ Number n;
+
+ // Fork the reader
+ CharReaderFork fork = reader.fork();
+
+ // TODO: Fork logger
+
+ // Try to parse the number
+ if (n.parse(fork, logger, delims)) {
+ // Parsing was successful, advance the reader
+ fork.commit();
+ if (n.isInt()) {
+ return std::make_pair(
+ true,
+ Variant{static_cast<Variant::intType>(n.intValue())});
+ } else {
+ return std::make_pair(true, n.doubleValue());
+ }
+ }
+ }
+
+ // Parse an unescaped string in any other case
+ auto res = parseUnescapedString(reader, logger, delims);
+
+ // Handling for special primitive values
+ if (res.first) {
+ if (res.second == "true") {
+ return std::make_pair(true, Variant{true});
+ }
+ if (res.second == "false") {
+ return std::make_pair(true, Variant{false});
+ }
+ if (res.second == "null") {
+ return std::make_pair(true, Variant{nullptr});
+ }
+ }
+ return std::make_pair(res.first, res.second.c_str());
+ }
+ return error(reader, logger, ERR_UNEXPECTED_END, nullptr);
+}
+}
+