diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2014-12-06 02:02:08 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2014-12-06 02:02:08 +0100 |
commit | b92ea632417f1736ae4c9429448cb88165b58570 (patch) | |
tree | 7d13a4edd7fe8c895946ce29e19be7e48d16d539 | |
parent | c5daa927663cb0eafa56495bded0d80b21d94e4d (diff) |
reimplemented number parsing (previous implementation was suboptimal, as it was not securely capable of detecting wheter a number is an integer or an double value
-rw-r--r-- | src/core/variant/Reader.cpp | 402 | ||||
-rw-r--r-- | src/core/variant/Reader.hpp | 6 | ||||
-rw-r--r-- | test/core/variant/ReaderTest.cpp | 134 |
3 files changed, 419 insertions, 123 deletions
diff --git a/src/core/variant/Reader.cpp b/src/core/variant/Reader.cpp index b189988..9215e31 100644 --- a/src/core/variant/Reader.cpp +++ b/src/core/variant/Reader.cpp @@ -18,7 +18,7 @@ #include <iostream> -#include <cassert> +#include <cmath> #include <sstream> #include <core/Utils.hpp> @@ -28,44 +28,282 @@ namespace ousia { namespace variant { -static const char *ERR_UNEXPECTED_CHARACTER = "Unexpected character"; -static const char *ERR_UNEXPECTED_END = "Unexpected end"; +/* Error Messages */ + +static const char *ERR_UNEXPECTED_CHAR = "Unexpected character"; +static const char *ERR_UNEXPECTED_END = "Unexpected literal end"; static const char *ERR_UNTERMINATED = "Unterminated literal"; static const char *ERR_INVALID_ESCAPE = "Invalid escape sequence"; -static const char *ERR_INVALID_INTEGER = "Sequence is not a valid integer"; -static const char *ERR_INVALID_DOUBLE = "Sequence is not a valid number"; +static const char *ERR_INVALID_INTEGER = "Invalid integer value"; +static const char *ERR_TOO_LARGE = "Value too large to represent"; -static const int STATE_INIT = 0; -static const int STATE_IN_STRING = 1; -static const int STATE_ESCAPE = 2; -static const int STATE_WHITESPACE = 3; +/* Class Number */ /** - * Used internally to extract the regexp [0-9xXe.-]* from the given buffered - * char reader. - * - * @param reader is the buffered char reader from which the sequence should be - * extracted. + * Class used internally to represent a number (integer or double). The number + * is represented by its components (base value a, nominator n, denominator d, + * exponent e, sign s and exponent sign sE). */ -static std::pair<bool, std::string> extractNumberSequence( - BufferedCharReader &reader) -{ - bool isInteger = true; - char c; - std::stringstream res; - while (reader.peek(&c)) { - isInteger = isInteger && !(c == '.' || c == 'e' || c == 'E'); - if (Utils::isHexadecimal(c) || c == '.' || c == '-' || c == 'e' || - c == 'E' || c == 'x' || c == 'X') { +class Number { +private: + /** + * Reprsents the part of the number: Base value a, nominator n, exponent e. + */ + enum class Part { A, N, E }; + + /** + * State used in the parser state machine + */ + enum class State { + INIT, + HAS_MINUS, + LEADING_ZERO, + LEADING_POINT, + INT, + HEX, + POINT, + EXP_INIT, + EXP_HAS_MINUS, + EXP + }; + + /** + * Returns the numeric value of the given ASCII character (returns 0 for + * '0', 1 for '1', 10 for 'A' and so on). + * + * @param c is the character for which the numeric value should be returned. + * @return the numeric value the character represents. + */ + static int charValue(char c) + { + if (c >= '0' && c <= '9') { + return c & 0x0F; + } + if ((c >= 'A' && c <= 'O') || (c >= 'a' && c <= 'o')) { + return (c & 0x0F) + 9; + } + return -1; + } + + /** + * Appends the value of the character c to the internal number + * representation and reports any errors that might occur. + */ + bool appendChar(char c, int base, Part p, BufferedCharReader &reader, + Logger &logger) + { + // Check whether the given character is valid + int v = charValue(c); + if (v < 0 || v >= base) { + logger.errorAt(ERR_UNEXPECTED_CHAR, reader); + return false; + } + + // Append the number to the specified part + switch (p) { + case Part::A: + a = a * base + v; + break; + case Part::N: + n = n * base + v; + d = d * base; + break; + case Part::E: + e = e * base + v; + break; + } + + // Check for any overflows + if (a < 0 || n < 0 || d < 0 || e < 0) { + logger.errorAt(ERR_TOO_LARGE, reader); + return false; + } + return true; + } + +public: + /** + * Sign and exponent sign. + */ + int8_t s, sE; + + /** + * Exponent + */ + int16_t e; + + /** + * Base value, nominator, denominator + */ + int64_t a, n, d; + + /** + * Constructor of the number class. + */ + Number() : s(1), sE(1), e(0), a(0), n(0), d(1) {} + + /** + * Returns the represented double value. + */ + double doubleValue() + { + return s * (a + ((double)n / (double)d)) * pow(10.0, (double)(sE * e)); + } + + /** + * Returns the represented integer value. Only a lossless operation, if the + * number is an integer (as can be checked via the isInt method), otherwise + * the exponent and the fractional value will be truncated. + */ + int64_t intValue() { return s * a; } + + /** + * Returns true, if the number is an integer (has no fractional or + * exponential part). + */ + bool isInt() { return (n == 0) && (d == 1) && (e == 0); } + + /** + * Tries to parse the number from the given stream and loggs any errors to + * the given logger instance. Numbers are terminated by one of the given + * delimiters. + */ + bool parse(BufferedCharReader &reader, Logger &logger, + const std::unordered_set<char> &delims) + { + State state = State::INIT; + char c; + + // Consume the first whitespace characters + reader.consumeWhitespace(); + + // Iterate over the FSM to extract numbers + while (reader.peek(&c)) { + // Abort, once a delimiter or whitespace is reached + if (Utils::isWhitespace(c) || delims.count(c)) { + reader.resetPeek(); + break; + } + + // The character is not a whitespace character and not a delimiter + switch (state) { + case State::INIT: + case State::HAS_MINUS: + switch (c) { + case '-': + // Do not allow multiple minus signs + if (state == State::HAS_MINUS) { + logger.errorAt(ERR_UNEXPECTED_CHAR, reader); + return false; + } + state = State::HAS_MINUS; + s = -1; + break; + case '0': + // Remember a leading zero for the detection of "0x" + state = State::LEADING_ZERO; + break; + case '.': + // Remember a leading point as ".eXXX" is invalid + state = State::LEADING_POINT; + break; + default: + state = State::INT; + if (!appendChar(c, 10, Part::A, reader, logger)) { + return false; + } + break; + } + break; + case State::LEADING_ZERO: + if (c == 'x' || c == 'X') { + state = State::HEX; + break; + } + // fallthrough + case State::INT: + switch (c) { + case '.': + state = State::POINT; + break; + case 'e': + case 'E': + state = State::EXP_INIT; + break; + default: + state = State::INT; + if (!appendChar(c, 10, Part::A, reader, logger)) { + return false; + } + break; + } + break; + case State::HEX: + if (!appendChar(c, 16, Part::A, reader, logger)) { + return false; + } + break; + case State::LEADING_POINT: + case State::POINT: + switch (c) { + case 'e': + case 'E': + if (state == State::LEADING_POINT) { + logger.errorAt(ERR_UNEXPECTED_CHAR, reader); + return false; + } + state = State::EXP_INIT; + break; + default: + state = State::POINT; + if (!appendChar(c, 10, Part::N, reader, logger)) { + return false; + } + break; + } + break; + case State::EXP_HAS_MINUS: + case State::EXP_INIT: + if (c == '-') { + if (state == State::EXP_HAS_MINUS) { + logger.errorAt(ERR_UNEXPECTED_CHAR, reader); + return false; + } + state = State::EXP_HAS_MINUS; + sE = -1; + } else { + state = State::EXP; + if (!appendChar(c, 10, Part::E, reader, logger)) { + return false; + } + } + break; + case State::EXP: + if (!appendChar(c, 10, Part::E, reader, logger)) { + return false; + } + break; + } reader.consumePeek(); - res << c; - } else { - reader.resetPeek(); - break; } + + // States in which ending is valid, in other states, log an error + if (state == State::LEADING_ZERO || state == State::HEX || + state == State::INT || state == State::POINT || + state == State::EXP) { + return true; + } + logger.errorAt(ERR_UNEXPECTED_END, reader); + return false; } - return std::make_pair(isInteger, res.str()); -} +}; + +/* Class Reader */ + +static const int STATE_INIT = 0; +static const int STATE_IN_STRING = 1; +static const int STATE_ESCAPE = 2; +static const int STATE_WHITESPACE = 3; template <class T> static std::pair<bool, T> error(BufferedCharReader &reader, Logger &logger, @@ -102,8 +340,7 @@ std::pair<bool, std::string> Reader::parseString( } else if (delims && delims->count(c)) { return error(reader, logger, ERR_UNEXPECTED_END, res.str()); } - return error(reader, logger, ERR_UNEXPECTED_CHARACTER, - res.str()); + return error(reader, logger, ERR_UNEXPECTED_CHAR, res.str()); case STATE_IN_STRING: if (c == quote) { reader.consumePeek(); @@ -211,77 +448,30 @@ std::pair<bool, std::string> Reader::parseUnescapedString( return std::make_pair(true, res.str()); } -static std::pair<bool, int64_t> parseExtractedInteger( - BufferedCharReader &reader, Logger &logger, const std::string &val) +std::pair<bool, int64_t> Reader::parseInteger( + BufferedCharReader &reader, Logger &logger, + const std::unordered_set<char> &delims) { - try { - size_t idx = 0; - bool valid = false; - int64_t res = 0; - - std::string prefix = val.substr(0, 2); - if (prefix == "0x" || prefix == "0X") { - // If the value starts with 0x or 0X parse a hexadecimal value - std::string hex = val.substr(2); - res = std::stoll(hex, &idx, 16); - valid = idx == hex.length(); + Number n; + if (n.parse(reader, logger, delims)) { + // Only succeed if the parsed number is an integer, otherwise this is an + // error + if (n.isInt()) { + return std::make_pair(true, n.intValue()); } else { - res = std::stoll(val, &idx, 10); - valid = idx == val.length(); - } - - if (!valid) { - return error(reader, logger, ERR_INVALID_INTEGER, 0L); - } - - return std::make_pair(valid, res); - } - catch (std::invalid_argument ex) { - return error(reader, logger, ERR_INVALID_INTEGER, 0L); - } -} - -static std::pair<bool, double> parseExtractedDouble(BufferedCharReader &reader, - Logger &logger, - const std::string &val) -{ - try { - size_t idx = 0; - double res = std::stod(val, &idx); - if (idx != val.length()) { - return error(reader, logger, ERR_INVALID_DOUBLE, 0.0); + return error(reader, logger, ERR_INVALID_INTEGER, n.intValue()); } - return std::make_pair(true, res); - } - catch (std::invalid_argument ex) { - return error(reader, logger, ERR_INVALID_DOUBLE, 0.0); } + return std::make_pair(false, n.intValue()); } -std::pair<bool, int64_t> Reader::parseInteger(BufferedCharReader &reader, - Logger &logger) -{ - // Skip all whitespace characters - reader.consumeWhitespace(); - - // Extract a number sequence, make sure it is an integer - auto num = extractNumberSequence(reader); - if (!num.first || num.second.empty()) { - return error(reader, logger, ERR_INVALID_INTEGER, 0L); - } - - return parseExtractedInteger(reader, logger, num.second); -} - -std::pair<bool, double> Reader::parseDouble(BufferedCharReader &reader, - Logger &logger) +std::pair<bool, double> Reader::parseDouble( + BufferedCharReader &reader, Logger &logger, + const std::unordered_set<char> &delims) { - // Skip all whitespace characters - reader.consumeWhitespace(); - - // Extract a number sequence, parse it as double - auto num = extractNumberSequence(reader); - return parseExtractedDouble(reader, logger, num.second); + Number n; + bool res = n.parse(reader, logger, delims); + return std::make_pair(res, n.doubleValue()); } std::pair<bool, Variant> Reader::parseGeneric( @@ -308,16 +498,20 @@ std::pair<bool, Variant> Reader::parseGeneric( // TODO: Parse struct descriptor } - if (Utils::isNumeric(c)) { + // Try to parse a number if a character in [0-9-] is reached + if (Utils::isNumeric(c) || c == '-') { reader.resetPeek(); - auto num = extractNumberSequence(reader); - if (num.first) { - auto res = parseExtractedInteger(reader, logger, num.second); - return std::make_pair( - res.first, - Variant{static_cast<Variant::intType>(res.second)}); + Number n; + if (n.parse(reader, logger, delims)) { + if (n.isInt()) { + return std::make_pair( + true, + Variant{static_cast<Variant::intType>(n.intValue())}); + } else { + return std::make_pair(true, n.doubleValue()); + } } else { - return parseExtractedDouble(reader, logger, num.second); + return std::make_pair(false, n.doubleValue()); } } diff --git a/src/core/variant/Reader.hpp b/src/core/variant/Reader.hpp index a0c6640..9de06bc 100644 --- a/src/core/variant/Reader.hpp +++ b/src/core/variant/Reader.hpp @@ -126,7 +126,8 @@ public: * integer. */ static std::pair<bool, int64_t> parseInteger( - BufferedCharReader &reader, Logger &logger); + BufferedCharReader &reader, Logger &logger, + const std::unordered_set<char> &delims); /** * Parses an double from the given buffered char reader instance until one @@ -138,7 +139,8 @@ public: * integer. */ static std::pair<bool, double> parseDouble( - BufferedCharReader &reader, Logger &logger); + BufferedCharReader &reader, Logger &logger, + const std::unordered_set<char> &delims); /** * Tries to parse the most specific item from the given stream until one of diff --git a/test/core/variant/ReaderTest.cpp b/test/core/variant/ReaderTest.cpp index c33f80d..3156042 100644 --- a/test/core/variant/ReaderTest.cpp +++ b/test/core/variant/ReaderTest.cpp @@ -24,10 +24,10 @@ namespace ousia { namespace variant { +Logger logger; + TEST(Reader, readString) { - TerminalLogger logger(std::cerr, true); - // Simple, double quoted string { BufferedCharReader reader("\"hello world\""); @@ -63,8 +63,6 @@ TEST(Reader, readString) TEST(Reader, parseUnescapedString) { - TerminalLogger logger(std::cerr, true); - // Simple case { BufferedCharReader reader("hello world;"); @@ -100,41 +98,143 @@ TEST(Reader, parseUnescapedString) TEST(Reader, parseInteger) { - TerminalLogger logger(std::cerr, true); + // Valid integers + { + BufferedCharReader reader("0 "); + auto res = Reader::parseInteger(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(0, res.second); + } { - BufferedCharReader reader("42"); - auto res = Reader::parseInteger(reader, logger); + BufferedCharReader reader("42 "); + auto res = Reader::parseInteger(reader, logger, {}); ASSERT_TRUE(res.first); ASSERT_EQ(42, res.second); } { BufferedCharReader reader("-42"); - auto res = Reader::parseInteger(reader, logger); + auto res = Reader::parseInteger(reader, logger, {}); ASSERT_TRUE(res.first); ASSERT_EQ(-42, res.second); } { - BufferedCharReader reader("0x42"); - auto res = Reader::parseInteger(reader, logger); + BufferedCharReader reader(" -0x4A2 "); + auto res = Reader::parseInteger(reader, logger, {}); ASSERT_TRUE(res.first); - ASSERT_EQ(0x42, res.second); + ASSERT_EQ(-0x4A2, res.second); } -/* { - BufferedCharReader reader("0Xaffe"); - auto res = Reader::parseInteger(reader, logger); + { + BufferedCharReader reader(" 0Xaffe"); + auto res = Reader::parseInteger(reader, logger, {}); ASSERT_TRUE(res.first); ASSERT_EQ(0xAFFE, res.second); - }*/ + } + + { + BufferedCharReader reader("0x7FFFFFFFFFFFFFFF"); + auto res = Reader::parseInteger(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(0x7FFFFFFFFFFFFFFFL, res.second); + } + + { + BufferedCharReader reader("-0x7FFFFFFFFFFFFFFF"); + auto res = Reader::parseInteger(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(-0x7FFFFFFFFFFFFFFFL, res.second); + } + + // Invalid integers + { + BufferedCharReader reader("-"); + auto res = Reader::parseInteger(reader, logger, {}); + ASSERT_FALSE(res.first); + } + + { + BufferedCharReader reader("0a"); + auto res = Reader::parseInteger(reader, logger, {}); + ASSERT_FALSE(res.first); + } + + { + BufferedCharReader reader("-0xag"); + auto res = Reader::parseInteger(reader, logger, {}); + ASSERT_FALSE(res.first); + } + + { + BufferedCharReader reader("0x8000000000000000"); + auto res = Reader::parseInteger(reader, logger, {}); + ASSERT_FALSE(res.first); + } } -TEST(Reader, parseGeneric) +TEST(Reader, parseDouble) { - TerminalLogger logger(std::cerr, true); + // Valid doubles + { + BufferedCharReader reader("1.25"); + auto res = Reader::parseDouble(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(1.25, res.second); + } + + { + BufferedCharReader reader(".25"); + auto res = Reader::parseDouble(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(.25, res.second); + } + + { + BufferedCharReader reader(".25e1"); + auto res = Reader::parseDouble(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(2.5, res.second); + } + + { + BufferedCharReader reader("-2.5e-1"); + auto res = Reader::parseDouble(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(-0.25, res.second); + } + + { + BufferedCharReader reader("-50e-2"); + auto res = Reader::parseDouble(reader, logger, {}); + ASSERT_TRUE(res.first); + ASSERT_EQ(-0.5, res.second); + } + + { + BufferedCharReader reader("-50.e-2"); + auto res = Reader::parseDouble(reader, logger, {'.'}); + ASSERT_TRUE(res.first); + ASSERT_EQ(-50, res.second); + } + // Invalid doubles + { + BufferedCharReader reader(".e1"); + auto res = Reader::parseDouble(reader, logger, {}); + ASSERT_FALSE(res.first); + } + + { + BufferedCharReader reader("0e100000"); + auto res = Reader::parseDouble(reader, logger, {}); + ASSERT_FALSE(res.first); + } +} + +TEST(Reader, parseGeneric) +{ // Simple case, unescaped string { BufferedCharReader reader("hello world"); |