diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-01-11 00:55:55 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-01-11 00:55:55 +0100 |
commit | b7d0f6517d19b6f8c544d6fa88a5f17b3c591a03 (patch) | |
tree | aa0c18c7af52210eafc4cc23bf561984ac6977b5 /src/core/common | |
parent | dd185959b9fff68008d02979fefde602d177ce75 (diff) |
Refactored number class out of VariantReader and added documentation
Diffstat (limited to 'src/core/common')
-rw-r--r-- | src/core/common/Number.cpp | 259 | ||||
-rw-r--r-- | src/core/common/Number.hpp | 171 | ||||
-rw-r--r-- | src/core/common/VariantReader.cpp | 297 |
3 files changed, 437 insertions, 290 deletions
diff --git a/src/core/common/Number.cpp b/src/core/common/Number.cpp new file mode 100644 index 0000000..12f64ae --- /dev/null +++ b/src/core/common/Number.cpp @@ -0,0 +1,259 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <cmath> +#include <sstream> + +#include "CharReader.hpp" +#include "Logger.hpp" +#include "Number.hpp" +#include "Utils.hpp" + +namespace ousia { + +// TODO: Invent common system for error messages which allows localization +// TODO: Possibly adapt the clang error logging system + +static const char *ERR_UNEXPECTED_CHAR = "Unexpected character"; +static const char *ERR_UNEXPECTED_END = "Unexpected end of number literal"; +static const char *ERR_TOO_LARGE = "Value too large to represent"; + +static std::string unexpectedMsg(const char *expected, const char got) +{ + std::stringstream ss; + ss << ERR_UNEXPECTED_CHAR << ": Expected " << expected << " but got \'" + << got << "\'"; + return ss.str(); +} + +/* Class Number */ + +/** + * Returns the numeric value of the given ASCII character (returns 0 for + * '0', 1 for '1', 10 for 'A' and so on). + * + * @param c is the character for which the numeric value should be returned. + * @return the numeric value the character represents. + */ +static int charValue(char c) +{ + if (c >= '0' && c <= '9') { + return c & 0x0F; + } + if ((c >= 'A' && c <= 'O') || (c >= 'a' && c <= 'o')) { + return (c & 0x0F) + 9; + } + return -1; +} + +double Number::doubleValue() +{ + return s * (a + ((double)n / (double)d)) * pow(10.0, (double)(sE * e)); +} + +int64_t Number::intValue() { return s * a; } + +bool Number::appendChar(char c, int base, Part p, CharReader &reader, + Logger &logger) +{ + // Check whether the given character is valid + int v = charValue(c); + if (v < 0 || v >= base) { + logger.error(unexpectedMsg("digit", c), reader); + return false; + } + + // Append the number to the specified part + switch (p) { + case Part::A: + a = a * base + v; + break; + case Part::N: + n = n * base + v; + d = d * base; + break; + case Part::E: + e = e * base + v; + break; + } + + // Check for any overflows + if (a < 0 || n < 0 || d < 0 || e < 0) { + logger.error(ERR_TOO_LARGE, reader); + return false; + } + return true; +} + +bool Number::parse(CharReader &reader, Logger &logger, + const std::unordered_set<char> &delims) +{ + State state = State::INIT; + char c; + + // Consume the first whitespace characters + reader.consumeWhitespace(); + + // Iterate over the FSM to extract numbers + while (reader.peek(c)) { + // Abort, once a delimiter or whitespace is reached + if (Utils::isWhitespace(c) || delims.count(c)) { + reader.resetPeek(); + break; + } + + // The character is not a whitespace character and not a delimiter + switch (state) { + case State::INIT: + case State::HAS_MINUS: + switch (c) { + case '-': + // Do not allow multiple minus signs + if (state == State::HAS_MINUS) { + logger.error(unexpectedMsg("digit", c), reader); + return false; + } + state = State::HAS_MINUS; + s = -1; + break; + case '0': + // Remember a leading zero for the detection of "0x" + state = State::LEADING_ZERO; + break; + case '.': + // Remember a leading point as ".eXXX" is invalid + state = State::LEADING_POINT; + validInteger = false; + break; + default: + state = State::INT; + if (!appendChar(c, 10, Part::A, reader, logger)) { + return false; + } + break; + } + break; + case State::LEADING_ZERO: + if (c == 'x' || c == 'X') { + state = State::HEX; + break; + } + // fallthrough + case State::INT: + switch (c) { + case '.': + state = State::POINT; + validInteger = false; + break; + case 'e': + case 'E': + state = State::EXP_INIT; + break; + default: + state = State::INT; + if (!appendChar(c, 10, Part::A, reader, logger)) { + return false; + } + break; + } + break; + case State::HEX: + if (!appendChar(c, 16, Part::A, reader, logger)) { + return false; + } + break; + case State::LEADING_POINT: + case State::POINT: + switch (c) { + case 'e': + case 'E': + if (state == State::LEADING_POINT) { + logger.error(unexpectedMsg("digit", c), reader); + return false; + } + state = State::EXP_INIT; + break; + default: + state = State::POINT; + if (!appendChar(c, 10, Part::N, reader, logger)) { + return false; + } + break; + } + break; + case State::EXP_HAS_MINUS: + case State::EXP_INIT: + if (c == '-') { + if (state == State::EXP_HAS_MINUS) { + logger.error(unexpectedMsg("digit", c), reader); + return false; + } + state = State::EXP_HAS_MINUS; + sE = -1; + } else { + state = State::EXP; + if (!appendChar(c, 10, Part::E, reader, logger)) { + return false; + } + } + break; + case State::EXP: + if (!appendChar(c, 10, Part::E, reader, logger)) { + return false; + } + break; + } + reader.consumePeek(); + } + + // States in which ending is valid. Log an error in other states + if (state == State::LEADING_ZERO || state == State::HEX || + state == State::INT || state == State::POINT || state == State::EXP) { + return true; + } + logger.error(ERR_UNEXPECTED_END, reader); + return false; +} + +bool Number::parse(const std::string &str, Logger &logger) +{ + // Create a char reader instance with the given string and call the actual + // parse function + CharReader reader(str); + return parse(reader, logger); +} + +bool Number::parseFixedLenInt(CharReader &reader, int len, int base, Logger &logger) +{ + char c; + reader.consumePeek(); + for (int i = 0; i < len; i++) { + if (!reader.peek(c)) { + logger.error("Unexpected end of escape sequence", reader); + return false; + } + if (!appendChar(c, base, Number::Part::A, reader, logger)) { + return false; + } + reader.consumePeek(); + } + return true; +} + +} + diff --git a/src/core/common/Number.hpp b/src/core/common/Number.hpp new file mode 100644 index 0000000..89a689e --- /dev/null +++ b/src/core/common/Number.hpp @@ -0,0 +1,171 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Number.hpp + * + * Contains the Number class responsible for parsing integers and doubles of + * various bases. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#include <cstdint> +#include <string> +#include <unordered_set> + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; + +/* Class Number */ + +/** + * Class used internally to represent a number (integer or double). The number + * is represented by its components (base value a, nominator n, denominator d, + * exponent e, sign s and exponent sign sE). + */ +class Number { +private: + /** + * State used in the parser state machine + */ + enum class State { + INIT, + HAS_MINUS, + LEADING_ZERO, + LEADING_POINT, + INT, + HEX, + POINT, + EXP_INIT, + EXP_HAS_MINUS, + EXP + }; + + /** + * Reprsents the part of the number: Base value a, nominator n, exponent e. + */ + enum class Part { A, N, E }; + + /** + * Sign of the number and the exponent. + */ + int8_t s, sE; + + /** + * Exponent. + */ + int16_t e; + + /** + * Base value, nominator, denominator + */ + int64_t a, n, d; + + /** + * Variable specifying whether the parsed number actually was an integer. + */ + bool validInteger; + + /** + * Appends the value of the character c to the internal number + * representation and reports any errors that might occur. + * + * @param c is the character that should be appended. + * @param base is the current base. + * @param p is the current number part. + * @param reader is the char reader which points at the current reading + * position. + */ + bool appendChar(char c, int base, Part p, CharReader &reader, + Logger &logger); + +public: + /** + * Constructor of the number class. + */ + Number() : s(1), sE(1), e(0), a(0), n(0), d(1), validInteger(true) {} + + /** + * Returns the represented double value. + * + * @return the double value the number is currently representing. + */ + double doubleValue(); + + /** + * Returns the represented integer value. Only a lossless operation, if the + * number is an integer (as can be checked via the isInt method), otherwise + * the exponent and the fractional value will be truncated. + * + * @return the integer value (ignoring any exponent) + */ + int64_t intValue(); + + /** + * Returns true, if the number was a valid integer. + * + * @return true if the number is an integer, false otherwise. + */ + bool isInt() { return validInteger; } + + /** + * Tries to parse the number from the given stream and loggs any errors to + * the given logger instance. Numbers are terminated by one of the given + * delimiters. + * + * @param reader is the char reader from which the number should be read. + * @param logger is the logger instance to which error messages should be + * written. + * @param delims is a set of characters at which parsing should stop. The + * reader is positioned at the delimiter. + * @return true if parsing was successful, false otherwise. + */ + bool parse( + CharReader &reader, Logger &logger, + const std::unordered_set<char> &delims = std::unordered_set<char>{}); + + /** + * Tries to parse the number from the given string and loggs any errors to + * the given logger instance. + * + * @param str is the string from which the number should be read. + * @param logger is the logger instance to which error messages should be + * written. + * @return true if parsing was successful, false otherwise. + */ + bool parse(const std::string &str, Logger &logger); + + /** + * Parses a number with a fixed length and the given base. + * + * @param reader is a reference at the char reader from which the number + * should be read. + * @param len is the length of the integer sequence. + * @param base is the base of the number. + * @param logger is the logger instance to which error messages should be + * written. + */ + bool parseFixedLenInt(CharReader &reader, int len, int base, + Logger &logger); +}; +} + diff --git a/src/core/common/VariantReader.cpp b/src/core/common/VariantReader.cpp index d48e5cc..904713e 100644 --- a/src/core/common/VariantReader.cpp +++ b/src/core/common/VariantReader.cpp @@ -23,21 +23,22 @@ #include <utf8.h> +#include "Number.hpp" #include "VariantReader.hpp" #include "Utils.hpp" namespace ousia { -// TODO: Use custom return value instead of std::pair - /* Error Messages */ +// TODO: Invent common system for error messages which allows localization +// TODO: Possibly adapt the clang error logging system + static const char *ERR_UNEXPECTED_CHAR = "Unexpected character"; static const char *ERR_UNEXPECTED_END = "Unexpected end of literal"; static const char *ERR_UNTERMINATED = "Unterminated literal"; static const char *ERR_INVALID_ESCAPE = "Invalid escape sequence"; static const char *ERR_INVALID_INTEGER = "Invalid integer value"; -static const char *ERR_TOO_LARGE = "Value too large to represent"; template <class T> static std::pair<bool, T> error(CharReader &reader, Logger &logger, @@ -71,290 +72,6 @@ static std::pair<bool, T> unexpected(CharReader &reader, Logger &logger, return error(reader, logger, unexpectedMsg(expected, got), res); } -/* Class Number */ - -/** - * Class used internally to represent a number (integer or double). The number - * is represented by its components (base value a, nominator n, denominator d, - * exponent e, sign s and exponent sign sE). - */ -class Number { -private: - /** - * State used in the parser state machine - */ - enum class State { - INIT, - HAS_MINUS, - LEADING_ZERO, - LEADING_POINT, - INT, - HEX, - POINT, - EXP_INIT, - EXP_HAS_MINUS, - EXP - }; - - /** - * Returns the numeric value of the given ASCII character (returns 0 for - * '0', 1 for '1', 10 for 'A' and so on). - * - * @param c is the character for which the numeric value should be returned. - * @return the numeric value the character represents. - */ - static int charValue(char c) - { - if (c >= '0' && c <= '9') { - return c & 0x0F; - } - if ((c >= 'A' && c <= 'O') || (c >= 'a' && c <= 'o')) { - return (c & 0x0F) + 9; - } - return -1; - } - -public: - /** - * Reprsents the part of the number: Base value a, nominator n, exponent e. - */ - enum class Part { A, N, E }; - - /** - * Sign and exponent sign. - */ - int8_t s, sE; - - /** - * Exponent - */ - int16_t e; - - /** - * Base value, nominator, denominator - */ - int64_t a, n, d; - - /** - * Constructor of the number class. - */ - Number() : s(1), sE(1), e(0), a(0), n(0), d(1) {} - - /** - * Returns the represented double value. - */ - double doubleValue() - { - return s * (a + ((double)n / (double)d)) * pow(10.0, (double)(sE * e)); - } - - /** - * Returns the represented integer value. Only a lossless operation, if the - * number is an integer (as can be checked via the isInt method), otherwise - * the exponent and the fractional value will be truncated. - */ - int64_t intValue() { return s * a; } - - /** - * Returns true, if the number is an integer (has no fractional or - * exponential part). - */ - bool isInt() { return (n == 0) && (d == 1) && (e == 0); } - - /** - * Appends the value of the character c to the internal number - * representation and reports any errors that might occur. - */ - bool appendChar(char c, int base, Part p, CharReader &reader, - Logger &logger) - { - // Check whether the given character is valid - int v = charValue(c); - if (v < 0 || v >= base) { - logger.error(unexpectedMsg("digit", c), reader); - return false; - } - - // Append the number to the specified part - switch (p) { - case Part::A: - a = a * base + v; - break; - case Part::N: - n = n * base + v; - d = d * base; - break; - case Part::E: - e = e * base + v; - break; - } - - // Check for any overflows - if (a < 0 || n < 0 || d < 0 || e < 0) { - logger.error(ERR_TOO_LARGE, reader); - return false; - } - return true; - } - - /** - * Tries to parse the number from the given stream and loggs any errors to - * the given logger instance. Numbers are terminated by one of the given - * delimiters. - */ - bool parse(CharReader &reader, Logger &logger, - const std::unordered_set<char> &delims); - - bool parseFixedLenInt(CharReader &reader, Logger &logger, int base, - int len); -}; - -bool Number::parse(CharReader &reader, Logger &logger, - const std::unordered_set<char> &delims) -{ - State state = State::INIT; - char c; - - // Consume the first whitespace characters - reader.consumeWhitespace(); - - // Iterate over the FSM to extract numbers - while (reader.peek(c)) { - // Abort, once a delimiter or whitespace is reached - if (Utils::isWhitespace(c) || delims.count(c)) { - reader.resetPeek(); - break; - } - - // The character is not a whitespace character and not a delimiter - switch (state) { - case State::INIT: - case State::HAS_MINUS: - switch (c) { - case '-': - // Do not allow multiple minus signs - if (state == State::HAS_MINUS) { - logger.error(unexpectedMsg("digit", c), reader); - return false; - } - state = State::HAS_MINUS; - s = -1; - break; - case '0': - // Remember a leading zero for the detection of "0x" - state = State::LEADING_ZERO; - break; - case '.': - // Remember a leading point as ".eXXX" is invalid - state = State::LEADING_POINT; - break; - default: - state = State::INT; - if (!appendChar(c, 10, Part::A, reader, logger)) { - return false; - } - break; - } - break; - case State::LEADING_ZERO: - if (c == 'x' || c == 'X') { - state = State::HEX; - break; - } - // fallthrough - case State::INT: - switch (c) { - case '.': - state = State::POINT; - break; - case 'e': - case 'E': - state = State::EXP_INIT; - break; - default: - state = State::INT; - if (!appendChar(c, 10, Part::A, reader, logger)) { - return false; - } - break; - } - break; - case State::HEX: - if (!appendChar(c, 16, Part::A, reader, logger)) { - return false; - } - break; - case State::LEADING_POINT: - case State::POINT: - switch (c) { - case 'e': - case 'E': - if (state == State::LEADING_POINT) { - logger.error(unexpectedMsg("digit", c), reader); - return false; - } - state = State::EXP_INIT; - break; - default: - state = State::POINT; - if (!appendChar(c, 10, Part::N, reader, logger)) { - return false; - } - break; - } - break; - case State::EXP_HAS_MINUS: - case State::EXP_INIT: - if (c == '-') { - if (state == State::EXP_HAS_MINUS) { - logger.error(unexpectedMsg("digit", c), reader); - return false; - } - state = State::EXP_HAS_MINUS; - sE = -1; - } else { - state = State::EXP; - if (!appendChar(c, 10, Part::E, reader, logger)) { - return false; - } - } - break; - case State::EXP: - if (!appendChar(c, 10, Part::E, reader, logger)) { - return false; - } - break; - } - reader.consumePeek(); - } - - // States in which ending is valid. Log an error in other states - if (state == State::LEADING_ZERO || state == State::HEX || - state == State::INT || state == State::POINT || state == State::EXP) { - return true; - } - logger.error(ERR_UNEXPECTED_END, reader); - return false; -} - -bool Number::parseFixedLenInt(CharReader &reader, Logger &logger, int base, - int len) -{ - char c; - reader.consumePeek(); - for (int i = 0; i < len; i++) { - if (!reader.peek(c)) { - logger.error("Unexpected end of escape sequence", reader); - return false; - } - if (!appendChar(c, base, Number::Part::A, reader, logger)) { - return false; - } - reader.consumePeek(); - } - return true; -} - /* State machine states */ static const int STATE_INIT = 0; @@ -658,7 +375,7 @@ std::pair<bool, std::string> VariantReader::parseString( // Parse Latin-1 sequence \xXX Number n; hadError = - !(n.parseFixedLenInt(reader, logger, 16, 2) && + !(n.parseFixedLenInt(reader, 2, 16, logger) && encodeUtf8(res, reader, logger, n.intValue(), true)) || hadError; @@ -668,7 +385,7 @@ std::pair<bool, std::string> VariantReader::parseString( // Parse Unicode sequence \uXXXX Number n; hadError = - !(n.parseFixedLenInt(reader, logger, 16, 4) && + !(n.parseFixedLenInt(reader, 4, 16, logger) && encodeUtf8(res, reader, logger, n.intValue(), false)) || hadError; @@ -680,7 +397,7 @@ std::pair<bool, std::string> VariantReader::parseString( reader.resetPeek(); Number n; hadError = - !(n.parseFixedLenInt(reader, logger, 8, 3) && + !(n.parseFixedLenInt(reader, 3, 8, logger) && encodeUtf8(res, reader, logger, n.intValue(), true)) || hadError; |