summaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-01-11 00:55:55 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-01-11 00:55:55 +0100
commitb7d0f6517d19b6f8c544d6fa88a5f17b3c591a03 (patch)
treeaa0c18c7af52210eafc4cc23bf561984ac6977b5 /src/core
parentdd185959b9fff68008d02979fefde602d177ce75 (diff)
Refactored number class out of VariantReader and added documentation
Diffstat (limited to 'src/core')
-rw-r--r--src/core/common/Number.cpp259
-rw-r--r--src/core/common/Number.hpp171
-rw-r--r--src/core/common/VariantReader.cpp297
3 files changed, 437 insertions, 290 deletions
diff --git a/src/core/common/Number.cpp b/src/core/common/Number.cpp
new file mode 100644
index 0000000..12f64ae
--- /dev/null
+++ b/src/core/common/Number.cpp
@@ -0,0 +1,259 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cmath>
+#include <sstream>
+
+#include "CharReader.hpp"
+#include "Logger.hpp"
+#include "Number.hpp"
+#include "Utils.hpp"
+
+namespace ousia {
+
+// TODO: Invent common system for error messages which allows localization
+// TODO: Possibly adapt the clang error logging system
+
+static const char *ERR_UNEXPECTED_CHAR = "Unexpected character";
+static const char *ERR_UNEXPECTED_END = "Unexpected end of number literal";
+static const char *ERR_TOO_LARGE = "Value too large to represent";
+
+static std::string unexpectedMsg(const char *expected, const char got)
+{
+ std::stringstream ss;
+ ss << ERR_UNEXPECTED_CHAR << ": Expected " << expected << " but got \'"
+ << got << "\'";
+ return ss.str();
+}
+
+/* Class Number */
+
+/**
+ * Returns the numeric value of the given ASCII character (returns 0 for
+ * '0', 1 for '1', 10 for 'A' and so on).
+ *
+ * @param c is the character for which the numeric value should be returned.
+ * @return the numeric value the character represents.
+ */
+static int charValue(char c)
+{
+ if (c >= '0' && c <= '9') {
+ return c & 0x0F;
+ }
+ if ((c >= 'A' && c <= 'O') || (c >= 'a' && c <= 'o')) {
+ return (c & 0x0F) + 9;
+ }
+ return -1;
+}
+
+double Number::doubleValue()
+{
+ return s * (a + ((double)n / (double)d)) * pow(10.0, (double)(sE * e));
+}
+
+int64_t Number::intValue() { return s * a; }
+
+bool Number::appendChar(char c, int base, Part p, CharReader &reader,
+ Logger &logger)
+{
+ // Check whether the given character is valid
+ int v = charValue(c);
+ if (v < 0 || v >= base) {
+ logger.error(unexpectedMsg("digit", c), reader);
+ return false;
+ }
+
+ // Append the number to the specified part
+ switch (p) {
+ case Part::A:
+ a = a * base + v;
+ break;
+ case Part::N:
+ n = n * base + v;
+ d = d * base;
+ break;
+ case Part::E:
+ e = e * base + v;
+ break;
+ }
+
+ // Check for any overflows
+ if (a < 0 || n < 0 || d < 0 || e < 0) {
+ logger.error(ERR_TOO_LARGE, reader);
+ return false;
+ }
+ return true;
+}
+
+bool Number::parse(CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims)
+{
+ State state = State::INIT;
+ char c;
+
+ // Consume the first whitespace characters
+ reader.consumeWhitespace();
+
+ // Iterate over the FSM to extract numbers
+ while (reader.peek(c)) {
+ // Abort, once a delimiter or whitespace is reached
+ if (Utils::isWhitespace(c) || delims.count(c)) {
+ reader.resetPeek();
+ break;
+ }
+
+ // The character is not a whitespace character and not a delimiter
+ switch (state) {
+ case State::INIT:
+ case State::HAS_MINUS:
+ switch (c) {
+ case '-':
+ // Do not allow multiple minus signs
+ if (state == State::HAS_MINUS) {
+ logger.error(unexpectedMsg("digit", c), reader);
+ return false;
+ }
+ state = State::HAS_MINUS;
+ s = -1;
+ break;
+ case '0':
+ // Remember a leading zero for the detection of "0x"
+ state = State::LEADING_ZERO;
+ break;
+ case '.':
+ // Remember a leading point as ".eXXX" is invalid
+ state = State::LEADING_POINT;
+ validInteger = false;
+ break;
+ default:
+ state = State::INT;
+ if (!appendChar(c, 10, Part::A, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ break;
+ case State::LEADING_ZERO:
+ if (c == 'x' || c == 'X') {
+ state = State::HEX;
+ break;
+ }
+ // fallthrough
+ case State::INT:
+ switch (c) {
+ case '.':
+ state = State::POINT;
+ validInteger = false;
+ break;
+ case 'e':
+ case 'E':
+ state = State::EXP_INIT;
+ break;
+ default:
+ state = State::INT;
+ if (!appendChar(c, 10, Part::A, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ break;
+ case State::HEX:
+ if (!appendChar(c, 16, Part::A, reader, logger)) {
+ return false;
+ }
+ break;
+ case State::LEADING_POINT:
+ case State::POINT:
+ switch (c) {
+ case 'e':
+ case 'E':
+ if (state == State::LEADING_POINT) {
+ logger.error(unexpectedMsg("digit", c), reader);
+ return false;
+ }
+ state = State::EXP_INIT;
+ break;
+ default:
+ state = State::POINT;
+ if (!appendChar(c, 10, Part::N, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ break;
+ case State::EXP_HAS_MINUS:
+ case State::EXP_INIT:
+ if (c == '-') {
+ if (state == State::EXP_HAS_MINUS) {
+ logger.error(unexpectedMsg("digit", c), reader);
+ return false;
+ }
+ state = State::EXP_HAS_MINUS;
+ sE = -1;
+ } else {
+ state = State::EXP;
+ if (!appendChar(c, 10, Part::E, reader, logger)) {
+ return false;
+ }
+ }
+ break;
+ case State::EXP:
+ if (!appendChar(c, 10, Part::E, reader, logger)) {
+ return false;
+ }
+ break;
+ }
+ reader.consumePeek();
+ }
+
+ // States in which ending is valid. Log an error in other states
+ if (state == State::LEADING_ZERO || state == State::HEX ||
+ state == State::INT || state == State::POINT || state == State::EXP) {
+ return true;
+ }
+ logger.error(ERR_UNEXPECTED_END, reader);
+ return false;
+}
+
+bool Number::parse(const std::string &str, Logger &logger)
+{
+ // Create a char reader instance with the given string and call the actual
+ // parse function
+ CharReader reader(str);
+ return parse(reader, logger);
+}
+
+bool Number::parseFixedLenInt(CharReader &reader, int len, int base, Logger &logger)
+{
+ char c;
+ reader.consumePeek();
+ for (int i = 0; i < len; i++) {
+ if (!reader.peek(c)) {
+ logger.error("Unexpected end of escape sequence", reader);
+ return false;
+ }
+ if (!appendChar(c, base, Number::Part::A, reader, logger)) {
+ return false;
+ }
+ reader.consumePeek();
+ }
+ return true;
+}
+
+}
+
diff --git a/src/core/common/Number.hpp b/src/core/common/Number.hpp
new file mode 100644
index 0000000..89a689e
--- /dev/null
+++ b/src/core/common/Number.hpp
@@ -0,0 +1,171 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Number.hpp
+ *
+ * Contains the Number class responsible for parsing integers and doubles of
+ * various bases.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#include <cstdint>
+#include <string>
+#include <unordered_set>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class Logger;
+
+/* Class Number */
+
+/**
+ * Class used internally to represent a number (integer or double). The number
+ * is represented by its components (base value a, nominator n, denominator d,
+ * exponent e, sign s and exponent sign sE).
+ */
+class Number {
+private:
+ /**
+ * State used in the parser state machine
+ */
+ enum class State {
+ INIT,
+ HAS_MINUS,
+ LEADING_ZERO,
+ LEADING_POINT,
+ INT,
+ HEX,
+ POINT,
+ EXP_INIT,
+ EXP_HAS_MINUS,
+ EXP
+ };
+
+ /**
+ * Reprsents the part of the number: Base value a, nominator n, exponent e.
+ */
+ enum class Part { A, N, E };
+
+ /**
+ * Sign of the number and the exponent.
+ */
+ int8_t s, sE;
+
+ /**
+ * Exponent.
+ */
+ int16_t e;
+
+ /**
+ * Base value, nominator, denominator
+ */
+ int64_t a, n, d;
+
+ /**
+ * Variable specifying whether the parsed number actually was an integer.
+ */
+ bool validInteger;
+
+ /**
+ * Appends the value of the character c to the internal number
+ * representation and reports any errors that might occur.
+ *
+ * @param c is the character that should be appended.
+ * @param base is the current base.
+ * @param p is the current number part.
+ * @param reader is the char reader which points at the current reading
+ * position.
+ */
+ bool appendChar(char c, int base, Part p, CharReader &reader,
+ Logger &logger);
+
+public:
+ /**
+ * Constructor of the number class.
+ */
+ Number() : s(1), sE(1), e(0), a(0), n(0), d(1), validInteger(true) {}
+
+ /**
+ * Returns the represented double value.
+ *
+ * @return the double value the number is currently representing.
+ */
+ double doubleValue();
+
+ /**
+ * Returns the represented integer value. Only a lossless operation, if the
+ * number is an integer (as can be checked via the isInt method), otherwise
+ * the exponent and the fractional value will be truncated.
+ *
+ * @return the integer value (ignoring any exponent)
+ */
+ int64_t intValue();
+
+ /**
+ * Returns true, if the number was a valid integer.
+ *
+ * @return true if the number is an integer, false otherwise.
+ */
+ bool isInt() { return validInteger; }
+
+ /**
+ * Tries to parse the number from the given stream and loggs any errors to
+ * the given logger instance. Numbers are terminated by one of the given
+ * delimiters.
+ *
+ * @param reader is the char reader from which the number should be read.
+ * @param logger is the logger instance to which error messages should be
+ * written.
+ * @param delims is a set of characters at which parsing should stop. The
+ * reader is positioned at the delimiter.
+ * @return true if parsing was successful, false otherwise.
+ */
+ bool parse(
+ CharReader &reader, Logger &logger,
+ const std::unordered_set<char> &delims = std::unordered_set<char>{});
+
+ /**
+ * Tries to parse the number from the given string and loggs any errors to
+ * the given logger instance.
+ *
+ * @param str is the string from which the number should be read.
+ * @param logger is the logger instance to which error messages should be
+ * written.
+ * @return true if parsing was successful, false otherwise.
+ */
+ bool parse(const std::string &str, Logger &logger);
+
+ /**
+ * Parses a number with a fixed length and the given base.
+ *
+ * @param reader is a reference at the char reader from which the number
+ * should be read.
+ * @param len is the length of the integer sequence.
+ * @param base is the base of the number.
+ * @param logger is the logger instance to which error messages should be
+ * written.
+ */
+ bool parseFixedLenInt(CharReader &reader, int len, int base,
+ Logger &logger);
+};
+}
+
diff --git a/src/core/common/VariantReader.cpp b/src/core/common/VariantReader.cpp
index d48e5cc..904713e 100644
--- a/src/core/common/VariantReader.cpp
+++ b/src/core/common/VariantReader.cpp
@@ -23,21 +23,22 @@
#include <utf8.h>
+#include "Number.hpp"
#include "VariantReader.hpp"
#include "Utils.hpp"
namespace ousia {
-// TODO: Use custom return value instead of std::pair
-
/* Error Messages */
+// TODO: Invent common system for error messages which allows localization
+// TODO: Possibly adapt the clang error logging system
+
static const char *ERR_UNEXPECTED_CHAR = "Unexpected character";
static const char *ERR_UNEXPECTED_END = "Unexpected end of literal";
static const char *ERR_UNTERMINATED = "Unterminated literal";
static const char *ERR_INVALID_ESCAPE = "Invalid escape sequence";
static const char *ERR_INVALID_INTEGER = "Invalid integer value";
-static const char *ERR_TOO_LARGE = "Value too large to represent";
template <class T>
static std::pair<bool, T> error(CharReader &reader, Logger &logger,
@@ -71,290 +72,6 @@ static std::pair<bool, T> unexpected(CharReader &reader, Logger &logger,
return error(reader, logger, unexpectedMsg(expected, got), res);
}
-/* Class Number */
-
-/**
- * Class used internally to represent a number (integer or double). The number
- * is represented by its components (base value a, nominator n, denominator d,
- * exponent e, sign s and exponent sign sE).
- */
-class Number {
-private:
- /**
- * State used in the parser state machine
- */
- enum class State {
- INIT,
- HAS_MINUS,
- LEADING_ZERO,
- LEADING_POINT,
- INT,
- HEX,
- POINT,
- EXP_INIT,
- EXP_HAS_MINUS,
- EXP
- };
-
- /**
- * Returns the numeric value of the given ASCII character (returns 0 for
- * '0', 1 for '1', 10 for 'A' and so on).
- *
- * @param c is the character for which the numeric value should be returned.
- * @return the numeric value the character represents.
- */
- static int charValue(char c)
- {
- if (c >= '0' && c <= '9') {
- return c & 0x0F;
- }
- if ((c >= 'A' && c <= 'O') || (c >= 'a' && c <= 'o')) {
- return (c & 0x0F) + 9;
- }
- return -1;
- }
-
-public:
- /**
- * Reprsents the part of the number: Base value a, nominator n, exponent e.
- */
- enum class Part { A, N, E };
-
- /**
- * Sign and exponent sign.
- */
- int8_t s, sE;
-
- /**
- * Exponent
- */
- int16_t e;
-
- /**
- * Base value, nominator, denominator
- */
- int64_t a, n, d;
-
- /**
- * Constructor of the number class.
- */
- Number() : s(1), sE(1), e(0), a(0), n(0), d(1) {}
-
- /**
- * Returns the represented double value.
- */
- double doubleValue()
- {
- return s * (a + ((double)n / (double)d)) * pow(10.0, (double)(sE * e));
- }
-
- /**
- * Returns the represented integer value. Only a lossless operation, if the
- * number is an integer (as can be checked via the isInt method), otherwise
- * the exponent and the fractional value will be truncated.
- */
- int64_t intValue() { return s * a; }
-
- /**
- * Returns true, if the number is an integer (has no fractional or
- * exponential part).
- */
- bool isInt() { return (n == 0) && (d == 1) && (e == 0); }
-
- /**
- * Appends the value of the character c to the internal number
- * representation and reports any errors that might occur.
- */
- bool appendChar(char c, int base, Part p, CharReader &reader,
- Logger &logger)
- {
- // Check whether the given character is valid
- int v = charValue(c);
- if (v < 0 || v >= base) {
- logger.error(unexpectedMsg("digit", c), reader);
- return false;
- }
-
- // Append the number to the specified part
- switch (p) {
- case Part::A:
- a = a * base + v;
- break;
- case Part::N:
- n = n * base + v;
- d = d * base;
- break;
- case Part::E:
- e = e * base + v;
- break;
- }
-
- // Check for any overflows
- if (a < 0 || n < 0 || d < 0 || e < 0) {
- logger.error(ERR_TOO_LARGE, reader);
- return false;
- }
- return true;
- }
-
- /**
- * Tries to parse the number from the given stream and loggs any errors to
- * the given logger instance. Numbers are terminated by one of the given
- * delimiters.
- */
- bool parse(CharReader &reader, Logger &logger,
- const std::unordered_set<char> &delims);
-
- bool parseFixedLenInt(CharReader &reader, Logger &logger, int base,
- int len);
-};
-
-bool Number::parse(CharReader &reader, Logger &logger,
- const std::unordered_set<char> &delims)
-{
- State state = State::INIT;
- char c;
-
- // Consume the first whitespace characters
- reader.consumeWhitespace();
-
- // Iterate over the FSM to extract numbers
- while (reader.peek(c)) {
- // Abort, once a delimiter or whitespace is reached
- if (Utils::isWhitespace(c) || delims.count(c)) {
- reader.resetPeek();
- break;
- }
-
- // The character is not a whitespace character and not a delimiter
- switch (state) {
- case State::INIT:
- case State::HAS_MINUS:
- switch (c) {
- case '-':
- // Do not allow multiple minus signs
- if (state == State::HAS_MINUS) {
- logger.error(unexpectedMsg("digit", c), reader);
- return false;
- }
- state = State::HAS_MINUS;
- s = -1;
- break;
- case '0':
- // Remember a leading zero for the detection of "0x"
- state = State::LEADING_ZERO;
- break;
- case '.':
- // Remember a leading point as ".eXXX" is invalid
- state = State::LEADING_POINT;
- break;
- default:
- state = State::INT;
- if (!appendChar(c, 10, Part::A, reader, logger)) {
- return false;
- }
- break;
- }
- break;
- case State::LEADING_ZERO:
- if (c == 'x' || c == 'X') {
- state = State::HEX;
- break;
- }
- // fallthrough
- case State::INT:
- switch (c) {
- case '.':
- state = State::POINT;
- break;
- case 'e':
- case 'E':
- state = State::EXP_INIT;
- break;
- default:
- state = State::INT;
- if (!appendChar(c, 10, Part::A, reader, logger)) {
- return false;
- }
- break;
- }
- break;
- case State::HEX:
- if (!appendChar(c, 16, Part::A, reader, logger)) {
- return false;
- }
- break;
- case State::LEADING_POINT:
- case State::POINT:
- switch (c) {
- case 'e':
- case 'E':
- if (state == State::LEADING_POINT) {
- logger.error(unexpectedMsg("digit", c), reader);
- return false;
- }
- state = State::EXP_INIT;
- break;
- default:
- state = State::POINT;
- if (!appendChar(c, 10, Part::N, reader, logger)) {
- return false;
- }
- break;
- }
- break;
- case State::EXP_HAS_MINUS:
- case State::EXP_INIT:
- if (c == '-') {
- if (state == State::EXP_HAS_MINUS) {
- logger.error(unexpectedMsg("digit", c), reader);
- return false;
- }
- state = State::EXP_HAS_MINUS;
- sE = -1;
- } else {
- state = State::EXP;
- if (!appendChar(c, 10, Part::E, reader, logger)) {
- return false;
- }
- }
- break;
- case State::EXP:
- if (!appendChar(c, 10, Part::E, reader, logger)) {
- return false;
- }
- break;
- }
- reader.consumePeek();
- }
-
- // States in which ending is valid. Log an error in other states
- if (state == State::LEADING_ZERO || state == State::HEX ||
- state == State::INT || state == State::POINT || state == State::EXP) {
- return true;
- }
- logger.error(ERR_UNEXPECTED_END, reader);
- return false;
-}
-
-bool Number::parseFixedLenInt(CharReader &reader, Logger &logger, int base,
- int len)
-{
- char c;
- reader.consumePeek();
- for (int i = 0; i < len; i++) {
- if (!reader.peek(c)) {
- logger.error("Unexpected end of escape sequence", reader);
- return false;
- }
- if (!appendChar(c, base, Number::Part::A, reader, logger)) {
- return false;
- }
- reader.consumePeek();
- }
- return true;
-}
-
/* State machine states */
static const int STATE_INIT = 0;
@@ -658,7 +375,7 @@ std::pair<bool, std::string> VariantReader::parseString(
// Parse Latin-1 sequence \xXX
Number n;
hadError =
- !(n.parseFixedLenInt(reader, logger, 16, 2) &&
+ !(n.parseFixedLenInt(reader, 2, 16, logger) &&
encodeUtf8(res, reader, logger, n.intValue(),
true)) ||
hadError;
@@ -668,7 +385,7 @@ std::pair<bool, std::string> VariantReader::parseString(
// Parse Unicode sequence \uXXXX
Number n;
hadError =
- !(n.parseFixedLenInt(reader, logger, 16, 4) &&
+ !(n.parseFixedLenInt(reader, 4, 16, logger) &&
encodeUtf8(res, reader, logger, n.intValue(),
false)) ||
hadError;
@@ -680,7 +397,7 @@ std::pair<bool, std::string> VariantReader::parseString(
reader.resetPeek();
Number n;
hadError =
- !(n.parseFixedLenInt(reader, logger, 8, 3) &&
+ !(n.parseFixedLenInt(reader, 3, 8, logger) &&
encodeUtf8(res, reader, logger, n.intValue(),
true)) ||
hadError;