summaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2014-12-20 15:49:22 +0100
committerAndreas Stöckel <andreas@somweyr.de>2014-12-20 15:49:22 +0100
commit87233da76c01ebead18a26f01ffb4e20dffe3214 (patch)
treef7fd66178555b794e3c27071c845a239fc46d4cb /src/core
parent55b860964cb752b920d415c4332da85a97abd087 (diff)
implemented reading escaped unicode characters as in javascript
Diffstat (limited to 'src/core')
-rw-r--r--src/core/common/VariantReader.cpp157
1 files changed, 112 insertions, 45 deletions
diff --git a/src/core/common/VariantReader.cpp b/src/core/common/VariantReader.cpp
index cc25eac..ccc14f8 100644
--- a/src/core/common/VariantReader.cpp
+++ b/src/core/common/VariantReader.cpp
@@ -21,6 +21,8 @@
#include <cmath>
#include <sstream>
+#include <utf8.h>
+
#include "VariantReader.hpp"
#include "Utils.hpp"
@@ -79,11 +81,6 @@ static std::pair<bool, T> unexpected(CharReader &reader, Logger &logger,
class Number {
private:
/**
- * Reprsents the part of the number: Base value a, nominator n, exponent e.
- */
- enum class Part { A, N, E };
-
- /**
* State used in the parser state machine
*/
enum class State {
@@ -117,43 +114,12 @@ private:
return -1;
}
+public:
/**
- * Appends the value of the character c to the internal number
- * representation and reports any errors that might occur.
+ * Reprsents the part of the number: Base value a, nominator n, exponent e.
*/
- bool appendChar(char c, int base, Part p, CharReader &reader,
- Logger &logger)
- {
- // Check whether the given character is valid
- int v = charValue(c);
- if (v < 0 || v >= base) {
- logger.error(unexpectedMsg("digit", c), reader);
- return false;
- }
-
- // Append the number to the specified part
- switch (p) {
- case Part::A:
- a = a * base + v;
- break;
- case Part::N:
- n = n * base + v;
- d = d * base;
- break;
- case Part::E:
- e = e * base + v;
- break;
- }
-
- // Check for any overflows
- if (a < 0 || n < 0 || d < 0 || e < 0) {
- logger.error(ERR_TOO_LARGE, reader);
- return false;
- }
- return true;
- }
+ enum class Part { A, N, E };
-public:
/**
* Sign and exponent sign.
*/
@@ -196,12 +162,51 @@ public:
bool isInt() { return (n == 0) && (d == 1) && (e == 0); }
/**
+ * Appends the value of the character c to the internal number
+ * representation and reports any errors that might occur.
+ */
+ bool appendChar(char c, int base, Part p, CharReader &reader,
+ Logger &logger)
+ {
+ // Check whether the given character is valid
+ int v = charValue(c);
+ if (v < 0 || v >= base) {
+ logger.error(unexpectedMsg("digit", c), reader);
+ return false;
+ }
+
+ // Append the number to the specified part
+ switch (p) {
+ case Part::A:
+ a = a * base + v;
+ break;
+ case Part::N:
+ n = n * base + v;
+ d = d * base;
+ break;
+ case Part::E:
+ e = e * base + v;
+ break;
+ }
+
+ // Check for any overflows
+ if (a < 0 || n < 0 || d < 0 || e < 0) {
+ logger.error(ERR_TOO_LARGE, reader);
+ return false;
+ }
+ return true;
+ }
+
+ /**
* Tries to parse the number from the given stream and loggs any errors to
* the given logger instance. Numbers are terminated by one of the given
* delimiters.
*/
bool parse(CharReader &reader, Logger &logger,
const std::unordered_set<char> &delims);
+
+ bool parseFixedLenInt(CharReader &reader, Logger &logger, int base,
+ int len);
};
bool Number::parse(CharReader &reader, Logger &logger,
@@ -332,6 +337,24 @@ bool Number::parse(CharReader &reader, Logger &logger,
return false;
}
+bool Number::parseFixedLenInt(CharReader &reader, Logger &logger, int base,
+ int len)
+{
+ char c;
+ reader.consumePeek();
+ for (int i = 0; i < len; i++) {
+ if (!reader.peek(c)) {
+ logger.error("Unexpected end of escape sequence", reader);
+ return false;
+ }
+ if (!appendChar(c, base, Number::Part::A, reader, logger)) {
+ return false;
+ }
+ reader.consumePeek();
+ }
+ return true;
+}
+
/* State machine states */
static const int STATE_INIT = 0;
@@ -535,10 +558,32 @@ static std::pair<bool, Variant> parseComplex(CharReader &reader, Logger &logger,
/* Class Reader */
+static bool encodeUtf8(std::stringstream &res, CharReader &reader,
+ Logger &logger, int64_t v, bool latin1)
+{
+ // Encode the unicode codepoint as UTF-8
+ uint32_t cp = static_cast<uint32_t>(v);
+ if (latin1 && cp > 0xFF) {
+ logger.error("Not a valid ISO-8859-1 (Latin-1) character, skipping", reader);
+ return false;
+ }
+
+ // Append the code point to the output stream
+ try {
+ utf8::append(cp, std::ostream_iterator<uint8_t>{res});
+ return true;
+ }
+ catch (utf8::invalid_code_point ex) {
+ logger.error("Invalid Unicode codepoint, skipping", reader);
+ }
+ return false;
+}
+
std::pair<bool, std::string> VariantReader::parseString(
CharReader &reader, Logger &logger, const std::unordered_set<char> *delims)
{
// Initialize the internal state
+ bool hadError = false;
int state = STATE_INIT;
char quote = 0;
std::stringstream res;
@@ -565,7 +610,7 @@ std::pair<bool, std::string> VariantReader::parseString(
case STATE_IN_STRING:
if (c == quote) {
reader.consumePeek();
- return std::make_pair(true, res.str());
+ return std::make_pair(!hadError, res.str());
} else if (c == '\\') {
state = STATE_ESCAPE;
reader.consumePeek();
@@ -608,17 +653,39 @@ std::pair<bool, std::string> VariantReader::parseString(
break;
case '\n':
break;
- case 'x':
- // TODO: Parse Latin-1 sequence hex XX
+ case 'x': {
+ // Parse Latin-1 sequence \xXX
+ Number n;
+ hadError =
+ !(n.parseFixedLenInt(reader, logger, 16, 2) &&
+ encodeUtf8(res, reader, logger, n.intValue(),
+ true)) ||
+ hadError;
break;
- case 'u':
- // TODO: Parse 16-Bit unicode character hex XXXX
+ }
+ case 'u': {
+ // Parse Unicode sequence \uXXXX
+ Number n;
+ hadError =
+ !(n.parseFixedLenInt(reader, logger, 16, 4) &&
+ encodeUtf8(res, reader, logger, n.intValue(),
+ false)) ||
+ hadError;
break;
+ }
default:
if (Utils::isNumeric(c)) {
- // TODO: Parse octal 000 sequence
+ // Parse Latin-1 sequence \000
+ reader.resetPeek();
+ Number n;
+ hadError =
+ !(n.parseFixedLenInt(reader, logger, 8, 3) &&
+ encodeUtf8(res, reader, logger, n.intValue(),
+ true)) ||
+ hadError;
} else {
logger.error(ERR_INVALID_ESCAPE, reader);
+ hadError = true;
}
break;
}