summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/formats/osdm/DynamicTokenizer.cpp251
-rw-r--r--src/formats/osdm/DynamicTokenizer.hpp23
2 files changed, 45 insertions, 229 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp
index f2cfcd1..1fac25a 100644
--- a/src/formats/osdm/DynamicTokenizer.cpp
+++ b/src/formats/osdm/DynamicTokenizer.cpp
@@ -22,6 +22,7 @@
#include <core/common/CharReader.hpp>
#include <core/common/Exceptions.hpp>
#include <core/common/Utils.hpp>
+#include <core/common/WhitespaceHandler.hpp>
#include "DynamicTokenizer.hpp"
@@ -102,8 +103,8 @@ public:
* @param textLength is the text buffer length of the previous text token.
* @param textEnd is the current end location of the previous text token.
*/
- TokenLookup(const TokenTrie::Node *node, size_t start,
- size_t textLength, size_t textEnd)
+ TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength,
+ size_t textEnd)
: node(node), start(start), textLength(textLength), textEnd(textEnd)
{
}
@@ -155,192 +156,29 @@ public:
}
};
-/* Internal class TextHandlerBase */
-
/**
- * Base class used for those classes that may be used as TextHandler in the
- * DynamicTokenizer::next function.
+ * Transforms the given token into a text token containing the extracted
+ * text.
+ *
+ * @param handler is the WhitespaceHandler containing the collected data.
+ * @param token is the output token to which the text should be written.
+ * @param sourceId is the source id of the underlying file.
*/
-class TextHandlerBase {
-public:
- /**
- * Start position of the extracted text.
- */
- size_t textStart;
-
- /**
- * End position of the extracted text.
- */
- size_t textEnd;
-
- /**
- * Buffer containing the extracted text.
- */
- std::vector<char> textBuf;
-
- /**
- * Constructor of the TextHandlerBase base class. Initializes the start and
- * end position with zeros.
- */
- TextHandlerBase() : textStart(0), textEnd(0) {}
-
- /**
- * Transforms the given token into a text token containing the extracted
- * text.
- *
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
- void buildTextToken(TokenMatch &match, SourceId sourceId)
- {
- if (match.hasMatch()) {
- match.token.content =
- std::string{textBuf.data(), match.textLength};
- match.token.location =
- SourceLocation{sourceId, textStart, match.textEnd};
- } else {
- match.token.content = std::string{textBuf.data(), textBuf.size()};
- match.token.location = SourceLocation{sourceId, textStart, textEnd};
- }
- match.token.type = TextToken;
- }
-
- /**
- * Returns true if this whitespace handler has found any text and a text
- * token could be emitted.
- *
- * @return true if the internal data buffer is non-empty.
- */
- bool hasText() { return !textBuf.empty(); }
-};
-
-/* Internal class PreservingTextHandler */
-
-/**
- * The PreservingTextHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Appends the given character to the internal text buffer, does not
- * eliminate whitespace.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
- textBuf.push_back(c);
- }
-};
-
-/* Internal class TrimmingTextHandler */
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Buffer used internally to temporarily store all whitespace characters.
- * They are only added to the output buffer if another non-whitespace
- * character is reached.
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * whitespace characters at the begin and end of the text.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- whitespaceBuf.push_back(c);
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (!whitespaceBuf.empty()) {
- textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
- whitespaceBuf.end());
- whitespaceBuf.clear();
- }
- textBuf.push_back(c);
- }
-};
-
-/* Internal class CollapsingTextHandler */
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Flag set to true if a whitespace character was reached.
- */
- bool hasWhitespace = false;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * redundant whitespace characters.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- hasWhitespace = true;
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (hasWhitespace) {
- textBuf.push_back(' ');
- hasWhitespace = false;
- }
- textBuf.push_back(c);
+static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match,
+ SourceId sourceId)
+{
+ if (match.hasMatch()) {
+ match.token.content =
+ std::string{handler.textBuf.data(), match.textLength};
+ match.token.location =
+ SourceLocation{sourceId, handler.textStart, match.textEnd};
+ } else {
+ match.token.content = handler.toString();
+ match.token.location =
+ SourceLocation{sourceId, handler.textStart, handler.textEnd};
}
-};
+ match.token.type = TextToken;
+}
}
/* Class DynamicTokenizer */
@@ -409,9 +247,8 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
}
// If we found text, emit that text
- if (textHandler.hasText() &&
- (!match.hasMatch() || match.textLength > 0)) {
- textHandler.buildTextToken(match, sourceId);
+ if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
+ buildTextToken(textHandler, match, sourceId);
}
// Move the read/peek cursor to the end of the token, abort if an error
@@ -436,28 +273,28 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
return match.hasMatch();
}
-bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
+bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token)
{
switch (whitespaceMode) {
case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, true>(reader, token);
+ return next<PreservingWhitespaceHandler, true>(reader, token);
case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, true>(reader, token);
+ return next<TrimmingWhitespaceHandler, true>(reader, token);
case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, true>(reader, token);
+ return next<CollapsingWhitespaceHandler, true>(reader, token);
}
return false;
}
-bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
+bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token)
{
switch (whitespaceMode) {
case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, false>(reader, token);
+ return next<PreservingWhitespaceHandler, false>(reader, token);
case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, false>(reader, token);
+ return next<TrimmingWhitespaceHandler, false>(reader, token);
case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, false>(reader, token);
+ return next<CollapsingWhitespaceHandler, false>(reader, token);
}
return false;
}
@@ -493,7 +330,7 @@ TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
// Try to register the token in the trie -- if this fails, remove it
// from the tokens list
if (!trie.registerToken(token, type)) {
- tokens[type] = std::string();
+ tokens[type] = std::string{};
nextTokenTypeId = type;
return EmptyToken;
}
@@ -528,17 +365,17 @@ WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
/* Explicitly instantiate all possible instantiations of the "next" member
function */
-template bool DynamicTokenizer::next<PreservingTextHandler, false>(
+template bool DynamicTokenizer::next<PreservingWhitespaceHandler, false>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingWhitespaceHandler, false>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingWhitespaceHandler, false>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<PreservingWhitespaceHandler, true>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingWhitespaceHandler, true>(
CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
+template bool DynamicTokenizer::next<CollapsingWhitespaceHandler, true>(
CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<PreservingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
}
diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp
index 0cac2e8..3e5aeb3 100644
--- a/src/formats/osdm/DynamicTokenizer.hpp
+++ b/src/formats/osdm/DynamicTokenizer.hpp
@@ -33,6 +33,7 @@
#include <vector>
#include <core/common/Location.hpp>
+#include <core/common/Whitespace.hpp>
#include "TokenTrie.hpp"
@@ -96,28 +97,6 @@ struct DynamicToken {
};
/**
- * Enum specifying the whitespace handling of the DynamicTokenizer class when
- * reading non-token text.
- */
-enum class WhitespaceMode {
- /**
- * Preserves all whitespaces as they are found in the source file.
- */
- PRESERVE,
-
- /**
- * Trims whitespace at the beginning and the end of the found text.
- */
- TRIM,
-
- /**
- * Whitespaces are trimmed and collapsed, multiple whitespace characters
- * are replaced by a single space character.
- */
- COLLAPSE
-};
-
-/**
* The DynamicTokenizer is used to extract tokens and chunks of text from a
* CharReader. It allows to register and unregister tokens while parsing and
* to modify the handling of whitespace characters. Note that the