diff options
-rw-r--r-- | src/formats/osdm/DynamicTokenizer.cpp | 251 | ||||
-rw-r--r-- | src/formats/osdm/DynamicTokenizer.hpp | 23 |
2 files changed, 45 insertions, 229 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp index f2cfcd1..1fac25a 100644 --- a/src/formats/osdm/DynamicTokenizer.cpp +++ b/src/formats/osdm/DynamicTokenizer.cpp @@ -22,6 +22,7 @@ #include <core/common/CharReader.hpp> #include <core/common/Exceptions.hpp> #include <core/common/Utils.hpp> +#include <core/common/WhitespaceHandler.hpp> #include "DynamicTokenizer.hpp" @@ -102,8 +103,8 @@ public: * @param textLength is the text buffer length of the previous text token. * @param textEnd is the current end location of the previous text token. */ - TokenLookup(const TokenTrie::Node *node, size_t start, - size_t textLength, size_t textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) : node(node), start(start), textLength(textLength), textEnd(textEnd) { } @@ -155,192 +156,29 @@ public: } }; -/* Internal class TextHandlerBase */ - /** - * Base class used for those classes that may be used as TextHandler in the - * DynamicTokenizer::next function. + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. */ -class TextHandlerBase { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector<char> textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - TextHandlerBase() : textStart(0), textEnd(0) {} - - /** - * Transforms the given token into a text token containing the extracted - * text. - * - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ - void buildTextToken(TokenMatch &match, SourceId sourceId) - { - if (match.hasMatch()) { - match.token.content = - std::string{textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, textStart, match.textEnd}; - } else { - match.token.content = std::string{textBuf.data(), textBuf.size()}; - match.token.location = SourceLocation{sourceId, textStart, textEnd}; - } - match.token.type = TextToken; - } - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } -}; - -/* Internal class PreservingTextHandler */ - -/** - * The PreservingTextHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/* Internal class TrimmingTextHandler */ - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector<char> whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/* Internal class CollapsingTextHandler */ - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; } -}; + match.token.type = TextToken; +} } /* Class DynamicTokenizer */ @@ -409,9 +247,8 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) } // If we found text, emit that text - if (textHandler.hasText() && - (!match.hasMatch() || match.textLength > 0)) { - textHandler.buildTextToken(match, sourceId); + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); } // Move the read/peek cursor to the end of the token, abort if an error @@ -436,28 +273,28 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) return match.hasMatch(); } -bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) +bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next<PreservingTextHandler, true>(reader, token); + return next<PreservingWhitespaceHandler, true>(reader, token); case WhitespaceMode::TRIM: - return next<TrimmingTextHandler, true>(reader, token); + return next<TrimmingWhitespaceHandler, true>(reader, token); case WhitespaceMode::COLLAPSE: - return next<CollapsingTextHandler, true>(reader, token); + return next<CollapsingWhitespaceHandler, true>(reader, token); } return false; } -bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) +bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next<PreservingTextHandler, false>(reader, token); + return next<PreservingWhitespaceHandler, false>(reader, token); case WhitespaceMode::TRIM: - return next<TrimmingTextHandler, false>(reader, token); + return next<TrimmingWhitespaceHandler, false>(reader, token); case WhitespaceMode::COLLAPSE: - return next<CollapsingTextHandler, false>(reader, token); + return next<CollapsingWhitespaceHandler, false>(reader, token); } return false; } @@ -493,7 +330,7 @@ TokenTypeId DynamicTokenizer::registerToken(const std::string &token) // Try to register the token in the trie -- if this fails, remove it // from the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string(); + tokens[type] = std::string{}; nextTokenTypeId = type; return EmptyToken; } @@ -528,17 +365,17 @@ WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool DynamicTokenizer::next<PreservingTextHandler, false>( +template bool DynamicTokenizer::next<PreservingWhitespaceHandler, false>( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<TrimmingWhitespaceHandler, false>( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<CollapsingWhitespaceHandler, false>( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<PreservingWhitespaceHandler, true>( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<TrimmingWhitespaceHandler, true>( CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, false>( +template bool DynamicTokenizer::next<CollapsingWhitespaceHandler, true>( CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, false>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<PreservingTextHandler, true>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, true>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, true>( - CharReader &reader,DynamicToken &token); } diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp index 0cac2e8..3e5aeb3 100644 --- a/src/formats/osdm/DynamicTokenizer.hpp +++ b/src/formats/osdm/DynamicTokenizer.hpp @@ -33,6 +33,7 @@ #include <vector> #include <core/common/Location.hpp> +#include <core/common/Whitespace.hpp> #include "TokenTrie.hpp" @@ -96,28 +97,6 @@ struct DynamicToken { }; /** - * Enum specifying the whitespace handling of the DynamicTokenizer class when - * reading non-token text. - */ -enum class WhitespaceMode { - /** - * Preserves all whitespaces as they are found in the source file. - */ - PRESERVE, - - /** - * Trims whitespace at the beginning and the end of the found text. - */ - TRIM, - - /** - * Whitespaces are trimmed and collapsed, multiple whitespace characters - * are replaced by a single space character. - */ - COLLAPSE -}; - -/** * The DynamicTokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing and * to modify the handling of whitespace characters. Note that the |