From 65bbbd778f6e0a3668c859b0e22cced7075a726d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:11 +0100 Subject: Moved DynamicTokenizer and TokenTrie to parser/utils --- src/core/parser/utils/Tokenizer.cpp | 381 ++++++++++++++++++++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 src/core/parser/utils/Tokenizer.cpp (limited to 'src/core/parser/utils/Tokenizer.cpp') diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp new file mode 100644 index 0000000..1fac25a --- /dev/null +++ b/src/core/parser/utils/Tokenizer.cpp @@ -0,0 +1,381 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include +#include +#include +#include + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + DynamicToken token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the DynamicToken instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * DynamicTokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + DynamicToken{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; + } + match.token.type = TextToken; +} +} + +/* Class DynamicTokenizer */ + +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) +{ + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector lookups; + std::vector nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = DynamicToken{}; + } + return match.hasMatch(); +} + +bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } + + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; + + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +} + -- cgit v1.2.3 From 9f9e51974e782c4eb6f393ca3d4c3382df093bf1 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:58:55 +0100 Subject: Moved Tokenizer to core/parser/utils and adapted name --- src/core/parser/utils/Tokenizer.cpp | 56 ++++++++++++++++++------------------- src/core/parser/utils/Tokenizer.hpp | 34 +++++++++++----------- 2 files changed, 45 insertions(+), 45 deletions(-) (limited to 'src/core/parser/utils/Tokenizer.cpp') diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 1fac25a..3c8177d 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -24,7 +24,7 @@ #include #include -#include "DynamicTokenizer.hpp" +#include "Tokenizer.hpp" namespace ousia { @@ -39,7 +39,7 @@ struct TokenMatch { /** * Token that was matched. */ - DynamicToken token; + Token token; /** * Current length of the data within the text handler. The text buffer needs @@ -117,10 +117,10 @@ public: * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- * which could potentially be expanded in the next iteration. - * @param match is the DynamicToken instance to which the matching token + * @param match is the Token instance to which the matching token * should be written. * @param tokens is a reference at the internal token list of the - * DynamicTokenizer. + * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. */ @@ -143,7 +143,7 @@ public: size_t len = str.size(); if (len > match.token.content.size()) { match.token = - DynamicToken{node->type, str, {sourceId, start, end}}; + Token{node->type, str, {sourceId, start, end}}; match.textLength = textLength; match.textEnd = textEnd; } @@ -181,15 +181,15 @@ static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, } } -/* Class DynamicTokenizer */ +/* Class Tokenizer */ -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) +Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) : whitespaceMode(whitespaceMode), nextTokenTypeId(0) { } template -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) +bool Tokenizer::next(CharReader &reader, Token &token) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -268,12 +268,12 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) } token = match.token; } else { - token = DynamicToken{}; + token = Token{}; } return match.hasMatch(); } -bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) +bool Tokenizer::read(CharReader &reader, Token &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: @@ -286,7 +286,7 @@ bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) return false; } -bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) +bool Tokenizer::peek(CharReader &reader, Token &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: @@ -299,7 +299,7 @@ bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) return false; } -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +TokenTypeId Tokenizer::registerToken(const std::string &token) { // Abort if an empty token should be registered if (token.empty()) { @@ -337,7 +337,7 @@ TokenTypeId DynamicTokenizer::registerToken(const std::string &token) return type; } -bool DynamicTokenizer::unregisterToken(TokenTypeId type) +bool Tokenizer::unregisterToken(TokenTypeId type) { // Unregister the token from the trie, abort if an invalid type is given if (type < tokens.size() && trie.unregisterToken(tokens[type])) { @@ -348,7 +348,7 @@ bool DynamicTokenizer::unregisterToken(TokenTypeId type) return false; } -std::string DynamicTokenizer::getTokenString(TokenTypeId type) +std::string Tokenizer::getTokenString(TokenTypeId type) { if (type < tokens.size()) { return tokens[type]; @@ -356,26 +356,26 @@ std::string DynamicTokenizer::getTokenString(TokenTypeId type) return std::string{}; } -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +void Tokenizer::setWhitespaceMode(WhitespaceMode mode) { whitespaceMode = mode; } -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } +WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); +template bool Tokenizer::next( + CharReader &reader, Token &token); +template bool Tokenizer::next( + CharReader &reader, Token &token); +template bool Tokenizer::next( + CharReader &reader, Token &token); +template bool Tokenizer::next( + CharReader &reader, Token &token); +template bool Tokenizer::next( + CharReader &reader, Token &token); +template bool Tokenizer::next( + CharReader &reader, Token &token); } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index 3e5aeb3..6b4e116 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -17,7 +17,7 @@ */ /** - * @file DynamicTokenizer.hpp + * @file Tokenizer.hpp * * Tokenizer that can be reconfigured at runtime used for parsing the plain * text format. @@ -43,9 +43,9 @@ namespace ousia { class CharReader; /** - * The DynamicToken structure describes a token discovered by the Tokenizer. + * The Token structure describes a token discovered by the Tokenizer. */ -struct DynamicToken { +struct Token { /** * Id of the type of this token. */ @@ -64,28 +64,28 @@ struct DynamicToken { /** * Default constructor. */ - DynamicToken() : type(EmptyToken) {} + Token() : type(EmptyToken) {} /** - * Constructor of the DynamicToken struct. + * Constructor of the Token struct. * * @param id represents the token type. * @param content is the string content that has been extracted. * @param location is the location of the extracted string content in the * source file. */ - DynamicToken(TokenTypeId type, const std::string &content, + Token(TokenTypeId type, const std::string &content, SourceLocation location) : type(type), content(content), location(location) { } /** - * Constructor of the DynamicToken struct, only initializes the token type + * Constructor of the Token struct, only initializes the token type * * @param type is the id corresponding to the type of the token. */ - DynamicToken(TokenTypeId type) : type(type) {} + Token(TokenTypeId type) : type(type) {} /** * The getLocation function allows the tokens to be directly passed as @@ -97,13 +97,13 @@ struct DynamicToken { }; /** - * The DynamicTokenizer is used to extract tokens and chunks of text from a + * The Tokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing and * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the + * Tokenizer always tries to extract the longest possible token from the * tokenizer. */ -class DynamicTokenizer { +class Tokenizer { private: /** * Internally used token trie. This object holds all registered tokens. @@ -140,15 +140,15 @@ private: * @return false if the end of the stream has been reached, true otherwise. */ template - bool next(CharReader &reader, DynamicToken &token); + bool next(CharReader &reader, Token &token); public: /** - * Constructor of the DynamicTokenizer class. + * Constructor of the Tokenizer class. * * @param whitespaceMode specifies how whitespace should be handled. */ - DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Registers the given string as a token. Returns a const pointer at a @@ -201,7 +201,7 @@ public: /** * Reads a new token from the CharReader and stores it in the given - * DynamicToken instance. + * Token instance. * * @param reader is the CharReader instance from which the data should be * read. @@ -210,7 +210,7 @@ public: * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(CharReader &reader, DynamicToken &token); + bool read(CharReader &reader, Token &token); /** * The peek method does not advance the read position of the char reader, @@ -223,7 +223,7 @@ public: * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(CharReader &reader, DynamicToken &token); + bool peek(CharReader &reader, Token &token); }; } -- cgit v1.2.3