diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-22 23:07:43 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-22 23:07:43 +0100 |
commit | 2d4508837b7885c962f815c062f98803917eca71 (patch) | |
tree | f957147a9b3d667d8ead3922e95d67262614eb17 /src/core/parser | |
parent | cb697e7eb78ad0bdfc2a20a7bdd2c369b678ca09 (diff) |
Adapted old Tokenizer infrastructure to new Tokens.hpp
Diffstat (limited to 'src/core/parser')
-rw-r--r-- | src/core/parser/utils/TokenTrie.cpp | 20 | ||||
-rw-r--r-- | src/core/parser/utils/TokenTrie.hpp | 23 | ||||
-rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 38 | ||||
-rw-r--r-- | src/core/parser/utils/Tokenizer.hpp | 73 |
4 files changed, 44 insertions, 110 deletions
diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 4a0430b..80cc945 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia { /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(EmptyToken) {} +TokenTrie::Node::Node() : type(Tokens::Empty) {} /* Class DynamicTokenTree */ bool TokenTrie::registerToken(const std::string &token, - TokenTypeId type) noexcept + TokenId type) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -48,7 +48,7 @@ bool TokenTrie::registerToken(const std::string &token, } // If the resulting node already has a type set, we're screwed. - if (node->type != EmptyToken) { + if (node->type != Tokens::Empty) { return false; } @@ -78,22 +78,22 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->type != EmptyToken || node->children.size() > 1) && + if ((node->type != Tokens::Empty || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; } } - // If the node type is already EmptyToken, we cannot do anything here - if (node->type == EmptyToken) { + // If the node type is already Tokens::Empty, we cannot do anything here + if (node->type == Tokens::Empty) { return false; } // If the target node has children, we cannot delete the subtree. Set the - // type to EmptyToken instead + // type to Tokens::Empty instead if (!node->children.empty()) { - node->type = EmptyToken; + node->type = Tokens::Empty; return true; } @@ -102,14 +102,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept return true; } -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +TokenId TokenTrie::hasToken(const std::string &token) const noexcept { Node const *node = &root; for (size_t i = 0; i < token.size(); i++) { const char c = token[i]; auto it = node->children.find(c); if (it == node->children.end()) { - return EmptyToken; + return Tokens::Empty; } node = it->second.get(); } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index 36c2ffa..b2d1539 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,22 +33,9 @@ #include <limits> #include <unordered_map> -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); +#include "Token.hpp" -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; +namespace ousia { /** * The Tokenizer internally uses a TokenTrie to be efficiently able to identify @@ -91,7 +78,7 @@ public: * Reference at the corresponding token descriptor. Set to nullptr if * no token is attached to this node. */ - TokenTypeId type; + TokenId type; /** * Default constructor, initializes the descriptor with nullptr. @@ -115,7 +102,7 @@ public: * @param type is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, TokenTypeId type) noexcept; + bool registerToken(const std::string &token, TokenId type) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was @@ -134,7 +121,7 @@ public: * @return the attached token descriptor or nullptr if the given token is * not found. */ - TokenTypeId hasToken(const std::string &token) const noexcept; + TokenId hasToken(const std::string &token) const noexcept; /** * Returns a reference at the root node to be used for traversing the token diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 3c8177d..2e0ac13 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -61,7 +61,7 @@ struct TokenMatch { /** * Returns true if this TokenMatch instance actually represents a match. */ - bool hasMatch() { return token.type != EmptyToken; } + bool hasMatch() { return token.id != Tokens::Empty; } }; /* Internal class TokenLookup */ @@ -138,7 +138,7 @@ public: // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); - if (node->type != EmptyToken) { + if (node->type != Tokens::Empty) { const std::string &str = tokens[node->type]; size_t len = str.size(); if (len > match.token.content.size()) { @@ -157,14 +157,14 @@ public: }; /** - * Transforms the given token into a text token containing the extracted + * Transforms the given token into a data token containing the extracted * text. * * @param handler is the WhitespaceHandler containing the collected data. * @param token is the output token to which the text should be written. * @param sourceId is the source id of the underlying file. */ -static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, +static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, SourceId sourceId) { if (match.hasMatch()) { @@ -177,14 +177,14 @@ static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, match.token.location = SourceLocation{sourceId, handler.textStart, handler.textEnd}; } - match.token.type = TextToken; + match.token.id = Tokens::Data; } } /* Class Tokenizer */ Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenTypeId(0) + : whitespaceMode(whitespaceMode), nextTokenId(0) { } @@ -248,7 +248,7 @@ bool Tokenizer::next(CharReader &reader, Token &token) // If we found text, emit that text if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildTextToken(textHandler, match, sourceId); + buildDataToken(textHandler, match, sourceId); } // Move the read/peek cursor to the end of the token, abort if an error @@ -299,16 +299,16 @@ bool Tokenizer::peek(CharReader &reader, Token &token) return false; } -TokenTypeId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token) { // Abort if an empty token should be registered if (token.empty()) { - return EmptyToken; + return Tokens::Empty; } // Search for a new slot in the tokens list - TokenTypeId type = EmptyToken; - for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + TokenId type = Tokens::Empty; + for (size_t i = nextTokenId; i < tokens.size(); i++) { if (tokens[i].empty()) { tokens[i] = token; type = i; @@ -318,37 +318,37 @@ TokenTypeId Tokenizer::registerToken(const std::string &token) // No existing slot was found, add a new one -- make sure we do not // override the special token type handles - if (type == EmptyToken) { + if (type == Tokens::Empty) { type = tokens.size(); - if (type == TextToken || type == EmptyToken) { + if (type == Tokens::Data || type == Tokens::Empty) { throw OusiaException{"Token type ids depleted!"}; } tokens.emplace_back(token); } - nextTokenTypeId = type + 1; + nextTokenId = type + 1; // Try to register the token in the trie -- if this fails, remove it // from the tokens list if (!trie.registerToken(token, type)) { tokens[type] = std::string{}; - nextTokenTypeId = type; - return EmptyToken; + nextTokenId = type; + return Tokens::Empty; } return type; } -bool Tokenizer::unregisterToken(TokenTypeId type) +bool Tokenizer::unregisterToken(TokenId type) { // Unregister the token from the trie, abort if an invalid type is given if (type < tokens.size() && trie.unregisterToken(tokens[type])) { tokens[type] = std::string{}; - nextTokenTypeId = type; + nextTokenId = type; return true; } return false; } -std::string Tokenizer::getTokenString(TokenTypeId type) +std::string Tokenizer::getTokenString(TokenId type) { if (type < tokens.size()) { return tokens[type]; diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index 6b4e116..f21c6a3 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -35,6 +35,7 @@ #include <core/common/Location.hpp> #include <core/common/Whitespace.hpp> +#include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { @@ -43,60 +44,6 @@ namespace ousia { class CharReader; /** - * The Token structure describes a token discovered by the Tokenizer. - */ -struct Token { - /** - * Id of the type of this token. - */ - TokenTypeId type; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - Token() : type(EmptyToken) {} - - /** - * Constructor of the Token struct. - * - * @param id represents the token type. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - Token(TokenTypeId type, const std::string &content, - SourceLocation location) - : type(type), content(content), location(location) - { - } - - /** - * Constructor of the Token struct, only initializes the token type - * - * @param type is the id corresponding to the type of the token. - */ - Token(TokenTypeId type) : type(type) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; - -/** * The Tokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing and * to modify the handling of whitespace characters. Note that the @@ -123,7 +70,7 @@ private: /** * Next index in the tokens list where to search for a new token id. */ - size_t nextTokenTypeId; + size_t nextTokenId; /** * Templated function used internally to read the current token. The @@ -158,31 +105,31 @@ public: * @return a unique identifier for the registered token or EmptyToken if * an error occured. */ - TokenTypeId registerToken(const std::string &token); + TokenId registerToken(const std::string &token); /** - * Unregisters the token belonging to the given TokenTypeId. + * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The - *TokenTypeId + *TokenId * must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. * because the given TokenDescriptor was already unregistered). */ - bool unregisterToken(TokenTypeId type); + bool unregisterToken(TokenId type); /** - * Returns the token that was registered under the given TokenTypeId id or + * Returns the token that was registered under the given TokenId id or *an - * empty string if an invalid TokenTypeId id is given. + * empty string if an invalid TokenId id is given. * - * @param type is the TokenTypeId id for which the corresponding token + * @param type is the TokenId id for which the corresponding token *string * should be returned. * @return the registered token string or an empty string if the given type * was invalid. */ - std::string getTokenString(TokenTypeId type); + std::string getTokenString(TokenId type); /** * Sets the whitespace mode. |