summaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-22 23:07:43 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-22 23:07:43 +0100
commit2d4508837b7885c962f815c062f98803917eca71 (patch)
treef957147a9b3d667d8ead3922e95d67262614eb17 /src/core
parentcb697e7eb78ad0bdfc2a20a7bdd2c369b678ca09 (diff)
Adapted old Tokenizer infrastructure to new Tokens.hpp
Diffstat (limited to 'src/core')
-rw-r--r--src/core/parser/utils/TokenTrie.cpp20
-rw-r--r--src/core/parser/utils/TokenTrie.hpp23
-rw-r--r--src/core/parser/utils/Tokenizer.cpp38
-rw-r--r--src/core/parser/utils/Tokenizer.hpp73
4 files changed, 44 insertions, 110 deletions
diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp
index 4a0430b..80cc945 100644
--- a/src/core/parser/utils/TokenTrie.cpp
+++ b/src/core/parser/utils/TokenTrie.cpp
@@ -22,12 +22,12 @@ namespace ousia {
/* Class DynamicTokenTree::Node */
-TokenTrie::Node::Node() : type(EmptyToken) {}
+TokenTrie::Node::Node() : type(Tokens::Empty) {}
/* Class DynamicTokenTree */
bool TokenTrie::registerToken(const std::string &token,
- TokenTypeId type) noexcept
+ TokenId type) noexcept
{
// Abort if the token is empty -- this would taint the root node
if (token.empty()) {
@@ -48,7 +48,7 @@ bool TokenTrie::registerToken(const std::string &token,
}
// If the resulting node already has a type set, we're screwed.
- if (node->type != EmptyToken) {
+ if (node->type != Tokens::Empty) {
return false;
}
@@ -78,22 +78,22 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
// Reset the subtree handler if this node has another type
node = it->second.get();
- if ((node->type != EmptyToken || node->children.size() > 1) &&
+ if ((node->type != Tokens::Empty || node->children.size() > 1) &&
(i + 1 != token.size())) {
subtreeRoot = node;
subtreeKey = token[i + 1];
}
}
- // If the node type is already EmptyToken, we cannot do anything here
- if (node->type == EmptyToken) {
+ // If the node type is already Tokens::Empty, we cannot do anything here
+ if (node->type == Tokens::Empty) {
return false;
}
// If the target node has children, we cannot delete the subtree. Set the
- // type to EmptyToken instead
+ // type to Tokens::Empty instead
if (!node->children.empty()) {
- node->type = EmptyToken;
+ node->type = Tokens::Empty;
return true;
}
@@ -102,14 +102,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
return true;
}
-TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
+TokenId TokenTrie::hasToken(const std::string &token) const noexcept
{
Node const *node = &root;
for (size_t i = 0; i < token.size(); i++) {
const char c = token[i];
auto it = node->children.find(c);
if (it == node->children.end()) {
- return EmptyToken;
+ return Tokens::Empty;
}
node = it->second.get();
}
diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp
index 36c2ffa..b2d1539 100644
--- a/src/core/parser/utils/TokenTrie.hpp
+++ b/src/core/parser/utils/TokenTrie.hpp
@@ -33,22 +33,9 @@
#include <limits>
#include <unordered_map>
-namespace ousia {
-
-/**
- * The TokenTypeId is used to give each token type a unique id.
- */
-using TokenTypeId = uint32_t;
-
-/**
- * Token which is not a token.
- */
-constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+#include "Token.hpp"
-/**
- * Token which represents a text token.
- */
-constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+namespace ousia {
/**
* The Tokenizer internally uses a TokenTrie to be efficiently able to identify
@@ -91,7 +78,7 @@ public:
* Reference at the corresponding token descriptor. Set to nullptr if
* no token is attached to this node.
*/
- TokenTypeId type;
+ TokenId type;
/**
* Default constructor, initializes the descriptor with nullptr.
@@ -115,7 +102,7 @@ public:
* @param type is the descriptor that should be set for this token.
* @return true if the operation is successful, false otherwise.
*/
- bool registerToken(const std::string &token, TokenTypeId type) noexcept;
+ bool registerToken(const std::string &token, TokenId type) noexcept;
/**
* Unregisters the token from the token tree. Returns true if the token was
@@ -134,7 +121,7 @@ public:
* @return the attached token descriptor or nullptr if the given token is
* not found.
*/
- TokenTypeId hasToken(const std::string &token) const noexcept;
+ TokenId hasToken(const std::string &token) const noexcept;
/**
* Returns a reference at the root node to be used for traversing the token
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 3c8177d..2e0ac13 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -61,7 +61,7 @@ struct TokenMatch {
/**
* Returns true if this TokenMatch instance actually represents a match.
*/
- bool hasMatch() { return token.type != EmptyToken; }
+ bool hasMatch() { return token.id != Tokens::Empty; }
};
/* Internal class TokenLookup */
@@ -138,7 +138,7 @@ public:
// Check whether the new node represents a complete token a whether it
// is longer than the current token. If yes, replace the current token.
node = it->second.get();
- if (node->type != EmptyToken) {
+ if (node->type != Tokens::Empty) {
const std::string &str = tokens[node->type];
size_t len = str.size();
if (len > match.token.content.size()) {
@@ -157,14 +157,14 @@ public:
};
/**
- * Transforms the given token into a text token containing the extracted
+ * Transforms the given token into a data token containing the extracted
* text.
*
* @param handler is the WhitespaceHandler containing the collected data.
* @param token is the output token to which the text should be written.
* @param sourceId is the source id of the underlying file.
*/
-static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match,
+static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match,
SourceId sourceId)
{
if (match.hasMatch()) {
@@ -177,14 +177,14 @@ static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match,
match.token.location =
SourceLocation{sourceId, handler.textStart, handler.textEnd};
}
- match.token.type = TextToken;
+ match.token.id = Tokens::Data;
}
}
/* Class Tokenizer */
Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
- : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
+ : whitespaceMode(whitespaceMode), nextTokenId(0)
{
}
@@ -248,7 +248,7 @@ bool Tokenizer::next(CharReader &reader, Token &token)
// If we found text, emit that text
if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
- buildTextToken(textHandler, match, sourceId);
+ buildDataToken(textHandler, match, sourceId);
}
// Move the read/peek cursor to the end of the token, abort if an error
@@ -299,16 +299,16 @@ bool Tokenizer::peek(CharReader &reader, Token &token)
return false;
}
-TokenTypeId Tokenizer::registerToken(const std::string &token)
+TokenId Tokenizer::registerToken(const std::string &token)
{
// Abort if an empty token should be registered
if (token.empty()) {
- return EmptyToken;
+ return Tokens::Empty;
}
// Search for a new slot in the tokens list
- TokenTypeId type = EmptyToken;
- for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
+ TokenId type = Tokens::Empty;
+ for (size_t i = nextTokenId; i < tokens.size(); i++) {
if (tokens[i].empty()) {
tokens[i] = token;
type = i;
@@ -318,37 +318,37 @@ TokenTypeId Tokenizer::registerToken(const std::string &token)
// No existing slot was found, add a new one -- make sure we do not
// override the special token type handles
- if (type == EmptyToken) {
+ if (type == Tokens::Empty) {
type = tokens.size();
- if (type == TextToken || type == EmptyToken) {
+ if (type == Tokens::Data || type == Tokens::Empty) {
throw OusiaException{"Token type ids depleted!"};
}
tokens.emplace_back(token);
}
- nextTokenTypeId = type + 1;
+ nextTokenId = type + 1;
// Try to register the token in the trie -- if this fails, remove it
// from the tokens list
if (!trie.registerToken(token, type)) {
tokens[type] = std::string{};
- nextTokenTypeId = type;
- return EmptyToken;
+ nextTokenId = type;
+ return Tokens::Empty;
}
return type;
}
-bool Tokenizer::unregisterToken(TokenTypeId type)
+bool Tokenizer::unregisterToken(TokenId type)
{
// Unregister the token from the trie, abort if an invalid type is given
if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
tokens[type] = std::string{};
- nextTokenTypeId = type;
+ nextTokenId = type;
return true;
}
return false;
}
-std::string Tokenizer::getTokenString(TokenTypeId type)
+std::string Tokenizer::getTokenString(TokenId type)
{
if (type < tokens.size()) {
return tokens[type];
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index 6b4e116..f21c6a3 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -35,6 +35,7 @@
#include <core/common/Location.hpp>
#include <core/common/Whitespace.hpp>
+#include "Token.hpp"
#include "TokenTrie.hpp"
namespace ousia {
@@ -43,60 +44,6 @@ namespace ousia {
class CharReader;
/**
- * The Token structure describes a token discovered by the Tokenizer.
- */
-struct Token {
- /**
- * Id of the type of this token.
- */
- TokenTypeId type;
-
- /**
- * String that was matched.
- */
- std::string content;
-
- /**
- * Location from which the string was extracted.
- */
- SourceLocation location;
-
- /**
- * Default constructor.
- */
- Token() : type(EmptyToken) {}
-
- /**
- * Constructor of the Token struct.
- *
- * @param id represents the token type.
- * @param content is the string content that has been extracted.
- * @param location is the location of the extracted string content in the
- * source file.
- */
- Token(TokenTypeId type, const std::string &content,
- SourceLocation location)
- : type(type), content(content), location(location)
- {
- }
-
- /**
- * Constructor of the Token struct, only initializes the token type
- *
- * @param type is the id corresponding to the type of the token.
- */
- Token(TokenTypeId type) : type(type) {}
-
- /**
- * The getLocation function allows the tokens to be directly passed as
- * parameter to Logger or LoggableException instances.
- *
- * @return a reference at the location field
- */
- const SourceLocation &getLocation() const { return location; }
-};
-
-/**
* The Tokenizer is used to extract tokens and chunks of text from a
* CharReader. It allows to register and unregister tokens while parsing and
* to modify the handling of whitespace characters. Note that the
@@ -123,7 +70,7 @@ private:
/**
* Next index in the tokens list where to search for a new token id.
*/
- size_t nextTokenTypeId;
+ size_t nextTokenId;
/**
* Templated function used internally to read the current token. The
@@ -158,31 +105,31 @@ public:
* @return a unique identifier for the registered token or EmptyToken if
* an error occured.
*/
- TokenTypeId registerToken(const std::string &token);
+ TokenId registerToken(const std::string &token);
/**
- * Unregisters the token belonging to the given TokenTypeId.
+ * Unregisters the token belonging to the given TokenId.
*
* @param type is the token type that should be unregistered. The
- *TokenTypeId
+ *TokenId
* must have been returned by registerToken.
* @return true if the operation was successful, false otherwise (e.g.
* because the given TokenDescriptor was already unregistered).
*/
- bool unregisterToken(TokenTypeId type);
+ bool unregisterToken(TokenId type);
/**
- * Returns the token that was registered under the given TokenTypeId id or
+ * Returns the token that was registered under the given TokenId id or
*an
- * empty string if an invalid TokenTypeId id is given.
+ * empty string if an invalid TokenId id is given.
*
- * @param type is the TokenTypeId id for which the corresponding token
+ * @param type is the TokenId id for which the corresponding token
*string
* should be returned.
* @return the registered token string or an empty string if the given type
* was invalid.
*/
- std::string getTokenString(TokenTypeId type);
+ std::string getTokenString(TokenId type);
/**
* Sets the whitespace mode.