Adapted old Tokenizer infrastructure to new Tokens.hpp

author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-02-22 23:07:43 +0100
committer: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-02-22 23:07:43 +0100
commit: 2d4508837b7885c962f815c062f98803917eca71 (patch)
tree: f957147a9b3d667d8ead3922e95d67262614eb17 /src/core
parent: cb697e7eb78ad0bdfc2a20a7bdd2c369b678ca09 (diff)
4 files changed, 44 insertions, 110 deletions
diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp
index 4a0430b..80cc945 100644
--- a/src/core/parser/utils/TokenTrie.cpp
+++ b/src/core/parser/utils/TokenTrie.cpp
@@ -22,12 +22,12 @@ namespace ousia {
 
 /* Class DynamicTokenTree::Node */
 
-TokenTrie::Node::Node() : type(EmptyToken) {}
+TokenTrie::Node::Node() : type(Tokens::Empty) {}
 
 /* Class DynamicTokenTree */
 
 bool TokenTrie::registerToken(const std::string &token,
-                              TokenTypeId type) noexcept
+                              TokenId type) noexcept
 {
 	// Abort if the token is empty -- this would taint the root node
 	if (token.empty()) {
@@ -48,7 +48,7 @@ bool TokenTrie::registerToken(const std::string &token,
 	}
 
 	// If the resulting node already has a type set, we're screwed.
-	if (node->type != EmptyToken) {
+	if (node->type != Tokens::Empty) {
 		return false;
 	}
 
@@ -78,22 +78,22 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
 
 		// Reset the subtree handler if this node has another type
 		node = it->second.get();
-		if ((node->type != EmptyToken || node->children.size() > 1) &&
+		if ((node->type != Tokens::Empty || node->children.size() > 1) &&
 		    (i + 1 != token.size())) {
 			subtreeRoot = node;
 			subtreeKey = token[i + 1];
 		}
 	}
 
-	// If the node type is already EmptyToken, we cannot do anything here
-	if (node->type == EmptyToken) {
+	// If the node type is already Tokens::Empty, we cannot do anything here
+	if (node->type == Tokens::Empty) {
 		return false;
 	}
 
 	// If the target node has children, we cannot delete the subtree. Set the
-	// type to EmptyToken instead
+	// type to Tokens::Empty instead
 	if (!node->children.empty()) {
-		node->type = EmptyToken;
+		node->type = Tokens::Empty;
 		return true;
 	}
 
@@ -102,14 +102,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
 	return true;
 }
 
-TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
+TokenId TokenTrie::hasToken(const std::string &token) const noexcept
 {
 	Node const *node = &root;
 	for (size_t i = 0; i < token.size(); i++) {
 		const char c = token[i];
 		auto it = node->children.find(c);
 		if (it == node->children.end()) {
-			return EmptyToken;
+			return Tokens::Empty;
 		}
 		node = it->second.get();
 	}
diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp
index 36c2ffa..b2d1539 100644
--- a/src/core/parser/utils/TokenTrie.hpp
+++ b/src/core/parser/utils/TokenTrie.hpp
@@ -33,22 +33,9 @@
 #include <limits>
 #include <unordered_map>
 
-namespace ousia {
-
-/**
- * The TokenTypeId is used to give each token type a unique id.
- */
-using TokenTypeId = uint32_t;
-
-/**
- * Token which is not a token.
- */
-constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+#include "Token.hpp"
 
-/**
- * Token which represents a text token.
- */
-constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+namespace ousia {
 
 /**
  * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
@@ -91,7 +78,7 @@ public:
 		 * Reference at the corresponding token descriptor. Set to nullptr if
 		 * no token is attached to this node.
 		 */
-		TokenTypeId type;
+		TokenId type;
 
 		/**
 		 * Default constructor, initializes the descriptor with nullptr.
@@ -115,7 +102,7 @@ public:
 	 * @param type is the descriptor that should be set for this token.
 	 * @return true if the operation is successful, false otherwise.
 	 */
-	bool registerToken(const std::string &token, TokenTypeId type) noexcept;
+	bool registerToken(const std::string &token, TokenId type) noexcept;
 
 	/**
 	 * Unregisters the token from the token tree. Returns true if the token was
@@ -134,7 +121,7 @@ public:
 	 * @return the attached token descriptor or nullptr if the given token is
 	 * not found.
 	 */
-	TokenTypeId hasToken(const std::string &token) const noexcept;
+	TokenId hasToken(const std::string &token) const noexcept;
 
 	/**
 	 * Returns a reference at the root node to be used for traversing the token
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 3c8177d..2e0ac13 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -61,7 +61,7 @@ struct TokenMatch {
 	/**
 	 * Returns true if this TokenMatch instance actually represents a match.
 	 */
-	bool hasMatch() { return token.type != EmptyToken; }
+	bool hasMatch() { return token.id != Tokens::Empty; }
 };
 
 /* Internal class TokenLookup */
@@ -138,7 +138,7 @@ public:
 		// Check whether the new node represents a complete token a whether it
 		// is longer than the current token. If yes, replace the current token.
 		node = it->second.get();
-		if (node->type != EmptyToken) {
+		if (node->type != Tokens::Empty) {
 			const std::string &str = tokens[node->type];
 			size_t len = str.size();
 			if (len > match.token.content.size()) {
@@ -157,14 +157,14 @@ public:
 };
 
 /**
- * Transforms the given token into a text token containing the extracted
+ * Transforms the given token into a data token containing the extracted
  * text.
  *
  * @param handler is the WhitespaceHandler containing the collected data.
  * @param token is the output token to which the text should be written.
  * @param sourceId is the source id of the underlying file.
  */
-static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match,
+static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match,
                            SourceId sourceId)
 {
 	if (match.hasMatch()) {
@@ -177,14 +177,14 @@ static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match,
 		match.token.location =
 		    SourceLocation{sourceId, handler.textStart, handler.textEnd};
 	}
-	match.token.type = TextToken;
+	match.token.id = Tokens::Data;
 }
 }
 
 /* Class Tokenizer */
 
 Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
-    : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
+    : whitespaceMode(whitespaceMode), nextTokenId(0)
 {
 }
 
@@ -248,7 +248,7 @@ bool Tokenizer::next(CharReader &reader, Token &token)
 
 	// If we found text, emit that text
 	if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
-		buildTextToken(textHandler, match, sourceId);
+		buildDataToken(textHandler, match, sourceId);
 	}
 
 	// Move the read/peek cursor to the end of the token, abort if an error
@@ -299,16 +299,16 @@ bool Tokenizer::peek(CharReader &reader, Token &token)
 	return false;
 }
 
-TokenTypeId Tokenizer::registerToken(const std::string &token)
+TokenId Tokenizer::registerToken(const std::string &token)
 {
 	// Abort if an empty token should be registered
 	if (token.empty()) {
-		return EmptyToken;
+		return Tokens::Empty;
 	}
 
 	// Search for a new slot in the tokens list
-	TokenTypeId type = EmptyToken;
-	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
+	TokenId type = Tokens::Empty;
+	for (size_t i = nextTokenId; i < tokens.size(); i++) {
 		if (tokens[i].empty()) {
 			tokens[i] = token;
 			type = i;
@@ -318,37 +318,37 @@ TokenTypeId Tokenizer::registerToken(const std::string &token)
 
 	// No existing slot was found, add a new one -- make sure we do not
 	// override the special token type handles
-	if (type == EmptyToken) {
+	if (type == Tokens::Empty) {
 		type = tokens.size();
-		if (type == TextToken || type == EmptyToken) {
+		if (type == Tokens::Data || type == Tokens::Empty) {
 			throw OusiaException{"Token type ids depleted!"};
 		}
 		tokens.emplace_back(token);
 	}
-	nextTokenTypeId = type + 1;
+	nextTokenId = type + 1;
 
 	// Try to register the token in the trie -- if this fails, remove it
 	// from the tokens list
 	if (!trie.registerToken(token, type)) {
 		tokens[type] = std::string{};
-		nextTokenTypeId = type;
-		return EmptyToken;
+		nextTokenId = type;
+		return Tokens::Empty;
 	}
 	return type;
 }
 
-bool Tokenizer::unregisterToken(TokenTypeId type)
+bool Tokenizer::unregisterToken(TokenId type)
 {
 	// Unregister the token from the trie, abort if an invalid type is given
 	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
 		tokens[type] = std::string{};
-		nextTokenTypeId = type;
+		nextTokenId = type;
 		return true;
 	}
 	return false;
 }
 
-std::string Tokenizer::getTokenString(TokenTypeId type)
+std::string Tokenizer::getTokenString(TokenId type)
 {
 	if (type < tokens.size()) {
 		return tokens[type];
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index 6b4e116..f21c6a3 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -35,6 +35,7 @@
 #include <core/common/Location.hpp>
 #include <core/common/Whitespace.hpp>
 
+#include "Token.hpp"
 #include "TokenTrie.hpp"
 
 namespace ousia {
@@ -43,60 +44,6 @@ namespace ousia {
 class CharReader;
 
 /**
- * The Token structure describes a token discovered by the Tokenizer.
- */
-struct Token {
-	/**
-	 * Id of the type of this token.
-	 */
-	TokenTypeId type;
-
-	/**
-	 * String that was matched.
-	 */
-	std::string content;
-
-	/**
-	 * Location from which the string was extracted.
-	 */
-	SourceLocation location;
-
-	/**
-	 * Default constructor.
-	 */
-	Token() : type(EmptyToken) {}
-
-	/**
-	 * Constructor of the Token struct.
-	 *
-	 * @param id represents the token type.
-	 * @param content is the string content that has been extracted.
-	 * @param location is the location of the extracted string content in the
-	 * source file.
-	 */
-	Token(TokenTypeId type, const std::string &content,
-	             SourceLocation location)
-	    : type(type), content(content), location(location)
-	{
-	}
-
-	/**
-	 * Constructor of the Token struct, only initializes the token type
-	 *
-	 * @param type is the id corresponding to the type of the token.
-	 */
-	Token(TokenTypeId type) : type(type) {}
-
-	/**
-	 * The getLocation function allows the tokens to be directly passed as
-	 * parameter to Logger or LoggableException instances.
-	 *
-	 * @return a reference at the location field
-	 */
-	const SourceLocation &getLocation() const { return location; }
-};
-
-/**
  * The Tokenizer is used to extract tokens and chunks of text from a
  * CharReader. It allows to register and unregister tokens while parsing and
  * to modify the handling of whitespace characters. Note that the
@@ -123,7 +70,7 @@ private:
 	/**
 	 * Next index in the tokens list where to search for a new token id.
 	 */
-	size_t nextTokenTypeId;
+	size_t nextTokenId;
 
 	/**
 	 * Templated function used internally to read the current token. The
@@ -158,31 +105,31 @@ public:
 	 * @return a unique identifier for the registered token or EmptyToken if
 	 * an error occured.
 	 */
-	TokenTypeId registerToken(const std::string &token);
+	TokenId registerToken(const std::string &token);
 
 	/**
-	 * Unregisters the token belonging to the given TokenTypeId.
+	 * Unregisters the token belonging to the given TokenId.
 	 *
 	 * @param type is the token type that should be unregistered. The
-	 *TokenTypeId
+	 *TokenId
 	 * must have been returned by registerToken.
 	 * @return true if the operation was successful, false otherwise (e.g.
 	 * because the given TokenDescriptor was already unregistered).
 	 */
-	bool unregisterToken(TokenTypeId type);
+	bool unregisterToken(TokenId type);
 
 	/**
-	 * Returns the token that was registered under the given TokenTypeId id or
+	 * Returns the token that was registered under the given TokenId id or
 	 *an
-	 * empty string if an invalid TokenTypeId id is given.
+	 * empty string if an invalid TokenId id is given.
 	 *
-	 * @param type is the TokenTypeId id for which the corresponding token
+	 * @param type is the TokenId id for which the corresponding token
 	 *string
 	 * should be returned.
 	 * @return the registered token string or an empty string if the given type
 	 * was invalid.
 	 */
-	std::string getTokenString(TokenTypeId type);
+	std::string getTokenString(TokenId type);
 
 	/**
 	 * Sets the whitespace mode.
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-02-22 23:07:43 +0100
committer	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-02-22 23:07:43 +0100
commit	2d4508837b7885c962f815c062f98803917eca71 (patch)
tree	f957147a9b3d667d8ead3922e95d67262614eb17 /src/core
parent	cb697e7eb78ad0bdfc2a20a7bdd2c369b678ca09 (diff)