Storing type and name in the HandlerData once again, using a Token

Conflicts: application/src/core/parser/stack/Callbacks.hpp
author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-03-03 15:08:18 +0100
committer: Andreas Stöckel <andreas@somweyr.de> 2015-03-03 15:08:18 +0100
commit: 466ff991bcfad76d78100193aacbfaf74d542b26 (patch)
tree: dafdb41ec766e83c6e37a8b9865e6ef454ff4def /src/core/parser/utils/Tokenizer.hpp
parent: b5cdca0331117ad3834b61eadd94ab3fcb6d2fba (diff)
parent: fb8d4cdf01909b61e4e5d0806ec6de178ff0058c (diff)
1 files changed, 84 insertions, 58 deletions
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index f21c6a3..74e3f0d 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -19,8 +19,8 @@
 /**
  * @file Tokenizer.hpp
  *
- * Tokenizer that can be reconfigured at runtime used for parsing the plain
- * text format.
+ * Tokenizer that can be reconfigured at runtime and is used for parsing the
+ * plain text format.
  *
  * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
  */
@@ -28,44 +28,80 @@
 #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
 #define _OUSIA_DYNAMIC_TOKENIZER_HPP_
 
-#include <set>
+#include <cstdint>
 #include <string>
 #include <vector>
 
 #include <core/common/Location.hpp>
-#include <core/common/Whitespace.hpp>
+#include <core/common/Token.hpp>
 
-#include "Token.hpp"
 #include "TokenTrie.hpp"
 
 namespace ousia {
 
 // Forward declarations
 class CharReader;
+class TokenizedData;
 
 /**
  * The Tokenizer is used to extract tokens and chunks of text from a
- * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters. Note that the
- * Tokenizer always tries to extract the longest possible token from the
- * tokenizer.
+ * CharReader. It allows to register and unregister tokens while parsing. Note
+ * that the Tokenizer always tries to extract the longest possible token from
+ * the tokenizer. Tokens can be registered as primary or non-primary token. If
+ * a Token is registered as a primary token, it is returned as a single Token
+ * instance if it occurs. In the non-primary case the token is returned as part
+ * of a segmented TokenizedData instance.
  */
 class Tokenizer {
-private:
+public:
 	/**
-	 * Internally used token trie. This object holds all registered tokens.
+	 * Internally used structure describing a registered token.
 	 */
-	TokenTrie trie;
+	struct TokenDescriptor {
+		/**
+		 * String describing the token.
+		 */
+		std::string string;
+
+		/**
+		 * Set to true if this token is primary.
+		 */
+		bool primary;
+
+		/**
+		 * Constructor of the TokenDescriptor class.
+		 *
+		 * @param string is the string representation of the registered token.
+		 * @param primary specifies whether the token is a primary token that
+		 * should be returned as a single token, or a secondary token, that
+		 * should be returned as part of TokenizedData.
+		 */
+		TokenDescriptor(const std::string &string, bool primary)
+		    : string(string), primary(primary)
+		{
+		}
+
+		/**
+		 * Default constructor.
+		 */
+		TokenDescriptor() : primary(false) {}
+
+		/**
+		 * Returns true if the TokenDescriptor represents a valid token.
+		 */
+		bool valid() { return !string.empty(); }
+	};
 
+private:
 	/**
-	 * Flag defining whether whitespaces should be preserved or not.
+	 * Internally used token trie. This object holds all registered tokens.
 	 */
-	WhitespaceMode whitespaceMode;
+	TokenTrie trie;
 
 	/**
 	 * Vector containing all registered token types.
 	 */
-	std::vector<std::string> tokens;
+	std::vector<TokenDescriptor> tokens;
 
 	/**
 	 * Next index in the tokens list where to search for a new token id.
@@ -74,90 +110,78 @@ private:
 
 	/**
 	 * Templated function used internally to read the current token. The
-	 * function is templated in order to force code generation for all six
-	 * combiations of whitespace modes and reading/peeking.
+	 * function is templated in order to force optimized code generation for
+	 * both reading and peeking.
 	 *
-	 * @tparam TextHandler is the type to be used for the textHandler instance.
-	 * @tparam read specifies whether the function should start from and advance
-	 * the read pointer of the char reader.
+	 * @tparam read specifies whether the method should read the token or just
+	 * peek.
 	 * @param reader is the CharReader instance from which the data should be
 	 * read.
 	 * @param token is the token structure into which the token information
 	 * should be written.
+	 * @param data is a reference at the TokenizedData instance to which the
+	 * token information should be appended.
 	 * @return false if the end of the stream has been reached, true otherwise.
 	 */
-	template <typename TextHandler, bool read>
-	bool next(CharReader &reader, Token &token);
+	template <bool read>
+	bool next(CharReader &reader, Token &token, TokenizedData &data);
 
 public:
 	/**
 	 * Constructor of the Tokenizer class.
-	 *
-	 * @param whitespaceMode specifies how whitespace should be handled.
 	 */
-	Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+	Tokenizer();
 
 	/**
-	 * Registers the given string as a token. Returns a const pointer at a
-	 * TokenDescriptor that will be used to reference the newly created token.
+	 * Registers the given string as a token. Returns a unique identifier
+	 * describing the registered token.
 	 *
 	 * @param token is the token string that should be registered.
-	 * @return a unique identifier for the registered token or EmptyToken if
+	 * @param primary specifies whether the token is a primary token -- if true,
+	 * the token will be returned as a single, standalone token. Otherwise the
+	 * token will be returned as part of a "TokenizedData" structure.
+	 * @return a unique identifier for the registered token or Tokens::Empty if
 	 * an error occured.
 	 */
-	TokenId registerToken(const std::string &token);
+	TokenId registerToken(const std::string &token, bool primary = true);
 
 	/**
 	 * Unregisters the token belonging to the given TokenId.
 	 *
 	 * @param type is the token type that should be unregistered. The
-	 *TokenId
-	 * must have been returned by registerToken.
+	 * TokenId must have been returned by registerToken.
 	 * @return true if the operation was successful, false otherwise (e.g.
-	 * because the given TokenDescriptor was already unregistered).
+	 * because the token with the given TokenId was already unregistered).
 	 */
-	bool unregisterToken(TokenId type);
+	bool unregisterToken(TokenId id);
 
 	/**
 	 * Returns the token that was registered under the given TokenId id or
-	 *an
-	 * empty string if an invalid TokenId id is given.
+	 * an empty string if an invalid TokenId id is given.
 	 *
-	 * @param type is the TokenId id for which the corresponding token
-	 *string
+	 * @param id is the TokenId for which the corresponding TokenDescriptor
 	 * should be returned.
-	 * @return the registered token string or an empty string if the given type
-	 * was invalid.
-	 */
-	std::string getTokenString(TokenId type);
-
-	/**
-	 * Sets the whitespace mode.
-	 *
-	 * @param whitespaceMode defines how whitespace should be treated in text
-	 * tokens.
-	 */
-	void setWhitespaceMode(WhitespaceMode mode);
-
-	/**
-	 * Returns the current value of the whitespace mode.
-	 *
-	 * @return the whitespace mode.
+	 * @return the registered TokenDescriptor or an invalid TokenDescriptor if
+	 * the given TokenId is invalid.
 	 */
-	WhitespaceMode getWhitespaceMode();
+	const TokenDescriptor& lookupToken(TokenId id) const;
 
 	/**
 	 * Reads a new token from the CharReader and stores it in the given
-	 * Token instance.
+	 * Token instance. If the token has the id Tokens::Data, use the "getData"
+	 * method to fetch a reference at the underlying TokenizedData instance
+	 * storing the data.
 	 *
 	 * @param reader is the CharReader instance from which the data should be
 	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
+	 * @param data is a reference at the TokenizedData instance to which the
+	 * token information should be appended.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool read(CharReader &reader, Token &token);
+	bool read(CharReader &reader, Token &token, TokenizedData &data);
 
 	/**
 	 * The peek method does not advance the read position of the char reader,
@@ -167,10 +191,12 @@ public:
 	 * read.
 	 * @param token is a reference at the token instance into which the Token
 	 * information should be written.
+	 * @param data is a reference at the TokenizedData instance to which the
+	 * token information should be appended.
 	 * @return true if a token could be read, false if the end of the stream
 	 * has been reached.
 	 */
-	bool peek(CharReader &reader, Token &token);
+	bool peek(CharReader &reader, Token &token, TokenizedData &data);
 };
 }
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-03-03 15:08:18 +0100
committer	Andreas Stöckel <andreas@somweyr.de>	2015-03-03 15:08:18 +0100
commit	466ff991bcfad76d78100193aacbfaf74d542b26 (patch)
tree	dafdb41ec766e83c6e37a8b9865e6ef454ff4def /src/core/parser/utils/Tokenizer.hpp
parent	b5cdca0331117ad3834b61eadd94ab3fcb6d2fba (diff)
parent	fb8d4cdf01909b61e4e5d0806ec6de178ff0058c (diff)