From 4854509f8add1e2ff167623fb0e8d4216d9d6023 Mon Sep 17 00:00:00 2001
From: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>
Date: Sun, 8 Feb 2015 17:54:27 +0100
Subject: Implemented DynamicTokenizer and unit tests

---
 src/plugins/plain/DynamicTokenizer.hpp | 154 ++++++++++++++++++++++-----------
 1 file changed, 105 insertions(+), 49 deletions(-)

(limited to 'src/plugins/plain/DynamicTokenizer.hpp')
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
index f7fef13..760bebf 100644
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -28,34 +28,63 @@
 #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
 #define _OUSIA_DYNAMIC_TOKENIZER_HPP_
 
+#include <set>
+#include <string>
+#include <vector>
+
 #include <core/common/Location.hpp>
 
+#include "TokenTrie.hpp"
+
 namespace ousia {
 
 // Forward declarations
 class CharReader;
-class TokenDescriptor;
 
 /**
  * The DynamicToken structure describes a token discovered by the Tokenizer.
  */
 struct DynamicToken {
 	/**
-	 * Pointer pointing at the TokenDescriptor instance this token corresponds
-	 * to. May be one of the special TokenDescriptors defined as static members
-	 * of the DynamicTokenizer class.
+	 * Id of the type of this token.
 	 */
-	TokenDescriptor const *descriptor;
+	TokenTypeId type;
 
 	/**
 	 * String that was matched.
 	 */
-	std::string str;
+	std::string content;
 
 	/**
 	 * Location from which the string was extracted.
 	 */
 	SourceLocation location;
+
+	/**
+	 * Default constructor.
+	 */
+	DynamicToken() : type(EmptyToken) {}
+
+	/**
+	 * Constructor of the DynamicToken struct.
+	 *
+	 * @param id represents the token type.
+	 * @param content is the string content that has been extracted.
+	 * @param location is the location of the extracted string content in the
+	 * source file.
+	 */
+	DynamicToken(TokenTypeId type, const std::string &content,
+	             SourceLocation location)
+	    : type(type), content(content), location(location)
+	{
+	}
+
+	/**
+	 * Constructor of the DynamicToken struct, only initializes the token type
+	 *
+	 * @param type is the id corresponding to the type of the token.
+	 */
+	DynamicToken(TokenTypeId type) : type(type) {}
 };
 
 /**
@@ -64,43 +93,70 @@ struct DynamicToken {
  */
 enum class WhitespaceMode {
 	/**
-	 * Preserves all whitespaces as they are found in the source file.
-	 */
+     * Preserves all whitespaces as they are found in the source file.
+     */
 	PRESERVE,
 
 	/**
-	 * Trims whitespace at the beginning and the end of the found text.
-	 */
+     * Trims whitespace at the beginning and the end of the found text.
+     */
 	TRIM,
 
 	/**
-	 * Whitespaces are trimmed and collapsed, multiple whitespace characters
-	 * are replaced by a single space character.
-	 */
+     * Whitespaces are trimmed and collapsed, multiple whitespace characters
+     * are replaced by a single space character.
+     */
 	COLLAPSE
 };
 
 /**
  * The DynamicTokenizer is used to extract tokens and chunks of text from a
  * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters.
+ * to modify the handling of whitespace characters. Note that the
+ * DynamicTokenizer always tries to extract the longest possible token from the
+ * tokenizer.
  */
 class DynamicTokenizer {
 private:
 	/**
-	 * Reference at the char reader.
+	 * CharReader instance from which the tokens should be read.
 	 */
 	CharReader &reader;
 
+	/**
+	 * Internally used token trie. This object holds all registered tokens.
+	 */
+	TokenTrie trie;
+
 	/**
 	 * Flag defining whether whitespaces should be preserved or not.
 	 */
 	WhitespaceMode whitespaceMode;
 
 	/**
-	 * Vector containing all registered token descriptors.
+	 * Vector containing all registered token types.
+	 */
+	std::vector<std::string> tokens;
+
+	/**
+	 * Next index in the tokens list where to search for a new token id.
 	 */
-	std::vector<std::unique_ptr<TokenDescriptor>> descriptors;
+	size_t nextTokenTypeId;
+
+	/**
+	 * Templated function used internally to read the current token. The
+	 * function is templated in order to force code generation for all six
+	 * combiations of whitespace modes and reading/peeking.
+	 *
+	 * @tparam TextHandler is the type to be used for the textHandler instance.
+	 * @tparam read specifies whether the function should start from and advance
+	 * the read pointer of the char reader.
+	 * @param token is the token structure into which the token information
+	 * should be written.
+	 * @return false if the end of the stream has been reached, true otherwise.
+	 */
+	template <typename TextHandler, bool read>
+	bool next(DynamicToken &token);
 
 public:
 	/**
@@ -108,43 +164,44 @@ public:
 	 *
 	 * @param reader is the CharReader that should be used for reading the
 	 * tokens.
-	 * @param preserveWhitespaces should be set to true if all whitespaces
-	 * should be preserved (for preformated environments).
-	 */
-	DynamicTokenizer(CharReader &reader)
-	    : reader(reader),
-	      preserveWhitespaces(preserveWhitespaces),
-	      location(reader.getSourceId()),
-	      empty(true),
-	      hasWhitespace(false)
-	{
-	}
-
-	/**
-	 * Destructor of the DynamicTokenizer class.
+	 * @param whitespaceMode specifies how whitespace should be handled.
 	 */
-	~DynamicTokenizer();
+	DynamicTokenizer(CharReader &reader,
+	                 WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
 
 	/**
 	 * Registers the given string as a token. Returns a const pointer at a
 	 * TokenDescriptor that will be used to reference the newly created token.
 	 *
 	 * @param token is the token string that should be registered.
-	 * @return a pointer at a TokenDescriptor which is representative for the
-	 * newly registered token. Returns nullptr if a token with this string
-	 * was already registered.
+	 * @return a unique identifier for the registered token or EmptyToken if
+	 * an error occured.
 	 */
-	const TokenDescriptor* registerToken(const std::string &token);
+	TokenTypeId registerToken(const std::string &token);
 
 	/**
-	 * Unregisters the token belonging to the given TokenDescriptor.
+	 * Unregisters the token belonging to the given TokenTypeId.
 	 *
-	 * @param descr is a TokenDescriptor that was previously returned by
-	 * registerToken.
+	 * @param type is the token type that should be unregistered. The
+	 *TokenTypeId
+	 * must have been returned by registerToken.
 	 * @return true if the operation was successful, false otherwise (e.g.
 	 * because the given TokenDescriptor was already unregistered).
 	 */
-	bool unregisterToken(const TokenDescriptor *descr);
+	bool unregisterToken(TokenTypeId type);
+
+	/**
+	 * Returns the token that was registered under the given TokenTypeId id or
+	 *an
+	 * empty string if an invalid TokenTypeId id is given.
+	 *
+	 * @param type is the TokenTypeId id for which the corresponding token
+	 *string
+	 * should be returned.
+	 * @return the registered token string or an empty string if the given type
+	 * was invalid.
+	 */
+	std::string getTokenString(TokenTypeId type);
 
 	/**
 	 * Sets the whitespace mode.
@@ -173,17 +230,16 @@ public:
 	bool read(DynamicToken &token);
 
 	/**
-	 * TokenDescriptor representing an empty token.
-	 */
-	static const *TokenDescriptor Empty;
-
-	/**
-	 * TokenDescriptor representing generic text.
+	 * The peek method does not advance the read position of the char reader,
+	 * but reads the next token from the current char reader peek position.
+	 *
+	 * @param token is a reference at the token instance into which the Token
+	 * information should be written.
+	 * @return true if a token could be read, false if the end of the stream
+	 * has been reached.
 	 */
-	static const *TokenDescriptor Text;
-
+	bool peek(DynamicToken &token);
 };
-
 }
 
 #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
-- 
cgit v1.2.3