diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-03-03 15:08:18 +0100 |
---|---|---|
committer | Andreas Stöckel <andreas@somweyr.de> | 2015-03-03 15:08:18 +0100 |
commit | 466ff991bcfad76d78100193aacbfaf74d542b26 (patch) | |
tree | dafdb41ec766e83c6e37a8b9865e6ef454ff4def /src/core/parser/utils/Tokenizer.hpp | |
parent | b5cdca0331117ad3834b61eadd94ab3fcb6d2fba (diff) | |
parent | fb8d4cdf01909b61e4e5d0806ec6de178ff0058c (diff) |
Storing type and name in the HandlerData once again, using a Token
Conflicts:
application/src/core/parser/stack/Callbacks.hpp
Diffstat (limited to 'src/core/parser/utils/Tokenizer.hpp')
-rw-r--r-- | src/core/parser/utils/Tokenizer.hpp | 142 |
1 files changed, 84 insertions, 58 deletions
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@ /** * @file Tokenizer.hpp * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -28,44 +28,80 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include <set> +#include <cstdint> #include <string> #include <vector> #include <core/common/Location.hpp> -#include <core/common/Whitespace.hpp> +#include <core/common/Token.hpp> -#include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; +class TokenizedData; /** * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance. */ class Tokenizer { -private: +public: /** - * Internally used token trie. This object holds all registered tokens. + * Internally used structure describing a registered token. */ - TokenTrie trie; + struct TokenDescriptor { + /** + * String describing the token. + */ + std::string string; + + /** + * Set to true if this token is primary. + */ + bool primary; + + /** + * Constructor of the TokenDescriptor class. + * + * @param string is the string representation of the registered token. + * @param primary specifies whether the token is a primary token that + * should be returned as a single token, or a secondary token, that + * should be returned as part of TokenizedData. + */ + TokenDescriptor(const std::string &string, bool primary) + : string(string), primary(primary) + { + } + + /** + * Default constructor. + */ + TokenDescriptor() : primary(false) {} + + /** + * Returns true if the TokenDescriptor represents a valid token. + */ + bool valid() { return !string.empty(); } + }; +private: /** - * Flag defining whether whitespaces should be preserved or not. + * Internally used token trie. This object holds all registered tokens. */ - WhitespaceMode whitespaceMode; + TokenTrie trie; /** * Vector containing all registered token types. */ - std::vector<std::string> tokens; + std::vector<TokenDescriptor> tokens; /** * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private: /** * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. + * function is templated in order to force optimized code generation for + * both reading and peeking. * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. + * @tparam read specifies whether the method should read the token or just + * peek. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return false if the end of the stream has been reached, true otherwise. */ - template <typename TextHandler, bool read> - bool next(CharReader &reader, Token &token); + template <bool read> + bool next(CharReader &reader, Token &token, TokenizedData &data); public: /** * Constructor of the Tokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. */ - Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + Tokenizer(); /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. + * Registers the given string as a token. Returns a unique identifier + * describing the registered token. * * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if + * @param primary specifies whether the token is a primary token -- if true, + * the token will be returned as a single, standalone token. Otherwise the + * token will be returned as part of a "TokenizedData" structure. + * @return a unique identifier for the registered token or Tokens::Empty if * an error occured. */ - TokenId registerToken(const std::string &token); + TokenId registerToken(const std::string &token, bool primary = true); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The - *TokenId - * must have been returned by registerToken. + * TokenId must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). + * because the token with the given TokenId was already unregistered). */ - bool unregisterToken(TokenId type); + bool unregisterToken(TokenId id); /** * Returns the token that was registered under the given TokenId id or - *an - * empty string if an invalid TokenId id is given. + * an empty string if an invalid TokenId id is given. * - * @param type is the TokenId id for which the corresponding token - *string + * @param id is the TokenId for which the corresponding TokenDescriptor * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. + * @return the registered TokenDescriptor or an invalid TokenDescriptor if + * the given TokenId is invalid. */ - WhitespaceMode getWhitespaceMode(); + const TokenDescriptor& lookupToken(TokenId id) const; /** * Reads a new token from the CharReader and stores it in the given - * Token instance. + * Token instance. If the token has the id Tokens::Data, use the "getData" + * method to fetch a reference at the underlying TokenizedData instance + * storing the data. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(CharReader &reader, Token &token); + bool read(CharReader &reader, Token &token, TokenizedData &data); /** * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public: * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(CharReader &reader, Token &token); + bool peek(CharReader &reader, Token &token, TokenizedData &data); }; } |