/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * @file Tokenizer.hpp * * Tokenizer that can be reconfigured at runtime used for parsing the plain * text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ #include #include #include #include #include #include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; /** * The Tokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing and * to modify the handling of whitespace characters. Note that the * Tokenizer always tries to extract the longest possible token from the * tokenizer. */ class Tokenizer { private: /** * Internally used token trie. This object holds all registered tokens. */ TokenTrie trie; /** * Flag defining whether whitespaces should be preserved or not. */ WhitespaceMode whitespaceMode; /** * Vector containing all registered token types. */ std::vector tokens; /** * Next index in the tokens list where to search for a new token id. */ size_t nextTokenId; /** * Templated function used internally to read the current token. The * function is templated in order to force code generation for all six * combiations of whitespace modes and reading/peeking. * * @tparam TextHandler is the type to be used for the textHandler instance. * @tparam read specifies whether the function should start from and advance * the read pointer of the char reader. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. * @return false if the end of the stream has been reached, true otherwise. */ template bool next(CharReader &reader, Token &token); public: /** * Constructor of the Tokenizer class. * * @param whitespaceMode specifies how whitespace should be handled. */ Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Registers the given string as a token. Returns a const pointer at a * TokenDescriptor that will be used to reference the newly created token. * * @param token is the token string that should be registered. * @return a unique identifier for the registered token or EmptyToken if * an error occured. */ TokenId registerToken(const std::string &token); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The *TokenId * must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. * because the given TokenDescriptor was already unregistered). */ bool unregisterToken(TokenId type); /** * Returns the token that was registered under the given TokenId id or *an * empty string if an invalid TokenId id is given. * * @param type is the TokenId id for which the corresponding token *string * should be returned. * @return the registered token string or an empty string if the given type * was invalid. */ std::string getTokenString(TokenId type); /** * Sets the whitespace mode. * * @param whitespaceMode defines how whitespace should be treated in text * tokens. */ void setWhitespaceMode(WhitespaceMode mode); /** * Returns the current value of the whitespace mode. * * @return the whitespace mode. */ WhitespaceMode getWhitespaceMode(); /** * Reads a new token from the CharReader and stores it in the given * Token instance. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. * @return true if a token could be read, false if the end of the stream * has been reached. */ bool read(CharReader &reader, Token &token); /** * The peek method does not advance the read position of the char reader, * but reads the next token from the current char reader peek position. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. * @return true if a token could be read, false if the end of the stream * has been reached. */ bool peek(CharReader &reader, Token &token); }; } #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */