/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * @file Tokenizer.hpp * * Tokenizer that can be reconfigured at runtime and is used for parsing the * plain text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ #include #include #include #include #include #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; class TokenizedData; /** * The Tokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing. Note * that the Tokenizer always tries to extract the longest possible token from * the tokenizer. Tokens can be registered as primary or non-primary token. If * a Token is registered as a primary token, it is returned as a single Token * instance if it occurs. In the non-primary case the token is returned as part * of a segmented TokenizedData instance. */ class Tokenizer { public: /** * Internally used structure describing a registered token. */ struct TokenDescriptor { /** * String describing the token. */ std::string string; /** * Set to true if this token is primary. */ bool primary; /** * Constructor of the TokenDescriptor class. * * @param string is the string representation of the registered token. * @param primary specifies whether the token is a primary token that * should be returned as a single token, or a secondary token, that * should be returned as part of TokenizedData. */ TokenDescriptor(const std::string &string, bool primary) : string(string), primary(primary) { } /** * Default constructor. */ TokenDescriptor() : primary(false) {} /** * Returns true if the TokenDescriptor represents a valid token. */ bool valid() { return !string.empty(); } }; private: /** * Internally used token trie. This object holds all registered tokens. */ TokenTrie trie; /** * Vector containing all registered token types. */ std::vector tokens; /** * Next index in the tokens list where to search for a new token id. */ size_t nextTokenId; /** * Templated function used internally to read the current token. The * function is templated in order to force optimized code generation for * both reading and peeking. * * @tparam read specifies whether the method should read the token or just * peek. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. * @param data is a reference at the TokenizedData instance to which the * token information should be appended. * @return false if the end of the stream has been reached, true otherwise. */ template bool next(CharReader &reader, Token &token, TokenizedData &data) const; public: /** * Constructor of the Tokenizer class. */ Tokenizer(); /** * Registers the given string as a token. Returns a unique identifier * describing the registered token. * * @param token is the token string that should be registered. * @param primary specifies whether the token is a primary token -- if true, * the token will be returned as a single, standalone token. Otherwise the * token will be returned as part of a "TokenizedData" structure. * @return a unique identifier for the registered token or Tokens::Empty if * an error occured. */ TokenId registerToken(const std::string &token, bool primary = true); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The * TokenId must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. * because the token with the given TokenId was already unregistered). */ bool unregisterToken(TokenId id); /** * Returns the token that was registered under the given TokenId id or * an empty string if an invalid TokenId id is given. * * @param id is the TokenId for which the corresponding TokenDescriptor * should be returned. * @return the registered TokenDescriptor or an invalid TokenDescriptor if * the given TokenId is invalid. */ const TokenDescriptor& lookupToken(TokenId id) const; /** * Reads a new token from the CharReader and stores it in the given * Token instance. If the token has the id Tokens::Data, use the "getData" * method to fetch a reference at the underlying TokenizedData instance * storing the data. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. * @param data is a reference at the TokenizedData instance to which the * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ bool read(CharReader &reader, Token &token, TokenizedData &data) const; /** * The peek method does not advance the read position of the char reader, * but reads the next token from the current char reader peek position. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. * @param data is a reference at the TokenizedData instance to which the * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ bool peek(CharReader &reader, Token &token, TokenizedData &data) const; }; } #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */