diff options
Diffstat (limited to 'src/plugins/plain/DynamicTokenizer.hpp')
-rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 160 |
1 files changed, 107 insertions, 53 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp index f7fef13..0b4dd39 100644 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -28,34 +28,63 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#include <set> +#include <string> +#include <vector> + #include <core/common/Location.hpp> +#include "TokenTrie.hpp" + namespace ousia { // Forward declarations class CharReader; -class TokenDescriptor; /** * The DynamicToken structure describes a token discovered by the Tokenizer. */ struct DynamicToken { /** - * Pointer pointing at the TokenDescriptor instance this token corresponds - * to. May be one of the special TokenDescriptors defined as static members - * of the DynamicTokenizer class. + * Id of the type of this token. */ - TokenDescriptor const *descriptor; + TokenTypeId type; /** * String that was matched. */ - std::string str; + std::string content; /** * Location from which the string was extracted. */ SourceLocation location; + + /** + * Default constructor. + */ + DynamicToken() : type(EmptyToken) {} + + /** + * Constructor of the DynamicToken struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + DynamicToken(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the DynamicToken struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + DynamicToken(TokenTypeId type) : type(type) {} }; /** @@ -64,33 +93,35 @@ struct DynamicToken { */ enum class WhitespaceMode { /** - * Preserves all whitespaces as they are found in the source file. - */ + * Preserves all whitespaces as they are found in the source file. + */ PRESERVE, /** - * Trims whitespace at the beginning and the end of the found text. - */ + * Trims whitespace at the beginning and the end of the found text. + */ TRIM, /** - * Whitespaces are trimmed and collapsed, multiple whitespace characters - * are replaced by a single space character. - */ + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ COLLAPSE }; /** * The DynamicTokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. + * to modify the handling of whitespace characters. Note that the + * DynamicTokenizer always tries to extract the longest possible token from the + * tokenizer. */ class DynamicTokenizer { private: /** - * Reference at the char reader. + * Internally used token trie. This object holds all registered tokens. */ - CharReader &reader; + TokenTrie trie; /** * Flag defining whether whitespaces should be preserved or not. @@ -98,53 +129,73 @@ private: WhitespaceMode whitespaceMode; /** - * Vector containing all registered token descriptors. + * Vector containing all registered token types. */ - std::vector<std::unique_ptr<TokenDescriptor>> descriptors; + std::vector<std::string> tokens; -public: /** - * Constructor of the DynamicTokenizer class. + * Next index in the tokens list where to search for a new token id. + */ + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. * - * @param reader is the CharReader that should be used for reading the - * tokens. - * @param preserveWhitespaces should be set to true if all whitespaces - * should be preserved (for preformated environments). - */ - DynamicTokenizer(CharReader &reader) - : reader(reader), - preserveWhitespaces(preserveWhitespaces), - location(reader.getSourceId()), - empty(true), - hasWhitespace(false) - { - } + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template <typename TextHandler, bool read> + bool next(CharReader &reader, DynamicToken &token); +public: /** - * Destructor of the DynamicTokenizer class. + * Constructor of the DynamicTokenizer class. + * + * @param whitespaceMode specifies how whitespace should be handled. */ - ~DynamicTokenizer(); + DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); /** * Registers the given string as a token. Returns a const pointer at a * TokenDescriptor that will be used to reference the newly created token. * * @param token is the token string that should be registered. - * @return a pointer at a TokenDescriptor which is representative for the - * newly registered token. Returns nullptr if a token with this string - * was already registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. */ - const TokenDescriptor* registerToken(const std::string &token); + TokenTypeId registerToken(const std::string &token); /** - * Unregisters the token belonging to the given TokenDescriptor. + * Unregisters the token belonging to the given TokenTypeId. * - * @param descr is a TokenDescriptor that was previously returned by - * registerToken. + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. * because the given TokenDescriptor was already unregistered). */ - bool unregisterToken(const TokenDescriptor *descr); + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); /** * Sets the whitespace mode. @@ -165,25 +216,28 @@ public: * Reads a new token from the CharReader and stores it in the given * DynamicToken instance. * + * @param reader is the CharReader instance from which the data should be + * read. * @param token is a reference at the token instance into which the Token * information should be written. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(DynamicToken &token); + bool read(CharReader &reader, DynamicToken &token); /** - * TokenDescriptor representing an empty token. - */ - static const *TokenDescriptor Empty; - - /** - * TokenDescriptor representing generic text. + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. */ - static const *TokenDescriptor Text; - + bool peek(CharReader &reader, DynamicToken &token); }; - } #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ |