/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* @file Tokenizer.hpp
*
* Tokenizer that can be reconfigured at runtime used for parsing the plain
* text format.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
#include
#include
#include
#include
#include
#include "TokenTrie.hpp"
namespace ousia {
// Forward declarations
class CharReader;
/**
* The Token structure describes a token discovered by the Tokenizer.
*/
struct Token {
/**
* Id of the type of this token.
*/
TokenTypeId type;
/**
* String that was matched.
*/
std::string content;
/**
* Location from which the string was extracted.
*/
SourceLocation location;
/**
* Default constructor.
*/
Token() : type(EmptyToken) {}
/**
* Constructor of the Token struct.
*
* @param id represents the token type.
* @param content is the string content that has been extracted.
* @param location is the location of the extracted string content in the
* source file.
*/
Token(TokenTypeId type, const std::string &content,
SourceLocation location)
: type(type), content(content), location(location)
{
}
/**
* Constructor of the Token struct, only initializes the token type
*
* @param type is the id corresponding to the type of the token.
*/
Token(TokenTypeId type) : type(type) {}
/**
* The getLocation function allows the tokens to be directly passed as
* parameter to Logger or LoggableException instances.
*
* @return a reference at the location field
*/
const SourceLocation &getLocation() const { return location; }
};
/**
* The Tokenizer is used to extract tokens and chunks of text from a
* CharReader. It allows to register and unregister tokens while parsing and
* to modify the handling of whitespace characters. Note that the
* Tokenizer always tries to extract the longest possible token from the
* tokenizer.
*/
class Tokenizer {
private:
/**
* Internally used token trie. This object holds all registered tokens.
*/
TokenTrie trie;
/**
* Flag defining whether whitespaces should be preserved or not.
*/
WhitespaceMode whitespaceMode;
/**
* Vector containing all registered token types.
*/
std::vector tokens;
/**
* Next index in the tokens list where to search for a new token id.
*/
size_t nextTokenTypeId;
/**
* Templated function used internally to read the current token. The
* function is templated in order to force code generation for all six
* combiations of whitespace modes and reading/peeking.
*
* @tparam TextHandler is the type to be used for the textHandler instance.
* @tparam read specifies whether the function should start from and advance
* the read pointer of the char reader.
* @param reader is the CharReader instance from which the data should be
* read.
* @param token is the token structure into which the token information
* should be written.
* @return false if the end of the stream has been reached, true otherwise.
*/
template
bool next(CharReader &reader, Token &token);
public:
/**
* Constructor of the Tokenizer class.
*
* @param whitespaceMode specifies how whitespace should be handled.
*/
Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
/**
* Registers the given string as a token. Returns a const pointer at a
* TokenDescriptor that will be used to reference the newly created token.
*
* @param token is the token string that should be registered.
* @return a unique identifier for the registered token or EmptyToken if
* an error occured.
*/
TokenTypeId registerToken(const std::string &token);
/**
* Unregisters the token belonging to the given TokenTypeId.
*
* @param type is the token type that should be unregistered. The
*TokenTypeId
* must have been returned by registerToken.
* @return true if the operation was successful, false otherwise (e.g.
* because the given TokenDescriptor was already unregistered).
*/
bool unregisterToken(TokenTypeId type);
/**
* Returns the token that was registered under the given TokenTypeId id or
*an
* empty string if an invalid TokenTypeId id is given.
*
* @param type is the TokenTypeId id for which the corresponding token
*string
* should be returned.
* @return the registered token string or an empty string if the given type
* was invalid.
*/
std::string getTokenString(TokenTypeId type);
/**
* Sets the whitespace mode.
*
* @param whitespaceMode defines how whitespace should be treated in text
* tokens.
*/
void setWhitespaceMode(WhitespaceMode mode);
/**
* Returns the current value of the whitespace mode.
*
* @return the whitespace mode.
*/
WhitespaceMode getWhitespaceMode();
/**
* Reads a new token from the CharReader and stores it in the given
* Token instance.
*
* @param reader is the CharReader instance from which the data should be
* read.
* @param token is a reference at the token instance into which the Token
* information should be written.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
bool read(CharReader &reader, Token &token);
/**
* The peek method does not advance the read position of the char reader,
* but reads the next token from the current char reader peek position.
*
* @param reader is the CharReader instance from which the data should be
* read.
* @param token is a reference at the token instance into which the Token
* information should be written.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
bool peek(CharReader &reader, Token &token);
};
}
#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */