diff options
Diffstat (limited to 'src/core/parser/utils')
| -rw-r--r-- | src/core/parser/utils/TokenTrie.cpp | 119 | ||||
| -rw-r--r-- | src/core/parser/utils/TokenTrie.hpp | 150 | ||||
| -rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 381 | ||||
| -rw-r--r-- | src/core/parser/utils/Tokenizer.hpp | 231 | 
4 files changed, 881 insertions, 0 deletions
| diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/core/parser/utils/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, +                              TokenTypeId type) noexcept +{ +	// Abort if the token is empty -- this would taint the root node +	if (token.empty()) { +		return false; +	} + +	// Iterate over each character in the given string and insert them as +	// (new) nodes +	Node *node = &root; +	for (size_t i = 0; i < token.size(); i++) { +		// Insert a new node if this one does not exist +		const char c = token[i]; +		auto it = node->children.find(c); +		if (it == node->children.end()) { +			it = node->children.emplace(c, std::make_shared<Node>()).first; +		} +		node = it->second.get(); +	} + +	// If the resulting node already has a type set, we're screwed. +	if (node->type != EmptyToken) { +		return false; +	} + +	// Otherwise just set the type to the given type. +	node->type = type; +	return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ +	// We cannot remove empty tokens as we need to access the fist character +	// upfront +	if (token.empty()) { +		return false; +	} + +	// First pass -- search the node in the path that can be deleted +	Node *subtreeRoot = &root; +	char subtreeKey = token[0]; +	Node *node = &root; +	for (size_t i = 0; i < token.size(); i++) { +		// Go to the next node, abort if the tree ends unexpectedly +		auto it = node->children.find(token[i]); +		if (it == node->children.end()) { +			return false; +		} + +		// Reset the subtree handler if this node has another type +		node = it->second.get(); +		if ((node->type != EmptyToken || node->children.size() > 1) && +		    (i + 1 != token.size())) { +			subtreeRoot = node; +			subtreeKey = token[i + 1]; +		} +	} + +	// If the node type is already EmptyToken, we cannot do anything here +	if (node->type == EmptyToken) { +		return false; +	} + +	// If the target node has children, we cannot delete the subtree. Set the +	// type to EmptyToken instead +	if (!node->children.empty()) { +		node->type = EmptyToken; +		return true; +	} + +	// If we end up here, we can safely delete the complete subtree +	subtreeRoot->children.erase(subtreeKey); +	return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ +	Node const *node = &root; +	for (size_t i = 0; i < token.size(); i++) { +		const char c = token[i]; +		auto it = node->children.find(c); +		if (it == node->children.end()) { +			return EmptyToken; +		} +		node = it->second.get(); +	} +	return node->type; +} +} + diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/core/parser/utils/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include <cstdint> +#include <memory> +#include <limits> +#include <unordered_map> + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + *        ~ (0) + *       /     \ + *      a (2)  b (0) + *      |      | + *      a (0)  a (0) + *      |      | + *      b (1)  c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: +	/** +	 * Structure used to build the node tree. +	 */ +	struct Node { +		/** +		 * Type used for the child map. +		 */ +		using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; + +		/** +		 * Map from single characters at the corresponding child nodes. +		 */ +		ChildMap children; + +		/** +		 * Reference at the corresponding token descriptor. Set to nullptr if +		 * no token is attached to this node. +		 */ +		TokenTypeId type; + +		/** +		 * Default constructor, initializes the descriptor with nullptr. +		 */ +		Node(); +	}; + +private: +	/** +	 * Root node of the internal token tree. +	 */ +	Node root; + +public: +	/** +	 * Registers a token containing the given string. Returns false if the +	 * token already exists, true otherwise. +	 * +	 * @param token is the character sequence that should be registered as +	 * token. +	 * @param type is the descriptor that should be set for this token. +	 * @return true if the operation is successful, false otherwise. +	 */ +	bool registerToken(const std::string &token, TokenTypeId type) noexcept; + +	/** +	 * Unregisters the token from the token tree. Returns true if the token was +	 * unregistered successfully, false otherwise. +	 * +	 * @param token is the character sequence that should be unregistered. +	 * @return true if the operation was successful, false otherwise. +	 */ +	bool unregisterToken(const std::string &token) noexcept; + +	/** +	 * Returns true, if the given token exists within the TokenTree. This +	 * function is mostly thought for debugging and unit testing. +	 * +	 * @param token is the character sequence that should be searched. +	 * @return the attached token descriptor or nullptr if the given token is +	 * not found. +	 */ +	TokenTypeId hasToken(const std::string &token) const noexcept; + +	/** +	 * Returns a reference at the root node to be used for traversing the token +	 * tree. +	 * +	 * @return a reference at the root node. +	 */ +	const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp new file mode 100644 index 0000000..3c8177d --- /dev/null +++ b/src/core/parser/utils/Tokenizer.cpp @@ -0,0 +1,381 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <memory> +#include <vector> + +#include <core/common/CharReader.hpp> +#include <core/common/Exceptions.hpp> +#include <core/common/Utils.hpp> +#include <core/common/WhitespaceHandler.hpp> + +#include "Tokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { +	/** +	 * Token that was matched. +	 */ +	Token token; + +	/** +	 * Current length of the data within the text handler. The text buffer needs +	 * to be trimmed to this length if this token matches. +	 */ +	size_t textLength; + +	/** +	 * End location of the current text handler. This location needs to be used +	 * for the text token that is emitted before the actual token. +	 */ +	size_t textEnd; + +	/** +	 * Constructor of the TokenMatch class. +	 */ +	TokenMatch() : textLength(0), textEnd(0) {} + +	/** +	 * Returns true if this TokenMatch instance actually represents a match. +	 */ +	bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: +	/** +	 * Current node within the token trie. +	 */ +	TokenTrie::Node const *node; + +	/** +	 * Start offset within the source file. +	 */ +	size_t start; + +	/** +	 * Current length of the data within the text handler. The text buffer needs +	 * to be trimmed to this length if this token matches. +	 */ +	size_t textLength; + +	/** +	 * End location of the current text handler. This location needs to be used +	 * for the text token that is emitted before the actual token. +	 */ +	size_t textEnd; + +public: +	/** +	 * Constructor of the TokenLookup class. +	 * +	 * @param node is the current node. +	 * @param start is the start position. +	 * @param textLength is the text buffer length of the previous text token. +	 * @param textEnd is the current end location of the previous text token. +	 */ +	TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, +	            size_t textEnd) +	    : node(node), start(start), textLength(textLength), textEnd(textEnd) +	{ +	} + +	/** +	 * Tries to extend the current path in the token trie with the given +	 * character. If a complete token is matched, stores this match in the +	 * tokens list (in case it is longer than any previous token). +	 * +	 * @param c is the character that should be appended to the current prefix. +	 * @param lookups is a list to which new TokeLookup instances are added -- +	 * which could potentially be expanded in the next iteration. +	 * @param match is the Token instance to which the matching token +	 * should be written. +	 * @param tokens is a reference at the internal token list of the +	 * Tokenizer. +	 * @param end is the end byte offset of the current character. +	 * @param sourceId is the source if of this file. +	 */ +	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, +	             const std::vector<std::string> &tokens, SourceOffset end, +	             SourceId sourceId) +	{ +		// Check whether we can continue the current token path with the given +		// character without visiting an already visited node +		auto it = node->children.find(c); +		if (it == node->children.end()) { +			return; +		} + +		// Check whether the new node represents a complete token a whether it +		// is longer than the current token. If yes, replace the current token. +		node = it->second.get(); +		if (node->type != EmptyToken) { +			const std::string &str = tokens[node->type]; +			size_t len = str.size(); +			if (len > match.token.content.size()) { +				match.token = +				    Token{node->type, str, {sourceId, start, end}}; +				match.textLength = textLength; +				match.textEnd = textEnd; +			} +		} + +		// If this state can possibly be advanced, store it in the states list. +		if (!node->children.empty()) { +			lookups.emplace_back(*this); +		} +	} +}; + +/** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, +                           SourceId sourceId) +{ +	if (match.hasMatch()) { +		match.token.content = +		    std::string{handler.textBuf.data(), match.textLength}; +		match.token.location = +		    SourceLocation{sourceId, handler.textStart, match.textEnd}; +	} else { +		match.token.content = handler.toString(); +		match.token.location = +		    SourceLocation{sourceId, handler.textStart, handler.textEnd}; +	} +	match.token.type = TextToken; +} +} + +/* Class Tokenizer */ + +Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) +    : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template <typename TextHandler, bool read> +bool Tokenizer::next(CharReader &reader, Token &token) +{ +	// If we're in the read mode, reset the char reader peek position to the +	// current read position +	if (read) { +		reader.resetPeek(); +	} + +	// Prepare the lookups in the token trie +	const TokenTrie::Node *root = trie.getRoot(); +	TokenMatch match; +	std::vector<TokenLookup> lookups; +	std::vector<TokenLookup> nextLookups; + +	// Instantiate the text handler +	TextHandler textHandler; + +	// Peek characters from the reader and try to advance the current token tree +	// cursor +	char c; +	size_t charStart = reader.getPeekOffset(); +	const SourceId sourceId = reader.getSourceId(); +	while (reader.peek(c)) { +		const size_t charEnd = reader.getPeekOffset(); +		const size_t textLength = textHandler.textBuf.size(); +		const size_t textEnd = textHandler.textEnd; + +		// If we do not have a match yet, start a new lookup from the root +		if (!match.hasMatch()) { +			TokenLookup{root, charStart, textLength, textEnd}.advance( +			    c, nextLookups, match, tokens, charEnd, sourceId); +		} + +		// Try to advance all other lookups with the new character +		for (TokenLookup &lookup : lookups) { +			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); +		} + +		// We have found a token and there are no more states to advance or the +		// text handler has found something -- abort to return the new token +		if (match.hasMatch()) { +			if ((nextLookups.empty() || textHandler.hasText())) { +				break; +			} +		} else { +			// Record all incomming characters +			textHandler.append(c, charStart, charEnd); +		} + +		// Swap the lookups and the nextLookups list +		lookups = std::move(nextLookups); +		nextLookups.clear(); + +		// Advance the offset +		charStart = charEnd; +	} + +	// If we found text, emit that text +	if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { +		buildTextToken(textHandler, match, sourceId); +	} + +	// Move the read/peek cursor to the end of the token, abort if an error +	// happens while doing so +	if (match.hasMatch()) { +		// Make sure we have a valid location +		if (match.token.location.getEnd() == InvalidSourceOffset) { +			throw OusiaException{"Token end position offset out of range"}; +		} + +		// Seek to the end of the current token +		const size_t end = match.token.location.getEnd(); +		if (read) { +			reader.seek(end); +		} else { +			reader.seekPeekCursor(end); +		} +		token = match.token; +	} else { +		token = Token{}; +	} +	return match.hasMatch(); +} + +bool Tokenizer::read(CharReader &reader, Token &token) +{ +	switch (whitespaceMode) { +		case WhitespaceMode::PRESERVE: +			return next<PreservingWhitespaceHandler, true>(reader, token); +		case WhitespaceMode::TRIM: +			return next<TrimmingWhitespaceHandler, true>(reader, token); +		case WhitespaceMode::COLLAPSE: +			return next<CollapsingWhitespaceHandler, true>(reader, token); +	} +	return false; +} + +bool Tokenizer::peek(CharReader &reader, Token &token) +{ +	switch (whitespaceMode) { +		case WhitespaceMode::PRESERVE: +			return next<PreservingWhitespaceHandler, false>(reader, token); +		case WhitespaceMode::TRIM: +			return next<TrimmingWhitespaceHandler, false>(reader, token); +		case WhitespaceMode::COLLAPSE: +			return next<CollapsingWhitespaceHandler, false>(reader, token); +	} +	return false; +} + +TokenTypeId Tokenizer::registerToken(const std::string &token) +{ +	// Abort if an empty token should be registered +	if (token.empty()) { +		return EmptyToken; +	} + +	// Search for a new slot in the tokens list +	TokenTypeId type = EmptyToken; +	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { +		if (tokens[i].empty()) { +			tokens[i] = token; +			type = i; +			break; +		} +	} + +	// No existing slot was found, add a new one -- make sure we do not +	// override the special token type handles +	if (type == EmptyToken) { +		type = tokens.size(); +		if (type == TextToken || type == EmptyToken) { +			throw OusiaException{"Token type ids depleted!"}; +		} +		tokens.emplace_back(token); +	} +	nextTokenTypeId = type + 1; + +	// Try to register the token in the trie -- if this fails, remove it +	// from the tokens list +	if (!trie.registerToken(token, type)) { +		tokens[type] = std::string{}; +		nextTokenTypeId = type; +		return EmptyToken; +	} +	return type; +} + +bool Tokenizer::unregisterToken(TokenTypeId type) +{ +	// Unregister the token from the trie, abort if an invalid type is given +	if (type < tokens.size() && trie.unregisterToken(tokens[type])) { +		tokens[type] = std::string{}; +		nextTokenTypeId = type; +		return true; +	} +	return false; +} + +std::string Tokenizer::getTokenString(TokenTypeId type) +{ +	if (type < tokens.size()) { +		return tokens[type]; +	} +	return std::string{}; +} + +void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +{ +	whitespaceMode = mode; +} + +WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member +   function */ +template bool Tokenizer::next<PreservingWhitespaceHandler, false>( +    CharReader &reader, Token &token); +template bool Tokenizer::next<TrimmingWhitespaceHandler, false>( +    CharReader &reader, Token &token); +template bool Tokenizer::next<CollapsingWhitespaceHandler, false>( +    CharReader &reader, Token &token); +template bool Tokenizer::next<PreservingWhitespaceHandler, true>( +    CharReader &reader, Token &token); +template bool Tokenizer::next<TrimmingWhitespaceHandler, true>( +    CharReader &reader, Token &token); +template bool Tokenizer::next<CollapsingWhitespaceHandler, true>( +    CharReader &reader, Token &token); +} + diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp new file mode 100644 index 0000000..6b4e116 --- /dev/null +++ b/src/core/parser/utils/Tokenizer.hpp @@ -0,0 +1,231 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Tokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include <set> +#include <string> +#include <vector> + +#include <core/common/Location.hpp> +#include <core/common/Whitespace.hpp> + +#include "TokenTrie.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; + +/** + * The Token structure describes a token discovered by the Tokenizer. + */ +struct Token { +	/** +	 * Id of the type of this token. +	 */ +	TokenTypeId type; + +	/** +	 * String that was matched. +	 */ +	std::string content; + +	/** +	 * Location from which the string was extracted. +	 */ +	SourceLocation location; + +	/** +	 * Default constructor. +	 */ +	Token() : type(EmptyToken) {} + +	/** +	 * Constructor of the Token struct. +	 * +	 * @param id represents the token type. +	 * @param content is the string content that has been extracted. +	 * @param location is the location of the extracted string content in the +	 * source file. +	 */ +	Token(TokenTypeId type, const std::string &content, +	             SourceLocation location) +	    : type(type), content(content), location(location) +	{ +	} + +	/** +	 * Constructor of the Token struct, only initializes the token type +	 * +	 * @param type is the id corresponding to the type of the token. +	 */ +	Token(TokenTypeId type) : type(type) {} + +	/** +	 * The getLocation function allows the tokens to be directly passed as +	 * parameter to Logger or LoggableException instances. +	 * +	 * @return a reference at the location field +	 */ +	const SourceLocation &getLocation() const { return location; } +}; + +/** + * The Tokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. Note that the + * Tokenizer always tries to extract the longest possible token from the + * tokenizer. + */ +class Tokenizer { +private: +	/** +	 * Internally used token trie. This object holds all registered tokens. +	 */ +	TokenTrie trie; + +	/** +	 * Flag defining whether whitespaces should be preserved or not. +	 */ +	WhitespaceMode whitespaceMode; + +	/** +	 * Vector containing all registered token types. +	 */ +	std::vector<std::string> tokens; + +	/** +	 * Next index in the tokens list where to search for a new token id. +	 */ +	size_t nextTokenTypeId; + +	/** +	 * Templated function used internally to read the current token. The +	 * function is templated in order to force code generation for all six +	 * combiations of whitespace modes and reading/peeking. +	 * +	 * @tparam TextHandler is the type to be used for the textHandler instance. +	 * @tparam read specifies whether the function should start from and advance +	 * the read pointer of the char reader. +	 * @param reader is the CharReader instance from which the data should be +	 * read. +	 * @param token is the token structure into which the token information +	 * should be written. +	 * @return false if the end of the stream has been reached, true otherwise. +	 */ +	template <typename TextHandler, bool read> +	bool next(CharReader &reader, Token &token); + +public: +	/** +	 * Constructor of the Tokenizer class. +	 * +	 * @param whitespaceMode specifies how whitespace should be handled. +	 */ +	Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + +	/** +	 * Registers the given string as a token. Returns a const pointer at a +	 * TokenDescriptor that will be used to reference the newly created token. +	 * +	 * @param token is the token string that should be registered. +	 * @return a unique identifier for the registered token or EmptyToken if +	 * an error occured. +	 */ +	TokenTypeId registerToken(const std::string &token); + +	/** +	 * Unregisters the token belonging to the given TokenTypeId. +	 * +	 * @param type is the token type that should be unregistered. The +	 *TokenTypeId +	 * must have been returned by registerToken. +	 * @return true if the operation was successful, false otherwise (e.g. +	 * because the given TokenDescriptor was already unregistered). +	 */ +	bool unregisterToken(TokenTypeId type); + +	/** +	 * Returns the token that was registered under the given TokenTypeId id or +	 *an +	 * empty string if an invalid TokenTypeId id is given. +	 * +	 * @param type is the TokenTypeId id for which the corresponding token +	 *string +	 * should be returned. +	 * @return the registered token string or an empty string if the given type +	 * was invalid. +	 */ +	std::string getTokenString(TokenTypeId type); + +	/** +	 * Sets the whitespace mode. +	 * +	 * @param whitespaceMode defines how whitespace should be treated in text +	 * tokens. +	 */ +	void setWhitespaceMode(WhitespaceMode mode); + +	/** +	 * Returns the current value of the whitespace mode. +	 * +	 * @return the whitespace mode. +	 */ +	WhitespaceMode getWhitespaceMode(); + +	/** +	 * Reads a new token from the CharReader and stores it in the given +	 * Token instance. +	 * +	 * @param reader is the CharReader instance from which the data should be +	 * read. +	 * @param token is a reference at the token instance into which the Token +	 * information should be written. +	 * @return true if a token could be read, false if the end of the stream +	 * has been reached. +	 */ +	bool read(CharReader &reader, Token &token); + +	/** +	 * The peek method does not advance the read position of the char reader, +	 * but reads the next token from the current char reader peek position. +	 * +	 * @param reader is the CharReader instance from which the data should be +	 * read. +	 * @param token is a reference at the token instance into which the Token +	 * information should be written. +	 * @return true if a token could be read, false if the end of the stream +	 * has been reached. +	 */ +	bool peek(CharReader &reader, Token &token); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + | 
