diff options
| author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-06 02:24:07 +0100 | 
|---|---|---|
| committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-06 02:24:07 +0100 | 
| commit | 0376fdb483464ded73a5c1a8bba97b196af23b6d (patch) | |
| tree | 43d2d5a8092ab4bcf6a991c8eee95414ccfc9f10 /src/plugins/plain/DynamicTokenizer.hpp | |
| parent | 9a153303908e9511526f916cc4771a91df6635ae (diff) | |
Continue writing parser for plain document format
Diffstat (limited to 'src/plugins/plain/DynamicTokenizer.hpp')
| -rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 190 | 
1 files changed, 190 insertions, 0 deletions
| diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp new file mode 100644 index 0000000..f7fef13 --- /dev/null +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -0,0 +1,190 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file DynamicTokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include <core/common/Location.hpp> + +namespace ousia { + +// Forward declarations +class CharReader; +class TokenDescriptor; + +/** + * The DynamicToken structure describes a token discovered by the Tokenizer. + */ +struct DynamicToken { +	/** +	 * Pointer pointing at the TokenDescriptor instance this token corresponds +	 * to. May be one of the special TokenDescriptors defined as static members +	 * of the DynamicTokenizer class. +	 */ +	TokenDescriptor const *descriptor; + +	/** +	 * String that was matched. +	 */ +	std::string str; + +	/** +	 * Location from which the string was extracted. +	 */ +	SourceLocation location; +}; + +/** + * Enum specifying the whitespace handling of the DynamicTokenizer class when + * reading non-token text. + */ +enum class WhitespaceMode { +	/** +	 * Preserves all whitespaces as they are found in the source file. +	 */ +	PRESERVE, + +	/** +	 * Trims whitespace at the beginning and the end of the found text. +	 */ +	TRIM, + +	/** +	 * Whitespaces are trimmed and collapsed, multiple whitespace characters +	 * are replaced by a single space character. +	 */ +	COLLAPSE +}; + +/** + * The DynamicTokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. + */ +class DynamicTokenizer { +private: +	/** +	 * Reference at the char reader. +	 */ +	CharReader &reader; + +	/** +	 * Flag defining whether whitespaces should be preserved or not. +	 */ +	WhitespaceMode whitespaceMode; + +	/** +	 * Vector containing all registered token descriptors. +	 */ +	std::vector<std::unique_ptr<TokenDescriptor>> descriptors; + +public: +	/** +	 * Constructor of the DynamicTokenizer class. +	 * +	 * @param reader is the CharReader that should be used for reading the +	 * tokens. +	 * @param preserveWhitespaces should be set to true if all whitespaces +	 * should be preserved (for preformated environments). +	 */ +	DynamicTokenizer(CharReader &reader) +	    : reader(reader), +	      preserveWhitespaces(preserveWhitespaces), +	      location(reader.getSourceId()), +	      empty(true), +	      hasWhitespace(false) +	{ +	} + +	/** +	 * Destructor of the DynamicTokenizer class. +	 */ +	~DynamicTokenizer(); + +	/** +	 * Registers the given string as a token. Returns a const pointer at a +	 * TokenDescriptor that will be used to reference the newly created token. +	 * +	 * @param token is the token string that should be registered. +	 * @return a pointer at a TokenDescriptor which is representative for the +	 * newly registered token. Returns nullptr if a token with this string +	 * was already registered. +	 */ +	const TokenDescriptor* registerToken(const std::string &token); + +	/** +	 * Unregisters the token belonging to the given TokenDescriptor. +	 * +	 * @param descr is a TokenDescriptor that was previously returned by +	 * registerToken. +	 * @return true if the operation was successful, false otherwise (e.g. +	 * because the given TokenDescriptor was already unregistered). +	 */ +	bool unregisterToken(const TokenDescriptor *descr); + +	/** +	 * Sets the whitespace mode. +	 * +	 * @param whitespaceMode defines how whitespace should be treated in text +	 * tokens. +	 */ +	void setWhitespaceMode(WhitespaceMode mode); + +	/** +	 * Returns the current value of the whitespace mode. +	 * +	 * @return the whitespace mode. +	 */ +	WhitespaceMode getWhitespaceMode(); + +	/** +	 * Reads a new token from the CharReader and stores it in the given +	 * DynamicToken instance. +	 * +	 * @param token is a reference at the token instance into which the Token +	 * information should be written. +	 * @return true if a token could be read, false if the end of the stream +	 * has been reached. +	 */ +	bool read(DynamicToken &token); + +	/** +	 * TokenDescriptor representing an empty token. +	 */ +	static const *TokenDescriptor Empty; + +	/** +	 * TokenDescriptor representing generic text. +	 */ +	static const *TokenDescriptor Text; + +}; + +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + | 
