diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-06 02:24:07 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-06 02:24:07 +0100 |
commit | 0376fdb483464ded73a5c1a8bba97b196af23b6d (patch) | |
tree | 43d2d5a8092ab4bcf6a991c8eee95414ccfc9f10 /src/plugins/plain/DynamicTokenizer.hpp | |
parent | 9a153303908e9511526f916cc4771a91df6635ae (diff) |
Continue writing parser for plain document format
Diffstat (limited to 'src/plugins/plain/DynamicTokenizer.hpp')
-rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp new file mode 100644 index 0000000..f7fef13 --- /dev/null +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -0,0 +1,190 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file DynamicTokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include <core/common/Location.hpp> + +namespace ousia { + +// Forward declarations +class CharReader; +class TokenDescriptor; + +/** + * The DynamicToken structure describes a token discovered by the Tokenizer. + */ +struct DynamicToken { + /** + * Pointer pointing at the TokenDescriptor instance this token corresponds + * to. May be one of the special TokenDescriptors defined as static members + * of the DynamicTokenizer class. + */ + TokenDescriptor const *descriptor; + + /** + * String that was matched. + */ + std::string str; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; +}; + +/** + * Enum specifying the whitespace handling of the DynamicTokenizer class when + * reading non-token text. + */ +enum class WhitespaceMode { + /** + * Preserves all whitespaces as they are found in the source file. + */ + PRESERVE, + + /** + * Trims whitespace at the beginning and the end of the found text. + */ + TRIM, + + /** + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ + COLLAPSE +}; + +/** + * The DynamicTokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. + */ +class DynamicTokenizer { +private: + /** + * Reference at the char reader. + */ + CharReader &reader; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token descriptors. + */ + std::vector<std::unique_ptr<TokenDescriptor>> descriptors; + +public: + /** + * Constructor of the DynamicTokenizer class. + * + * @param reader is the CharReader that should be used for reading the + * tokens. + * @param preserveWhitespaces should be set to true if all whitespaces + * should be preserved (for preformated environments). + */ + DynamicTokenizer(CharReader &reader) + : reader(reader), + preserveWhitespaces(preserveWhitespaces), + location(reader.getSourceId()), + empty(true), + hasWhitespace(false) + { + } + + /** + * Destructor of the DynamicTokenizer class. + */ + ~DynamicTokenizer(); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a pointer at a TokenDescriptor which is representative for the + * newly registered token. Returns nullptr if a token with this string + * was already registered. + */ + const TokenDescriptor* registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenDescriptor. + * + * @param descr is a TokenDescriptor that was previously returned by + * registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(const TokenDescriptor *descr); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * DynamicToken instance. + * + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(DynamicToken &token); + + /** + * TokenDescriptor representing an empty token. + */ + static const *TokenDescriptor Empty; + + /** + * TokenDescriptor representing generic text. + */ + static const *TokenDescriptor Text; + +}; + +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + |