diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-22 23:06:54 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-22 23:06:54 +0100 |
commit | cb697e7eb78ad0bdfc2a20a7bdd2c369b678ca09 (patch) | |
tree | b75bb61ce1467fca50133c2dd5c6bcdc551dc0e8 /src/core/parser/utils/TokenizedData.hpp | |
parent | 6ace07685cbaa81338ec5e68487054dcbf9da969 (diff) |
Implemented TokenizedData, a facility to store data with tokens where tokens can be dynamically enabled and the whitespace mode specified at the moment the tokens are read
Diffstat (limited to 'src/core/parser/utils/TokenizedData.hpp')
-rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp new file mode 100644 index 0000000..38125c4 --- /dev/null +++ b/src/core/parser/utils/TokenizedData.hpp @@ -0,0 +1,189 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenizedData.hpp + * + * The TokenizedData class defined in this file stores string data extracted + * from a document including user defined tokens. Tokens can be dynamically + * enabled and disabled. And the data up to the next enabled token can be + * returned. Additionally, the data provided by the TokenizedData class is + * processed according to a whitespace mode that can be dynamically updated. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKENIZED_DATA_HPP_ +#define _OUSIA_TOKENIZED_DATA_HPP_ + +#include <cstdint> +#include <memory> +#include <unordered_set> + +#include <core/common/Location.hpp> +#include <core/common/Whitespace.hpp> + +#include "Token.hpp" + +namespace ousia { + +// Forward declaration +class TokenizedDataImpl; + +/** + * The TokenizedData class stores data extracted from a user defined document. + * As users are capable of defining their own tokens and these are only valid + * in certain scopes TokenizedData allows to divide the stored data into chunks + * separated by tokens. + */ +class TokenizedData { +private: + /** + * Shared pointer pointing at the internal data. This data is shared when + * copying TokenizedData instances, which corresponds to forking a + * TokenizedData instance. + */ + std::shared_ptr<TokenizedDataImpl> impl; + + /** + * Contains all currently enabled token ids. + */ + std::unordered_set<TokenId> tokens; + + /** + * Position from which the last element was read from the internal buffer. + * This information is not shared with the other instances of TokenizedData + * pointing at the same location. + */ + size_t cursor; + +public: + /** + * Default constructor, creates a new instance of TokenizedData, sets the + * internal SourceId to the InvalidSourceId constant. + */ + TokenizedData(); + + /** + * Creates a new instance of TokenizedData, takes a SourceId. + * + * @param sourceId is the source identifier that should be used for + * constructing the location when returning tokens. + */ + TokenizedData(SourceId sourceId); + + /** + * Destructor. Needs to be defined explicitly for freeing a shared pointer + * of the incomplete TokenizedDataImpl type. + */ + ~TokenizedData(); + + /** + * Appends a complete string to the internal character buffer. Note that the + * start and end positions for each character in the given data string will + * be interpolated and may thus be incorrect (e.g. when multi-character + * linebreaks or multi-character characters (not handled now) are read). + * + * @param data is the string that should be appended to the buffer. + * @param offsStart is the start offset in bytes in the input file. + * @return the current size of the internal byte buffer. The returned value + * is intended to be used for the "mark" function. + */ + size_t append(const std::string &data, SourceOffset offsStart = 0); + + /** + * Appends a single character to the internal character buffer. + * + * @param c is the character that should be appended to the buffer. + * @param start is the start offset in bytes in the input file. + * @param end is the end offset in bytes in the input file. + * @return the current size of the internal byte buffer. The returned value + * is intended to be used for the "mark" function. + */ + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); + + /** + * Stores a token ending at the last character of the current buffer. + * + * @param id is the id of the token for which the mark should be stored. + * @param len is the length of the token. + */ + void mark(TokenId id, TokenLength len); + + /** + * Stores a token at the given position. + * + * @param id is the if of the token for which the mark should be stored. + * @param bufStart is the start position in the internal buffer. Use the + * values returned by append to calculate the start position. + * @param len is the length of the token. + */ + void mark(TokenId id, size_t bufStart, TokenLength len); + + /** + * Enables a single token id. Enabled tokens will no longer be returned as + * text. Instead, when querying for the next token, TokenizedData will + * return them as token and not as part of a Text token. + * + * @param id is the TokenId of the token that should be enabled. + */ + void enableToken(TokenId id) { tokens.insert(id); } + + /** + * Enables a set of token ids. Enabled tokens will no longer be returned as + * text. Instead, when querying for the next token, TokenizedData will + * return them as token and not as part of a Text token. + * + * @param ids is the TokenId of the token that should be enabled. + */ + void enableToken(const std::unordered_set<TokenId> &ids) + { + tokens.insert(ids.begin(), ids.end()); + } + + /** + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. + * + * @param token is an output parameter into which the read token will be + * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param mode is the whitespace mode that should be used when a text token + * is returned. + * @return true if the operation was successful and there is a next token, + * false if there are no more tokens. + */ + bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + + /** + * Stores the next text token in the given token reference, returns true if + * the operation was successful (there was indeed a text token), false if + * the next token is not a text token or there were no more tokens. + * + * @param token is an output parameter into which the read token will be + * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param mode is the whitespace mode that should be used when a text token + * is returned. + * @return true if the operation was successful and there is a next token, + * false if there are no more tokens. + */ + bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + |