/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * @file TokenizedData.hpp * * The TokenizedData class defined in this file stores string data extracted * from a document including user defined tokens. Tokens can be dynamically * enabled and disabled. And the data up to the next enabled token can be * returned. Additionally, the data provided by the TokenizedData class is * processed according to a whitespace mode that can be dynamically updated. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_TOKENIZED_DATA_HPP_ #define _OUSIA_TOKENIZED_DATA_HPP_ #include #include #include #include #include #include "Token.hpp" namespace ousia { // Forward declaration class TokenizedDataImpl; /** * The TokenizedData class stores data extracted from a user defined document. * As users are capable of defining their own tokens and these are only valid * in certain scopes TokenizedData allows to divide the stored data into chunks * separated by tokens. */ class TokenizedData { private: /** * Shared pointer pointing at the internal data. This data is shared when * copying TokenizedData instances, which corresponds to forking a * TokenizedData instance. */ std::shared_ptr impl; /** * Contains all currently enabled token ids. */ std::unordered_set tokens; /** * Position from which the last element was read from the internal buffer. * This information is not shared with the other instances of TokenizedData * pointing at the same location. */ size_t cursor; public: /** * Default constructor, creates a new instance of TokenizedData, sets the * internal SourceId to the InvalidSourceId constant. */ TokenizedData(); /** * Creates a new instance of TokenizedData, takes a SourceId. * * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ TokenizedData(SourceId sourceId); /** * Destructor. Needs to be defined explicitly for freeing a shared pointer * of the incomplete TokenizedDataImpl type. */ ~TokenizedData(); /** * Appends a complete string to the internal character buffer. Note that the * start and end positions for each character in the given data string will * be interpolated and may thus be incorrect (e.g. when multi-character * linebreaks or multi-character characters (not handled now) are read). * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ size_t append(const std::string &data, SourceOffset offsStart = 0); /** * Appends a single character to the internal character buffer. * * @param c is the character that should be appended to the buffer. * @param start is the start offset in bytes in the input file. * @param end is the end offset in bytes in the input file. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); /** * Stores a token ending at the last character of the current buffer. * * @param id is the id of the token for which the mark should be stored. * @param len is the length of the token. */ void mark(TokenId id, TokenLength len); /** * Stores a token at the given position. * * @param id is the if of the token for which the mark should be stored. * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. */ void mark(TokenId id, size_t bufStart, TokenLength len); /** * Enables a single token id. Enabled tokens will no longer be returned as * text. Instead, when querying for the next token, TokenizedData will * return them as token and not as part of a Text token. * * @param id is the TokenId of the token that should be enabled. */ void enableToken(TokenId id) { tokens.insert(id); } /** * Enables a set of token ids. Enabled tokens will no longer be returned as * text. Instead, when querying for the next token, TokenizedData will * return them as token and not as part of a Text token. * * @param ids is the TokenId of the token that should be enabled. */ void enableToken(const std::unordered_set &ids) { tokens.insert(ids.begin(), ids.end()); } /** * Stores the next token in the given token reference, returns true if the * operation was successful, false if there are no more tokens. * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Stores the next text token in the given token reference, returns true if * the operation was successful (there was indeed a text token), false if * the next token is not a text token or there were no more tokens. * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); }; } #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */