diff options
Diffstat (limited to 'src/core/parser/utils/TokenizedData.hpp')
-rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 214 |
1 files changed, 176 insertions, 38 deletions
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..85b80ae 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -36,42 +36,29 @@ #include <unordered_set> #include <core/common/Location.hpp> +#include <core/common/Variant.hpp> #include <core/common/Whitespace.hpp> - -#include "Token.hpp" +#include <core/common/Token.hpp> namespace ousia { // Forward declaration class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork; /** * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * The data stored in TokenizedData */ class TokenizedData { private: /** - * Shared pointer pointing at the internal data. This data is shared when - * copying TokenizedData instances, which corresponds to forking a - * TokenizedData instance. + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. */ std::shared_ptr<TokenizedDataImpl> impl; - /** - * Contains all currently enabled token ids. - */ - std::unordered_set<TokenId> tokens; - - /** - * Position from which the last element was read from the internal buffer. - * This information is not shared with the other instances of TokenizedData - * pointing at the same location. - */ - size_t cursor; - public: /** * Default constructor, creates a new instance of TokenizedData, sets the @@ -136,25 +123,121 @@ public: void mark(TokenId id, size_t bufStart, TokenLength len); /** - * Enables a single token id. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Resets the TokenizedData instance to the state it had when it was + * constructred. + */ + void clear(); + + /** + * Trims the length of the TokenizedData instance to the given length. Note + * that this function does not remove any token matches for performance + * reasons, it merely renders them incaccessible. Appending new data after + * calling trim will make the token marks accessible again. Thus this method + * should be the last function called to modify the data buffer and the + * token marks. * - * @param id is the TokenId of the token that should be enabled. + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. */ - void enableToken(TokenId id) { tokens.insert(id); } + void trim(size_t length); /** - * Enables a set of token ids. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Returns the number of characters currently represented by this + * TokenizedData instance. + */ + size_t size() const; + + /** + * Returns true if the TokenizedData instance is empty, false otherwise. * - * @param ids is the TokenId of the token that should be enabled. + * @return true if not data is stored inside the TokenizedData instance. */ - void enableToken(const std::unordered_set<TokenId> &ids) - { - tokens.insert(ids.begin(), ids.end()); - } + bool empty() const; + + /** + * Returns the location of the entire TokenizedData instance. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const; + + /** + * Returns a TokenizedDataReader instance that can be used to access the + * data. + * + * @return a new TokenizedDataReader instance pointing at the beginning of + * the internal buffer. + */ + TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: + friend TokenizedData; + + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. + */ + std::shared_ptr<const TokenizedDataImpl> impl; + + /** + * Position from which the last element was read from the internal buffer. + */ + size_t readCursor; + + /** + * Position from which the last element was peeked from the internal buffer. + */ + size_t peekCursor; + + /** + * Private constructor of TokenizedDataReader, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader. + * + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl, + size_t readCursor, size_t peekCursor); + +public: + /** + * Returns a new TokenizedDataReaderFork from which tokens and text can be + * read without advancing this reader instance. + */ + TokenizedDataReaderFork fork(); + + /** + * Returns true if this TokenizedData instance is at the end. + * + * @return true if the end of the TokenizedData instance has been reached. + */ + bool atEnd() const; + + /** + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. Advances the + * internal cursor and re + * + * @param token is an output parameter into which the read token will be + * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. + * @param mode is the whitespace mode that should be used when a text token + * is returned. + * @return true if the operation was successful and there is a next token, + * false if there are no more tokens. + */ + bool read(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Stores the next token in the given token reference, returns true if the @@ -162,12 +245,26 @@ public: * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool peek(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::COLLAPSE); + + /** + * Consumes the peeked tokens, the read cursor will now be at the position + * of the peek cursor. + */ + void consumePeek() { readCursor = peekCursor; } + + /** + * Resets the peek cursor to the position of the read cursor. + */ + void resetPeek() { peekCursor = readCursor; } /** * Stores the next text token in the given token reference, returns true if @@ -178,12 +275,53 @@ public: * stored. The TokenId is set to Tokens::Empty if there are no more tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. - * @return true if the operation was successful and there is a next token, - * false if there are no more tokens. + * @return a string variant with the data if there is any data or a nullptr + * variant if there is no text. */ - bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE); }; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: + friend TokenizedDataReader; + + /** + * Reference pointing at the parent TokenizedDataReader to which changes may + * be commited. + */ + TokenizedDataReader &parent; + + /** + * Private constructor of TokenizedDataReaderFork, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader and a reference at the parent TokenizedDataReader. + * + * @param parent is the TokenizedDataReader instance to which the current + * read/peek progress may be commited. + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReaderFork(TokenizedDataReader &parent, + std::shared_ptr<TokenizedDataImpl> impl, + size_t readCursor, size_t peekCursor) + : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) + { + } + +public: + /** + * Commits the read/peek progress to the underlying parent. + */ + void commit() { parent = *this; } +} } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ |