summaryrefslogtreecommitdiff
path: root/src/core/parser/utils/TokenizedData.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/parser/utils/TokenizedData.hpp')
-rw-r--r--src/core/parser/utils/TokenizedData.hpp214
1 files changed, 176 insertions, 38 deletions
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 38125c4..85b80ae 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -36,42 +36,29 @@
#include <unordered_set>
#include <core/common/Location.hpp>
+#include <core/common/Variant.hpp>
#include <core/common/Whitespace.hpp>
-
-#include "Token.hpp"
+#include <core/common/Token.hpp>
namespace ousia {
// Forward declaration
class TokenizedDataImpl;
+class TokenizedDataReader;
+class TokenizedDataReaderFork;
/**
* The TokenizedData class stores data extracted from a user defined document.
- * As users are capable of defining their own tokens and these are only valid
- * in certain scopes TokenizedData allows to divide the stored data into chunks
- * separated by tokens.
+ * The data stored in TokenizedData
*/
class TokenizedData {
private:
/**
- * Shared pointer pointing at the internal data. This data is shared when
- * copying TokenizedData instances, which corresponds to forking a
- * TokenizedData instance.
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
*/
std::shared_ptr<TokenizedDataImpl> impl;
- /**
- * Contains all currently enabled token ids.
- */
- std::unordered_set<TokenId> tokens;
-
- /**
- * Position from which the last element was read from the internal buffer.
- * This information is not shared with the other instances of TokenizedData
- * pointing at the same location.
- */
- size_t cursor;
-
public:
/**
* Default constructor, creates a new instance of TokenizedData, sets the
@@ -136,25 +123,121 @@ public:
void mark(TokenId id, size_t bufStart, TokenLength len);
/**
- * Enables a single token id. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Resets the TokenizedData instance to the state it had when it was
+ * constructred.
+ */
+ void clear();
+
+ /**
+ * Trims the length of the TokenizedData instance to the given length. Note
+ * that this function does not remove any token matches for performance
+ * reasons, it merely renders them incaccessible. Appending new data after
+ * calling trim will make the token marks accessible again. Thus this method
+ * should be the last function called to modify the data buffer and the
+ * token marks.
*
- * @param id is the TokenId of the token that should be enabled.
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
*/
- void enableToken(TokenId id) { tokens.insert(id); }
+ void trim(size_t length);
/**
- * Enables a set of token ids. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Returns the number of characters currently represented by this
+ * TokenizedData instance.
+ */
+ size_t size() const;
+
+ /**
+ * Returns true if the TokenizedData instance is empty, false otherwise.
*
- * @param ids is the TokenId of the token that should be enabled.
+ * @return true if not data is stored inside the TokenizedData instance.
*/
- void enableToken(const std::unordered_set<TokenId> &ids)
- {
- tokens.insert(ids.begin(), ids.end());
- }
+ bool empty() const;
+
+ /**
+ * Returns the location of the entire TokenizedData instance.
+ *
+ * @return the location of the entire data represented by this instance.
+ */
+ SourceLocation getLocation() const;
+
+ /**
+ * Returns a TokenizedDataReader instance that can be used to access the
+ * data.
+ *
+ * @return a new TokenizedDataReader instance pointing at the beginning of
+ * the internal buffer.
+ */
+ TokenizedDataReader reader() const;
+};
+
+/**
+ * The TokenizedDataReader
+ */
+class TokenizedDataReader {
+private:
+ friend TokenizedData;
+
+ /**
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
+ */
+ std::shared_ptr<const TokenizedDataImpl> impl;
+
+ /**
+ * Position from which the last element was read from the internal buffer.
+ */
+ size_t readCursor;
+
+ /**
+ * Position from which the last element was peeked from the internal buffer.
+ */
+ size_t peekCursor;
+
+ /**
+ * Private constructor of TokenizedDataReader, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader.
+ *
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl,
+ size_t readCursor, size_t peekCursor);
+
+public:
+ /**
+ * Returns a new TokenizedDataReaderFork from which tokens and text can be
+ * read without advancing this reader instance.
+ */
+ TokenizedDataReaderFork fork();
+
+ /**
+ * Returns true if this TokenizedData instance is at the end.
+ *
+ * @return true if the end of the TokenizedData instance has been reached.
+ */
+ bool atEnd() const;
+
+ /**
+ * Stores the next token in the given token reference, returns true if the
+ * operation was successful, false if there are no more tokens. Advances the
+ * internal cursor and re
+ *
+ * @param token is an output parameter into which the read token will be
+ * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
+ * @param mode is the whitespace mode that should be used when a text token
+ * is returned.
+ * @return true if the operation was successful and there is a next token,
+ * false if there are no more tokens.
+ */
+ bool read(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::COLLAPSE);
/**
* Stores the next token in the given token reference, returns true if the
@@ -162,12 +245,26 @@ public:
*
* @param token is an output parameter into which the read token will be
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
- bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ bool peek(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+
+ /**
+ * Consumes the peeked tokens, the read cursor will now be at the position
+ * of the peek cursor.
+ */
+ void consumePeek() { readCursor = peekCursor; }
+
+ /**
+ * Resets the peek cursor to the position of the read cursor.
+ */
+ void resetPeek() { peekCursor = readCursor; }
/**
* Stores the next text token in the given token reference, returns true if
@@ -178,12 +275,53 @@ public:
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
- * @return true if the operation was successful and there is a next token,
- * false if there are no more tokens.
+ * @return a string variant with the data if there is any data or a nullptr
+ * variant if there is no text.
*/
- bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE);
};
+
+/**
+ * The TokenizedDataReaderFork class is created when forking a
+ * TokenizedDataReader
+ */
+class TokenizedDataReaderFork : public TokenizedDataReader {
+private:
+ friend TokenizedDataReader;
+
+ /**
+ * Reference pointing at the parent TokenizedDataReader to which changes may
+ * be commited.
+ */
+ TokenizedDataReader &parent;
+
+ /**
+ * Private constructor of TokenizedDataReaderFork, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader and a reference at the parent TokenizedDataReader.
+ *
+ * @param parent is the TokenizedDataReader instance to which the current
+ * read/peek progress may be commited.
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReaderFork(TokenizedDataReader &parent,
+ std::shared_ptr<TokenizedDataImpl> impl,
+ size_t readCursor, size_t peekCursor)
+ : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
+ {
+ }
+
+public:
+ /**
+ * Commits the read/peek progress to the underlying parent.
+ */
+ void commit() { parent = *this; }
+}
}
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */