1 files changed, 176 insertions, 38 deletions
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 38125c4..85b80ae 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -36,42 +36,29 @@
 #include <unordered_set>
 
 #include <core/common/Location.hpp>
+#include <core/common/Variant.hpp>
 #include <core/common/Whitespace.hpp>
-
-#include "Token.hpp"
+#include <core/common/Token.hpp>
 
 namespace ousia {
 
 // Forward declaration
 class TokenizedDataImpl;
+class TokenizedDataReader;
+class TokenizedDataReaderFork;
 
 /**
  * The TokenizedData class stores data extracted from a user defined document.
- * As users are capable of defining their own tokens and these are only valid
- * in certain scopes TokenizedData allows to divide the stored data into chunks
- * separated by tokens.
+ * The data stored in TokenizedData
  */
 class TokenizedData {
 private:
 	/**
-	 * Shared pointer pointing at the internal data. This data is shared when
-	 * copying TokenizedData instances, which corresponds to forking a
-	 * TokenizedData instance.
+	 * Shared pointer pointing at the internal data. This data is shared with
+	 * all the TokenizedDataReader instances.
 	 */
 	std::shared_ptr<TokenizedDataImpl> impl;
 
-	/**
-	 * Contains all currently enabled token ids.
-	 */
-	std::unordered_set<TokenId> tokens;
-
-	/**
-	 * Position from which the last element was read from the internal buffer.
-	 * This information is not shared with the other instances of TokenizedData
-	 * pointing at the same location.
-	 */
-	size_t cursor;
-
 public:
 	/**
 	 * Default constructor, creates a new instance of TokenizedData, sets the
@@ -136,25 +123,121 @@ public:
 	void mark(TokenId id, size_t bufStart, TokenLength len);
 
 	/**
-	 * Enables a single token id. Enabled tokens will no longer be returned as
-	 * text. Instead, when querying for the next token, TokenizedData will
-	 * return them as token and not as part of a Text token.
+	 * Resets the TokenizedData instance to the state it had when it was
+	 * constructred.
+	 */
+	void clear();
+
+	/**
+	 * Trims the length of the TokenizedData instance to the given length. Note
+	 * that this function does not remove any token matches for performance
+	 * reasons, it merely renders them incaccessible. Appending new data after
+	 * calling trim will make the token marks accessible again. Thus this method
+	 * should be the last function called to modify the data buffer and the
+	 * token marks.
 	 *
-	 * @param id is the TokenId of the token that should be enabled.
+	 * @param length is the number of characters to which the TokenizedData
+	 * instance should be trimmed.
 	 */
-	void enableToken(TokenId id) { tokens.insert(id); }
+	void trim(size_t length);
 
 	/**
-	 * Enables a set of token ids. Enabled tokens will no longer be returned as
-	 * text. Instead, when querying for the next token, TokenizedData will
-	 * return them as token and not as part of a Text token.
+	 * Returns the number of characters currently represented by this
+	 * TokenizedData instance.
+	 */
+	size_t size() const;
+
+	/**
+	 * Returns true if the TokenizedData instance is empty, false otherwise.
 	 *
-	 * @param ids is the TokenId of the token that should be enabled.
+	 * @return true if not data is stored inside the TokenizedData instance.
 	 */
-	void enableToken(const std::unordered_set<TokenId> &ids)
-	{
-		tokens.insert(ids.begin(), ids.end());
-	}
+	bool empty() const;
+
+	/**
+	 * Returns the location of the entire TokenizedData instance.
+	 *
+	 * @return the location of the entire data represented by this instance.
+	 */
+	SourceLocation getLocation() const;
+
+	/**
+	 * Returns a TokenizedDataReader instance that can be used to access the
+	 * data.
+	 *
+	 * @return a new TokenizedDataReader instance pointing at the beginning of
+	 * the internal buffer.
+	 */
+	TokenizedDataReader reader() const;
+};
+
+/**
+ * The TokenizedDataReader
+ */
+class TokenizedDataReader {
+private:
+	friend TokenizedData;
+
+	/**
+	 * Shared pointer pointing at the internal data. This data is shared with
+	 * all the TokenizedDataReader instances.
+	 */
+	std::shared_ptr<const TokenizedDataImpl> impl;
+
+	/**
+	 * Position from which the last element was read from the internal buffer.
+	 */
+	size_t readCursor;
+
+	/**
+	 * Position from which the last element was peeked from the internal buffer.
+	 */
+	size_t peekCursor;
+
+	/**
+	 * Private constructor of TokenizedDataReader, taking a reference to the
+	 * internal TokenizedDataImpl structure storing the data that is accessed by
+	 * the reader.
+	 *
+	 * @param impl is the TokenizedDataImpl instance that holds the actual data.
+	 * @param readCursor is the cursor position from which tokens and text are
+	 * read.
+	 * @param peekCursor is the cursor position from which tokens and text are
+	 * peeked.
+	 */
+	TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl,
+	                    size_t readCursor, size_t peekCursor);
+
+public:
+	/**
+	 * Returns a new TokenizedDataReaderFork from which tokens and text can be
+	 * read without advancing this reader instance.
+	 */
+	TokenizedDataReaderFork fork();
+
+	/**
+	 * Returns true if this TokenizedData instance is at the end.
+	 *
+	 * @return true if the end of the TokenizedData instance has been reached.
+	 */
+	bool atEnd() const;
+
+	/**
+	 * Stores the next token in the given token reference, returns true if the
+	 * operation was successful, false if there are no more tokens. Advances the
+	 * internal cursor and re
+	 *
+	 * @param token is an output parameter into which the read token will be
+	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+	 * @param tokens is the set of token identifers, representing the currently
+	 * enabled tokens.
+	 * @param mode is the whitespace mode that should be used when a text token
+	 * is returned.
+	 * @return true if the operation was successful and there is a next token,
+	 * false if there are no more tokens.
+	 */
+	bool read(Token &token, const TokenSet &tokens = TokenSet{},
+	          WhitespaceMode mode = WhitespaceMode::COLLAPSE);
 
 	/**
 	 * Stores the next token in the given token reference, returns true if the
@@ -162,12 +245,26 @@ public:
 	 *
 	 * @param token is an output parameter into which the read token will be
 	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+	 * @param tokens is the set of token identifers, representing the currently
+	 * enabled tokens.
 	 * @param mode is the whitespace mode that should be used when a text token
 	 * is returned.
 	 * @return true if the operation was successful and there is a next token,
 	 * false if there are no more tokens.
 	 */
-	bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+	bool peek(Token &token, const TokenSet &tokens = TokenSet{},
+	          WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+
+	/**
+	 * Consumes the peeked tokens, the read cursor will now be at the position
+	 * of the peek cursor.
+	 */
+	void consumePeek() { readCursor = peekCursor; }
+
+	/**
+	 * Resets the peek cursor to the position of the read cursor.
+	 */
+	void resetPeek() { peekCursor = readCursor; }
 
 	/**
 	 * Stores the next text token in the given token reference, returns true if
@@ -178,12 +275,53 @@ public:
 	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
 	 * @param mode is the whitespace mode that should be used when a text token
 	 * is returned.
-	 * @return true if the operation was successful and there is a next token,
-	 * false if there are no more tokens.
+	 * @return a string variant with the data if there is any data or a nullptr
+	 * variant if there is no text.
 	 */
-	bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+	Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE);
 };
+
+/**
+ * The TokenizedDataReaderFork class is created when forking a
+ * TokenizedDataReader
+ */
+class TokenizedDataReaderFork : public TokenizedDataReader {
+private:
+	friend TokenizedDataReader;
+
+	/**
+	 * Reference pointing at the parent TokenizedDataReader to which changes may
+	 * be commited.
+	 */
+	TokenizedDataReader &parent;
+
+	/**
+	 * Private constructor of TokenizedDataReaderFork, taking a reference to the
+	 * internal TokenizedDataImpl structure storing the data that is accessed by
+	 * the reader and a reference at the parent TokenizedDataReader.
+	 *
+	 * @param parent is the TokenizedDataReader instance to which the current
+	 * read/peek progress may be commited.
+	 * @param impl is the TokenizedDataImpl instance that holds the actual data.
+	 * @param readCursor is the cursor position from which tokens and text are
+	 * read.
+	 * @param peekCursor is the cursor position from which tokens and text are
+	 * peeked.
+	 */
+	TokenizedDataReaderFork(TokenizedDataReader &parent,
+	                        std::shared_ptr<TokenizedDataImpl> impl,
+	                        size_t readCursor, size_t peekCursor)
+	    : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
+	{
+	}
+
+public:
+	/**
+	 * Commits the read/peek progress to the underlying parent.
+	 */
+	void commit() { parent = *this; }
+}
 }
 
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */