summaryrefslogtreecommitdiff
path: root/src/core/parser/utils/TokenizedData.hpp
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-03-03 15:08:18 +0100
committerAndreas Stöckel <andreas@somweyr.de>2015-03-03 15:08:18 +0100
commit466ff991bcfad76d78100193aacbfaf74d542b26 (patch)
treedafdb41ec766e83c6e37a8b9865e6ef454ff4def /src/core/parser/utils/TokenizedData.hpp
parentb5cdca0331117ad3834b61eadd94ab3fcb6d2fba (diff)
parentfb8d4cdf01909b61e4e5d0806ec6de178ff0058c (diff)
Storing type and name in the HandlerData once again, using a Token
Conflicts: application/src/core/parser/stack/Callbacks.hpp
Diffstat (limited to 'src/core/parser/utils/TokenizedData.hpp')
-rw-r--r--src/core/parser/utils/TokenizedData.hpp246
1 files changed, 206 insertions, 40 deletions
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 38125c4..bc937f2 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -37,40 +37,48 @@
#include <core/common/Location.hpp>
#include <core/common/Whitespace.hpp>
-
-#include "Token.hpp"
+#include <core/common/Token.hpp>
namespace ousia {
// Forward declaration
class TokenizedDataImpl;
+class TokenizedDataReader;
+class TokenizedDataReaderFork;
/**
- * The TokenizedData class stores data extracted from a user defined document.
- * As users are capable of defining their own tokens and these are only valid
- * in certain scopes TokenizedData allows to divide the stored data into chunks
- * separated by tokens.
+ * Internally used structure representing a cursor within the TokenizedData
+ * stream.
*/
-class TokenizedData {
-private:
+struct TokenizedDataCursor {
/**
- * Shared pointer pointing at the internal data. This data is shared when
- * copying TokenizedData instances, which corresponds to forking a
- * TokenizedData instance.
+ * Position within the byte buffer.
*/
- std::shared_ptr<TokenizedDataImpl> impl;
+ size_t bufPos;
/**
- * Contains all currently enabled token ids.
+ * Position within the token mark buffer.
*/
- std::unordered_set<TokenId> tokens;
+ size_t markPos;
/**
- * Position from which the last element was read from the internal buffer.
- * This information is not shared with the other instances of TokenizedData
- * pointing at the same location.
+ * Default constructor. The resulting cursor points at the beginning of the
+ * stream.
+ */
+ TokenizedDataCursor() : bufPos(0), markPos(0) {}
+};
+
+/**
+ * The TokenizedData class stores data extracted from a user defined document.
+ * The data stored in TokenizedData
+ */
+class TokenizedData {
+private:
+ /**
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
*/
- size_t cursor;
+ std::shared_ptr<TokenizedDataImpl> impl;
public:
/**
@@ -88,6 +96,18 @@ public:
TokenizedData(SourceId sourceId);
/**
+ * Creates a new instance of TokenizedData, takes a SourceId and an initial
+ * string buffer.
+ *
+ * @param data is the string that should be appended to the buffer.
+ * @param offsStart is the start offset in bytes in the input file.
+ * @param sourceId is the source identifier that should be used for
+ * constructing the location when returning tokens.
+ */
+ TokenizedData(const std::string &data, SourceOffset offsStart = 0,
+ SourceId sourceId = InvalidSourceId);
+
+ /**
* Destructor. Needs to be defined explicitly for freeing a shared pointer
* of the incomplete TokenizedDataImpl type.
*/
@@ -101,10 +121,13 @@ public:
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
+ * @param protect if set to true, the appended characters will not be
+ * affected by whitespace handling, they will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(const std::string &data, SourceOffset offsStart = 0);
+ size_t append(const std::string &data, SourceOffset offsStart = 0,
+ bool protect = false);
/**
* Appends a single character to the internal character buffer.
@@ -112,10 +135,13 @@ public:
* @param c is the character that should be appended to the buffer.
* @param start is the start offset in bytes in the input file.
* @param end is the end offset in bytes in the input file.
+ * @param protect if set to true, the appended character will not be
+ * affected by whitespace handling, it will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd);
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+ bool protect = false);
/**
* Stores a token ending at the last character of the current buffer.
@@ -136,54 +162,194 @@ public:
void mark(TokenId id, size_t bufStart, TokenLength len);
/**
- * Enables a single token id. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Resets the TokenizedData instance to the state it had when it was
+ * constructred.
+ */
+ void clear();
+
+ /**
+ * Trims the length of the TokenizedData instance to the given length. Note
+ * that this function does not remove any token matches for performance
+ * reasons, it merely renders them incaccessible. Appending new data after
+ * calling trim will make the token marks accessible again. Thus this method
+ * should be the last function called to modify the data buffer and the
+ * token marks.
*
- * @param id is the TokenId of the token that should be enabled.
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
+ */
+ void trim(size_t length);
+
+ /**
+ * Returns the number of characters currently represented by this
+ * TokenizedData instance.
*/
- void enableToken(TokenId id) { tokens.insert(id); }
+ size_t size() const;
/**
- * Enables a set of token ids. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Returns true if the TokenizedData instance is empty, false otherwise.
*
- * @param ids is the TokenId of the token that should be enabled.
+ * @return true if not data is stored inside the TokenizedData instance.
*/
- void enableToken(const std::unordered_set<TokenId> &ids)
- {
- tokens.insert(ids.begin(), ids.end());
- }
+ bool empty() const;
+
+ /**
+ * Returns the location of the entire TokenizedData instance.
+ *
+ * @return the location of the entire data represented by this instance.
+ */
+ SourceLocation getLocation() const;
+
+ /**
+ * Returns a TokenizedDataReader instance that can be used to access the
+ * data.
+ *
+ * @return a new TokenizedDataReader instance pointing at the beginning of
+ * the internal buffer.
+ */
+ TokenizedDataReader reader() const;
+};
+
+/**
+ * The TokenizedDataReader
+ */
+class TokenizedDataReader {
+private:
+ friend TokenizedData;
+
+ /**
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
+ */
+ std::shared_ptr<const TokenizedDataImpl> impl;
+
+ /**
+ * Position from which the last element was read from the internal buffer.
+ */
+ TokenizedDataCursor readCursor;
+
+ /**
+ * Position from which the last element was peeked from the internal buffer.
+ */
+ TokenizedDataCursor peekCursor;
+
+protected:
+ /**
+ * Protected constructor of TokenizedDataReader, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader.
+ *
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor);
+
+public:
+ /**
+ * Returns a new TokenizedDataReaderFork from which tokens and text can be
+ * read without advancing this reader instance.
+ */
+ TokenizedDataReaderFork fork();
+
+ /**
+ * Returns true if this TokenizedData instance is at the end.
+ *
+ * @return true if the end of the TokenizedData instance has been reached.
+ */
+ bool atEnd() const;
/**
* Stores the next token in the given token reference, returns true if the
- * operation was successful, false if there are no more tokens.
+ * operation was successful, false if there are no more tokens. Advances the
+ * internal cursor and re
*
* @param token is an output parameter into which the read token will be
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
- bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ bool read(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM);
/**
- * Stores the next text token in the given token reference, returns true if
- * the operation was successful (there was indeed a text token), false if
- * the next token is not a text token or there were no more tokens.
+ * Stores the next token in the given token reference, returns true if the
+ * operation was successful, false if there are no more tokens.
*
* @param token is an output parameter into which the read token will be
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
- bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ bool peek(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM);
+
+ /**
+ * Consumes the peeked tokens, the read cursor will now be at the position
+ * of the peek cursor.
+ */
+ void consumePeek() { readCursor = peekCursor; }
+
+ /**
+ * Resets the peek cursor to the position of the read cursor.
+ */
+ void resetPeek() { peekCursor = readCursor; }
+};
+
+/**
+ * The TokenizedDataReaderFork class is created when forking a
+ * TokenizedDataReader
+ */
+class TokenizedDataReaderFork : public TokenizedDataReader {
+private:
+ friend TokenizedDataReader;
+
+ /**
+ * Reference pointing at the parent TokenizedDataReader to which changes may
+ * be commited.
+ */
+ TokenizedDataReader &parent;
+
+ /**
+ * Private constructor of TokenizedDataReaderFork, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader and a reference at the parent TokenizedDataReader.
+ *
+ * @param parent is the TokenizedDataReader instance to which the current
+ * read/peek progress may be commited.
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReaderFork(TokenizedDataReader &parent,
+ std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor)
+ : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
+ {
+ }
+
+public:
+ /**
+ * Commits the read/peek progress to the underlying parent.
+ */
+ void commit() { parent = *this; }
};
}
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */