4 files changed, 251 insertions, 114 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
index 0ec56af..aeefa26 100644
--- a/src/core/parser/utils/TokenizedData.cpp
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -48,6 +48,17 @@ struct TokenMark {
 	TokenLength len;
 
 	/**
+	 * Specifies whether the token is special or not.
+	 */
+	bool special;
+
+	/**
+	 * Maximum token length.
+	 */
+	static constexpr TokenLength MaxTokenLength =
+	    std::numeric_limits<TokenLength>::max();
+
+	/**
 	 * Constructor of the TokenMark structure, initializes all members with the
 	 * given values.
 	 *
@@ -55,9 +66,10 @@ struct TokenMark {
 	 * @param bufStart is the start position of the TokenMark in the internal
 	 * character buffer.
 	 * @param len is the length of the token.
+	 * @param special modifies the sort order, special tokens are prefered.
 	 */
-	TokenMark(TokenId id, size_t bufStart, TokenLength len)
-	    : bufStart(bufStart), id(id), len(len)
+	TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special)
+	    : bufStart(bufStart), id(id), len(len), special(special)
 	{
 	}
 
@@ -72,7 +84,8 @@ struct TokenMark {
 	TokenMark(size_t bufStart)
 	    : bufStart(bufStart),
 	      id(Tokens::Empty),
-	      len(std::numeric_limits<TokenLength>::max())
+	      len(MaxTokenLength),
+	      special(true)
 	{
 	}
 
@@ -86,8 +99,22 @@ struct TokenMark {
 	 */
 	friend bool operator<(const TokenMark &m1, const TokenMark &m2)
 	{
-		return (m1.bufStart < m2.bufStart) ||
-		       (m1.bufStart == m2.bufStart && m1.len > m2.len);
+		// Prefer the mark with the smaller bufStart
+		if (m1.bufStart < m2.bufStart) {
+			return true;
+		}
+
+		// Special handling for marks with the same bufStart
+		if (m1.bufStart == m2.bufStart) {
+			// If exactly one of the two marks is special, return true if this
+			// one is special
+			if (m1.special != m2.special) {
+				return m1.special;
+			}
+			// Otherwise prefer longer marks
+			return m1.len > m2.len;
+		}
+		return false;
 	}
 };
 }
@@ -110,6 +137,11 @@ private:
 	std::vector<char> buf;
 
 	/**
+	 * Buffset storing the "protected" flag of the character data.
+	 */
+	std::vector<bool> protectedChars;
+
+	/**
 	 * Vector storing all the character offsets efficiently.
 	 */
 	SourceOffsetVector offsets;
@@ -120,6 +152,26 @@ private:
 	mutable std::vector<TokenMark> marks;
 
 	/**
+	 * Position of the first linebreak in a sequence of linebreaks.
+	 */
+	size_t firstLinebreak;
+
+	/**
+	 * Current indentation level.
+	 */
+	uint16_t currentIndentation;
+
+	/**
+	 * Last indentation level.
+	 */
+	uint16_t lastIndentation;
+
+	/**
+	 * Number of linebreaks without any content between them.
+	 */
+	uint16_t numLinebreaks;
+
+	/**
 	 * Flag indicating whether the internal "marks" vector is sorted.
 	 */
 	mutable bool sorted;
@@ -132,7 +184,7 @@ public:
 	 * @param sourceId is the source identifier that should be used for
 	 * constructing the location when returning tokens.
 	 */
-	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {}
+	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); }
 
 	/**
 	 * Appends a complete string to the internal character buffer and extends
@@ -140,25 +192,22 @@ public:
 	 *
 	 * @param data is the string that should be appended to the buffer.
 	 * @param offsStart is the start offset in bytes in the input file.
+	 * @param protect if set to true, the appended characters will not be
+	 * affected by whitespace handling, they will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(const std::string &data, SourceOffset offsStart)
-	{  // Append the data to the internal buffer
-		buf.insert(buf.end(), data.begin(), data.end());
-
-		// Extend the text regions, interpolate the source position (this may
-		// yield incorrect results)
-		const size_t size = buf.size();
+	size_t append(const std::string &data, SourceOffset offsStart, bool protect)
+	{
 		for (size_t i = 0; i < data.size(); i++) {
 			if (offsStart != InvalidSourceOffset) {
-				offsets.storeOffset(offsStart + i, offsStart + i + 1);
+				append(data[i], offsStart + i, offsStart + i + 1, protect);
 			} else {
-				offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset);
+				append(data[i], InvalidSourceOffset, InvalidSourceOffset,
+				       protect);
 			}
 		}
-
-		return size;
+		return size();
 	}
 
 	/**
@@ -168,16 +217,86 @@ public:
 	 * @param c is the character that should be appended to the buffer.
 	 * @param offsStart is the start offset in bytes in the input file.
 	 * @param offsEnd is the end offset in bytes in the input file.
+	 * @param protect if set to true, the appended character will not be
+	 * affected by whitespace handling, it will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd)
+	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+	              bool protect)
 	{
 		// Add the character to the list and store the location of the character
 		// in the source file
 		buf.push_back(c);
+		protectedChars.push_back(protect);
 		offsets.storeOffset(offsStart, offsEnd);
-		return buf.size();
+
+		// Insert special tokens
+		const size_t size = buf.size();
+		const bool isWhitespace = Utils::isWhitespace(c);
+		const bool isLinebreak = Utils::isLinebreak(c);
+
+		// Handle linebreaks
+		if (isLinebreak) {
+			// Mark linebreaks as linebreak
+			mark(Tokens::Newline, size - 1, 1, false);
+
+			// The linebreak sequence started at the previous character
+			if (numLinebreaks == 0) {
+				firstLinebreak = size - 1;
+			}
+
+			// Reset the indentation
+			currentIndentation = 0;
+
+			// Increment the number of linebreaks
+			numLinebreaks++;
+
+			const size_t markStart = firstLinebreak;
+			const size_t markLength = size - firstLinebreak;
+
+			// Issue two consecutive linebreaks as paragraph token
+			if (numLinebreaks == 2) {
+				mark(Tokens::Paragraph, markStart, markLength, false);
+			}
+
+			// Issue three consecutive linebreaks as paragraph token
+			if (numLinebreaks >= 3) {
+				mark(Tokens::Section, markStart, markLength, false);
+			}
+		} else if (isWhitespace) {
+			// Count the whitespace characters at the beginning of the line
+			if (numLinebreaks > 0) {
+				// Implement the UNIX/Pyhton rule for tabs: Tabs extend to the
+				// next multiple of eight.
+				if (c == '\t') {
+					currentIndentation = (currentIndentation + 8) & ~7;
+				} else {
+					currentIndentation++;
+				}
+			}
+		}
+
+		// Issue indent and unindent tokens
+		if (!isWhitespace && numLinebreaks > 0) {
+			// Issue a larger indentation than that in the previous line as
+			// "Indent" token
+			if (currentIndentation > lastIndentation) {
+				mark(Tokens::Indent, size - 1, 0, true);
+			}
+
+			// Issue a smaller indentation than that in the previous line as
+			// "Dedent" token
+			if (currentIndentation < lastIndentation) {
+				mark(Tokens::Dedent, size - 1, 0, true);
+			}
+
+			// Reset the internal state machine
+			lastIndentation = currentIndentation;
+			numLinebreaks = 0;
+		}
+
+		return size;
 	}
 
 	/**
@@ -187,11 +306,12 @@ public:
 	 * @param bufStart is the start position in the internal buffer. Use the
 	 * values returned by append to calculate the start position.
 	 * @param len is the length of the token.
+	 * @param special tags the mark as "special", prefering it in the sort order
 	 */
-	void mark(TokenId id, size_t bufStart, TokenLength len)
+	void mark(TokenId id, size_t bufStart, TokenLength len, bool special)
 	{
 		// Push the new instance back onto the list
-		marks.emplace_back(id, bufStart, len);
+		marks.emplace_back(id, bufStart, len, special);
 
 		// Update the sorted flag as soon as more than one element is in the
 		// list
@@ -215,9 +335,13 @@ public:
 	 * @return true if a token was returned, false if no more tokens are
 	 * available.
 	 */
-	bool next(Token &token, WhitespaceMode mode,
-	          const std::unordered_set<TokenId> &tokens, size_t &cursor) const
+	bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens,
+	          TokenizedDataCursor &cursor) const
 	{
+		// Some variables for convenient access
+		size_t &bufPos = cursor.bufPos;
+		size_t &markPos = cursor.markPos;
+
 		// Sort the "marks" vector if it has not been sorted yet.
 		if (!sorted) {
 			std::sort(marks.begin(), marks.end());
@@ -226,8 +350,8 @@ public:
 
 		// Fetch the next larger TokenMark instance, make sure the token is in
 		// the "enabled" list and within the buffer range
-		auto it =
-		    std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor));
+		auto it = std::lower_bound(marks.begin() + markPos, marks.end(),
+		                           TokenMark(bufPos));
 		while (it != marks.end() && (tokens.count(it->id) == 0 ||
 		                             it->bufStart + it->len > buf.size())) {
 			it++;
@@ -240,15 +364,15 @@ public:
 		// Depending on the whitespace mode, fetch all the data between the
 		// cursor position and the calculated end position and return a token
 		// containing that data.
-		if (cursor < end && cursor < buf.size()) {
+		if (bufPos < end && bufPos < buf.size()) {
 			switch (mode) {
 				case WhitespaceMode::PRESERVE: {
 					token = Token(
-					    Tokens::Data, std::string(&buf[cursor], end - cursor),
+					    Tokens::Data, std::string(&buf[bufPos], end - bufPos),
 					    SourceLocation(sourceId,
-					                   offsets.loadOffset(cursor).first,
+					                   offsets.loadOffset(bufPos).first,
 					                   offsets.loadOffset(end).first));
-					cursor = end;
+					bufPos = end;
 					return true;
 				}
 				case WhitespaceMode::TRIM:
@@ -258,30 +382,35 @@ public:
 					size_t stringStart;
 					size_t stringEnd;
 					std::string content;
+					const char *cBuf = &buf[bufPos];
+					auto filter = [cBuf, this](size_t i) -> bool {
+						return Utils::isWhitespace(cBuf[i]) &&
+						       !protectedChars[i];
+					};
 					if (mode == WhitespaceMode::TRIM) {
-						content = Utils::trim(&buf[cursor], end - cursor,
-						                      stringStart, stringEnd);
+						content = Utils::trim(cBuf, end - bufPos, stringStart,
+						                      stringEnd, filter);
 					} else {
-						content = Utils::collapse(&buf[cursor], end - cursor,
-						                          stringStart, stringEnd);
+						content = Utils::collapse(
+						    cBuf, end - bufPos, stringStart, stringEnd, filter);
 					}
 
 					// If the resulting string is empty (only whitespaces),
 					// abort
 					if (content.empty()) {
-						cursor = end;
+						bufPos = end;
 						break;
 					}
 
 					// Calculate the absolute positions and return the token
-					stringStart += cursor;
-					stringEnd += cursor;
+					stringStart += bufPos;
+					stringEnd += bufPos;
 					token = Token(
 					    Tokens::Data, content,
 					    SourceLocation(sourceId,
 					                   offsets.loadOffset(stringStart).first,
 					                   offsets.loadOffset(stringEnd).first));
-					cursor = end;
+					bufPos = end;
 					return true;
 				}
 			}
@@ -290,14 +419,18 @@ public:
 		// If start equals end, we're currently directly at a token
 		// instance. Return this token and advance the cursor to the end of
 		// the token.
-		if (cursor == end && it != marks.end()) {
+		if (bufPos == end && it != marks.end()) {
 			const size_t tokenStart = it->bufStart;
 			const size_t tokenEnd = it->bufStart + it->len;
 			token = Token(
 			    it->id, std::string(&buf[tokenStart], it->len),
 			    SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,
 			                   offsets.loadOffset(tokenEnd).first));
-			cursor = tokenEnd;
+
+			// Update the cursor, consume the token by incrementing the marks
+			// pos counter
+			bufPos = tokenEnd;
+			markPos = it - marks.begin() + 1;
 			return true;
 		}
 
@@ -314,8 +447,12 @@ public:
 	void clear()
 	{
 		buf.clear();
-		marks.clear();
+		protectedChars.clear();
 		offsets.clear();
+		marks.clear();
+		currentIndentation = 0;
+		lastIndentation = 0;
+		numLinebreaks = 1;  // Assume the stream starts with a linebreak
 		sorted = true;
 	}
 
@@ -367,39 +504,35 @@ public:
 TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}
 
 TokenizedData::TokenizedData(SourceId sourceId)
-    : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0)
+    : impl(std::make_shared<TokenizedDataImpl>(sourceId))
 {
 }
 
 TokenizedData::~TokenizedData() {}
 
-size_t TokenizedData::append(const std::string &data, SourceOffset offsStart)
+size_t TokenizedData::append(const std::string &data, SourceOffset offsStart,
+                             bool protect)
 {
-	return impl->append(data, offsStart);
+	return impl->append(data, offsStart, protect);
 }
 
 size_t TokenizedData::append(char c, SourceOffset offsStart,
-                             SourceOffset offsEnd)
+                             SourceOffset offsEnd, bool protect)
 {
-	return impl->append(c, offsStart, offsEnd);
+	return impl->append(c, offsStart, offsEnd, protect);
 }
 
 void TokenizedData::mark(TokenId id, TokenLength len)
 {
-	impl->mark(id, impl->size() - len, len);
+	impl->mark(id, impl->size() - len, len, false);
 }
 
 void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)
 {
-	impl->mark(id, bufStart, len);
+	impl->mark(id, bufStart, len, false);
 }
 
-void TokenizedData::clear()
-{
-	impl->clear();
-	tokens.clear();
-	cursor = 0;
-}
+void TokenizedData::clear() { impl->clear(); }
 
 void TokenizedData::trim(size_t length) { impl->trim(length); }
 
@@ -412,49 +545,42 @@ SourceLocation TokenizedData::getLocation() const
 	return impl->getLocation();
 }
 
-TokenizedDataReader reader() const
+TokenizedDataReader TokenizedData::reader() const
 {
-	return TokenizedDataReader(impl, std::unordered_set<TokenId>{}, 0, 0);
+	return TokenizedDataReader(impl, TokenizedDataCursor(),
+	                           TokenizedDataCursor());
 }
 
 /* Class TokenizedDataReader */
 
+TokenizedDataReader::TokenizedDataReader(
+    std::shared_ptr<const TokenizedDataImpl> impl,
+    const TokenizedDataCursor &readCursor,
+    const TokenizedDataCursor &peekCursor)
+    : impl(impl), readCursor(readCursor), peekCursor(peekCursor)
+{
+}
+
 TokenizedDataReaderFork TokenizedDataReader::fork()
 {
-	return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor);
+	return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor);
 }
 
-bool TokenizedDataReader::atEnd() const { return readCursor >= size(); }
+bool TokenizedDataReader::atEnd() const
+{
+	return readCursor.bufPos >= impl->size();
+}
 
-bool TokenizedData::read(Token &token, const TokenSet &tokens,
-                         WhitespaceMode mode)
+bool TokenizedDataReader::read(Token &token, const TokenSet &tokens,
+                               WhitespaceMode mode)
 {
 	peekCursor = readCursor;
 	return impl->next(token, mode, tokens, readCursor);
 }
 
-bool TokenizedData::peek(Token &token, const TokenSet &tokens,
-                         WhitespaceMode mode)
+bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens,
+                               WhitespaceMode mode)
 {
 	return impl->next(token, mode, tokens, peekCursor);
 }
-
-Variant TokenizedData::text(WhitespaceMode mode)
-{
-	// Copy the current cursor position to not update the actual cursor position
-	// if the operation was not successful
-	size_t cursorCopy = cursor;
-	Token token;
-	if (!impl->next(token, mode, tokens, cursorCopy) ||
-	    token.id != Tokens::Data) {
-		return Variant{nullptr};
-	}
-
-	// There is indeed a text token, update the internal cursor position and
-	// return the token as variant.
-	cursor = cursorCopy;
-	Variant res = Variant::fromString(token.content);
-	res.setLocation(token.getLocation());
-	return res;
-}
 }
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 85b80ae..b72ca02 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -36,7 +36,6 @@
 #include <unordered_set>
 
 #include <core/common/Location.hpp>
-#include <core/common/Variant.hpp>
 #include <core/common/Whitespace.hpp>
 #include <core/common/Token.hpp>
 
@@ -48,6 +47,28 @@ class TokenizedDataReader;
 class TokenizedDataReaderFork;
 
 /**
+ * Internally used structure representing a cursor within the TokenizedData
+ * stream.
+ */
+struct TokenizedDataCursor {
+	/**
+	 * Position within the byte buffer.
+	 */
+	size_t bufPos;
+
+	/**
+	 * Position within the token mark buffer.
+	 */
+	size_t markPos;
+
+	/**
+	 * Default constructor. The resulting cursor points at the beginning of the
+	 * stream.
+	 */
+	TokenizedDataCursor() : bufPos(0), markPos(0) {}
+};
+
+/**
  * The TokenizedData class stores data extracted from a user defined document.
  * The data stored in TokenizedData
  */
@@ -88,10 +109,13 @@ public:
 	 *
 	 * @param data is the string that should be appended to the buffer.
 	 * @param offsStart is the start offset in bytes in the input file.
+	 * @param protect if set to true, the appended characters will not be
+	 * affected by whitespace handling, they will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(const std::string &data, SourceOffset offsStart = 0);
+	size_t append(const std::string &data, SourceOffset offsStart = 0,
+	              bool protect = false);
 
 	/**
 	 * Appends a single character to the internal character buffer.
@@ -99,10 +123,13 @@ public:
 	 * @param c is the character that should be appended to the buffer.
 	 * @param start is the start offset in bytes in the input file.
 	 * @param end is the end offset in bytes in the input file.
+	 * @param protect if set to true, the appended character will not be
+	 * affected by whitespace handling, it will be returned as is.
 	 * @return the current size of the internal byte buffer. The returned value
 	 * is intended to be used for the "mark" function.
 	 */
-	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd);
+	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+	              bool protect = false);
 
 	/**
 	 * Stores a token ending at the last character of the current buffer.
@@ -187,15 +214,16 @@ private:
 	/**
 	 * Position from which the last element was read from the internal buffer.
 	 */
-	size_t readCursor;
+	TokenizedDataCursor readCursor;
 
 	/**
 	 * Position from which the last element was peeked from the internal buffer.
 	 */
-	size_t peekCursor;
+	TokenizedDataCursor peekCursor;
 
+protected:
 	/**
-	 * Private constructor of TokenizedDataReader, taking a reference to the
+	 * Protected constructor of TokenizedDataReader, taking a reference to the
 	 * internal TokenizedDataImpl structure storing the data that is accessed by
 	 * the reader.
 	 *
@@ -205,8 +233,9 @@ private:
 	 * @param peekCursor is the cursor position from which tokens and text are
 	 * peeked.
 	 */
-	TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl,
-	                    size_t readCursor, size_t peekCursor);
+	TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl,
+	                    const TokenizedDataCursor &readCursor,
+	                    const TokenizedDataCursor &peekCursor);
 
 public:
 	/**
@@ -237,7 +266,7 @@ public:
 	 * false if there are no more tokens.
 	 */
 	bool read(Token &token, const TokenSet &tokens = TokenSet{},
-	          WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+	          WhitespaceMode mode = WhitespaceMode::TRIM);
 
 	/**
 	 * Stores the next token in the given token reference, returns true if the
@@ -253,7 +282,7 @@ public:
 	 * false if there are no more tokens.
 	 */
 	bool peek(Token &token, const TokenSet &tokens = TokenSet{},
-	          WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+	          WhitespaceMode mode = WhitespaceMode::TRIM);
 
 	/**
 	 * Consumes the peeked tokens, the read cursor will now be at the position
@@ -265,20 +294,6 @@ public:
 	 * Resets the peek cursor to the position of the read cursor.
 	 */
 	void resetPeek() { peekCursor = readCursor; }
-
-	/**
-	 * Stores the next text token in the given token reference, returns true if
-	 * the operation was successful (there was indeed a text token), false if
-	 * the next token is not a text token or there were no more tokens.
-	 *
-	 * @param token is an output parameter into which the read token will be
-	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
-	 * @param mode is the whitespace mode that should be used when a text token
-	 * is returned.
-	 * @return a string variant with the data if there is any data or a nullptr
-	 * variant if there is no text.
-	 */
-	Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE);
 };
 
 /**
@@ -309,8 +324,9 @@ private:
 	 * peeked.
 	 */
 	TokenizedDataReaderFork(TokenizedDataReader &parent,
-	                        std::shared_ptr<TokenizedDataImpl> impl,
-	                        size_t readCursor, size_t peekCursor)
+	                        std::shared_ptr<const TokenizedDataImpl> impl,
+	                        const TokenizedDataCursor &readCursor,
+	                        const TokenizedDataCursor &peekCursor)
 	    : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
 	{
 	}
@@ -320,7 +336,7 @@ public:
 	 * Commits the read/peek progress to the underlying parent.
 	 */
 	void commit() { parent = *this; }
-}
+};
 }
 
 #endif /* _OUSIA_TOKENIZED_DATA_HPP_ */
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 51787cd..e78b0f4 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -156,6 +156,7 @@ public:
 		return res;
 	}
 };
+
 }
 
 /* Class Tokenizer */
@@ -229,12 +230,6 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
 		} else {
 			// Record all incomming characters
 			data.append(c, charStart, charEnd);
-
-			// Special token processing
-			// TODO: Build a special state machine for this in another class
-			if (c == '\n') {
-				data.mark(Tokens::Newline, 1);
-			}
 		}
 
 		// Swap the lookups and the nextLookups list
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index 2ddb9c9..74e3f0d 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -28,7 +28,7 @@
 #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
 #define _OUSIA_DYNAMIC_TOKENIZER_HPP_
 
-#include <set>
+#include <cstdint>
 #include <string>
 #include <vector>