diff options
Diffstat (limited to 'src/core/parser/utils')
| -rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 286 | ||||
| -rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 70 | ||||
| -rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 7 | ||||
| -rw-r--r-- | src/core/parser/utils/Tokenizer.hpp | 2 | 
4 files changed, 251 insertions, 114 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index 0ec56af..aeefa26 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -48,6 +48,17 @@ struct TokenMark {  	TokenLength len;  	/** +	 * Specifies whether the token is special or not. +	 */ +	bool special; + +	/** +	 * Maximum token length. +	 */ +	static constexpr TokenLength MaxTokenLength = +	    std::numeric_limits<TokenLength>::max(); + +	/**  	 * Constructor of the TokenMark structure, initializes all members with the  	 * given values.  	 * @@ -55,9 +66,10 @@ struct TokenMark {  	 * @param bufStart is the start position of the TokenMark in the internal  	 * character buffer.  	 * @param len is the length of the token. +	 * @param special modifies the sort order, special tokens are prefered.  	 */ -	TokenMark(TokenId id, size_t bufStart, TokenLength len) -	    : bufStart(bufStart), id(id), len(len) +	TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) +	    : bufStart(bufStart), id(id), len(len), special(special)  	{  	} @@ -72,7 +84,8 @@ struct TokenMark {  	TokenMark(size_t bufStart)  	    : bufStart(bufStart),  	      id(Tokens::Empty), -	      len(std::numeric_limits<TokenLength>::max()) +	      len(MaxTokenLength), +	      special(true)  	{  	} @@ -86,8 +99,22 @@ struct TokenMark {  	 */  	friend bool operator<(const TokenMark &m1, const TokenMark &m2)  	{ -		return (m1.bufStart < m2.bufStart) || -		       (m1.bufStart == m2.bufStart && m1.len > m2.len); +		// Prefer the mark with the smaller bufStart +		if (m1.bufStart < m2.bufStart) { +			return true; +		} + +		// Special handling for marks with the same bufStart +		if (m1.bufStart == m2.bufStart) { +			// If exactly one of the two marks is special, return true if this +			// one is special +			if (m1.special != m2.special) { +				return m1.special; +			} +			// Otherwise prefer longer marks +			return m1.len > m2.len; +		} +		return false;  	}  };  } @@ -110,6 +137,11 @@ private:  	std::vector<char> buf;  	/** +	 * Buffset storing the "protected" flag of the character data. +	 */ +	std::vector<bool> protectedChars; + +	/**  	 * Vector storing all the character offsets efficiently.  	 */  	SourceOffsetVector offsets; @@ -120,6 +152,26 @@ private:  	mutable std::vector<TokenMark> marks;  	/** +	 * Position of the first linebreak in a sequence of linebreaks. +	 */ +	size_t firstLinebreak; + +	/** +	 * Current indentation level. +	 */ +	uint16_t currentIndentation; + +	/** +	 * Last indentation level. +	 */ +	uint16_t lastIndentation; + +	/** +	 * Number of linebreaks without any content between them. +	 */ +	uint16_t numLinebreaks; + +	/**  	 * Flag indicating whether the internal "marks" vector is sorted.  	 */  	mutable bool sorted; @@ -132,7 +184,7 @@ public:  	 * @param sourceId is the source identifier that should be used for  	 * constructing the location when returning tokens.  	 */ -	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} +	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); }  	/**  	 * Appends a complete string to the internal character buffer and extends @@ -140,25 +192,22 @@ public:  	 *  	 * @param data is the string that should be appended to the buffer.  	 * @param offsStart is the start offset in bytes in the input file. +	 * @param protect if set to true, the appended characters will not be +	 * affected by whitespace handling, they will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(const std::string &data, SourceOffset offsStart) -	{  // Append the data to the internal buffer -		buf.insert(buf.end(), data.begin(), data.end()); - -		// Extend the text regions, interpolate the source position (this may -		// yield incorrect results) -		const size_t size = buf.size(); +	size_t append(const std::string &data, SourceOffset offsStart, bool protect) +	{  		for (size_t i = 0; i < data.size(); i++) {  			if (offsStart != InvalidSourceOffset) { -				offsets.storeOffset(offsStart + i, offsStart + i + 1); +				append(data[i], offsStart + i, offsStart + i + 1, protect);  			} else { -				offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset); +				append(data[i], InvalidSourceOffset, InvalidSourceOffset, +				       protect);  			}  		} - -		return size; +		return size();  	}  	/** @@ -168,16 +217,86 @@ public:  	 * @param c is the character that should be appended to the buffer.  	 * @param offsStart is the start offset in bytes in the input file.  	 * @param offsEnd is the end offset in bytes in the input file. +	 * @param protect if set to true, the appended character will not be +	 * affected by whitespace handling, it will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) +	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, +	              bool protect)  	{  		// Add the character to the list and store the location of the character  		// in the source file  		buf.push_back(c); +		protectedChars.push_back(protect);  		offsets.storeOffset(offsStart, offsEnd); -		return buf.size(); + +		// Insert special tokens +		const size_t size = buf.size(); +		const bool isWhitespace = Utils::isWhitespace(c); +		const bool isLinebreak = Utils::isLinebreak(c); + +		// Handle linebreaks +		if (isLinebreak) { +			// Mark linebreaks as linebreak +			mark(Tokens::Newline, size - 1, 1, false); + +			// The linebreak sequence started at the previous character +			if (numLinebreaks == 0) { +				firstLinebreak = size - 1; +			} + +			// Reset the indentation +			currentIndentation = 0; + +			// Increment the number of linebreaks +			numLinebreaks++; + +			const size_t markStart = firstLinebreak; +			const size_t markLength = size - firstLinebreak; + +			// Issue two consecutive linebreaks as paragraph token +			if (numLinebreaks == 2) { +				mark(Tokens::Paragraph, markStart, markLength, false); +			} + +			// Issue three consecutive linebreaks as paragraph token +			if (numLinebreaks >= 3) { +				mark(Tokens::Section, markStart, markLength, false); +			} +		} else if (isWhitespace) { +			// Count the whitespace characters at the beginning of the line +			if (numLinebreaks > 0) { +				// Implement the UNIX/Pyhton rule for tabs: Tabs extend to the +				// next multiple of eight. +				if (c == '\t') { +					currentIndentation = (currentIndentation + 8) & ~7; +				} else { +					currentIndentation++; +				} +			} +		} + +		// Issue indent and unindent tokens +		if (!isWhitespace && numLinebreaks > 0) { +			// Issue a larger indentation than that in the previous line as +			// "Indent" token +			if (currentIndentation > lastIndentation) { +				mark(Tokens::Indent, size - 1, 0, true); +			} + +			// Issue a smaller indentation than that in the previous line as +			// "Dedent" token +			if (currentIndentation < lastIndentation) { +				mark(Tokens::Dedent, size - 1, 0, true); +			} + +			// Reset the internal state machine +			lastIndentation = currentIndentation; +			numLinebreaks = 0; +		} + +		return size;  	}  	/** @@ -187,11 +306,12 @@ public:  	 * @param bufStart is the start position in the internal buffer. Use the  	 * values returned by append to calculate the start position.  	 * @param len is the length of the token. +	 * @param special tags the mark as "special", prefering it in the sort order  	 */ -	void mark(TokenId id, size_t bufStart, TokenLength len) +	void mark(TokenId id, size_t bufStart, TokenLength len, bool special)  	{  		// Push the new instance back onto the list -		marks.emplace_back(id, bufStart, len); +		marks.emplace_back(id, bufStart, len, special);  		// Update the sorted flag as soon as more than one element is in the  		// list @@ -215,9 +335,13 @@ public:  	 * @return true if a token was returned, false if no more tokens are  	 * available.  	 */ -	bool next(Token &token, WhitespaceMode mode, -	          const std::unordered_set<TokenId> &tokens, size_t &cursor) const +	bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, +	          TokenizedDataCursor &cursor) const  	{ +		// Some variables for convenient access +		size_t &bufPos = cursor.bufPos; +		size_t &markPos = cursor.markPos; +  		// Sort the "marks" vector if it has not been sorted yet.  		if (!sorted) {  			std::sort(marks.begin(), marks.end()); @@ -226,8 +350,8 @@ public:  		// Fetch the next larger TokenMark instance, make sure the token is in  		// the "enabled" list and within the buffer range -		auto it = -		    std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); +		auto it = std::lower_bound(marks.begin() + markPos, marks.end(), +		                           TokenMark(bufPos));  		while (it != marks.end() && (tokens.count(it->id) == 0 ||  		                             it->bufStart + it->len > buf.size())) {  			it++; @@ -240,15 +364,15 @@ public:  		// Depending on the whitespace mode, fetch all the data between the  		// cursor position and the calculated end position and return a token  		// containing that data. -		if (cursor < end && cursor < buf.size()) { +		if (bufPos < end && bufPos < buf.size()) {  			switch (mode) {  				case WhitespaceMode::PRESERVE: {  					token = Token( -					    Tokens::Data, std::string(&buf[cursor], end - cursor), +					    Tokens::Data, std::string(&buf[bufPos], end - bufPos),  					    SourceLocation(sourceId, -					                   offsets.loadOffset(cursor).first, +					                   offsets.loadOffset(bufPos).first,  					                   offsets.loadOffset(end).first)); -					cursor = end; +					bufPos = end;  					return true;  				}  				case WhitespaceMode::TRIM: @@ -258,30 +382,35 @@ public:  					size_t stringStart;  					size_t stringEnd;  					std::string content; +					const char *cBuf = &buf[bufPos]; +					auto filter = [cBuf, this](size_t i) -> bool { +						return Utils::isWhitespace(cBuf[i]) && +						       !protectedChars[i]; +					};  					if (mode == WhitespaceMode::TRIM) { -						content = Utils::trim(&buf[cursor], end - cursor, -						                      stringStart, stringEnd); +						content = Utils::trim(cBuf, end - bufPos, stringStart, +						                      stringEnd, filter);  					} else { -						content = Utils::collapse(&buf[cursor], end - cursor, -						                          stringStart, stringEnd); +						content = Utils::collapse( +						    cBuf, end - bufPos, stringStart, stringEnd, filter);  					}  					// If the resulting string is empty (only whitespaces),  					// abort  					if (content.empty()) { -						cursor = end; +						bufPos = end;  						break;  					}  					// Calculate the absolute positions and return the token -					stringStart += cursor; -					stringEnd += cursor; +					stringStart += bufPos; +					stringEnd += bufPos;  					token = Token(  					    Tokens::Data, content,  					    SourceLocation(sourceId,  					                   offsets.loadOffset(stringStart).first,  					                   offsets.loadOffset(stringEnd).first)); -					cursor = end; +					bufPos = end;  					return true;  				}  			} @@ -290,14 +419,18 @@ public:  		// If start equals end, we're currently directly at a token  		// instance. Return this token and advance the cursor to the end of  		// the token. -		if (cursor == end && it != marks.end()) { +		if (bufPos == end && it != marks.end()) {  			const size_t tokenStart = it->bufStart;  			const size_t tokenEnd = it->bufStart + it->len;  			token = Token(  			    it->id, std::string(&buf[tokenStart], it->len),  			    SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,  			                   offsets.loadOffset(tokenEnd).first)); -			cursor = tokenEnd; + +			// Update the cursor, consume the token by incrementing the marks +			// pos counter +			bufPos = tokenEnd; +			markPos = it - marks.begin() + 1;  			return true;  		} @@ -314,8 +447,12 @@ public:  	void clear()  	{  		buf.clear(); -		marks.clear(); +		protectedChars.clear();  		offsets.clear(); +		marks.clear(); +		currentIndentation = 0; +		lastIndentation = 0; +		numLinebreaks = 1;  // Assume the stream starts with a linebreak  		sorted = true;  	} @@ -367,39 +504,35 @@ public:  TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}  TokenizedData::TokenizedData(SourceId sourceId) -    : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) +    : impl(std::make_shared<TokenizedDataImpl>(sourceId))  {  }  TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, +                             bool protect)  { -	return impl->append(data, offsStart); +	return impl->append(data, offsStart, protect);  }  size_t TokenizedData::append(char c, SourceOffset offsStart, -                             SourceOffset offsEnd) +                             SourceOffset offsEnd, bool protect)  { -	return impl->append(c, offsStart, offsEnd); +	return impl->append(c, offsStart, offsEnd, protect);  }  void TokenizedData::mark(TokenId id, TokenLength len)  { -	impl->mark(id, impl->size() - len, len); +	impl->mark(id, impl->size() - len, len, false);  }  void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)  { -	impl->mark(id, bufStart, len); +	impl->mark(id, bufStart, len, false);  } -void TokenizedData::clear() -{ -	impl->clear(); -	tokens.clear(); -	cursor = 0; -} +void TokenizedData::clear() { impl->clear(); }  void TokenizedData::trim(size_t length) { impl->trim(length); } @@ -412,49 +545,42 @@ SourceLocation TokenizedData::getLocation() const  	return impl->getLocation();  } -TokenizedDataReader reader() const +TokenizedDataReader TokenizedData::reader() const  { -	return TokenizedDataReader(impl, std::unordered_set<TokenId>{}, 0, 0); +	return TokenizedDataReader(impl, TokenizedDataCursor(), +	                           TokenizedDataCursor());  }  /* Class TokenizedDataReader */ +TokenizedDataReader::TokenizedDataReader( +    std::shared_ptr<const TokenizedDataImpl> impl, +    const TokenizedDataCursor &readCursor, +    const TokenizedDataCursor &peekCursor) +    : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} +  TokenizedDataReaderFork TokenizedDataReader::fork()  { -	return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor); +	return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor);  } -bool TokenizedDataReader::atEnd() const { return readCursor >= size(); } +bool TokenizedDataReader::atEnd() const +{ +	return readCursor.bufPos >= impl->size(); +} -bool TokenizedData::read(Token &token, const TokenSet &tokens, -                         WhitespaceMode mode) +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, +                               WhitespaceMode mode)  {  	peekCursor = readCursor;  	return impl->next(token, mode, tokens, readCursor);  } -bool TokenizedData::peek(Token &token, const TokenSet &tokens, -                         WhitespaceMode mode) +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, +                               WhitespaceMode mode)  {  	return impl->next(token, mode, tokens, peekCursor);  } - -Variant TokenizedData::text(WhitespaceMode mode) -{ -	// Copy the current cursor position to not update the actual cursor position -	// if the operation was not successful -	size_t cursorCopy = cursor; -	Token token; -	if (!impl->next(token, mode, tokens, cursorCopy) || -	    token.id != Tokens::Data) { -		return Variant{nullptr}; -	} - -	// There is indeed a text token, update the internal cursor position and -	// return the token as variant. -	cursor = cursorCopy; -	Variant res = Variant::fromString(token.content); -	res.setLocation(token.getLocation()); -	return res; -}  } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 85b80ae..b72ca02 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -36,7 +36,6 @@  #include <unordered_set>  #include <core/common/Location.hpp> -#include <core/common/Variant.hpp>  #include <core/common/Whitespace.hpp>  #include <core/common/Token.hpp> @@ -48,6 +47,28 @@ class TokenizedDataReader;  class TokenizedDataReaderFork;  /** + * Internally used structure representing a cursor within the TokenizedData + * stream. + */ +struct TokenizedDataCursor { +	/** +	 * Position within the byte buffer. +	 */ +	size_t bufPos; + +	/** +	 * Position within the token mark buffer. +	 */ +	size_t markPos; + +	/** +	 * Default constructor. The resulting cursor points at the beginning of the +	 * stream. +	 */ +	TokenizedDataCursor() : bufPos(0), markPos(0) {} +}; + +/**   * The TokenizedData class stores data extracted from a user defined document.   * The data stored in TokenizedData   */ @@ -88,10 +109,13 @@ public:  	 *  	 * @param data is the string that should be appended to the buffer.  	 * @param offsStart is the start offset in bytes in the input file. +	 * @param protect if set to true, the appended characters will not be +	 * affected by whitespace handling, they will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(const std::string &data, SourceOffset offsStart = 0); +	size_t append(const std::string &data, SourceOffset offsStart = 0, +	              bool protect = false);  	/**  	 * Appends a single character to the internal character buffer. @@ -99,10 +123,13 @@ public:  	 * @param c is the character that should be appended to the buffer.  	 * @param start is the start offset in bytes in the input file.  	 * @param end is the end offset in bytes in the input file. +	 * @param protect if set to true, the appended character will not be +	 * affected by whitespace handling, it will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); +	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, +	              bool protect = false);  	/**  	 * Stores a token ending at the last character of the current buffer. @@ -187,15 +214,16 @@ private:  	/**  	 * Position from which the last element was read from the internal buffer.  	 */ -	size_t readCursor; +	TokenizedDataCursor readCursor;  	/**  	 * Position from which the last element was peeked from the internal buffer.  	 */ -	size_t peekCursor; +	TokenizedDataCursor peekCursor; +protected:  	/** -	 * Private constructor of TokenizedDataReader, taking a reference to the +	 * Protected constructor of TokenizedDataReader, taking a reference to the  	 * internal TokenizedDataImpl structure storing the data that is accessed by  	 * the reader.  	 * @@ -205,8 +233,9 @@ private:  	 * @param peekCursor is the cursor position from which tokens and text are  	 * peeked.  	 */ -	TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl, -	                    size_t readCursor, size_t peekCursor); +	TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl, +	                    const TokenizedDataCursor &readCursor, +	                    const TokenizedDataCursor &peekCursor);  public:  	/** @@ -237,7 +266,7 @@ public:  	 * false if there are no more tokens.  	 */  	bool read(Token &token, const TokenSet &tokens = TokenSet{}, -	          WhitespaceMode mode = WhitespaceMode::COLLAPSE); +	          WhitespaceMode mode = WhitespaceMode::TRIM);  	/**  	 * Stores the next token in the given token reference, returns true if the @@ -253,7 +282,7 @@ public:  	 * false if there are no more tokens.  	 */  	bool peek(Token &token, const TokenSet &tokens = TokenSet{}, -	          WhitespaceMode mode = WhitespaceMode::COLLAPSE); +	          WhitespaceMode mode = WhitespaceMode::TRIM);  	/**  	 * Consumes the peeked tokens, the read cursor will now be at the position @@ -265,20 +294,6 @@ public:  	 * Resets the peek cursor to the position of the read cursor.  	 */  	void resetPeek() { peekCursor = readCursor; } - -	/** -	 * Stores the next text token in the given token reference, returns true if -	 * the operation was successful (there was indeed a text token), false if -	 * the next token is not a text token or there were no more tokens. -	 * -	 * @param token is an output parameter into which the read token will be -	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens. -	 * @param mode is the whitespace mode that should be used when a text token -	 * is returned. -	 * @return a string variant with the data if there is any data or a nullptr -	 * variant if there is no text. -	 */ -	Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE);  };  /** @@ -309,8 +324,9 @@ private:  	 * peeked.  	 */  	TokenizedDataReaderFork(TokenizedDataReader &parent, -	                        std::shared_ptr<TokenizedDataImpl> impl, -	                        size_t readCursor, size_t peekCursor) +	                        std::shared_ptr<const TokenizedDataImpl> impl, +	                        const TokenizedDataCursor &readCursor, +	                        const TokenizedDataCursor &peekCursor)  	    : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)  	{  	} @@ -320,7 +336,7 @@ public:  	 * Commits the read/peek progress to the underlying parent.  	 */  	void commit() { parent = *this; } -} +};  }  #endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 51787cd..e78b0f4 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -156,6 +156,7 @@ public:  		return res;  	}  }; +  }  /* Class Tokenizer */ @@ -229,12 +230,6 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)  		} else {  			// Record all incomming characters  			data.append(c, charStart, charEnd); - -			// Special token processing -			// TODO: Build a special state machine for this in another class -			if (c == '\n') { -				data.mark(Tokens::Newline, 1); -			}  		}  		// Swap the lookups and the nextLookups list diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index 2ddb9c9..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -28,7 +28,7 @@  #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_  #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include <set> +#include <cstdint>  #include <string>  #include <vector>  | 
