diff options
| author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-03-03 15:08:18 +0100 | 
|---|---|---|
| committer | Andreas Stöckel <andreas@somweyr.de> | 2015-03-03 15:08:18 +0100 | 
| commit | 466ff991bcfad76d78100193aacbfaf74d542b26 (patch) | |
| tree | dafdb41ec766e83c6e37a8b9865e6ef454ff4def /src/core/parser/utils | |
| parent | b5cdca0331117ad3834b61eadd94ab3fcb6d2fba (diff) | |
| parent | fb8d4cdf01909b61e4e5d0806ec6de178ff0058c (diff) | |
Storing type and name in the HandlerData once again, using a Token
Conflicts:
	application/src/core/parser/stack/Callbacks.hpp
Diffstat (limited to 'src/core/parser/utils')
| -rw-r--r-- | src/core/parser/utils/SourceOffsetVector.hpp | 89 | ||||
| -rw-r--r-- | src/core/parser/utils/Token.cpp | 24 | ||||
| -rw-r--r-- | src/core/parser/utils/Token.hpp | 142 | ||||
| -rw-r--r-- | src/core/parser/utils/TokenTrie.cpp | 16 | ||||
| -rw-r--r-- | src/core/parser/utils/TokenTrie.hpp | 11 | ||||
| -rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 361 | ||||
| -rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 246 | ||||
| -rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 276 | ||||
| -rw-r--r-- | src/core/parser/utils/Tokenizer.hpp | 142 | 
9 files changed, 796 insertions, 511 deletions
diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..f322a88 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -33,6 +33,7 @@  #include <limits>  #include <vector>  #include <utility> +#include <unordered_map>  #include <core/common/Location.hpp> @@ -43,6 +44,9 @@ namespace ousia {   * a delta compression.   */  class SourceOffsetVector { +public: +	using OffsPair = std::pair<SourceOffset, SourceOffset>; +  private:  	/**  	 * Type used for representing the length of a character. @@ -82,9 +86,12 @@ private:  	std::vector<SourceOffset> offsets;  	/** +	 * Map used to store discontinuities in the character offsets. +	 */ +	std::unordered_map<size_t, OffsPair> gaps; + +	/**  	 * Last position given as "end" position in the storeOffset() method. -	 * Used to adapt the length of the previous element in case start and end -	 * positions do not match.  	 */  	SourceOffset lastEnd; @@ -105,19 +112,22 @@ public:  		// Make sure (end - start) is smaller than MAX_LEN  		assert(end - start < MAX_LEN); -		// Adapt the length of the previous character in case there is a gap -		if (!lens.empty() && start > lastEnd) { -			lens.back() += start - lastEnd; -		} -		lastEnd = end; -  		// Store an absolute offset every OFFSET_INTERVAL elements  		if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) {  			offsets.push_back(start);  		} -		// Store the length -		lens.push_back(end - start); +		// Adapt the length of the previous character in case there is a gap +		if (!lens.empty() && start > lastEnd) { +			// There is a discontinuity, store the given offsets in the "gaps" +			// map +			gaps[lens.size()] = OffsPair(start, end); +			lens.push_back(MAX_LEN); +		} else { +			// Store the length +			lens.push_back(end - start); +		} +		lastEnd = end;  	}  	/** @@ -127,14 +137,13 @@ public:  	 * read.  	 * @return a pair containing start and end source offset.  	 */ -	std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx) +	OffsPair loadOffset(size_t idx) const  	{  		// Special treatment for the last character  		const size_t count = lens.size();  		if (idx > 0 && idx == count) {  			auto offs = loadOffset(count - 1); -			return std::pair<SourceOffset, SourceOffset>(offs.second, -			                                             offs.second); +			return OffsPair(offs.second, offs.second);  		}  		// Calculate the start index in the lens vector and in the offsets @@ -146,18 +155,66 @@ public:  		assert(idx < count);  		assert(offsetIdx < offsets.size()); +		// If the length of the last character is MAX_LEN, the position is +		// stored in the "gaps" list +		if (lens[idx] == MAX_LEN) { +			auto it = gaps.find(idx); +			assert(it != gaps.end()); +			return it->second; +		} +  		// Sum over the length starting with the start offset  		SourceOffset start = offsets[offsetIdx];  		for (size_t i = sumStartIdx; i < idx; i++) { -			start += lens[i]; +			if (lens[i] == MAX_LEN) { +				auto it = gaps.find(i); +				assert(it != gaps.end()); +				start = it->second.first; +			} else { +				start += lens[i]; +			}  		} -		return std::pair<SourceOffset, SourceOffset>(start, start + lens[idx]); +		return OffsPair(start, start + lens[idx]);  	}  	/**  	 * Returns the number of characters for which offsets are stored.  	 */ -	size_t size() { return lens.size(); } +	size_t size() const { return lens.size(); } + +	/** +	 * Trims the length of the TokenizedData instance to the given length. +	 * Removes all token matches that lie within the trimmed region. +	 * +	 * @param length is the number of characters to which the TokenizedData +	 * instance should be trimmed. +	 */ +	void trim(size_t length) +	{ +		if (length < size()) { +			lens.resize(length); +			if (length > 0) { +				offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); +				lastEnd = loadOffset(length - 1).second; +			} else { +				offsets.clear(); +				gaps.clear(); +				lastEnd = 0; +			} +		} +	} + +	/** +	 * Resets the SourceOffsetVector to the state it had when it was +	 * constructed. +	 */ +	void clear() +	{ +		lens.clear(); +		offsets.clear(); +		gaps.clear(); +		lastEnd = 0; +	}  };  } diff --git a/src/core/parser/utils/Token.cpp b/src/core/parser/utils/Token.cpp deleted file mode 100644 index 8bcdbb5..0000000 --- a/src/core/parser/utils/Token.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "Token.hpp" - -namespace ousia { -// Stub to make sure Tokens.hpp is valid -} - diff --git a/src/core/parser/utils/Token.hpp b/src/core/parser/utils/Token.hpp deleted file mode 100644 index f907450..0000000 --- a/src/core/parser/utils/Token.hpp +++ /dev/null @@ -1,142 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file Token.hpp - * - * Definition of the TokenId id and constants for some special tokens. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_HPP_ -#define _OUSIA_TOKEN_HPP_ - -#include <cstdint> -#include <limits> -#include <string> - -#include <core/common/Location.hpp> - -namespace ousia { - -/** - * The TokenId is used to give each token id a unique id. - */ -using TokenId = uint32_t; - -/** - * Type used for storing token lengths. - */ -using TokenLength = uint16_t; - -/** - * Namespace containing constants for TokenId instances with special meaning. - */ -namespace Tokens { -/** - * Token which is not a token. - */ -constexpr TokenId Empty = std::numeric_limits<TokenId>::max(); - -/** - * Token which represents data (represented as TokenizedData). - */ -constexpr TokenId Data = std::numeric_limits<TokenId>::max() - 1; - -/** - * Token which represents a newline token. - */ -constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2; - -/** - * Token which represents a paragraph token -- issued if two consecutive - * newlines occur with optionally any amout of whitespace between them. - */ -constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3; - -/** - * Token which represents an indentation token -- issued if the indentation of - * this line is larget than the indentation of the previous line. - */ -constexpr TokenId Indentation = std::numeric_limits<TokenId>::max() - 4; - -/** - * Maximum token id to be used. Tokens allocated for users should not surpass - * this value. - */ -constexpr TokenId MaxTokenId = std::numeric_limits<TokenId>::max() - 255; -} - -/** - * The Token structure describes a token discovered by the Tokenizer or read - * from the TokenizedData struct. - */ -struct Token { -	/** -	 * Id of the id of this token. -	 */ -	TokenId id; - -	/** -	 * String that was matched. -	 */ -	std::string content; - -	/** -	 * Location from which the string was extracted. -	 */ -	SourceLocation location; - -	/** -	 * Default constructor. -	 */ -	Token() : id(Tokens::Empty) {} - -	/** -	 * Constructor of the Token struct. -	 * -	 * @param id represents the token id. -	 * @param content is the string content that has been extracted. -	 * @param location is the location of the extracted string content in the -	 * source file. -	 */ -	Token(TokenId id, const std::string &content, SourceLocation location) -	    : id(id), content(content), location(location) -	{ -	} - -	/** -	 * Constructor of the Token struct, only initializes the token id -	 * -	 * @param id is the id corresponding to the id of the token. -	 */ -	Token(TokenId id) : id(id) {} - -	/** -	 * The getLocation function allows the tokens to be directly passed as -	 * parameter to Logger or LoggableException instances. -	 * -	 * @return a reference at the location field -	 */ -	const SourceLocation &getLocation() const { return location; } -}; -} - -#endif /* _OUSIA_TOKENS_HPP_ */ - diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 80cc945..a45d3ff 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia {  /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(Tokens::Empty) {} +TokenTrie::Node::Node() : id(Tokens::Empty) {}  /* Class DynamicTokenTree */  bool TokenTrie::registerToken(const std::string &token, -                              TokenId type) noexcept +                              TokenId id) noexcept  {  	// Abort if the token is empty -- this would taint the root node  	if (token.empty()) { @@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token,  	}  	// If the resulting node already has a type set, we're screwed. -	if (node->type != Tokens::Empty) { +	if (node->id != Tokens::Empty) {  		return false;  	}  	// Otherwise just set the type to the given type. -	node->type = type; +	node->id = id;  	return true;  } @@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept  		// Reset the subtree handler if this node has another type  		node = it->second.get(); -		if ((node->type != Tokens::Empty || node->children.size() > 1) && +		if ((node->id != Tokens::Empty || node->children.size() > 1) &&  		    (i + 1 != token.size())) {  			subtreeRoot = node;  			subtreeKey = token[i + 1]; @@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept  	}  	// If the node type is already Tokens::Empty, we cannot do anything here -	if (node->type == Tokens::Empty) { +	if (node->id == Tokens::Empty) {  		return false;  	}  	// If the target node has children, we cannot delete the subtree. Set the  	// type to Tokens::Empty instead  	if (!node->children.empty()) { -		node->type = Tokens::Empty; +		node->id = Tokens::Empty;  		return true;  	} @@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept  		}  		node = it->second.get();  	} -	return node->type; +	return node->id;  }  } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index b2d1539..c470acc 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,7 +33,7 @@  #include <limits>  #include <unordered_map> -#include "Token.hpp" +#include <core/common/Token.hpp>  namespace ousia { @@ -75,10 +75,9 @@ public:  		ChildMap children;  		/** -		 * Reference at the corresponding token descriptor. Set to nullptr if -		 * no token is attached to this node. +		 * Id of the token represented by this node.  		 */ -		TokenId type; +		TokenId id;  		/**  		 * Default constructor, initializes the descriptor with nullptr. @@ -99,10 +98,10 @@ public:  	 *  	 * @param token is the character sequence that should be registered as  	 * token. -	 * @param type is the descriptor that should be set for this token. +	 * @param id is the descriptor that should be set for this token.  	 * @return true if the operation is successful, false otherwise.  	 */ -	bool registerToken(const std::string &token, TokenId type) noexcept; +	bool registerToken(const std::string &token, TokenId id) noexcept;  	/**  	 * Unregisters the token from the token tree. Returns true if the token was diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..d8a8b37 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -26,6 +26,11 @@  #include "TokenizedData.hpp"  namespace ousia { +/** + * Maximum token length. + */ +constexpr TokenLength MaxTokenLength = std::numeric_limits<TokenLength>::max(); +  namespace {  /**   * Structure used to represent the position of a token in the internal @@ -48,6 +53,11 @@ struct TokenMark {  	TokenLength len;  	/** +	 * Specifies whether the token is special or not. +	 */ +	bool special; + +	/**  	 * Constructor of the TokenMark structure, initializes all members with the  	 * given values.  	 * @@ -55,9 +65,10 @@ struct TokenMark {  	 * @param bufStart is the start position of the TokenMark in the internal  	 * character buffer.  	 * @param len is the length of the token. +	 * @param special modifies the sort order, special tokens are prefered.  	 */ -	TokenMark(TokenId id, size_t bufStart, TokenLength len) -	    : bufStart(bufStart), id(id), len(len) +	TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) +	    : bufStart(bufStart), id(id), len(len), special(special)  	{  	} @@ -72,7 +83,8 @@ struct TokenMark {  	TokenMark(size_t bufStart)  	    : bufStart(bufStart),  	      id(Tokens::Empty), -	      len(std::numeric_limits<TokenLength>::max()) +	      len(MaxTokenLength), +	      special(true)  	{  	} @@ -86,8 +98,22 @@ struct TokenMark {  	 */  	friend bool operator<(const TokenMark &m1, const TokenMark &m2)  	{ -		return (m1.bufStart < m2.bufStart) || -		       (m1.bufStart == m2.bufStart && m1.len > m2.len); +		// Prefer the mark with the smaller bufStart +		if (m1.bufStart < m2.bufStart) { +			return true; +		} + +		// Special handling for marks with the same bufStart +		if (m1.bufStart == m2.bufStart) { +			// If exactly one of the two marks is special, return true if this +			// one is special +			if (m1.special != m2.special) { +				return m1.special; +			} +			// Otherwise prefer longer marks +			return m1.len > m2.len; +		} +		return false;  	}  };  } @@ -110,9 +136,9 @@ private:  	std::vector<char> buf;  	/** -	 * Vector containing all token marks. +	 * Buffset storing the "protected" flag of the character data.  	 */ -	std::vector<TokenMark> marks; +	std::vector<bool> protectedChars;  	/**  	 * Vector storing all the character offsets efficiently. @@ -120,9 +146,34 @@ private:  	SourceOffsetVector offsets;  	/** +	 * Vector containing all token marks. +	 */ +	mutable std::vector<TokenMark> marks; + +	/** +	 * Position of the first linebreak in a sequence of linebreaks. +	 */ +	size_t firstLinebreak; + +	/** +	 * Current indentation level. +	 */ +	uint16_t currentIndentation; + +	/** +	 * Last indentation level. +	 */ +	uint16_t lastIndentation; + +	/** +	 * Number of linebreaks without any content between them. +	 */ +	uint16_t numLinebreaks; + +	/**  	 * Flag indicating whether the internal "marks" vector is sorted.  	 */ -	bool sorted; +	mutable bool sorted;  public:  	/** @@ -132,7 +183,7 @@ public:  	 * @param sourceId is the source identifier that should be used for  	 * constructing the location when returning tokens.  	 */ -	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} +	TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); }  	/**  	 * Appends a complete string to the internal character buffer and extends @@ -140,22 +191,22 @@ public:  	 *  	 * @param data is the string that should be appended to the buffer.  	 * @param offsStart is the start offset in bytes in the input file. +	 * @param protect if set to true, the appended characters will not be +	 * affected by whitespace handling, they will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(const std::string &data, SourceOffset offsStart) -	{  // Append the data to the internal buffer -		buf.insert(buf.end(), data.begin(), data.end()); - -		// Extend the text regions, interpolate the source position (this may -		// yield incorrect results) -		const size_t size = buf.size(); -		for (SourceOffset offs = offsStart; offs < offsStart + data.size(); -		     offs++) { -			offsets.storeOffset(offs, offs + 1); +	size_t append(const std::string &data, SourceOffset offsStart, bool protect) +	{ +		for (size_t i = 0; i < data.size(); i++) { +			if (offsStart != InvalidSourceOffset) { +				append(data[i], offsStart + i, offsStart + i + 1, protect); +			} else { +				append(data[i], InvalidSourceOffset, InvalidSourceOffset, +				       protect); +			}  		} - -		return size; +		return size();  	}  	/** @@ -165,16 +216,86 @@ public:  	 * @param c is the character that should be appended to the buffer.  	 * @param offsStart is the start offset in bytes in the input file.  	 * @param offsEnd is the end offset in bytes in the input file. +	 * @param protect if set to true, the appended character will not be +	 * affected by whitespace handling, it will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) +	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, +	              bool protect)  	{  		// Add the character to the list and store the location of the character  		// in the source file  		buf.push_back(c); +		protectedChars.push_back(protect);  		offsets.storeOffset(offsStart, offsEnd); -		return buf.size(); + +		// Insert special tokens +		const size_t size = buf.size(); +		const bool isWhitespace = Utils::isWhitespace(c); +		const bool isLinebreak = Utils::isLinebreak(c); + +		// Handle linebreaks +		if (isLinebreak) { +			// Mark linebreaks as linebreak +			mark(Tokens::Newline, size - 1, 1, false); + +			// The linebreak sequence started at the previous character +			if (numLinebreaks == 0) { +				firstLinebreak = size - 1; +			} + +			// Reset the indentation +			currentIndentation = 0; + +			// Increment the number of linebreaks +			numLinebreaks++; + +			const size_t markStart = firstLinebreak; +			const size_t markLength = size - firstLinebreak; + +			// Issue two consecutive linebreaks as paragraph token +			if (numLinebreaks == 2) { +				mark(Tokens::Paragraph, markStart, markLength, false); +			} + +			// Issue three consecutive linebreaks as paragraph token +			if (numLinebreaks >= 3) { +				mark(Tokens::Section, markStart, markLength, false); +			} +		} else if (isWhitespace) { +			// Count the whitespace characters at the beginning of the line +			if (numLinebreaks > 0) { +				// Implement the UNIX/Pyhton rule for tabs: Tabs extend to the +				// next multiple of eight. +				if (c == '\t') { +					currentIndentation = (currentIndentation + 8) & ~7; +				} else { +					currentIndentation++; +				} +			} +		} + +		// Issue indent and unindent tokens +		if (!isWhitespace && numLinebreaks > 0) { +			// Issue a larger indentation than that in the previous line as +			// "Indent" token +			if (currentIndentation > lastIndentation) { +				mark(Tokens::Indent, size - 1, 0, true); +			} + +			// Issue a smaller indentation than that in the previous line as +			// "Dedent" token +			if (currentIndentation < lastIndentation) { +				mark(Tokens::Dedent, size - 1, 0, true); +			} + +			// Reset the internal state machine +			lastIndentation = currentIndentation; +			numLinebreaks = 0; +		} + +		return size;  	}  	/** @@ -184,11 +305,12 @@ public:  	 * @param bufStart is the start position in the internal buffer. Use the  	 * values returned by append to calculate the start position.  	 * @param len is the length of the token. +	 * @param special tags the mark as "special", prefering it in the sort order  	 */ -	void mark(TokenId id, size_t bufStart, TokenLength len) +	void mark(TokenId id, size_t bufStart, TokenLength len, bool special)  	{  		// Push the new instance back onto the list -		marks.emplace_back(id, bufStart, len); +		marks.emplace_back(id, bufStart, len, special);  		// Update the sorted flag as soon as more than one element is in the  		// list @@ -212,9 +334,13 @@ public:  	 * @return true if a token was returned, false if no more tokens are  	 * available.  	 */ -	bool next(Token &token, WhitespaceMode mode, -	          const std::unordered_set<TokenId> &tokens, size_t &cursor) +	bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, +	          TokenizedDataCursor &cursor) const  	{ +		// Some variables for convenient access +		size_t &bufPos = cursor.bufPos; +		size_t &markPos = cursor.markPos; +  		// Sort the "marks" vector if it has not been sorted yet.  		if (!sorted) {  			std::sort(marks.begin(), marks.end()); @@ -222,10 +348,11 @@ public:  		}  		// Fetch the next larger TokenMark instance, make sure the token is in -		// the "enabled" list -		auto it = -		    std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); -		while (it != marks.end() && tokens.count(it->id) == 0) { +		// the "enabled" list and within the buffer range +		auto it = std::lower_bound(marks.begin() + markPos, marks.end(), +		                           TokenMark(bufPos)); +		while (it != marks.end() && (tokens.count(it->id) == 0 || +		                             it->bufStart + it->len > buf.size())) {  			it++;  		} @@ -236,15 +363,15 @@ public:  		// Depending on the whitespace mode, fetch all the data between the  		// cursor position and the calculated end position and return a token  		// containing that data. -		if (cursor < end && cursor < buf.size()) { +		if (bufPos < end && bufPos < buf.size()) {  			switch (mode) {  				case WhitespaceMode::PRESERVE: {  					token = Token( -					    Tokens::Data, std::string(&buf[cursor], end - cursor), +					    Tokens::Data, std::string(&buf[bufPos], end - bufPos),  					    SourceLocation(sourceId, -					                   offsets.loadOffset(cursor).first, +					                   offsets.loadOffset(bufPos).first,  					                   offsets.loadOffset(end).first)); -					cursor = end; +					bufPos = end;  					return true;  				}  				case WhitespaceMode::TRIM: @@ -254,30 +381,35 @@ public:  					size_t stringStart;  					size_t stringEnd;  					std::string content; +					const char *cBuf = &buf[bufPos]; +					auto filter = [cBuf, this](size_t i) -> bool { +						return Utils::isWhitespace(cBuf[i]) && +						       !protectedChars[i]; +					};  					if (mode == WhitespaceMode::TRIM) { -						content = Utils::trim(&buf[cursor], end - cursor, -						                      stringStart, stringEnd); +						content = Utils::trim(cBuf, end - bufPos, stringStart, +						                      stringEnd, filter);  					} else { -						content = Utils::collapse(&buf[cursor], end - cursor, -						                          stringStart, stringEnd); +						content = Utils::collapse( +						    cBuf, end - bufPos, stringStart, stringEnd, filter);  					}  					// If the resulting string is empty (only whitespaces),  					// abort  					if (content.empty()) { -						cursor = end; +						bufPos = end;  						break;  					}  					// Calculate the absolute positions and return the token -					stringStart += cursor; -					stringEnd += cursor; +					stringStart += bufPos; +					stringEnd += bufPos;  					token = Token(  					    Tokens::Data, content,  					    SourceLocation(sourceId,  					                   offsets.loadOffset(stringStart).first,  					                   offsets.loadOffset(stringEnd).first)); -					cursor = end; +					bufPos = end;  					return true;  				}  			} @@ -286,14 +418,18 @@ public:  		// If start equals end, we're currently directly at a token  		// instance. Return this token and advance the cursor to the end of  		// the token. -		if (cursor == end && it != marks.end()) { +		if (bufPos == end && it != marks.end()) {  			const size_t tokenStart = it->bufStart;  			const size_t tokenEnd = it->bufStart + it->len;  			token = Token(  			    it->id, std::string(&buf[tokenStart], it->len),  			    SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,  			                   offsets.loadOffset(tokenEnd).first)); -			cursor = tokenEnd; + +			// Update the cursor, consume the token by incrementing the marks +			// pos counter +			bufPos = tokenEnd; +			markPos = it - marks.begin() + 1;  			return true;  		} @@ -304,11 +440,64 @@ public:  	}  	/** +	 * Resets the TokenizedDataImpl instance to the state it had when it was +	 * constructred. +	 */ +	void clear() +	{ +		buf.clear(); +		protectedChars.clear(); +		offsets.clear(); +		marks.clear(); +		firstLinebreak = 0; +		currentIndentation = 0; +		lastIndentation = 0; +		numLinebreaks = 1;  // Assume the stream starts with a linebreak +		sorted = true; +	} + +	/** +	 * Trims the length of the TokenizedDataImpl instance to the given length. +	 * +	 * @param length is the number of characters to which the TokenizedData +	 * instance should be trimmed. +	 */ +	void trim(size_t length) +	{ +		if (length < size()) { +			buf.resize(length); +			protectedChars.resize(length); +			offsets.trim(length); +		} +	} + +	/**  	 * Returns the current size of the internal buffer.  	 *  	 * @return the size of the internal character buffer.  	 */ -	size_t getSize() { return buf.size(); } +	size_t size() const { return buf.size(); } + +	/** +	 * Returns true if no data is in the data buffer. +	 * +	 * @return true if the "buf" instance has no data. +	 */ +	bool empty() const { return buf.empty(); } + +	/** +	 * Returns the current location of all data in the buffer. +	 * +	 * @return the location of the entire data represented by this instance. +	 */ +	SourceLocation getLocation() const +	{ +		if (empty()) { +			return SourceLocation{sourceId}; +		} +		return SourceLocation{sourceId, offsets.loadOffset(0).first, +		                      offsets.loadOffset(size()).second}; +	}  };  /* Class TokenizedData */ @@ -316,50 +505,90 @@ public:  TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}  TokenizedData::TokenizedData(SourceId sourceId) -    : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) +    : impl(std::make_shared<TokenizedDataImpl>(sourceId))  {  } +TokenizedData::TokenizedData(const std::string &data, SourceOffset offsStart, +                             SourceId sourceId) +    : TokenizedData(sourceId) +{ +	append(data, offsStart); +} +  TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, +                             bool protect)  { -	return impl->append(data, offsStart); +	return impl->append(data, offsStart, protect);  }  size_t TokenizedData::append(char c, SourceOffset offsStart, -                             SourceOffset offsEnd) +                             SourceOffset offsEnd, bool protect)  { -	return impl->append(c, offsStart, offsEnd); +	return impl->append(c, offsStart, offsEnd, protect);  }  void TokenizedData::mark(TokenId id, TokenLength len)  { -	impl->mark(id, impl->getSize() - len, len); +	impl->mark(id, impl->size() - len, len, false);  }  void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)  { -	impl->mark(id, bufStart, len); +	impl->mark(id, bufStart, len, false);  } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear() { impl->clear(); } + +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const  { -	return impl->next(token, mode, tokens, cursor); +	return impl->getLocation();  } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +TokenizedDataReader TokenizedData::reader() const  { -	// Copy the current cursor position to not update the actual cursor position -	// if the operation was not successful -	size_t cursorCopy = cursor; -	if (!impl->next(token, mode, tokens, cursorCopy) || -	    token.id != Tokens::Data) { -		return false; -	} +	return TokenizedDataReader(impl, TokenizedDataCursor(), +	                           TokenizedDataCursor()); +} + +/* Class TokenizedDataReader */ -	// There is indeed a text token, update the internal cursor position -	cursor = cursorCopy; -	return true; +TokenizedDataReader::TokenizedDataReader( +    std::shared_ptr<const TokenizedDataImpl> impl, +    const TokenizedDataCursor &readCursor, +    const TokenizedDataCursor &peekCursor) +    : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ +	return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const +{ +	return readCursor.bufPos >= impl->size(); +} + +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, +                               WhitespaceMode mode) +{ +	peekCursor = readCursor; +	return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, +                               WhitespaceMode mode) +{ +	return impl->next(token, mode, tokens, peekCursor);  }  } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..bc937f2 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -37,40 +37,48 @@  #include <core/common/Location.hpp>  #include <core/common/Whitespace.hpp> - -#include "Token.hpp" +#include <core/common/Token.hpp>  namespace ousia {  // Forward declaration  class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork;  /** - * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * Internally used structure representing a cursor within the TokenizedData + * stream.   */ -class TokenizedData { -private: +struct TokenizedDataCursor {  	/** -	 * Shared pointer pointing at the internal data. This data is shared when -	 * copying TokenizedData instances, which corresponds to forking a -	 * TokenizedData instance. +	 * Position within the byte buffer.  	 */ -	std::shared_ptr<TokenizedDataImpl> impl; +	size_t bufPos;  	/** -	 * Contains all currently enabled token ids. +	 * Position within the token mark buffer.  	 */ -	std::unordered_set<TokenId> tokens; +	size_t markPos;  	/** -	 * Position from which the last element was read from the internal buffer. -	 * This information is not shared with the other instances of TokenizedData -	 * pointing at the same location. +	 * Default constructor. The resulting cursor points at the beginning of the +	 * stream. +	 */ +	TokenizedDataCursor() : bufPos(0), markPos(0) {} +}; + +/** + * The TokenizedData class stores data extracted from a user defined document. + * The data stored in TokenizedData + */ +class TokenizedData { +private: +	/** +	 * Shared pointer pointing at the internal data. This data is shared with +	 * all the TokenizedDataReader instances.  	 */ -	size_t cursor; +	std::shared_ptr<TokenizedDataImpl> impl;  public:  	/** @@ -88,6 +96,18 @@ public:  	TokenizedData(SourceId sourceId);  	/** +	 * Creates a new instance of TokenizedData, takes a SourceId and an initial +	 * string buffer. +	 * +	 * @param data is the string that should be appended to the buffer. +	 * @param offsStart is the start offset in bytes in the input file. +	 * @param sourceId is the source identifier that should be used for +	 * constructing the location when returning tokens. +	 */ +	TokenizedData(const std::string &data, SourceOffset offsStart = 0, +	              SourceId sourceId = InvalidSourceId); + +	/**  	 * Destructor. Needs to be defined explicitly for freeing a shared pointer  	 * of the incomplete TokenizedDataImpl type.  	 */ @@ -101,10 +121,13 @@ public:  	 *  	 * @param data is the string that should be appended to the buffer.  	 * @param offsStart is the start offset in bytes in the input file. +	 * @param protect if set to true, the appended characters will not be +	 * affected by whitespace handling, they will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(const std::string &data, SourceOffset offsStart = 0); +	size_t append(const std::string &data, SourceOffset offsStart = 0, +	              bool protect = false);  	/**  	 * Appends a single character to the internal character buffer. @@ -112,10 +135,13 @@ public:  	 * @param c is the character that should be appended to the buffer.  	 * @param start is the start offset in bytes in the input file.  	 * @param end is the end offset in bytes in the input file. +	 * @param protect if set to true, the appended character will not be +	 * affected by whitespace handling, it will be returned as is.  	 * @return the current size of the internal byte buffer. The returned value  	 * is intended to be used for the "mark" function.  	 */ -	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); +	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, +	              bool protect = false);  	/**  	 * Stores a token ending at the last character of the current buffer. @@ -136,54 +162,194 @@ public:  	void mark(TokenId id, size_t bufStart, TokenLength len);  	/** -	 * Enables a single token id. Enabled tokens will no longer be returned as -	 * text. Instead, when querying for the next token, TokenizedData will -	 * return them as token and not as part of a Text token. +	 * Resets the TokenizedData instance to the state it had when it was +	 * constructred. +	 */ +	void clear(); + +	/** +	 * Trims the length of the TokenizedData instance to the given length. Note +	 * that this function does not remove any token matches for performance +	 * reasons, it merely renders them incaccessible. Appending new data after +	 * calling trim will make the token marks accessible again. Thus this method +	 * should be the last function called to modify the data buffer and the +	 * token marks.  	 * -	 * @param id is the TokenId of the token that should be enabled. +	 * @param length is the number of characters to which the TokenizedData +	 * instance should be trimmed. +	 */ +	void trim(size_t length); + +	/** +	 * Returns the number of characters currently represented by this +	 * TokenizedData instance.  	 */ -	void enableToken(TokenId id) { tokens.insert(id); } +	size_t size() const;  	/** -	 * Enables a set of token ids. Enabled tokens will no longer be returned as -	 * text. Instead, when querying for the next token, TokenizedData will -	 * return them as token and not as part of a Text token. +	 * Returns true if the TokenizedData instance is empty, false otherwise.  	 * -	 * @param ids is the TokenId of the token that should be enabled. +	 * @return true if not data is stored inside the TokenizedData instance.  	 */ -	void enableToken(const std::unordered_set<TokenId> &ids) -	{ -		tokens.insert(ids.begin(), ids.end()); -	} +	bool empty() const; + +	/** +	 * Returns the location of the entire TokenizedData instance. +	 * +	 * @return the location of the entire data represented by this instance. +	 */ +	SourceLocation getLocation() const; + +	/** +	 * Returns a TokenizedDataReader instance that can be used to access the +	 * data. +	 * +	 * @return a new TokenizedDataReader instance pointing at the beginning of +	 * the internal buffer. +	 */ +	TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: +	friend TokenizedData; + +	/** +	 * Shared pointer pointing at the internal data. This data is shared with +	 * all the TokenizedDataReader instances. +	 */ +	std::shared_ptr<const TokenizedDataImpl> impl; + +	/** +	 * Position from which the last element was read from the internal buffer. +	 */ +	TokenizedDataCursor readCursor; + +	/** +	 * Position from which the last element was peeked from the internal buffer. +	 */ +	TokenizedDataCursor peekCursor; + +protected: +	/** +	 * Protected constructor of TokenizedDataReader, taking a reference to the +	 * internal TokenizedDataImpl structure storing the data that is accessed by +	 * the reader. +	 * +	 * @param impl is the TokenizedDataImpl instance that holds the actual data. +	 * @param readCursor is the cursor position from which tokens and text are +	 * read. +	 * @param peekCursor is the cursor position from which tokens and text are +	 * peeked. +	 */ +	TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl, +	                    const TokenizedDataCursor &readCursor, +	                    const TokenizedDataCursor &peekCursor); + +public: +	/** +	 * Returns a new TokenizedDataReaderFork from which tokens and text can be +	 * read without advancing this reader instance. +	 */ +	TokenizedDataReaderFork fork(); + +	/** +	 * Returns true if this TokenizedData instance is at the end. +	 * +	 * @return true if the end of the TokenizedData instance has been reached. +	 */ +	bool atEnd() const;  	/**  	 * Stores the next token in the given token reference, returns true if the -	 * operation was successful, false if there are no more tokens. +	 * operation was successful, false if there are no more tokens. Advances the +	 * internal cursor and re  	 *  	 * @param token is an output parameter into which the read token will be  	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens. +	 * @param tokens is the set of token identifers, representing the currently +	 * enabled tokens.  	 * @param mode is the whitespace mode that should be used when a text token  	 * is returned.  	 * @return true if the operation was successful and there is a next token,  	 * false if there are no more tokens.  	 */ -	bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); +	bool read(Token &token, const TokenSet &tokens = TokenSet{}, +	          WhitespaceMode mode = WhitespaceMode::TRIM);  	/** -	 * Stores the next text token in the given token reference, returns true if -	 * the operation was successful (there was indeed a text token), false if -	 * the next token is not a text token or there were no more tokens. +	 * Stores the next token in the given token reference, returns true if the +	 * operation was successful, false if there are no more tokens.  	 *  	 * @param token is an output parameter into which the read token will be  	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens. +	 * @param tokens is the set of token identifers, representing the currently +	 * enabled tokens.  	 * @param mode is the whitespace mode that should be used when a text token  	 * is returned.  	 * @return true if the operation was successful and there is a next token,  	 * false if there are no more tokens.  	 */ -	bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); +	bool peek(Token &token, const TokenSet &tokens = TokenSet{}, +	          WhitespaceMode mode = WhitespaceMode::TRIM); + +	/** +	 * Consumes the peeked tokens, the read cursor will now be at the position +	 * of the peek cursor. +	 */ +	void consumePeek() { readCursor = peekCursor; } + +	/** +	 * Resets the peek cursor to the position of the read cursor. +	 */ +	void resetPeek() { peekCursor = readCursor; } +}; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: +	friend TokenizedDataReader; + +	/** +	 * Reference pointing at the parent TokenizedDataReader to which changes may +	 * be commited. +	 */ +	TokenizedDataReader &parent; + +	/** +	 * Private constructor of TokenizedDataReaderFork, taking a reference to the +	 * internal TokenizedDataImpl structure storing the data that is accessed by +	 * the reader and a reference at the parent TokenizedDataReader. +	 * +	 * @param parent is the TokenizedDataReader instance to which the current +	 * read/peek progress may be commited. +	 * @param impl is the TokenizedDataImpl instance that holds the actual data. +	 * @param readCursor is the cursor position from which tokens and text are +	 * read. +	 * @param peekCursor is the cursor position from which tokens and text are +	 * peeked. +	 */ +	TokenizedDataReaderFork(TokenizedDataReader &parent, +	                        std::shared_ptr<const TokenizedDataImpl> impl, +	                        const TokenizedDataCursor &readCursor, +	                        const TokenizedDataCursor &peekCursor) +	    : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) +	{ +	} + +public: +	/** +	 * Commits the read/peek progress to the underlying parent. +	 */ +	void commit() { parent = *this; }  };  } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..8d540a6 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@  #include <core/common/CharReader.hpp>  #include <core/common/Exceptions.hpp>  #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> +#include "TokenizedData.hpp"  #include "Tokenizer.hpp"  namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch {  	Token token;  	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. +	 * Position at which this token starts in the TokenizedData instance.  	 */ -	size_t textLength; +	size_t dataStartOffset;  	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. +	 * Set to true if the matched token is a primary token.  	 */ -	size_t textEnd; +	bool primary;  	/**  	 * Constructor of the TokenMatch class.  	 */ -	TokenMatch() : textLength(0), textEnd(0) {} +	TokenMatch() : dataStartOffset(0), primary(false) {}  	/**  	 * Returns true if this TokenMatch instance actually represents a match. +	 * +	 * @return true if the TokenMatch actually has a match. +	 */ +	bool hasMatch() const { return token.id != Tokens::Empty; } + +	/** +	 * Returns the length of the matched token. +	 * +	 * @return the length of the token string.  	 */ -	bool hasMatch() { return token.id != Tokens::Empty; } +	size_t size() const { return token.content.size(); }  };  /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private:  	size_t start;  	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. +	 * Position at which this token starts in the TokenizedData instance.  	 */ -	size_t textLength; - -	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. -	 */ -	size_t textEnd; +	size_t dataStartOffset;  public:  	/**  	 * Constructor of the TokenLookup class.  	 *  	 * @param node is the current node. -	 * @param start is the start position. -	 * @param textLength is the text buffer length of the previous text token. -	 * @param textEnd is the current end location of the previous text token. +	 * @param start is the start position in the source file. +	 * @param dataStartOffset is the current length of the TokenizedData buffer.  	 */ -	TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, -	            size_t textEnd) -	    : node(node), start(start), textLength(textLength), textEnd(textEnd) +	TokenLookup(const TokenTrie::Node *node, size_t start, +	            size_t dataStartOffset) +	    : node(node), start(start), dataStartOffset(dataStartOffset)  	{  	}  	/**  	 * Tries to extend the current path in the token trie with the given -	 * character. If a complete token is matched, stores this match in the -	 * tokens list (in case it is longer than any previous token). +	 * character. If a complete token is matched, stores the match in the given +	 * TokenMatch reference and returns true.  	 *  	 * @param c is the character that should be appended to the current prefix.  	 * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,48 @@ public:  	 * Tokenizer.  	 * @param end is the end byte offset of the current character.  	 * @param sourceId is the source if of this file. +	 * @return true if a token was matched, false otherwise.  	 */ -	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, -	             const std::vector<std::string> &tokens, SourceOffset end, -	             SourceId sourceId) +	bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, +	             const std::vector<Tokenizer::TokenDescriptor> &tokens, +	             SourceOffset end, SourceId sourceId)  	{ -		// Check whether we can continue the current token path with the given -		// character without visiting an already visited node +		// Set to true once a token has been matched +		bool res = false; + +		// Check whether we can continue the current token path, if not, abort  		auto it = node->children.find(c);  		if (it == node->children.end()) { -			return; +			return res;  		}  		// Check whether the new node represents a complete token a whether it  		// is longer than the current token. If yes, replace the current token.  		node = it->second.get(); -		if (node->type != Tokens::Empty) { -			const std::string &str = tokens[node->type]; -			size_t len = str.size(); -			if (len > match.token.content.size()) { -				match.token = -				    Token{node->type, str, {sourceId, start, end}}; -				match.textLength = textLength; -				match.textEnd = textEnd; -			} +		if (node->id != Tokens::Empty) { +			const Tokenizer::TokenDescriptor &descr = tokens[node->id]; +			match.token = Token(node->id, descr.string, +			                    SourceLocation(sourceId, start, end)); +			match.dataStartOffset = dataStartOffset; +			match.primary = descr.primary; +			res = true;  		}  		// If this state can possibly be advanced, store it in the states list.  		if (!node->children.empty()) {  			lookups.emplace_back(*this);  		} +		return res;  	}  }; - -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, -                           SourceId sourceId) -{ -	if (match.hasMatch()) { -		match.token.content = -		    std::string{handler.textBuf.data(), match.textLength}; -		match.token.location = -		    SourceLocation{sourceId, handler.textStart, match.textEnd}; -	} else { -		match.token.content = handler.toString(); -		match.token.location = -		    SourceLocation{sourceId, handler.textStart, handler.textEnd}; -	} -	match.token.id = Tokens::Data; -}  }  /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) -    : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template <typename TextHandler, bool read> -bool Tokenizer::next(CharReader &reader, Token &token) +template <bool read> +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)  {  	// If we're in the read mode, reset the char reader peek position to the  	// current read position @@ -199,45 +173,63 @@ bool Tokenizer::next(CharReader &reader, Token &token)  	// Prepare the lookups in the token trie  	const TokenTrie::Node *root = trie.getRoot(); -	TokenMatch match; +	TokenMatch bestMatch;  	std::vector<TokenLookup> lookups;  	std::vector<TokenLookup> nextLookups; -	// Instantiate the text handler -	TextHandler textHandler; -  	// Peek characters from the reader and try to advance the current token tree  	// cursor  	char c; +	const size_t initialDataSize = data.size();  	size_t charStart = reader.getPeekOffset();  	const SourceId sourceId = reader.getSourceId();  	while (reader.peek(c)) {  		const size_t charEnd = reader.getPeekOffset(); -		const size_t textLength = textHandler.textBuf.size(); -		const size_t textEnd = textHandler.textEnd; +		const size_t dataStartOffset = data.size();  		// If we do not have a match yet, start a new lookup from the root -		if (!match.hasMatch()) { -			TokenLookup{root, charStart, textLength, textEnd}.advance( -			    c, nextLookups, match, tokens, charEnd, sourceId); +		if (!bestMatch.hasMatch() || !bestMatch.primary) { +			lookups.emplace_back(root, charStart, dataStartOffset);  		}  		// Try to advance all other lookups with the new character +		TokenMatch match;  		for (TokenLookup &lookup : lookups) { -			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); +			// Continue if the current lookup +			if (!lookup.advance(c, nextLookups, match, tokens, charEnd, +			                    sourceId)) { +				continue; +			} + +			// Replace the best match with longest token +			if (match.size() > bestMatch.size()) { +				bestMatch = match; +			} + +			// If the matched token is a non-primary token -- mark the match in +			// the TokenizedData list +			if (!match.primary) { +				data.mark(match.token.id, data.size() - match.size() + 1, +				          match.size()); +			}  		} -		// We have found a token and there are no more states to advance or the -		// text handler has found something -- abort to return the new token -		if (match.hasMatch()) { -			if ((nextLookups.empty() || textHandler.hasText())) { + +		// If a token has been found and the token is a primary token, check +		// whether we have to abort, otherwise if we have a non-primary match, +		// reset it once it can no longer be advanced +		if (bestMatch.hasMatch() && nextLookups.empty()) { +			if (bestMatch.primary) {  				break; +			} else { +				bestMatch = TokenMatch{};  			} -		} else { -			// Record all incomming characters -			textHandler.append(c, charStart, charEnd);  		} +		// Record all incomming characters +		data.append(c, charStart, charEnd); + +  		// Swap the lookups and the nextLookups list  		lookups = std::move(nextLookups);  		nextLookups.clear(); @@ -246,60 +238,57 @@ bool Tokenizer::next(CharReader &reader, Token &token)  		charStart = charEnd;  	} -	// If we found text, emit that text -	if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { -		buildDataToken(textHandler, match, sourceId); +	// If we found data, emit a corresponding data token +	if (data.size() > initialDataSize && +	    (!bestMatch.hasMatch() || !bestMatch.primary || +	     bestMatch.dataStartOffset > initialDataSize)) { +		// If we have a "bestMatch" wich starts after text data has started, +		// trim the TokenizedData to this offset +		if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) { +			data.trim(bestMatch.dataStartOffset); +		} + +		// Create a token containing the data location +		bestMatch.token = Token{data.getLocation()}; +	} else if (bestMatch.hasMatch() && bestMatch.primary && +	           bestMatch.dataStartOffset == initialDataSize) { +		data.trim(initialDataSize);  	}  	// Move the read/peek cursor to the end of the token, abort if an error  	// happens while doing so -	if (match.hasMatch()) { +	if (bestMatch.hasMatch()) {  		// Make sure we have a valid location -		if (match.token.location.getEnd() == InvalidSourceOffset) { +		if (bestMatch.token.location.getEnd() == InvalidSourceOffset) {  			throw OusiaException{"Token end position offset out of range"};  		}  		// Seek to the end of the current token -		const size_t end = match.token.location.getEnd(); +		const size_t end = bestMatch.token.location.getEnd();  		if (read) {  			reader.seek(end);  		} else {  			reader.seekPeekCursor(end);  		} -		token = match.token; + +		token = bestMatch.token;  	} else {  		token = Token{};  	} -	return match.hasMatch(); +	return bestMatch.hasMatch();  } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data)  { -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingWhitespaceHandler, true>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingWhitespaceHandler, true>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingWhitespaceHandler, true>(reader, token); -	} -	return false; +	return next<true>(reader, token, data);  } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data)  { -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingWhitespaceHandler, false>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingWhitespaceHandler, false>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingWhitespaceHandler, false>(reader, token); -	} -	return false; +	return next<false>(reader, token, data);  } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary)  {  	// Abort if an empty token should be registered  	if (token.empty()) { @@ -309,8 +298,8 @@ TokenId Tokenizer::registerToken(const std::string &token)  	// Search for a new slot in the tokens list  	TokenId type = Tokens::Empty;  	for (size_t i = nextTokenId; i < tokens.size(); i++) { -		if (tokens[i].empty()) { -			tokens[i] = token; +		if (!tokens[i].valid()) { +			tokens[i] = TokenDescriptor(token, primary);  			type = i;  			break;  		} @@ -320,62 +309,47 @@ TokenId Tokenizer::registerToken(const std::string &token)  	// override the special token type handles  	if (type == Tokens::Empty) {  		type = tokens.size(); -		if (type == Tokens::Data || type == Tokens::Empty) { +		if (type >= Tokens::MaxTokenId) {  			throw OusiaException{"Token type ids depleted!"};  		} -		tokens.emplace_back(token); +		tokens.emplace_back(token, primary);  	}  	nextTokenId = type + 1; -	// Try to register the token in the trie -- if this fails, remove it -	// from the tokens list +	// Try to register the token in the trie -- if this fails, remove it from +	// the tokens list  	if (!trie.registerToken(token, type)) { -		tokens[type] = std::string{}; +		tokens[type] = TokenDescriptor();  		nextTokenId = type;  		return Tokens::Empty;  	}  	return type;  } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id)  {  	// Unregister the token from the trie, abort if an invalid type is given -	if (type < tokens.size() && trie.unregisterToken(tokens[type])) { -		tokens[type] = std::string{}; -		nextTokenId = type; +	if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { +		tokens[id] = TokenDescriptor(); +		nextTokenId = id;  		return true;  	}  	return false;  } -std::string Tokenizer::getTokenString(TokenId type) -{ -	if (type < tokens.size()) { -		return tokens[type]; -	} -	return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const  { -	whitespaceMode = mode; +	if (id < tokens.size()) { +		return tokens[id]; +	} +	return EmptyTokenDescriptor;  } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } -  /* Explicitly instantiate all possible instantiations of the "next" member     function */ -template bool Tokenizer::next<PreservingWhitespaceHandler, false>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, false>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, false>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<PreservingWhitespaceHandler, true>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<TrimmingWhitespaceHandler, true>( -    CharReader &reader, Token &token); -template bool Tokenizer::next<CollapsingWhitespaceHandler, true>( -    CharReader &reader, Token &token); +template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &);  } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@  /**   * @file Tokenizer.hpp   * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format.   *   * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)   */ @@ -28,44 +28,80 @@  #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_  #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include <set> +#include <cstdint>  #include <string>  #include <vector>  #include <core/common/Location.hpp> -#include <core/common/Whitespace.hpp> +#include <core/common/Token.hpp> -#include "Token.hpp"  #include "TokenTrie.hpp"  namespace ousia {  // Forward declarations  class CharReader; +class TokenizedData;  /**   * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance.   */  class Tokenizer { -private: +public:  	/** -	 * Internally used token trie. This object holds all registered tokens. +	 * Internally used structure describing a registered token.  	 */ -	TokenTrie trie; +	struct TokenDescriptor { +		/** +		 * String describing the token. +		 */ +		std::string string; + +		/** +		 * Set to true if this token is primary. +		 */ +		bool primary; + +		/** +		 * Constructor of the TokenDescriptor class. +		 * +		 * @param string is the string representation of the registered token. +		 * @param primary specifies whether the token is a primary token that +		 * should be returned as a single token, or a secondary token, that +		 * should be returned as part of TokenizedData. +		 */ +		TokenDescriptor(const std::string &string, bool primary) +		    : string(string), primary(primary) +		{ +		} + +		/** +		 * Default constructor. +		 */ +		TokenDescriptor() : primary(false) {} + +		/** +		 * Returns true if the TokenDescriptor represents a valid token. +		 */ +		bool valid() { return !string.empty(); } +	}; +private:  	/** -	 * Flag defining whether whitespaces should be preserved or not. +	 * Internally used token trie. This object holds all registered tokens.  	 */ -	WhitespaceMode whitespaceMode; +	TokenTrie trie;  	/**  	 * Vector containing all registered token types.  	 */ -	std::vector<std::string> tokens; +	std::vector<TokenDescriptor> tokens;  	/**  	 * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private:  	/**  	 * Templated function used internally to read the current token. The -	 * function is templated in order to force code generation for all six -	 * combiations of whitespace modes and reading/peeking. +	 * function is templated in order to force optimized code generation for +	 * both reading and peeking.  	 * -	 * @tparam TextHandler is the type to be used for the textHandler instance. -	 * @tparam read specifies whether the function should start from and advance -	 * the read pointer of the char reader. +	 * @tparam read specifies whether the method should read the token or just +	 * peek.  	 * @param reader is the CharReader instance from which the data should be  	 * read.  	 * @param token is the token structure into which the token information  	 * should be written. +	 * @param data is a reference at the TokenizedData instance to which the +	 * token information should be appended.  	 * @return false if the end of the stream has been reached, true otherwise.  	 */ -	template <typename TextHandler, bool read> -	bool next(CharReader &reader, Token &token); +	template <bool read> +	bool next(CharReader &reader, Token &token, TokenizedData &data);  public:  	/**  	 * Constructor of the Tokenizer class. -	 * -	 * @param whitespaceMode specifies how whitespace should be handled.  	 */ -	Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); +	Tokenizer();  	/** -	 * Registers the given string as a token. Returns a const pointer at a -	 * TokenDescriptor that will be used to reference the newly created token. +	 * Registers the given string as a token. Returns a unique identifier +	 * describing the registered token.  	 *  	 * @param token is the token string that should be registered. -	 * @return a unique identifier for the registered token or EmptyToken if +	 * @param primary specifies whether the token is a primary token -- if true, +	 * the token will be returned as a single, standalone token. Otherwise the +	 * token will be returned as part of a "TokenizedData" structure. +	 * @return a unique identifier for the registered token or Tokens::Empty if  	 * an error occured.  	 */ -	TokenId registerToken(const std::string &token); +	TokenId registerToken(const std::string &token, bool primary = true);  	/**  	 * Unregisters the token belonging to the given TokenId.  	 *  	 * @param type is the token type that should be unregistered. The -	 *TokenId -	 * must have been returned by registerToken. +	 * TokenId must have been returned by registerToken.  	 * @return true if the operation was successful, false otherwise (e.g. -	 * because the given TokenDescriptor was already unregistered). +	 * because the token with the given TokenId was already unregistered).  	 */ -	bool unregisterToken(TokenId type); +	bool unregisterToken(TokenId id);  	/**  	 * Returns the token that was registered under the given TokenId id or -	 *an -	 * empty string if an invalid TokenId id is given. +	 * an empty string if an invalid TokenId id is given.  	 * -	 * @param type is the TokenId id for which the corresponding token -	 *string +	 * @param id is the TokenId for which the corresponding TokenDescriptor  	 * should be returned. -	 * @return the registered token string or an empty string if the given type -	 * was invalid. -	 */ -	std::string getTokenString(TokenId type); - -	/** -	 * Sets the whitespace mode. -	 * -	 * @param whitespaceMode defines how whitespace should be treated in text -	 * tokens. -	 */ -	void setWhitespaceMode(WhitespaceMode mode); - -	/** -	 * Returns the current value of the whitespace mode. -	 * -	 * @return the whitespace mode. +	 * @return the registered TokenDescriptor or an invalid TokenDescriptor if +	 * the given TokenId is invalid.  	 */ -	WhitespaceMode getWhitespaceMode(); +	const TokenDescriptor& lookupToken(TokenId id) const;  	/**  	 * Reads a new token from the CharReader and stores it in the given -	 * Token instance. +	 * Token instance. If the token has the id Tokens::Data, use the "getData" +	 * method to fetch a reference at the underlying TokenizedData instance +	 * storing the data.  	 *  	 * @param reader is the CharReader instance from which the data should be  	 * read.  	 * @param token is a reference at the token instance into which the Token  	 * information should be written. +	 * @param data is a reference at the TokenizedData instance to which the +	 * token information should be appended.  	 * @return true if a token could be read, false if the end of the stream  	 * has been reached.  	 */ -	bool read(CharReader &reader, Token &token); +	bool read(CharReader &reader, Token &token, TokenizedData &data);  	/**  	 * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public:  	 * read.  	 * @param token is a reference at the token instance into which the Token  	 * information should be written. +	 * @param data is a reference at the TokenizedData instance to which the +	 * token information should be appended.  	 * @return true if a token could be read, false if the end of the stream  	 * has been reached.  	 */ -	bool peek(CharReader &reader, Token &token); +	bool peek(CharReader &reader, Token &token, TokenizedData &data);  };  }  | 
