diff options
Diffstat (limited to 'src/plugins')
| -rw-r--r-- | src/plugins/plain/DynamicTokenizer.cpp | 513 | ||||
| -rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 160 | ||||
| -rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.cpp | 276 | ||||
| -rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.hpp | 34 | ||||
| -rw-r--r-- | src/plugins/plain/TokenTrie.cpp (renamed from src/plugins/plain/DynamicTokenTree.cpp) | 40 | ||||
| -rw-r--r-- | src/plugins/plain/TokenTrie.hpp (renamed from src/plugins/plain/DynamicTokenTree.hpp) | 84 | 
6 files changed, 806 insertions, 301 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp index 7690395..f2cfcd1 100644 --- a/src/plugins/plain/DynamicTokenizer.cpp +++ b/src/plugins/plain/DynamicTokenizer.cpp @@ -17,57 +17,528 @@  */  #include <memory> -#include <string> -#include <unordered_map> +#include <vector>  #include <core/common/CharReader.hpp> +#include <core/common/Exceptions.hpp> +#include <core/common/Utils.hpp>  #include "DynamicTokenizer.hpp"  namespace ousia { +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { +	/** +	 * Token that was matched. +	 */ +	DynamicToken token; + +	/** +	 * Current length of the data within the text handler. The text buffer needs +	 * to be trimmed to this length if this token matches. +	 */ +	size_t textLength; + +	/** +	 * End location of the current text handler. This location needs to be used +	 * for the text token that is emitted before the actual token. +	 */ +	size_t textEnd; + +	/** +	 * Constructor of the TokenMatch class. +	 */ +	TokenMatch() : textLength(0), textEnd(0) {} + +	/** +	 * Returns true if this TokenMatch instance actually represents a match. +	 */ +	bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: +	/** +	 * Current node within the token trie. +	 */ +	TokenTrie::Node const *node; + +	/** +	 * Start offset within the source file. +	 */ +	size_t start; + +	/** +	 * Current length of the data within the text handler. The text buffer needs +	 * to be trimmed to this length if this token matches. +	 */ +	size_t textLength; + +	/** +	 * End location of the current text handler. This location needs to be used +	 * for the text token that is emitted before the actual token. +	 */ +	size_t textEnd; + +public: +	/** +	 * Constructor of the TokenLookup class. +	 * +	 * @param node is the current node. +	 * @param start is the start position. +	 * @param textLength is the text buffer length of the previous text token. +	 * @param textEnd is the current end location of the previous text token. +	 */ +	TokenLookup(const TokenTrie::Node *node, size_t start, +	            size_t textLength, size_t textEnd) +	    : node(node), start(start), textLength(textLength), textEnd(textEnd) +	{ +	} + +	/** +	 * Tries to extend the current path in the token trie with the given +	 * character. If a complete token is matched, stores this match in the +	 * tokens list (in case it is longer than any previous token). +	 * +	 * @param c is the character that should be appended to the current prefix. +	 * @param lookups is a list to which new TokeLookup instances are added -- +	 * which could potentially be expanded in the next iteration. +	 * @param match is the DynamicToken instance to which the matching token +	 * should be written. +	 * @param tokens is a reference at the internal token list of the +	 * DynamicTokenizer. +	 * @param end is the end byte offset of the current character. +	 * @param sourceId is the source if of this file. +	 */ +	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, +	             const std::vector<std::string> &tokens, SourceOffset end, +	             SourceId sourceId) +	{ +		// Check whether we can continue the current token path with the given +		// character without visiting an already visited node +		auto it = node->children.find(c); +		if (it == node->children.end()) { +			return; +		} + +		// Check whether the new node represents a complete token a whether it +		// is longer than the current token. If yes, replace the current token. +		node = it->second.get(); +		if (node->type != EmptyToken) { +			const std::string &str = tokens[node->type]; +			size_t len = str.size(); +			if (len > match.token.content.size()) { +				match.token = +				    DynamicToken{node->type, str, {sourceId, start, end}}; +				match.textLength = textLength; +				match.textEnd = textEnd; +			} +		} + +		// If this state can possibly be advanced, store it in the states list. +		if (!node->children.empty()) { +			lookups.emplace_back(*this); +		} +	} +}; + +/* Internal class TextHandlerBase */ + +/** + * Base class used for those classes that may be used as TextHandler in the + * DynamicTokenizer::next function. + */ +class TextHandlerBase { +public: +	/** +	 * Start position of the extracted text. +	 */ +	size_t textStart; + +	/** +	 * End position of the extracted text. +	 */ +	size_t textEnd; + +	/** +	 * Buffer containing the extracted text. +	 */ +	std::vector<char> textBuf; + +	/** +	 * Constructor of the TextHandlerBase base class. Initializes the start and +	 * end position with zeros. +	 */ +	TextHandlerBase() : textStart(0), textEnd(0) {} + +	/** +	 * Transforms the given token into a text token containing the extracted +	 * text. +	 * +	 * @param token is the output token to which the text should be written. +	 * @param sourceId is the source id of the underlying file. +	 */ +	void buildTextToken(TokenMatch &match, SourceId sourceId) +	{ +		if (match.hasMatch()) { +			match.token.content = +			    std::string{textBuf.data(), match.textLength}; +			match.token.location = +			    SourceLocation{sourceId, textStart, match.textEnd}; +		} else { +			match.token.content = std::string{textBuf.data(), textBuf.size()}; +			match.token.location = SourceLocation{sourceId, textStart, textEnd}; +		} +		match.token.type = TextToken; +	} + +	/** +	 * Returns true if this whitespace handler has found any text and a text +	 * token could be emitted. +	 * +	 * @return true if the internal data buffer is non-empty. +	 */ +	bool hasText() { return !textBuf.empty(); } +}; + +/* Internal class PreservingTextHandler */ + +/** + * The PreservingTextHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingTextHandler : public TextHandlerBase { +public: +	using TextHandlerBase::TextHandlerBase; + +	/** +	 * Appends the given character to the internal text buffer, does not +	 * eliminate whitespace. +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 */ +	void append(char c, size_t start, size_t end) +	{ +		if (textBuf.empty()) { +			textStart = start; +		} +		textEnd = end; +		textBuf.push_back(c); +	} +}; + +/* Internal class TrimmingTextHandler */ +  /** - * The TokenDescriptor class is a simple wrapper around a standard string - * containing the character sequence of the token. + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters.   */ -class TokenDescriptor { +class TrimmingTextHandler : public TextHandlerBase { +public: +	using TextHandlerBase::TextHandlerBase; +  	/** -	 * The character sequence of the token. +	 * Buffer used internally to temporarily store all whitespace characters. +	 * They are only added to the output buffer if another non-whitespace +	 * character is reached.  	 */ -	std::string str; +	std::vector<char> whitespaceBuf;  	/** -	 * Default constructor of the TokenDescriptor class. Used to describe -	 * special tokens. +	 * Appends the given character to the internal text buffer, eliminates +	 * whitespace characters at the begin and end of the text. +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character.  	 */ -	TokenDescriptor(); +	void append(char c, size_t start, size_t end) +	{ +		// Handle whitespace characters +		if (Utils::isWhitespace(c)) { +			if (!textBuf.empty()) { +				whitespaceBuf.push_back(c); +			} +			return; +		} + +		// Set the start and end offset correctly +		if (textBuf.empty()) { +			textStart = start; +		} +		textEnd = end; + +		// Store the character +		if (!whitespaceBuf.empty()) { +			textBuf.insert(textBuf.end(), whitespaceBuf.begin(), +			               whitespaceBuf.end()); +			whitespaceBuf.clear(); +		} +		textBuf.push_back(c); +	} +}; + +/* Internal class CollapsingTextHandler */ + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingTextHandler : public TextHandlerBase { +public: +	using TextHandlerBase::TextHandlerBase;  	/** -	 * Constructor initializing the character sequence of the token. +	 * Flag set to true if a whitespace character was reached.  	 */ -	TokenDescriptor(const std::string &str) : str(str) {} +	bool hasWhitespace = false; + +	/** +	 * Appends the given character to the internal text buffer, eliminates +	 * redundant whitespace characters. +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 */ +	void append(char c, size_t start, size_t end) +	{ +		// Handle whitespace characters +		if (Utils::isWhitespace(c)) { +			if (!textBuf.empty()) { +				hasWhitespace = true; +			} +			return; +		} + +		// Set the start and end offset correctly +		if (textBuf.empty()) { +			textStart = start; +		} +		textEnd = end; + +		// Store the character +		if (hasWhitespace) { +			textBuf.push_back(' '); +			hasWhitespace = false; +		} +		textBuf.push_back(c); +	}  }; +}  /* Class DynamicTokenizer */ -void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode) +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) +    : whitespaceMode(whitespaceMode), nextTokenTypeId(0)  { -	whitespaceMode = mode;  } -WhitespaceMode DynamicTokenizer::getWhitespaceMode() +template <typename TextHandler, bool read> +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)  { -	return whitespaceMode; +	// If we're in the read mode, reset the char reader peek position to the +	// current read position +	if (read) { +		reader.resetPeek(); +	} + +	// Prepare the lookups in the token trie +	const TokenTrie::Node *root = trie.getRoot(); +	TokenMatch match; +	std::vector<TokenLookup> lookups; +	std::vector<TokenLookup> nextLookups; + +	// Instantiate the text handler +	TextHandler textHandler; + +	// Peek characters from the reader and try to advance the current token tree +	// cursor +	char c; +	size_t charStart = reader.getPeekOffset(); +	const SourceId sourceId = reader.getSourceId(); +	while (reader.peek(c)) { +		const size_t charEnd = reader.getPeekOffset(); +		const size_t textLength = textHandler.textBuf.size(); +		const size_t textEnd = textHandler.textEnd; + +		// If we do not have a match yet, start a new lookup from the root +		if (!match.hasMatch()) { +			TokenLookup{root, charStart, textLength, textEnd}.advance( +			    c, nextLookups, match, tokens, charEnd, sourceId); +		} + +		// Try to advance all other lookups with the new character +		for (TokenLookup &lookup : lookups) { +			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); +		} + +		// We have found a token and there are no more states to advance or the +		// text handler has found something -- abort to return the new token +		if (match.hasMatch()) { +			if ((nextLookups.empty() || textHandler.hasText())) { +				break; +			} +		} else { +			// Record all incomming characters +			textHandler.append(c, charStart, charEnd); +		} + +		// Swap the lookups and the nextLookups list +		lookups = std::move(nextLookups); +		nextLookups.clear(); + +		// Advance the offset +		charStart = charEnd; +	} + +	// If we found text, emit that text +	if (textHandler.hasText() && +	    (!match.hasMatch() || match.textLength > 0)) { +		textHandler.buildTextToken(match, sourceId); +	} + +	// Move the read/peek cursor to the end of the token, abort if an error +	// happens while doing so +	if (match.hasMatch()) { +		// Make sure we have a valid location +		if (match.token.location.getEnd() == InvalidSourceOffset) { +			throw OusiaException{"Token end position offset out of range"}; +		} + +		// Seek to the end of the current token +		const size_t end = match.token.location.getEnd(); +		if (read) { +			reader.seek(end); +		} else { +			reader.seekPeekCursor(end); +		} +		token = match.token; +	} else { +		token = DynamicToken{}; +	} +	return match.hasMatch(); +} + +bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) +{ +	switch (whitespaceMode) { +		case WhitespaceMode::PRESERVE: +			return next<PreservingTextHandler, true>(reader, token); +		case WhitespaceMode::TRIM: +			return next<TrimmingTextHandler, true>(reader, token); +		case WhitespaceMode::COLLAPSE: +			return next<CollapsingTextHandler, true>(reader, token); +	} +	return false; +} + +bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) +{ +	switch (whitespaceMode) { +		case WhitespaceMode::PRESERVE: +			return next<PreservingTextHandler, false>(reader, token); +		case WhitespaceMode::TRIM: +			return next<TrimmingTextHandler, false>(reader, token); +		case WhitespaceMode::COLLAPSE: +			return next<CollapsingTextHandler, false>(reader, token); +	} +	return false;  } +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ +	// Abort if an empty token should be registered +	if (token.empty()) { +		return EmptyToken; +	} + +	// Search for a new slot in the tokens list +	TokenTypeId type = EmptyToken; +	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { +		if (tokens[i].empty()) { +			tokens[i] = token; +			type = i; +			break; +		} +	} -/* Constant initializations */ +	// No existing slot was found, add a new one -- make sure we do not +	// override the special token type handles +	if (type == EmptyToken) { +		type = tokens.size(); +		if (type == TextToken || type == EmptyToken) { +			throw OusiaException{"Token type ids depleted!"}; +		} +		tokens.emplace_back(token); +	} +	nextTokenTypeId = type + 1; -static const TokenDescriptor Empty; -static const TokenDescriptor Text; -static const TokenDescriptor* DynamicTokenizer::Empty = &Empty; -static const TokenDescriptor* DynamicTokenizer::Token = &Text; +	// Try to register the token in the trie -- if this fails, remove it +	// from the tokens list +	if (!trie.registerToken(token, type)) { +		tokens[type] = std::string(); +		nextTokenTypeId = type; +		return EmptyToken; +	} +	return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ +	// Unregister the token from the trie, abort if an invalid type is given +	if (type < tokens.size() && trie.unregisterToken(tokens[type])) { +		tokens[type] = std::string{}; +		nextTokenTypeId = type; +		return true; +	} +	return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ +	if (type < tokens.size()) { +		return tokens[type]; +	} +	return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ +	whitespaceMode = mode; +} +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } +/* Explicitly instantiate all possible instantiations of the "next" member +   function */ +template bool DynamicTokenizer::next<PreservingTextHandler, false>( +    CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<TrimmingTextHandler, false>( +    CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<CollapsingTextHandler, false>( +    CharReader &reader,DynamicToken &token); +template bool DynamicTokenizer::next<PreservingTextHandler, true>( +    CharReader &reader,DynamicToken &token); +template bool DynamicTokenizer::next<TrimmingTextHandler, true>( +    CharReader &reader,DynamicToken &token); +template bool DynamicTokenizer::next<CollapsingTextHandler, true>( +    CharReader &reader,DynamicToken &token);  } diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp index f7fef13..0b4dd39 100644 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ b/src/plugins/plain/DynamicTokenizer.hpp @@ -28,34 +28,63 @@  #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_  #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#include <set> +#include <string> +#include <vector> +  #include <core/common/Location.hpp> +#include "TokenTrie.hpp" +  namespace ousia {  // Forward declarations  class CharReader; -class TokenDescriptor;  /**   * The DynamicToken structure describes a token discovered by the Tokenizer.   */  struct DynamicToken {  	/** -	 * Pointer pointing at the TokenDescriptor instance this token corresponds -	 * to. May be one of the special TokenDescriptors defined as static members -	 * of the DynamicTokenizer class. +	 * Id of the type of this token.  	 */ -	TokenDescriptor const *descriptor; +	TokenTypeId type;  	/**  	 * String that was matched.  	 */ -	std::string str; +	std::string content;  	/**  	 * Location from which the string was extracted.  	 */  	SourceLocation location; + +	/** +	 * Default constructor. +	 */ +	DynamicToken() : type(EmptyToken) {} + +	/** +	 * Constructor of the DynamicToken struct. +	 * +	 * @param id represents the token type. +	 * @param content is the string content that has been extracted. +	 * @param location is the location of the extracted string content in the +	 * source file. +	 */ +	DynamicToken(TokenTypeId type, const std::string &content, +	             SourceLocation location) +	    : type(type), content(content), location(location) +	{ +	} + +	/** +	 * Constructor of the DynamicToken struct, only initializes the token type +	 * +	 * @param type is the id corresponding to the type of the token. +	 */ +	DynamicToken(TokenTypeId type) : type(type) {}  };  /** @@ -64,33 +93,35 @@ struct DynamicToken {   */  enum class WhitespaceMode {  	/** -	 * Preserves all whitespaces as they are found in the source file. -	 */ +     * Preserves all whitespaces as they are found in the source file. +     */  	PRESERVE,  	/** -	 * Trims whitespace at the beginning and the end of the found text. -	 */ +     * Trims whitespace at the beginning and the end of the found text. +     */  	TRIM,  	/** -	 * Whitespaces are trimmed and collapsed, multiple whitespace characters -	 * are replaced by a single space character. -	 */ +     * Whitespaces are trimmed and collapsed, multiple whitespace characters +     * are replaced by a single space character. +     */  	COLLAPSE  };  /**   * The DynamicTokenizer is used to extract tokens and chunks of text from a   * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. + * to modify the handling of whitespace characters. Note that the + * DynamicTokenizer always tries to extract the longest possible token from the + * tokenizer.   */  class DynamicTokenizer {  private:  	/** -	 * Reference at the char reader. +	 * Internally used token trie. This object holds all registered tokens.  	 */ -	CharReader &reader; +	TokenTrie trie;  	/**  	 * Flag defining whether whitespaces should be preserved or not. @@ -98,53 +129,73 @@ private:  	WhitespaceMode whitespaceMode;  	/** -	 * Vector containing all registered token descriptors. +	 * Vector containing all registered token types.  	 */ -	std::vector<std::unique_ptr<TokenDescriptor>> descriptors; +	std::vector<std::string> tokens; -public:  	/** -	 * Constructor of the DynamicTokenizer class. +	 * Next index in the tokens list where to search for a new token id. +	 */ +	size_t nextTokenTypeId; + +	/** +	 * Templated function used internally to read the current token. The +	 * function is templated in order to force code generation for all six +	 * combiations of whitespace modes and reading/peeking.  	 * -	 * @param reader is the CharReader that should be used for reading the -	 * tokens. -	 * @param preserveWhitespaces should be set to true if all whitespaces -	 * should be preserved (for preformated environments). -	 */ -	DynamicTokenizer(CharReader &reader) -	    : reader(reader), -	      preserveWhitespaces(preserveWhitespaces), -	      location(reader.getSourceId()), -	      empty(true), -	      hasWhitespace(false) -	{ -	} +	 * @tparam TextHandler is the type to be used for the textHandler instance. +	 * @tparam read specifies whether the function should start from and advance +	 * the read pointer of the char reader. +	 * @param reader is the CharReader instance from which the data should be +	 * read. +	 * @param token is the token structure into which the token information +	 * should be written. +	 * @return false if the end of the stream has been reached, true otherwise. +	 */ +	template <typename TextHandler, bool read> +	bool next(CharReader &reader, DynamicToken &token); +public:  	/** -	 * Destructor of the DynamicTokenizer class. +	 * Constructor of the DynamicTokenizer class. +	 * +	 * @param whitespaceMode specifies how whitespace should be handled.  	 */ -	~DynamicTokenizer(); +	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);  	/**  	 * Registers the given string as a token. Returns a const pointer at a  	 * TokenDescriptor that will be used to reference the newly created token.  	 *  	 * @param token is the token string that should be registered. -	 * @return a pointer at a TokenDescriptor which is representative for the -	 * newly registered token. Returns nullptr if a token with this string -	 * was already registered. +	 * @return a unique identifier for the registered token or EmptyToken if +	 * an error occured.  	 */ -	const TokenDescriptor* registerToken(const std::string &token); +	TokenTypeId registerToken(const std::string &token);  	/** -	 * Unregisters the token belonging to the given TokenDescriptor. +	 * Unregisters the token belonging to the given TokenTypeId.  	 * -	 * @param descr is a TokenDescriptor that was previously returned by -	 * registerToken. +	 * @param type is the token type that should be unregistered. The +	 *TokenTypeId +	 * must have been returned by registerToken.  	 * @return true if the operation was successful, false otherwise (e.g.  	 * because the given TokenDescriptor was already unregistered).  	 */ -	bool unregisterToken(const TokenDescriptor *descr); +	bool unregisterToken(TokenTypeId type); + +	/** +	 * Returns the token that was registered under the given TokenTypeId id or +	 *an +	 * empty string if an invalid TokenTypeId id is given. +	 * +	 * @param type is the TokenTypeId id for which the corresponding token +	 *string +	 * should be returned. +	 * @return the registered token string or an empty string if the given type +	 * was invalid. +	 */ +	std::string getTokenString(TokenTypeId type);  	/**  	 * Sets the whitespace mode. @@ -165,25 +216,28 @@ public:  	 * Reads a new token from the CharReader and stores it in the given  	 * DynamicToken instance.  	 * +	 * @param reader is the CharReader instance from which the data should be +	 * read.  	 * @param token is a reference at the token instance into which the Token  	 * information should be written.  	 * @return true if a token could be read, false if the end of the stream  	 * has been reached.  	 */ -	bool read(DynamicToken &token); +	bool read(CharReader &reader, DynamicToken &token);  	/** -	 * TokenDescriptor representing an empty token. -	 */ -	static const *TokenDescriptor Empty; - -	/** -	 * TokenDescriptor representing generic text. +	 * The peek method does not advance the read position of the char reader, +	 * but reads the next token from the current char reader peek position. +	 * +	 * @param reader is the CharReader instance from which the data should be +	 * read. +	 * @param token is a reference at the token instance into which the Token +	 * information should be written. +	 * @return true if a token could be read, false if the end of the stream +	 * has been reached.  	 */ -	static const *TokenDescriptor Text; - +	bool peek(CharReader &reader, DynamicToken &token);  }; -  }  #endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp index 15ca403..498cd43 100644 --- a/src/plugins/plain/PlainFormatStreamReader.cpp +++ b/src/plugins/plain/PlainFormatStreamReader.cpp @@ -16,9 +16,6 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ -#include <sstream> -#include <unordered_set> -  #include <core/common/CharReader.hpp>  #include <core/common/Logger.hpp>  #include <core/common/Utils.hpp> @@ -27,123 +24,89 @@  namespace ousia { -/* Internally used types, protected from spilling the exports by a namespace */ -  namespace { -/** - * Enum used to specify the state of the parseBlockComment state machine. - */ -enum class BlockCommentState { DEFAULT, HAS_CURLY_CLOSE, HAS_PERCENT };  /** - * Class taking care of recording plain text data found withing the file. + * Class used internally to collect data issued via "DATA" event.   */  class DataHandler {  private:  	/** -	 * Const reference at the reader, used for reading the current location. +	 * Internal character buffer.  	 */ -	const CharReader &reader; +	std::vector<char> buf;  	/** -	 * Flag defining whether whitespaces should be preserved or not. +	 * Start location of the character data.  	 */ -	const bool preserveWhitespaces; +	SourceOffset start;  	/** -	 * Current source range of the data in the buffer. +	 * End location of the character data.  	 */ -	SourceLocation location; +	SourceOffset end; -	/** -	 * Current buffer containing all read characters. -	 */ -	std::stringstream buffer; +public:  	/** -	 * Set to false, once a non-whitespace character was reached. +	 * Default constructor, initializes start and end with zeros.  	 */ -	bool empty; +	DataHandler() : start(0), end(0) {}  	/** -	 * Set to true if a whitespace was found -- these are normalized to a single -	 * space. +	 * Returns true if the internal buffer is empty. +	 * +	 * @return true if no characters were added to the internal buffer, false +	 * otherwise.  	 */ -	bool hasWhitespace; +	bool isEmpty() { return buf.empty(); } -public:  	/** -	 * Constructor of the DataHandler class. +	 * Appends a single character to the internal buffer.  	 * -	 * @param reader is the CharReader that should be used for reading the data -	 * location. -	 * @param preserveWhitespaces should be set to true if all whitespaces -	 * should be preserved (for preformated environments). +	 * @param c is the character that should be added to the internal buffer. +	 * @param charStart is the start position of the character. +	 * @param charEnd is the end position of the character.  	 */ -	DataHandler(const CharReader &reader, bool preserveWhitespaces = false) -	    : reader(reader), -	      preserveWhitespaces(preserveWhitespaces), -	      location(reader.getSourceId()), -	      empty(true), -	      hasWhitespace(false) +	void append(char c, SourceOffset charStart, SourceOffset charEnd)  	{ +		if (isEmpty()) { +			start = charStart; +		} +		buf.push_back(c); +		end = charEnd;  	}  	/** -	 * Appends the given character to the internal buffer. +	 * Appends a string to the internal buffer.  	 * -	 * @param c is the character that should be appended. -	 * @param wasEscaped is set to true if the character was escaped (prepended -	 * with a backslash), this allows whitespace characters to be explicitly -	 * included. +	 * @param s is the string that should be added to the internal buffer. +	 * @param stringStart is the start position of the string. +	 * @param stringEnd is the end position of the string.  	 */ -	void append(char c, bool wasEscaped = false) +	void append(const std::string &s, SourceOffset stringStart, +	            SourceOffset stringEnd)  	{ -		// Check whether the character is a whitespace -		const bool isWhitespace = -		    !wasEscaped && !preserveWhitespaces && Utils::isWhitespace(c); - -		// Trim leading and trailing whitespaces -		if (isWhitespace) { -			if (!empty) { -				hasWhitespace = true; -			} -		} else { -			// Compress whitespaces to a single space -			if (hasWhitespace) { -				buffer << ' '; -				hasWhitespace = false; -			} - -			// Append the character -			buffer << c; - -			// Update the "empty" flag and set the start and end offset -			if (empty) { -				location.setStart(reader.getOffset()); -				empty = false; -			} -			location.setEnd(reader.getPeekOffset()); +		if (isEmpty()) { +			start = stringStart;  		} +		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); +		end = stringEnd;  	}  	/** -	 * Returns true if no non-whitespace character has been found until now. -	 * -	 * @return true if the internal buffer is still empty. -	 */ -	bool isEmpty() { return empty; } - -	/** -	 * Returns a variant containg the read data and its location. +	 * Converts the internal buffer to a variant with attached location +	 * information.  	 * -	 * @return a variant with a string value containing the read data and the -	 * location being set to +	 * @param sourceId is the source id which is needed for building the +	 * location information. +	 * @return a Variant with the internal buffer content as string and +	 * the correct start and end location.  	 */ -	Variant getData() +	Variant toVariant(SourceId sourceId)  	{ -		Variant res = Variant::fromString(buffer.str()); -		res.setLocation(location); +		Variant res = Variant::fromString(std::string(buf.data(), buf.size())); +		res.setLocation({sourceId, start, end});  		return res;  	}  }; @@ -153,35 +116,26 @@ PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,                                                   Logger &logger)      : reader(reader), logger(logger), fieldIdx(0)  { +	tokenBackslash = tokenizer.registerToken("\\"); +	tokenLinebreak = tokenizer.registerToken("\n"); +	tokenLineComment = tokenizer.registerToken("%"); +	tokenBlockCommentStart = tokenizer.registerToken("%{"); +	tokenBlockCommentEnd = tokenizer.registerToken("}%");  } -/* Comment handling */ -  void PlainFormatStreamReader::parseBlockComment()  { -	char c; -	BlockCommentState state = BlockCommentState::DEFAULT; -	while (reader.read(c)) { -		switch (state) { -			case BlockCommentState::DEFAULT: -				if (c == '%') { -					state = BlockCommentState::HAS_PERCENT; -				} else if (c == '}') { -					state = BlockCommentState::HAS_CURLY_CLOSE; -				} -				break; -			case BlockCommentState::HAS_PERCENT: -				if (c == '{') { -					parseBlockComment(); -				} -				state = BlockCommentState::DEFAULT; -				break; -			case BlockCommentState::HAS_CURLY_CLOSE: -				if (c == '%') { -					return; -				} -				state = BlockCommentState::DEFAULT; -				break; +	DynamicToken token; +	size_t depth = 1; +	while (tokenizer.read(reader, token)) { +		if (token.type == tokenBlockCommentEnd) { +			depth--; +			if (depth == 0) { +				return; +			} +		} +		if (token.type == tokenBlockCommentStart) { +			depth++;  		}  	} @@ -189,102 +143,84 @@ void PlainFormatStreamReader::parseBlockComment()  	logger.error("File ended while being in a block comment", reader);  } -void PlainFormatStreamReader::parseComment() +void PlainFormatStreamReader::parseLineComment()  {  	char c; -	bool first = true;  	reader.consumePeek();  	while (reader.read(c)) { -		// Continue parsing a block comment if a '{' is found -		if (c == '{' && first) { -			parseBlockComment(); -			return; -		}  		if (c == '\n') {  			return;  		} -		first = false;  	}  } -/* Top level parse function */ - -static const std::unordered_set<char> EscapeableCharacters{'\\', '<', '>', -                                                    '{',  '}', '%'}; -  PlainFormatStreamReader::State PlainFormatStreamReader::parse()  {  // Macro (sorry for that) used for checking whether there is data to issue, and  // if yes, aborting the loop, allowing for a reentry on a later parse call by  // resetting the peek cursor -#define CHECK_ISSUE_DATA()      \ -	{                           \ -		if (!dataHandler.isEmpty()) {   \ -			reader.resetPeek(); \ -			abort = true;       \ -			break;              \ -		}                       \ +#define CHECK_ISSUE_DATA()            \ +	{                                 \ +		if (!dataHandler.isEmpty()) { \ +			reader.resetPeek();       \ +			abort = true;             \ +			break;                    \ +		}                             \  	} -	// Data handler -	DataHandler dataHandler(reader); +	// Handler for incomming data +	DataHandler dataHandler;  	// Variable set to true if the parser loop should be left  	bool abort = false; -	// Happily add characters to the dataHandler and handle escaping until a -	// special character is reached. Then go to a specialiced parsing routine -	char c; -	while (!abort && reader.peek(c)) { -		switch (c) { -			case '\\': -				reader.peek(c); -				// Check whether this backslash just escaped some special or -				// whitespace character or was the beginning of a command -				if (EscapeableCharacters.count(c) == 0 && -				    !Utils::isWhitespace(c)) { -					CHECK_ISSUE_DATA(); -					// TODO: Parse command (starting from the backslash) -					return State::COMMAND; -				} -				// A character was escaped, add it to the buffer, with the -				// wasEscaped flag set to true -				dataHandler.append(c, true); -				break; -			case '<': -				// TODO: Annotations -				break; -			case '>': -				// TODO: Annotations -				break; -			case '{': -				// TODO: Issue start of field -				break; -			case '}': -			// TODO: Issue end of field -			case '%': -				CHECK_ISSUE_DATA(); -				parseComment(); -				break; -			case '\n': +	// Read tokens until the outer loop should be left +	DynamicToken token; +	while (!abort && tokenizer.peek(reader, token)) { +		// Check whether this backslash just escaped some special or +		// whitespace character or was the beginning of a command +		if (token.type == tokenBackslash) { +			// Check whether this character could be the start of a command +			char c; +			reader.consumePeek(); +			reader.peek(c); +			if (Utils::isIdentifierStart(c)) {  				CHECK_ISSUE_DATA(); -				reader.consumePeek(); -				return State::LINEBREAK; -			default: -				dataHandler.append(c, false); +				// TODO: Parse a command +				return State::COMMAND; +			} + +			// This was not a special character, just append the given character +			// to the data buffer, use the escape character start as start +			// location and the peek offset as end location +			dataHandler.append(c, token.location.getStart(), +			                   reader.getPeekOffset()); +		} else if (token.type == tokenLineComment) { +			CHECK_ISSUE_DATA(); +			reader.consumePeek(); +			parseLineComment(); +		} else if (token.type == tokenBlockCommentStart) { +			CHECK_ISSUE_DATA(); +			reader.consumePeek(); +			parseBlockComment(); +		} else if (token.type == tokenLinebreak) { +			CHECK_ISSUE_DATA(); +			reader.consumePeek(); +			return State::LINEBREAK; +		} else if (token.type == TextToken) { +			dataHandler.append(token.content, token.location.getStart(), +			                   token.location.getEnd());  		}  		// Consume the peeked character if we did not abort, otherwise abort  		if (!abort) {  			reader.consumePeek(); -		} else { -			break;  		}  	}  	// Send out pending output data, otherwise we are at the end of the stream  	if (!dataHandler.isEmpty()) { -		data = dataHandler.getData(); +		data = dataHandler.toVariant(reader.getSourceId());  		return State::DATA;  	}  	return State::END; diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp index 1a136cd..b2ea378 100644 --- a/src/plugins/plain/PlainFormatStreamReader.hpp +++ b/src/plugins/plain/PlainFormatStreamReader.hpp @@ -31,6 +31,8 @@  #include <core/common/Variant.hpp> +#include "DynamicTokenizer.hpp" +  namespace ousia {  // Forward declarations @@ -123,6 +125,11 @@ private:  	Logger &logger;  	/** +	 * Tokenizer instance used to read individual tokens from the text. +	 */ +	DynamicTokenizer tokenizer; + +	/**  	 * Variant containing the current command name (always is a string variant,  	 * but additionally contains the correct locatino of the name).  	 */ @@ -141,6 +148,31 @@ private:  	Variant data;  	/** +	 * Id of the backslash token. +	 */ +	TokenTypeId tokenBackslash; + +	/** +	 * Id of the linebreak token. +	 */ +	TokenTypeId tokenLinebreak; + +	/** +	 * Id of the line comment token. +	 */ +	TokenTypeId tokenLineComment; + +	/** +	 * Id of the block comment start token. +	 */ +	TokenTypeId tokenBlockCommentStart; + +	/** +	 * If of the block comment end token. +	 */ +	TokenTypeId tokenBlockCommentEnd; + +	/**  	 * Contains the field index of the current command.  	 */  	size_t fieldIdx; @@ -153,7 +185,7 @@ private:  	/**  	 * Function used internally to parse a generic comment.  	 */ -	void parseComment(); +	void parseLineComment();  public:  	/** diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/TokenTrie.cpp index 8b7bfc2..4a0430b 100644 --- a/src/plugins/plain/DynamicTokenTree.cpp +++ b/src/plugins/plain/TokenTrie.cpp @@ -16,18 +16,18 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ -#include "DynamicTokenTree.hpp" +#include "TokenTrie.hpp"  namespace ousia {  /* Class DynamicTokenTree::Node */ -DynamicTokenTree::Node::Node() : descriptor(nullptr) {} +TokenTrie::Node::Node() : type(EmptyToken) {}  /* Class DynamicTokenTree */ -bool DynamicTokenTree::registerToken(const std::string &token, -                                     const TokenDescriptor *descriptor) noexcept +bool TokenTrie::registerToken(const std::string &token, +                              TokenTypeId type) noexcept  {  	// Abort if the token is empty -- this would taint the root node  	if (token.empty()) { @@ -42,23 +42,22 @@ bool DynamicTokenTree::registerToken(const std::string &token,  		const char c = token[i];  		auto it = node->children.find(c);  		if (it == node->children.end()) { -			it = node->children.emplace(c, std::unique_ptr<Node>(new Node{})) -			         .first; +			it = node->children.emplace(c, std::make_shared<Node>()).first;  		}  		node = it->second.get();  	} -	// If the resulting node already has a descriptor set, we're screwed. -	if (node->descriptor != nullptr) { +	// If the resulting node already has a type set, we're screwed. +	if (node->type != EmptyToken) {  		return false;  	} -	// Otherwise just set the descriptor to the given descriptor. -	node->descriptor = descriptor; +	// Otherwise just set the type to the given type. +	node->type = type;  	return true;  } -bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept +bool TokenTrie::unregisterToken(const std::string &token) noexcept  {  	// We cannot remove empty tokens as we need to access the fist character  	// upfront @@ -77,24 +76,24 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept  			return false;  		} -		// Reset the subtree handler if this node has another descriptor +		// Reset the subtree handler if this node has another type  		node = it->second.get(); -		if ((node->descriptor != nullptr || node->children.size() > 1) && +		if ((node->type != EmptyToken || node->children.size() > 1) &&  		    (i + 1 != token.size())) {  			subtreeRoot = node;  			subtreeKey = token[i + 1];  		}  	} -	// If the node descriptor is already nullptr, we cannot do anything here -	if (node->descriptor == nullptr) { +	// If the node type is already EmptyToken, we cannot do anything here +	if (node->type == EmptyToken) {  		return false;  	}  	// If the target node has children, we cannot delete the subtree. Set the -	// descriptor to nullptr instead +	// type to EmptyToken instead  	if (!node->children.empty()) { -		node->descriptor = nullptr; +		node->type = EmptyToken;  		return true;  	} @@ -103,19 +102,18 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept  	return true;  } -const TokenDescriptor *DynamicTokenTree::hasToken( -    const std::string &token) const noexcept +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept  {  	Node const *node = &root;  	for (size_t i = 0; i < token.size(); i++) {  		const char c = token[i];  		auto it = node->children.find(c);  		if (it == node->children.end()) { -			return nullptr; +			return EmptyToken;  		}  		node = it->second.get();  	} -	return node->descriptor; +	return node->type;  }  } diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/TokenTrie.hpp index c5dc4de..36c2ffa 100644 --- a/src/plugins/plain/DynamicTokenTree.hpp +++ b/src/plugins/plain/TokenTrie.hpp @@ -17,54 +17,61 @@  */  /** - * @file DynamicTokenTree.hpp + * @file TokenTrie.hpp   * - * Class representing a token tree that can be updated dynamically. + * Class representing a token trie that can be updated dynamically.   *   * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)   * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)   */ -#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ -#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ +#include <cstdint>  #include <memory> +#include <limits>  #include <unordered_map>  namespace ousia { -class TokenDescriptor; +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t;  /** - * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to - * identify the longest consecutive token in the text. This is equivalent to a - * prefix trie. + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie.   * - * A token tree is a construct that structures all special tokens a - * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then - * the token tree would look like this: + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this:   *   * \code{*.txt} - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ + *        ~ (0) + *       /     \ + *      a (2)  b (0) + *      |      | + *      a (0)  a (0) + *      |      | + *      b (1)  c (0)   * \endcode   * - * Every node in the token tree is a valid end state that has a $ attached to - * it. During the search algorithm the Tokenizer goes through the tree and - * stores the last valid position. If a character follows that does not lead to - * a new node in the TokenTree the search ends (and starts again at this - * character). The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). + * Where the number indicates the corresponding token descriptor identifier.   */ -class DynamicTokenTree { +class TokenTrie {  public:  	/**  	 * Structure used to build the node tree. @@ -73,7 +80,7 @@ public:  		/**  		 * Type used for the child map.  		 */ -		using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>; +		using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;  		/**  		 * Map from single characters at the corresponding child nodes. @@ -84,7 +91,7 @@ public:  		 * Reference at the corresponding token descriptor. Set to nullptr if  		 * no token is attached to this node.  		 */ -		TokenDescriptor const *descriptor; +		TokenTypeId type;  		/**  		 * Default constructor, initializes the descriptor with nullptr. @@ -105,11 +112,10 @@ public:  	 *  	 * @param token is the character sequence that should be registered as  	 * token. -	 * @param descriptor is the descriptor that should be set for this token. +	 * @param type is the descriptor that should be set for this token.  	 * @return true if the operation is successful, false otherwise.  	 */ -	bool registerToken(const std::string &token, -	                   const TokenDescriptor *descriptor) noexcept; +	bool registerToken(const std::string &token, TokenTypeId type) noexcept;  	/**  	 * Unregisters the token from the token tree. Returns true if the token was @@ -128,9 +134,17 @@ public:  	 * @return the attached token descriptor or nullptr if the given token is  	 * not found.  	 */ -	const TokenDescriptor* hasToken(const std::string &token) const noexcept; +	TokenTypeId hasToken(const std::string &token) const noexcept; + +	/** +	 * Returns a reference at the root node to be used for traversing the token +	 * tree. +	 * +	 * @return a reference at the root node. +	 */ +	const Node *getRoot() const noexcept { return &root; }  };  } -#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */ +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */  | 
