diff options
| author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-10 16:42:20 +0100 | 
|---|---|---|
| committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-10 16:42:20 +0100 | 
| commit | 6352751f87a4acdd9a32baf4f1133819678d06b0 (patch) | |
| tree | 23f1764605227d25ee9861fcf23e2dc381bdc187 /src/plugins/plain | |
| parent | e85e174028de377cee9f6dfcccbe418d185cc7b4 (diff) | |
Renamed some files and changed folder structure
Diffstat (limited to 'src/plugins/plain')
| -rw-r--r-- | src/plugins/plain/DynamicTokenizer.cpp | 544 | ||||
| -rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 252 | ||||
| -rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.cpp | 641 | ||||
| -rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.hpp | 347 | ||||
| -rw-r--r-- | src/plugins/plain/TokenTrie.cpp | 119 | ||||
| -rw-r--r-- | src/plugins/plain/TokenTrie.hpp | 150 | 
6 files changed, 0 insertions, 2053 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp deleted file mode 100644 index f2cfcd1..0000000 --- a/src/plugins/plain/DynamicTokenizer.cpp +++ /dev/null @@ -1,544 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <memory> -#include <vector> - -#include <core/common/CharReader.hpp> -#include <core/common/Exceptions.hpp> -#include <core/common/Utils.hpp> - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -namespace { - -/* Internal class TokenMatch */ - -/** - * Contains information about a matching token. - */ -struct TokenMatch { -	/** -	 * Token that was matched. -	 */ -	DynamicToken token; - -	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. -	 */ -	size_t textLength; - -	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. -	 */ -	size_t textEnd; - -	/** -	 * Constructor of the TokenMatch class. -	 */ -	TokenMatch() : textLength(0), textEnd(0) {} - -	/** -	 * Returns true if this TokenMatch instance actually represents a match. -	 */ -	bool hasMatch() { return token.type != EmptyToken; } -}; - -/* Internal class TokenLookup */ - -/** - * The TokenLookup class is used to represent a thread in a running token - * lookup. - */ -class TokenLookup { -private: -	/** -	 * Current node within the token trie. -	 */ -	TokenTrie::Node const *node; - -	/** -	 * Start offset within the source file. -	 */ -	size_t start; - -	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. -	 */ -	size_t textLength; - -	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. -	 */ -	size_t textEnd; - -public: -	/** -	 * Constructor of the TokenLookup class. -	 * -	 * @param node is the current node. -	 * @param start is the start position. -	 * @param textLength is the text buffer length of the previous text token. -	 * @param textEnd is the current end location of the previous text token. -	 */ -	TokenLookup(const TokenTrie::Node *node, size_t start, -	            size_t textLength, size_t textEnd) -	    : node(node), start(start), textLength(textLength), textEnd(textEnd) -	{ -	} - -	/** -	 * Tries to extend the current path in the token trie with the given -	 * character. If a complete token is matched, stores this match in the -	 * tokens list (in case it is longer than any previous token). -	 * -	 * @param c is the character that should be appended to the current prefix. -	 * @param lookups is a list to which new TokeLookup instances are added -- -	 * which could potentially be expanded in the next iteration. -	 * @param match is the DynamicToken instance to which the matching token -	 * should be written. -	 * @param tokens is a reference at the internal token list of the -	 * DynamicTokenizer. -	 * @param end is the end byte offset of the current character. -	 * @param sourceId is the source if of this file. -	 */ -	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, -	             const std::vector<std::string> &tokens, SourceOffset end, -	             SourceId sourceId) -	{ -		// Check whether we can continue the current token path with the given -		// character without visiting an already visited node -		auto it = node->children.find(c); -		if (it == node->children.end()) { -			return; -		} - -		// Check whether the new node represents a complete token a whether it -		// is longer than the current token. If yes, replace the current token. -		node = it->second.get(); -		if (node->type != EmptyToken) { -			const std::string &str = tokens[node->type]; -			size_t len = str.size(); -			if (len > match.token.content.size()) { -				match.token = -				    DynamicToken{node->type, str, {sourceId, start, end}}; -				match.textLength = textLength; -				match.textEnd = textEnd; -			} -		} - -		// If this state can possibly be advanced, store it in the states list. -		if (!node->children.empty()) { -			lookups.emplace_back(*this); -		} -	} -}; - -/* Internal class TextHandlerBase */ - -/** - * Base class used for those classes that may be used as TextHandler in the - * DynamicTokenizer::next function. - */ -class TextHandlerBase { -public: -	/** -	 * Start position of the extracted text. -	 */ -	size_t textStart; - -	/** -	 * End position of the extracted text. -	 */ -	size_t textEnd; - -	/** -	 * Buffer containing the extracted text. -	 */ -	std::vector<char> textBuf; - -	/** -	 * Constructor of the TextHandlerBase base class. Initializes the start and -	 * end position with zeros. -	 */ -	TextHandlerBase() : textStart(0), textEnd(0) {} - -	/** -	 * Transforms the given token into a text token containing the extracted -	 * text. -	 * -	 * @param token is the output token to which the text should be written. -	 * @param sourceId is the source id of the underlying file. -	 */ -	void buildTextToken(TokenMatch &match, SourceId sourceId) -	{ -		if (match.hasMatch()) { -			match.token.content = -			    std::string{textBuf.data(), match.textLength}; -			match.token.location = -			    SourceLocation{sourceId, textStart, match.textEnd}; -		} else { -			match.token.content = std::string{textBuf.data(), textBuf.size()}; -			match.token.location = SourceLocation{sourceId, textStart, textEnd}; -		} -		match.token.type = TextToken; -	} - -	/** -	 * Returns true if this whitespace handler has found any text and a text -	 * token could be emitted. -	 * -	 * @return true if the internal data buffer is non-empty. -	 */ -	bool hasText() { return !textBuf.empty(); } -}; - -/* Internal class PreservingTextHandler */ - -/** - * The PreservingTextHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingTextHandler : public TextHandlerBase { -public: -	using TextHandlerBase::TextHandlerBase; - -	/** -	 * Appends the given character to the internal text buffer, does not -	 * eliminate whitespace. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; -		textBuf.push_back(c); -	} -}; - -/* Internal class TrimmingTextHandler */ - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingTextHandler : public TextHandlerBase { -public: -	using TextHandlerBase::TextHandlerBase; - -	/** -	 * Buffer used internally to temporarily store all whitespace characters. -	 * They are only added to the output buffer if another non-whitespace -	 * character is reached. -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * whitespace characters at the begin and end of the text. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				whitespaceBuf.push_back(c); -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (!whitespaceBuf.empty()) { -			textBuf.insert(textBuf.end(), whitespaceBuf.begin(), -			               whitespaceBuf.end()); -			whitespaceBuf.clear(); -		} -		textBuf.push_back(c); -	} -}; - -/* Internal class CollapsingTextHandler */ - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingTextHandler : public TextHandlerBase { -public: -	using TextHandlerBase::TextHandlerBase; - -	/** -	 * Flag set to true if a whitespace character was reached. -	 */ -	bool hasWhitespace = false; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * redundant whitespace characters. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				hasWhitespace = true; -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (hasWhitespace) { -			textBuf.push_back(' '); -			hasWhitespace = false; -		} -		textBuf.push_back(c); -	} -}; -} - -/* Class DynamicTokenizer */ - -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) -    : whitespaceMode(whitespaceMode), nextTokenTypeId(0) -{ -} - -template <typename TextHandler, bool read> -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) -{ -	// If we're in the read mode, reset the char reader peek position to the -	// current read position -	if (read) { -		reader.resetPeek(); -	} - -	// Prepare the lookups in the token trie -	const TokenTrie::Node *root = trie.getRoot(); -	TokenMatch match; -	std::vector<TokenLookup> lookups; -	std::vector<TokenLookup> nextLookups; - -	// Instantiate the text handler -	TextHandler textHandler; - -	// Peek characters from the reader and try to advance the current token tree -	// cursor -	char c; -	size_t charStart = reader.getPeekOffset(); -	const SourceId sourceId = reader.getSourceId(); -	while (reader.peek(c)) { -		const size_t charEnd = reader.getPeekOffset(); -		const size_t textLength = textHandler.textBuf.size(); -		const size_t textEnd = textHandler.textEnd; - -		// If we do not have a match yet, start a new lookup from the root -		if (!match.hasMatch()) { -			TokenLookup{root, charStart, textLength, textEnd}.advance( -			    c, nextLookups, match, tokens, charEnd, sourceId); -		} - -		// Try to advance all other lookups with the new character -		for (TokenLookup &lookup : lookups) { -			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); -		} - -		// We have found a token and there are no more states to advance or the -		// text handler has found something -- abort to return the new token -		if (match.hasMatch()) { -			if ((nextLookups.empty() || textHandler.hasText())) { -				break; -			} -		} else { -			// Record all incomming characters -			textHandler.append(c, charStart, charEnd); -		} - -		// Swap the lookups and the nextLookups list -		lookups = std::move(nextLookups); -		nextLookups.clear(); - -		// Advance the offset -		charStart = charEnd; -	} - -	// If we found text, emit that text -	if (textHandler.hasText() && -	    (!match.hasMatch() || match.textLength > 0)) { -		textHandler.buildTextToken(match, sourceId); -	} - -	// Move the read/peek cursor to the end of the token, abort if an error -	// happens while doing so -	if (match.hasMatch()) { -		// Make sure we have a valid location -		if (match.token.location.getEnd() == InvalidSourceOffset) { -			throw OusiaException{"Token end position offset out of range"}; -		} - -		// Seek to the end of the current token -		const size_t end = match.token.location.getEnd(); -		if (read) { -			reader.seek(end); -		} else { -			reader.seekPeekCursor(end); -		} -		token = match.token; -	} else { -		token = DynamicToken{}; -	} -	return match.hasMatch(); -} - -bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) -{ -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingTextHandler, true>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingTextHandler, true>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingTextHandler, true>(reader, token); -	} -	return false; -} - -bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) -{ -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingTextHandler, false>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingTextHandler, false>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingTextHandler, false>(reader, token); -	} -	return false; -} - -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) -{ -	// Abort if an empty token should be registered -	if (token.empty()) { -		return EmptyToken; -	} - -	// Search for a new slot in the tokens list -	TokenTypeId type = EmptyToken; -	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { -		if (tokens[i].empty()) { -			tokens[i] = token; -			type = i; -			break; -		} -	} - -	// No existing slot was found, add a new one -- make sure we do not -	// override the special token type handles -	if (type == EmptyToken) { -		type = tokens.size(); -		if (type == TextToken || type == EmptyToken) { -			throw OusiaException{"Token type ids depleted!"}; -		} -		tokens.emplace_back(token); -	} -	nextTokenTypeId = type + 1; - -	// Try to register the token in the trie -- if this fails, remove it -	// from the tokens list -	if (!trie.registerToken(token, type)) { -		tokens[type] = std::string(); -		nextTokenTypeId = type; -		return EmptyToken; -	} -	return type; -} - -bool DynamicTokenizer::unregisterToken(TokenTypeId type) -{ -	// Unregister the token from the trie, abort if an invalid type is given -	if (type < tokens.size() && trie.unregisterToken(tokens[type])) { -		tokens[type] = std::string{}; -		nextTokenTypeId = type; -		return true; -	} -	return false; -} - -std::string DynamicTokenizer::getTokenString(TokenTypeId type) -{ -	if (type < tokens.size()) { -		return tokens[type]; -	} -	return std::string{}; -} - -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) -{ -	whitespaceMode = mode; -} - -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } - -/* Explicitly instantiate all possible instantiations of the "next" member -   function */ -template bool DynamicTokenizer::next<PreservingTextHandler, false>( -    CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, false>( -    CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, false>( -    CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<PreservingTextHandler, true>( -    CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, true>( -    CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, true>( -    CharReader &reader,DynamicToken &token); -} - diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp deleted file mode 100644 index 0cac2e8..0000000 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ /dev/null @@ -1,252 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file DynamicTokenizer.hpp - * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ - -#include <set> -#include <string> -#include <vector> - -#include <core/common/Location.hpp> - -#include "TokenTrie.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; - -/** - * The DynamicToken structure describes a token discovered by the Tokenizer. - */ -struct DynamicToken { -	/** -	 * Id of the type of this token. -	 */ -	TokenTypeId type; - -	/** -	 * String that was matched. -	 */ -	std::string content; - -	/** -	 * Location from which the string was extracted. -	 */ -	SourceLocation location; - -	/** -	 * Default constructor. -	 */ -	DynamicToken() : type(EmptyToken) {} - -	/** -	 * Constructor of the DynamicToken struct. -	 * -	 * @param id represents the token type. -	 * @param content is the string content that has been extracted. -	 * @param location is the location of the extracted string content in the -	 * source file. -	 */ -	DynamicToken(TokenTypeId type, const std::string &content, -	             SourceLocation location) -	    : type(type), content(content), location(location) -	{ -	} - -	/** -	 * Constructor of the DynamicToken struct, only initializes the token type -	 * -	 * @param type is the id corresponding to the type of the token. -	 */ -	DynamicToken(TokenTypeId type) : type(type) {} - -	/** -	 * The getLocation function allows the tokens to be directly passed as -	 * parameter to Logger or LoggableException instances. -	 * -	 * @return a reference at the location field -	 */ -	const SourceLocation &getLocation() const { return location; } -}; - -/** - * Enum specifying the whitespace handling of the DynamicTokenizer class when - * reading non-token text. - */ -enum class WhitespaceMode { -	/** -     * Preserves all whitespaces as they are found in the source file. -     */ -	PRESERVE, - -	/** -     * Trims whitespace at the beginning and the end of the found text. -     */ -	TRIM, - -	/** -     * Whitespaces are trimmed and collapsed, multiple whitespace characters -     * are replaced by a single space character. -     */ -	COLLAPSE -}; - -/** - * The DynamicTokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the - * tokenizer. - */ -class DynamicTokenizer { -private: -	/** -	 * Internally used token trie. This object holds all registered tokens. -	 */ -	TokenTrie trie; - -	/** -	 * Flag defining whether whitespaces should be preserved or not. -	 */ -	WhitespaceMode whitespaceMode; - -	/** -	 * Vector containing all registered token types. -	 */ -	std::vector<std::string> tokens; - -	/** -	 * Next index in the tokens list where to search for a new token id. -	 */ -	size_t nextTokenTypeId; - -	/** -	 * Templated function used internally to read the current token. The -	 * function is templated in order to force code generation for all six -	 * combiations of whitespace modes and reading/peeking. -	 * -	 * @tparam TextHandler is the type to be used for the textHandler instance. -	 * @tparam read specifies whether the function should start from and advance -	 * the read pointer of the char reader. -	 * @param reader is the CharReader instance from which the data should be -	 * read. -	 * @param token is the token structure into which the token information -	 * should be written. -	 * @return false if the end of the stream has been reached, true otherwise. -	 */ -	template <typename TextHandler, bool read> -	bool next(CharReader &reader, DynamicToken &token); - -public: -	/** -	 * Constructor of the DynamicTokenizer class. -	 * -	 * @param whitespaceMode specifies how whitespace should be handled. -	 */ -	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); - -	/** -	 * Registers the given string as a token. Returns a const pointer at a -	 * TokenDescriptor that will be used to reference the newly created token. -	 * -	 * @param token is the token string that should be registered. -	 * @return a unique identifier for the registered token or EmptyToken if -	 * an error occured. -	 */ -	TokenTypeId registerToken(const std::string &token); - -	/** -	 * Unregisters the token belonging to the given TokenTypeId. -	 * -	 * @param type is the token type that should be unregistered. The -	 *TokenTypeId -	 * must have been returned by registerToken. -	 * @return true if the operation was successful, false otherwise (e.g. -	 * because the given TokenDescriptor was already unregistered). -	 */ -	bool unregisterToken(TokenTypeId type); - -	/** -	 * Returns the token that was registered under the given TokenTypeId id or -	 *an -	 * empty string if an invalid TokenTypeId id is given. -	 * -	 * @param type is the TokenTypeId id for which the corresponding token -	 *string -	 * should be returned. -	 * @return the registered token string or an empty string if the given type -	 * was invalid. -	 */ -	std::string getTokenString(TokenTypeId type); - -	/** -	 * Sets the whitespace mode. -	 * -	 * @param whitespaceMode defines how whitespace should be treated in text -	 * tokens. -	 */ -	void setWhitespaceMode(WhitespaceMode mode); - -	/** -	 * Returns the current value of the whitespace mode. -	 * -	 * @return the whitespace mode. -	 */ -	WhitespaceMode getWhitespaceMode(); - -	/** -	 * Reads a new token from the CharReader and stores it in the given -	 * DynamicToken instance. -	 * -	 * @param reader is the CharReader instance from which the data should be -	 * read. -	 * @param token is a reference at the token instance into which the Token -	 * information should be written. -	 * @return true if a token could be read, false if the end of the stream -	 * has been reached. -	 */ -	bool read(CharReader &reader, DynamicToken &token); - -	/** -	 * The peek method does not advance the read position of the char reader, -	 * but reads the next token from the current char reader peek position. -	 * -	 * @param reader is the CharReader instance from which the data should be -	 * read. -	 * @param token is a reference at the token instance into which the Token -	 * information should be written. -	 * @return true if a token could be read, false if the end of the stream -	 * has been reached. -	 */ -	bool peek(CharReader &reader, DynamicToken &token); -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ - diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp deleted file mode 100644 index 05769f0..0000000 --- a/src/plugins/plain/PlainFormatStreamReader.cpp +++ /dev/null @@ -1,641 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <core/common/CharReader.hpp> -#include <core/common/Logger.hpp> -#include <core/common/Utils.hpp> -#include <core/common/VariantReader.hpp> - -#include "PlainFormatStreamReader.hpp" - -namespace ousia { - -/** - * Plain format default tokenizer. - */ -class PlainFormatTokens : public DynamicTokenizer { -public: -	/** -	 * Id of the backslash token. -	 */ -	TokenTypeId Backslash; - -	/** -	 * Id of the line comment token. -	 */ -	TokenTypeId LineComment; - -	/** -	 * Id of the block comment start token. -	 */ -	TokenTypeId BlockCommentStart; - -	/** -	 * Id of the block comment end token. -	 */ -	TokenTypeId BlockCommentEnd; - -	/** -	 * Id of the field start token. -	 */ -	TokenTypeId FieldStart; - -	/** -	 * Id of the field end token. -	 */ -	TokenTypeId FieldEnd; - -	/** -	 * Registers the plain format tokens in the internal tokenizer. -	 */ -	PlainFormatTokens() -	{ -		Backslash = registerToken("\\"); -		LineComment = registerToken("%"); -		BlockCommentStart = registerToken("%{"); -		BlockCommentEnd = registerToken("}%"); -		FieldStart = registerToken("{"); -		FieldEnd = registerToken("}"); -	} -}; - -static const PlainFormatTokens Tokens; - -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: -	/** -	 * Internal character buffer. -	 */ -	std::vector<char> buf; - -	/** -	 * Start location of the character data. -	 */ -	SourceOffset start; - -	/** -	 * End location of the character data. -	 */ -	SourceOffset end; - -public: -	/** -	 * Default constructor, initializes start and end with zeros. -	 */ -	DataHandler() : start(0), end(0) {} - -	/** -	 * Returns true if the internal buffer is empty. -	 * -	 * @return true if no characters were added to the internal buffer, false -	 * otherwise. -	 */ -	bool isEmpty() { return buf.empty(); } - -	/** -	 * Appends a single character to the internal buffer. -	 * -	 * @param c is the character that should be added to the internal buffer. -	 * @param charStart is the start position of the character. -	 * @param charEnd is the end position of the character. -	 */ -	void append(char c, SourceOffset charStart, SourceOffset charEnd) -	{ -		if (isEmpty()) { -			start = charStart; -		} -		buf.push_back(c); -		end = charEnd; -	} - -	/** -	 * Appends a string to the internal buffer. -	 * -	 * @param s is the string that should be added to the internal buffer. -	 * @param stringStart is the start position of the string. -	 * @param stringEnd is the end position of the string. -	 */ -	void append(const std::string &s, SourceOffset stringStart, -	            SourceOffset stringEnd) -	{ -		if (isEmpty()) { -			start = stringStart; -		} -		std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); -		end = stringEnd; -	} - -	/** -	 * Converts the internal buffer to a variant with attached location -	 * information. -	 * -	 * @param sourceId is the source id which is needed for building the -	 * location information. -	 * @return a Variant with the internal buffer content as string and -	 * the correct start and end location. -	 */ -	Variant toVariant(SourceId sourceId) -	{ -		Variant res = Variant::fromString(std::string(buf.data(), buf.size())); -		res.setLocation({sourceId, start, end}); -		return res; -	} -}; - -PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader, -                                                 Logger &logger) -    : reader(reader), logger(logger), tokenizer(Tokens) -{ -	// Place an intial command representing the complete file on the stack -	commands.push(Command{"", Variant::mapType{}, true, true, true}); -} - -Variant PlainFormatStreamReader::parseIdentifier(size_t start, bool allowNSSep) -{ -	bool first = true; -	bool hasCharSiceNSSep = false; -	std::vector<char> identifier; -	size_t end = reader.getPeekOffset(); -	char c, c2; -	while (reader.peek(c)) { -		// Abort if this character is not a valid identifer character -		if ((first && Utils::isIdentifierStartCharacter(c)) || -		    (!first && Utils::isIdentifierCharacter(c))) { -			identifier.push_back(c); -		} else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { -			identifier.push_back(c); -		} else { -			if (c == ':' && allowNSSep) { -				logger.error( -				    "Expected character before and after namespace separator \":\"", -				    reader); -			} -			reader.resetPeek(); -			break; -		} - -		// This is no longer the first character -		first = false; - -		// Advance the hasCharSiceNSSep flag -		hasCharSiceNSSep = allowNSSep && (c != ':'); - -		end = reader.getPeekOffset(); -		reader.consumePeek(); -	} - -	// Return the identifier at its location -	Variant res = -	    Variant::fromString(std::string(identifier.data(), identifier.size())); -	res.setLocation({reader.getSourceId(), start, end}); -	return res; -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parseBeginCommand() -{ -	// Expect a '{' after the command -	reader.consumeWhitespace(); -	if (!reader.expect('{')) { -		logger.error("Expected \"{\" after \\begin", reader); -		return State::NONE; -	} - -	// Parse the name of the command that should be opened -	Variant commandName = parseIdentifier(reader.getOffset(), true); -	if (commandName.asString().empty()) { -		logger.error("Expected identifier", commandName); -		return State::ERROR; -	} - -	// Check whether the next character is a '#', indicating the start of the -	// command name -	Variant commandArgName; -	SourceOffset start = reader.getOffset(); -	if (reader.expect('#')) { -		commandArgName = parseIdentifier(start); -		if (commandArgName.asString().empty()) { -			logger.error("Expected identifier after \"#\"", commandArgName); -		} -	} - -	if (!reader.expect('}')) { -		logger.error("Expected \"}\"", reader); -		return State::ERROR; -	} - -	// Parse the arguments -	Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - -	// Push the command onto the command stack -	pushCommand(std::move(commandName), std::move(commandArguments), true); - -	return State::COMMAND; -} - -static bool checkStillInField(const PlainFormatStreamReader::Command &cmd, -                              const Variant &endName, Logger &logger) -{ -	if (cmd.inField && !cmd.inRangeField) { -		logger.error(std::string("\\end in open field of command \"") + -		                 cmd.name.asString() + std::string("\""), -		             endName); -		logger.note(std::string("Open command started here:"), cmd.name); -		return true; -	} -	return false; -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parseEndCommand() -{ -	// Expect a '{' after the command -	if (!reader.expect('{')) { -		logger.error("Expected \"{\" after \\end", reader); -		return State::NONE; -	} - -	// Fetch the name of the command that should be ended here -	Variant name = parseIdentifier(reader.getOffset(), true); - -	// Make sure the given command name is not empty -	if (name.asString().empty()) { -		logger.error("Expected identifier", name); -		return State::ERROR; -	} - -	// Make sure the command name is terminated with a '}' -	if (!reader.expect('}')) { -		logger.error("Expected \"}\"", reader); -		return State::ERROR; -	} - -	// Unroll the command stack up to the last range command -	while (!commands.top().hasRange) { -		if (checkStillInField(commands.top(), name, logger)) { -			return State::ERROR; -		} -		commands.pop(); -	} - -	// Make sure we're not in an open field of this command -	if (checkStillInField(commands.top(), name, logger)) { -		return State::ERROR; -	} - -	// Special error message if the top-level command is reached -	if (commands.size() == 1) { -		logger.error(std::string("Cannot end command \"") + name.asString() + -		                 std::string("\" here, no command open"), -		             name); -		return State::ERROR; -	} - -	// Inform the about command mismatches -	const Command &cmd = commands.top(); -	if (commands.top().name.asString() != name.asString()) { -		logger.error(std::string("Trying to end command \"") + -		                 cmd.name.asString() + -		                 std::string("\", but open command is \"") + -		                 name.asString() + std::string("\""), -		             name); -		logger.note("Last command was opened here:", cmd.name); -		return State::ERROR; -	} - -	// Set the location to the location of the command that was ended, then end -	// the current command -	location = name.getLocation(); -	commands.pop(); -	return cmd.inRangeField ? State::FIELD_END : State::NONE; -} - -Variant PlainFormatStreamReader::parseCommandArguments(Variant commandArgName) -{ -	// Parse the arguments using the universal VariantReader -	Variant commandArguments; -	if (reader.expect('[')) { -		auto res = VariantReader::parseObject(reader, logger, ']'); -		commandArguments = res.second; -	} else { -		commandArguments = Variant::mapType{}; -	} - -	// Insert the parsed name, make sure "name" was not specified in the -	// arguments -	if (commandArgName.isString()) { -		auto res = -		    commandArguments.asMap().emplace("name", std::move(commandArgName)); -		if (!res.second) { -			logger.error("Name argument specified multiple times", -			             SourceLocation{}, MessageMode::NO_CONTEXT); -			logger.note("First occurance is here: ", commandArgName); -			logger.note("Second occurance is here: ", res.first->second); -		} -	} -	return commandArguments; -} - -void PlainFormatStreamReader::pushCommand(Variant commandName, -                                          Variant commandArguments, -                                          bool hasRange) -{ -	// Store the location on the stack -	location = commandName.getLocation(); - -	// Place the command on the command stack, remove the last commands if we're -	// not currently inside a field of these commands -	while (!commands.top().inField) { -		commands.pop(); -	} -	commands.push(Command{std::move(commandName), std::move(commandArguments), -	                      hasRange, false, false}); -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parseCommand( -    size_t start) -{ -	// Parse the commandName as a first identifier -	Variant commandName = parseIdentifier(start, true); -	if (commandName.asString().empty()) { -		logger.error("Empty command name", reader); -		return State::NONE; -	} - -	// Handle the special "begin" and "end" commands -	const auto commandNameComponents = -	    Utils::split(commandName.asString(), ':'); -	const bool isBegin = commandNameComponents[0] == "begin"; -	const bool isEnd = commandNameComponents[0] == "end"; -	if (isBegin || isEnd) { -		if (commandNameComponents.size() > 1) { -			logger.error( -			    "Special commands \"\\begin\" and \"\\end\" may not contain a " -			    "namespace separator \":\"", -			    commandName); -		} -		if (isBegin) { -			return parseBeginCommand(); -		} else if (isEnd) { -			return parseEndCommand(); -		} -	} - -	// Check whether the next character is a '#', indicating the start of the -	// command name -	Variant commandArgName; -	start = reader.getOffset(); -	if (reader.expect('#')) { -		commandArgName = parseIdentifier(start); -		if (commandArgName.asString().empty()) { -			logger.error("Expected identifier after \"#\"", commandArgName); -		} -	} - -	// Parse the arugments -	Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - -	// Push the command onto the command stack -	pushCommand(std::move(commandName), std::move(commandArguments), false); - -	return State::COMMAND; -} - -void PlainFormatStreamReader::parseBlockComment() -{ -	DynamicToken token; -	size_t depth = 1; -	while (tokenizer.read(reader, token)) { -		if (token.type == Tokens.BlockCommentEnd) { -			depth--; -			if (depth == 0) { -				return; -			} -		} -		if (token.type == Tokens.BlockCommentStart) { -			depth++; -		} -	} - -	// Issue an error if the file ends while we are in a block comment -	logger.error("File ended while being in a block comment", reader); -} - -void PlainFormatStreamReader::parseLineComment() -{ -	char c; -	while (reader.read(c)) { -		if (c == '\n') { -			return; -		} -	} -} - -bool PlainFormatStreamReader::checkIssueData(DataHandler &handler) -{ -	if (!handler.isEmpty()) { -		data = handler.toVariant(reader.getSourceId()); -		location = data.getLocation(); -		reader.resetPeek(); -		return true; -	} -	return false; -} - -bool PlainFormatStreamReader::checkIssueFieldStart() -{ -	// Fetch the current command, and check whether we're currently inside a -	// field of this command -	Command &cmd = commands.top(); -	if (!cmd.inField) { -		// If this is a range command, we're now implicitly inside the field of -		// this command -- we'll have to issue a field start command! -		if (cmd.hasRange) { -			cmd.inField = true; -			cmd.inRangeField = true; -			reader.resetPeek(); -			return true; -		} - -		// This was not a range command, so obviously we're now inside within -		// a field of some command -- so unroll the commands stack until a -		// command with open field is reached -		while (!commands.top().inField) { -			commands.pop(); -		} -	} -	return false; -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parse() -{ -	// Handler for incomming data -	DataHandler handler; - -	// Read tokens until the outer loop should be left -	DynamicToken token; -	while (tokenizer.peek(reader, token)) { -		const TokenTypeId type = token.type; - -		// Special handling for Backslash and Text -		if (type == Tokens.Backslash) { -			// Before appending anything to the output data or starting a new -			// command, check whether FIELD_START has to be issued, as the -			// current command is a command with range -			if (checkIssueFieldStart()) { -				location = token.location; -				return State::FIELD_START; -			} - -			// Check whether a command starts now, without advancing the peek -			// cursor -			char c; -			if (!reader.fetchPeek(c)) { -				logger.error("Trailing backslash at the end of the file.", -				             token); -				return State::END; -			} - -			// Try to parse a command -			if (Utils::isIdentifierStartCharacter(c)) { -				// Make sure to issue any data before it is to late -				if (checkIssueData(handler)) { -					return State::DATA; -				} - -				// Parse the actual command -				State res = parseCommand(token.location.getStart()); -				switch (res) { -					case State::ERROR: -						throw LoggableException( -						    "Last error was irrecoverable, ending parsing " -						    "process"); -					case State::NONE: -						continue; -					default: -						return res; -				} -			} - -			// This was not a special character, just append the given character -			// to the data buffer, use the escape character start as start -			// location and the peek offset as end location -			reader.peek(c);  // Peek the previously fetched character -			handler.append(c, token.location.getStart(), -			               reader.getPeekOffset()); -			reader.consumePeek(); -			continue; -		} else if (type == TextToken) { -			// Check whether FIELD_START has to be issued before appending text -			if (checkIssueFieldStart()) { -				location = token.location; -				return State::FIELD_START; -			} - -			// Append the text to the data handler -			handler.append(token.content, token.location.getStart(), -			               token.location.getEnd()); - -			reader.consumePeek(); -			continue; -		} - -		// A non-text token was reached, make sure all pending data commands -		// have been issued -		if (checkIssueData(handler)) { -			return State::DATA; -		} - -		// We will handle the token now, consume the peeked characters -		reader.consumePeek(); - -		// Update the location to the current token location -		location = token.location; - -		if (token.type == Tokens.LineComment) { -			parseLineComment(); -		} else if (token.type == Tokens.BlockCommentStart) { -			parseBlockComment(); -		} else if (token.type == Tokens.FieldStart) { -			Command &cmd = commands.top(); -			if (!cmd.inField) { -				cmd.inField = true; -				return State::FIELD_START; -			} -			logger.error( -			    "Got field start token \"{\", but no command for which to " -			    "start the field. Did you mean \"\\{\"?", -			    token); -		} else if (token.type == Tokens.FieldEnd) { -			// Try to end an open field of the current command -- if the current -			// command is not inside an open field, end this command and try to -			// close the next one -			for (int i = 0; i < 2 && commands.size() > 1; i++) { -				Command &cmd = commands.top(); -				if (!cmd.inRangeField) { -					if (cmd.inField) { -						cmd.inField = false; -						return State::FIELD_END; -					} -					commands.pop(); -				} else { -					break; -				} -			} -			logger.error( -			    "Got field end token \"}\", but there is no field to end. Did " -			    "you mean \"\\}\"?", -			    token); -		} else { -			logger.error("Unexpected token \"" + token.content + "\"", token); -		} -	} - -	// Issue available data -	if (checkIssueData(handler)) { -		return State::DATA; -	} - -	// Make sure all open commands and fields have been ended at the end of the -	// stream -	while (commands.size() > 1) { -		Command &cmd = commands.top(); -		if (cmd.inField || cmd.hasRange) { -			logger.error("Reached end of stream, but command \"" + -			                 cmd.name.asString() + "\" has not been ended", -			             cmd.name); -		} -		commands.pop(); -	} - -	location = SourceLocation{reader.getSourceId(), reader.getOffset()}; -	return State::END; -} - -const Variant &PlainFormatStreamReader::getCommandName() -{ -	return commands.top().name; -} - -const Variant &PlainFormatStreamReader::getCommandArguments() -{ -	return commands.top().arguments; -} -} - diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp deleted file mode 100644 index 2ee261c..0000000 --- a/src/plugins/plain/PlainFormatStreamReader.hpp +++ /dev/null @@ -1,347 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file PlainFormatStreamReader.hpp - * - * Provides classes for low-level classes for reading the plain TeX-esque - * format. The class provided here do not build any model objects and does not - * implement the Parser interfaces. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ -#define _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ - -#include <stack> - -#include <core/common/Variant.hpp> - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; -class Logger; -class DataHandler; - -/** - * The PlainFormatStreamReader class provides a low-level reader for the plain - * TeX-esque format. The parser is constructed around a "parse" function, which - * reads data from the underlying CharReader until a new state is reached and - * indicates this state in a return value. The calling code then has to pull - * corresponding data from the stream reader. The reader already handles some - * invalid cases, but recovers from most errors and happily continues parsing. - */ -class PlainFormatStreamReader { -public: -	/** -	 * Enum used to indicate which state the PlainFormatStreamReader class is in -	 * after calling the "parse" function. -	 */ -	enum class State { -		/** -	     * State returned if a fully featured command has been read. A command -	     * consists of the command name and its arguments (which optionally -	     * includes the name). -	     */ -		COMMAND, - -		/** -	     * State returned if data is given. The reader must decide which field -	     * or command this should be routed to. Trailing or leading whitespace -	     * has been removed. Only called if the data is non-empty. -	     */ -		DATA, - -		/** -	     * A user-defined entity has been found. The entity sequence is stored -	     * in the command name. -	     */ -		ENTITY, - -		/** -	     * State returned if an annotation was started. An annotation consists -	     * of the command name and its arguments (which optionally include the -	     * name). -	     */ -		ANNOTATION_START, - -		/** -	     * State returned if an annotation ends. The reader indicates which -	     * annotation ends. -	     */ -		ANNOTATION_END, - -		/** -	     * State returned if a new field started. The reader assures that the -	     * current field ends before a new field is started and that the field -	     * is not started if data has been given outside of a field. The -	     * field number is set to the current field index. -	     */ -		FIELD_START, - -		/** -	     * State returned if the current field ends. The reader assures that a -	     * field was actually open. -	     */ -		FIELD_END, - -		/** -	     * The end of the stream has been reached. -	     */ -		END, - -		/** -		 * Returned from internal functions if nothing should be done. -		 */ -		NONE, - -		/** -		 * Returned from internal function to indicate irrecoverable errors. -		 */ -		ERROR -	}; - -	/** -	 * Entry used for the command stack. -	 */ -	struct Command { -		/** -		 * Name and location of the current command. -		 */ -		Variant name; - -		/** -		 * Arguments that were passed to the command. -		 */ -		Variant arguments; - -		/** -		 * Set to true if this is a command with clear begin and end. -		 */ -		bool hasRange; - -		/** -		 * Set to true if we are currently inside a field of this command. -		 */ -		bool inField; - -		/** -		 * Set to true if we are currently in the range field of the command -		 * (implies inField being set to true). -		 */ -		bool inRangeField; - -		/** -		 * Default constructor. -		 */ -		Command() : hasRange(false), inField(false), inRangeField(false) {} - -		/** -		 * Constructor of the Command class. -		 * -		 * @param name is a string variant with name and location of the -		 * command. -		 * @param arguments is a map variant with the arguments given to the -		 * command. -		 * @param hasRange should be set to true if this is a command with -		 * explicit range. -		 * @param inField is set to true if we currently are inside a field -		 * of this command. -		 * @param inRangeField is set to true if we currently inside the outer -		 * field of the command. -		 */ -		Command(Variant name, Variant arguments, bool hasRange, -		        bool inField, bool inRangeField) -		    : name(std::move(name)), -		      arguments(std::move(arguments)), -		      hasRange(hasRange), -		      inField(inField), -		      inRangeField(inRangeField) -		{ -		} -	}; - -private: -	/** -	 * Reference to the CharReader instance from which the incomming bytes are -	 * read. -	 */ -	CharReader &reader; - -	/** -	 * Reference at the logger instance to which all error messages are sent. -	 */ -	Logger &logger; - -	/** -	 * Tokenizer instance used to read individual tokens from the text. -	 */ -	DynamicTokenizer tokenizer; - -	/** -	 * Stack containing the current commands. -	 */ -	std::stack<Command> commands; - -	/** -	 * Variant containing the data that has been read (always is a string, -	 * contains the exact location of the data in the source file). -	 */ -	Variant data; - -	/** -	 * Contains the location of the last token. -	 */ -	SourceLocation location; - -	/** -	 * Contains the field index of the current command. -	 */ -	size_t fieldIdx; - -	/** -	 * Function used internall to parse an identifier. -	 * -	 * @param start is the start byte offset of the identifier (including the -	 * backslash). -	 * @param allowNSSep should be set to true if the namespace separator is -	 * allowed in the identifier name. Issues error if the namespace separator -	 * is placed incorrectly. -	 */ -	Variant parseIdentifier(size_t start, bool allowNSSep = false); - -	/** -	 * Function used internally to handle the special "\begin" command. -	 */ -	State parseBeginCommand(); - -	/** -	 * Function used internally to handle the special "\end" command. -	 */ -	State parseEndCommand(); - -	/** -	 * Pushes the parsed command onto the command stack. -	 */ -	void pushCommand(Variant commandName, Variant commandArguments, bool hasRange); - -	/** -	 * Parses the command arguments. -	 */ -	Variant parseCommandArguments(Variant commandArgName); - -	/** -	 * Function used internally to parse a command. -	 * -	 * @param start is the start byte offset of the command (including the -	 * backslash) -	 * @return true if a command was actuall parsed, false otherwise. -	 */ -	State parseCommand(size_t start); - -	/** -	 * Function used internally to parse a block comment. -	 */ -	void parseBlockComment(); - -	/** -	 * Function used internally to parse a generic comment. -	 */ -	void parseLineComment(); - -	/** -	 * Checks whether there is any data pending to be issued, if yes, issues it. -	 * -	 * @param handler is the data handler that contains the data that may be -	 * returned to the user. -	 * @return true if there was any data and DATA should be returned by the -	 * parse function, false otherwise. -	 */ -	bool checkIssueData(DataHandler &handler); - -	/** -	 * Called before any data is appended to the internal data handler. Checks -	 * whether a new field should be started or implicitly ended. -	 * -	 * @return true if FIELD_START should be returned by the parse function. -	 */ -	bool checkIssueFieldStart(); - -public: -	/** -	 * Constructor of the PlainFormatStreamReader class. Attaches the new -	 * PlainFormatStreamReader to the given CharReader and Logger instances. -	 * -	 * @param reader is the reader instance from which incomming characters -	 * should be read. -	 * @param logger is the logger instance to which errors should be written. -	 */ -	PlainFormatStreamReader(CharReader &reader, Logger &logger); - -	/** -	 * Continues parsing. Returns one of the states defined in the State enum. -	 * Callers should stop once the State::END state is reached. Use the getter -	 * functions to get more information about the current state, such as the -	 * command name or the data or the current field index. -	 * -	 * @return the new state the parser has reached. -	 */ -	State parse(); - -	/** -	 * Returns a reference at the internally stored data. Only valid if -	 * State::DATA was returned by the "parse" function. -	 * -	 * @return a reference at a variant containing the data parsed by the -	 * "parse" function. -	 */ -	const Variant &getData() { return data; } - -	/** -	 * Returns a reference at the internally stored command name. Only valid if -	 * State::COMMAND was returned by the "parse" function. -	 * -	 * @return a reference at a variant containing name and location of the -	 * parsed command. -	 */ -	const Variant &getCommandName(); - -	/** -	 * Returns a reference at the internally stored command name. Only valid if -	 * State::COMMAND was returned by the "parse" function. -	 * -	 * @return a reference at a variant containing arguments given to the -	 * command. -	 */ -	const Variant &getCommandArguments(); - -	/** -	 * Returns a reference at the char reader. -	 * -	 * @return the last internal token location. -	 */ -	SourceLocation &getLocation() {return location;} -}; -} - -#endif /* _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ */ - diff --git a/src/plugins/plain/TokenTrie.cpp b/src/plugins/plain/TokenTrie.cpp deleted file mode 100644 index 4a0430b..0000000 --- a/src/plugins/plain/TokenTrie.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "TokenTrie.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -TokenTrie::Node::Node() : type(EmptyToken) {} - -/* Class DynamicTokenTree */ - -bool TokenTrie::registerToken(const std::string &token, -                              TokenTypeId type) noexcept -{ -	// Abort if the token is empty -- this would taint the root node -	if (token.empty()) { -		return false; -	} - -	// Iterate over each character in the given string and insert them as -	// (new) nodes -	Node *node = &root; -	for (size_t i = 0; i < token.size(); i++) { -		// Insert a new node if this one does not exist -		const char c = token[i]; -		auto it = node->children.find(c); -		if (it == node->children.end()) { -			it = node->children.emplace(c, std::make_shared<Node>()).first; -		} -		node = it->second.get(); -	} - -	// If the resulting node already has a type set, we're screwed. -	if (node->type != EmptyToken) { -		return false; -	} - -	// Otherwise just set the type to the given type. -	node->type = type; -	return true; -} - -bool TokenTrie::unregisterToken(const std::string &token) noexcept -{ -	// We cannot remove empty tokens as we need to access the fist character -	// upfront -	if (token.empty()) { -		return false; -	} - -	// First pass -- search the node in the path that can be deleted -	Node *subtreeRoot = &root; -	char subtreeKey = token[0]; -	Node *node = &root; -	for (size_t i = 0; i < token.size(); i++) { -		// Go to the next node, abort if the tree ends unexpectedly -		auto it = node->children.find(token[i]); -		if (it == node->children.end()) { -			return false; -		} - -		// Reset the subtree handler if this node has another type -		node = it->second.get(); -		if ((node->type != EmptyToken || node->children.size() > 1) && -		    (i + 1 != token.size())) { -			subtreeRoot = node; -			subtreeKey = token[i + 1]; -		} -	} - -	// If the node type is already EmptyToken, we cannot do anything here -	if (node->type == EmptyToken) { -		return false; -	} - -	// If the target node has children, we cannot delete the subtree. Set the -	// type to EmptyToken instead -	if (!node->children.empty()) { -		node->type = EmptyToken; -		return true; -	} - -	// If we end up here, we can safely delete the complete subtree -	subtreeRoot->children.erase(subtreeKey); -	return true; -} - -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept -{ -	Node const *node = &root; -	for (size_t i = 0; i < token.size(); i++) { -		const char c = token[i]; -		auto it = node->children.find(c); -		if (it == node->children.end()) { -			return EmptyToken; -		} -		node = it->second.get(); -	} -	return node->type; -} -} - diff --git a/src/plugins/plain/TokenTrie.hpp b/src/plugins/plain/TokenTrie.hpp deleted file mode 100644 index 36c2ffa..0000000 --- a/src/plugins/plain/TokenTrie.hpp +++ /dev/null @@ -1,150 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file TokenTrie.hpp - * - * Class representing a token trie that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_TRIE_HPP_ -#define _OUSIA_TOKEN_TRIE_HPP_ - -#include <cstdint> -#include <memory> -#include <limits> -#include <unordered_map> - -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); - -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; - -/** - * The Tokenizer internally uses a TokenTrie to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * A token trie is a construct that structures all special tokens a Tokenizer - * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and - * three. Then the token tree would look like this: - * - * \code{*.txt} - *        ~ (0) - *       /     \ - *      a (2)  b (0) - *      |      | - *      a (0)  a (0) - *      |      | - *      b (1)  c (0) - * \endcode - * - * Where the number indicates the corresponding token descriptor identifier. - */ -class TokenTrie { -public: -	/** -	 * Structure used to build the node tree. -	 */ -	struct Node { -		/** -		 * Type used for the child map. -		 */ -		using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; - -		/** -		 * Map from single characters at the corresponding child nodes. -		 */ -		ChildMap children; - -		/** -		 * Reference at the corresponding token descriptor. Set to nullptr if -		 * no token is attached to this node. -		 */ -		TokenTypeId type; - -		/** -		 * Default constructor, initializes the descriptor with nullptr. -		 */ -		Node(); -	}; - -private: -	/** -	 * Root node of the internal token tree. -	 */ -	Node root; - -public: -	/** -	 * Registers a token containing the given string. Returns false if the -	 * token already exists, true otherwise. -	 * -	 * @param token is the character sequence that should be registered as -	 * token. -	 * @param type is the descriptor that should be set for this token. -	 * @return true if the operation is successful, false otherwise. -	 */ -	bool registerToken(const std::string &token, TokenTypeId type) noexcept; - -	/** -	 * Unregisters the token from the token tree. Returns true if the token was -	 * unregistered successfully, false otherwise. -	 * -	 * @param token is the character sequence that should be unregistered. -	 * @return true if the operation was successful, false otherwise. -	 */ -	bool unregisterToken(const std::string &token) noexcept; - -	/** -	 * Returns true, if the given token exists within the TokenTree. This -	 * function is mostly thought for debugging and unit testing. -	 * -	 * @param token is the character sequence that should be searched. -	 * @return the attached token descriptor or nullptr if the given token is -	 * not found. -	 */ -	TokenTypeId hasToken(const std::string &token) const noexcept; - -	/** -	 * Returns a reference at the root node to be used for traversing the token -	 * tree. -	 * -	 * @return a reference at the root node. -	 */ -	const Node *getRoot() const noexcept { return &root; } -}; -} - -#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ -  | 
