diff options
Diffstat (limited to 'src/formats')
| -rw-r--r-- | src/formats/osdm/DynamicTokenizer.cpp | 544 | ||||
| -rw-r--r-- | src/formats/osdm/DynamicTokenizer.hpp | 252 | ||||
| -rw-r--r-- | src/formats/osdm/TokenTrie.cpp | 119 | ||||
| -rw-r--r-- | src/formats/osdm/TokenTrie.hpp | 150 | ||||
| -rw-r--r-- | src/formats/osml/OsmlParser.cpp | 57 | ||||
| -rw-r--r-- | src/formats/osml/OsmlParser.hpp | 48 | ||||
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp (renamed from src/formats/osdm/OsdmStreamParser.cpp) | 226 | ||||
| -rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp (renamed from src/formats/osdm/OsdmStreamParser.hpp) | 90 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlAttributeLocator.cpp | 144 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlAttributeLocator.hpp | 67 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 547 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 217 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlParser.cpp | 98 | ||||
| -rw-r--r-- | src/formats/osxml/OsxmlParser.hpp | 55 | 
14 files changed, 1465 insertions, 1149 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp deleted file mode 100644 index f2cfcd1..0000000 --- a/src/formats/osdm/DynamicTokenizer.cpp +++ /dev/null @@ -1,544 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <memory> -#include <vector> - -#include <core/common/CharReader.hpp> -#include <core/common/Exceptions.hpp> -#include <core/common/Utils.hpp> - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -namespace { - -/* Internal class TokenMatch */ - -/** - * Contains information about a matching token. - */ -struct TokenMatch { -	/** -	 * Token that was matched. -	 */ -	DynamicToken token; - -	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. -	 */ -	size_t textLength; - -	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. -	 */ -	size_t textEnd; - -	/** -	 * Constructor of the TokenMatch class. -	 */ -	TokenMatch() : textLength(0), textEnd(0) {} - -	/** -	 * Returns true if this TokenMatch instance actually represents a match. -	 */ -	bool hasMatch() { return token.type != EmptyToken; } -}; - -/* Internal class TokenLookup */ - -/** - * The TokenLookup class is used to represent a thread in a running token - * lookup. - */ -class TokenLookup { -private: -	/** -	 * Current node within the token trie. -	 */ -	TokenTrie::Node const *node; - -	/** -	 * Start offset within the source file. -	 */ -	size_t start; - -	/** -	 * Current length of the data within the text handler. The text buffer needs -	 * to be trimmed to this length if this token matches. -	 */ -	size_t textLength; - -	/** -	 * End location of the current text handler. This location needs to be used -	 * for the text token that is emitted before the actual token. -	 */ -	size_t textEnd; - -public: -	/** -	 * Constructor of the TokenLookup class. -	 * -	 * @param node is the current node. -	 * @param start is the start position. -	 * @param textLength is the text buffer length of the previous text token. -	 * @param textEnd is the current end location of the previous text token. -	 */ -	TokenLookup(const TokenTrie::Node *node, size_t start, -	            size_t textLength, size_t textEnd) -	    : node(node), start(start), textLength(textLength), textEnd(textEnd) -	{ -	} - -	/** -	 * Tries to extend the current path in the token trie with the given -	 * character. If a complete token is matched, stores this match in the -	 * tokens list (in case it is longer than any previous token). -	 * -	 * @param c is the character that should be appended to the current prefix. -	 * @param lookups is a list to which new TokeLookup instances are added -- -	 * which could potentially be expanded in the next iteration. -	 * @param match is the DynamicToken instance to which the matching token -	 * should be written. -	 * @param tokens is a reference at the internal token list of the -	 * DynamicTokenizer. -	 * @param end is the end byte offset of the current character. -	 * @param sourceId is the source if of this file. -	 */ -	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, -	             const std::vector<std::string> &tokens, SourceOffset end, -	             SourceId sourceId) -	{ -		// Check whether we can continue the current token path with the given -		// character without visiting an already visited node -		auto it = node->children.find(c); -		if (it == node->children.end()) { -			return; -		} - -		// Check whether the new node represents a complete token a whether it -		// is longer than the current token. If yes, replace the current token. -		node = it->second.get(); -		if (node->type != EmptyToken) { -			const std::string &str = tokens[node->type]; -			size_t len = str.size(); -			if (len > match.token.content.size()) { -				match.token = -				    DynamicToken{node->type, str, {sourceId, start, end}}; -				match.textLength = textLength; -				match.textEnd = textEnd; -			} -		} - -		// If this state can possibly be advanced, store it in the states list. -		if (!node->children.empty()) { -			lookups.emplace_back(*this); -		} -	} -}; - -/* Internal class TextHandlerBase */ - -/** - * Base class used for those classes that may be used as TextHandler in the - * DynamicTokenizer::next function. - */ -class TextHandlerBase { -public: -	/** -	 * Start position of the extracted text. -	 */ -	size_t textStart; - -	/** -	 * End position of the extracted text. -	 */ -	size_t textEnd; - -	/** -	 * Buffer containing the extracted text. -	 */ -	std::vector<char> textBuf; - -	/** -	 * Constructor of the TextHandlerBase base class. Initializes the start and -	 * end position with zeros. -	 */ -	TextHandlerBase() : textStart(0), textEnd(0) {} - -	/** -	 * Transforms the given token into a text token containing the extracted -	 * text. -	 * -	 * @param token is the output token to which the text should be written. -	 * @param sourceId is the source id of the underlying file. -	 */ -	void buildTextToken(TokenMatch &match, SourceId sourceId) -	{ -		if (match.hasMatch()) { -			match.token.content = -			    std::string{textBuf.data(), match.textLength}; -			match.token.location = -			    SourceLocation{sourceId, textStart, match.textEnd}; -		} else { -			match.token.content = std::string{textBuf.data(), textBuf.size()}; -			match.token.location = SourceLocation{sourceId, textStart, textEnd}; -		} -		match.token.type = TextToken; -	} - -	/** -	 * Returns true if this whitespace handler has found any text and a text -	 * token could be emitted. -	 * -	 * @return true if the internal data buffer is non-empty. -	 */ -	bool hasText() { return !textBuf.empty(); } -}; - -/* Internal class PreservingTextHandler */ - -/** - * The PreservingTextHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingTextHandler : public TextHandlerBase { -public: -	using TextHandlerBase::TextHandlerBase; - -	/** -	 * Appends the given character to the internal text buffer, does not -	 * eliminate whitespace. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; -		textBuf.push_back(c); -	} -}; - -/* Internal class TrimmingTextHandler */ - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingTextHandler : public TextHandlerBase { -public: -	using TextHandlerBase::TextHandlerBase; - -	/** -	 * Buffer used internally to temporarily store all whitespace characters. -	 * They are only added to the output buffer if another non-whitespace -	 * character is reached. -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * whitespace characters at the begin and end of the text. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				whitespaceBuf.push_back(c); -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (!whitespaceBuf.empty()) { -			textBuf.insert(textBuf.end(), whitespaceBuf.begin(), -			               whitespaceBuf.end()); -			whitespaceBuf.clear(); -		} -		textBuf.push_back(c); -	} -}; - -/* Internal class CollapsingTextHandler */ - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingTextHandler : public TextHandlerBase { -public: -	using TextHandlerBase::TextHandlerBase; - -	/** -	 * Flag set to true if a whitespace character was reached. -	 */ -	bool hasWhitespace = false; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * redundant whitespace characters. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				hasWhitespace = true; -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (hasWhitespace) { -			textBuf.push_back(' '); -			hasWhitespace = false; -		} -		textBuf.push_back(c); -	} -}; -} - -/* Class DynamicTokenizer */ - -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) -    : whitespaceMode(whitespaceMode), nextTokenTypeId(0) -{ -} - -template <typename TextHandler, bool read> -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) -{ -	// If we're in the read mode, reset the char reader peek position to the -	// current read position -	if (read) { -		reader.resetPeek(); -	} - -	// Prepare the lookups in the token trie -	const TokenTrie::Node *root = trie.getRoot(); -	TokenMatch match; -	std::vector<TokenLookup> lookups; -	std::vector<TokenLookup> nextLookups; - -	// Instantiate the text handler -	TextHandler textHandler; - -	// Peek characters from the reader and try to advance the current token tree -	// cursor -	char c; -	size_t charStart = reader.getPeekOffset(); -	const SourceId sourceId = reader.getSourceId(); -	while (reader.peek(c)) { -		const size_t charEnd = reader.getPeekOffset(); -		const size_t textLength = textHandler.textBuf.size(); -		const size_t textEnd = textHandler.textEnd; - -		// If we do not have a match yet, start a new lookup from the root -		if (!match.hasMatch()) { -			TokenLookup{root, charStart, textLength, textEnd}.advance( -			    c, nextLookups, match, tokens, charEnd, sourceId); -		} - -		// Try to advance all other lookups with the new character -		for (TokenLookup &lookup : lookups) { -			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); -		} - -		// We have found a token and there are no more states to advance or the -		// text handler has found something -- abort to return the new token -		if (match.hasMatch()) { -			if ((nextLookups.empty() || textHandler.hasText())) { -				break; -			} -		} else { -			// Record all incomming characters -			textHandler.append(c, charStart, charEnd); -		} - -		// Swap the lookups and the nextLookups list -		lookups = std::move(nextLookups); -		nextLookups.clear(); - -		// Advance the offset -		charStart = charEnd; -	} - -	// If we found text, emit that text -	if (textHandler.hasText() && -	    (!match.hasMatch() || match.textLength > 0)) { -		textHandler.buildTextToken(match, sourceId); -	} - -	// Move the read/peek cursor to the end of the token, abort if an error -	// happens while doing so -	if (match.hasMatch()) { -		// Make sure we have a valid location -		if (match.token.location.getEnd() == InvalidSourceOffset) { -			throw OusiaException{"Token end position offset out of range"}; -		} - -		// Seek to the end of the current token -		const size_t end = match.token.location.getEnd(); -		if (read) { -			reader.seek(end); -		} else { -			reader.seekPeekCursor(end); -		} -		token = match.token; -	} else { -		token = DynamicToken{}; -	} -	return match.hasMatch(); -} - -bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) -{ -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingTextHandler, true>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingTextHandler, true>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingTextHandler, true>(reader, token); -	} -	return false; -} - -bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) -{ -	switch (whitespaceMode) { -		case WhitespaceMode::PRESERVE: -			return next<PreservingTextHandler, false>(reader, token); -		case WhitespaceMode::TRIM: -			return next<TrimmingTextHandler, false>(reader, token); -		case WhitespaceMode::COLLAPSE: -			return next<CollapsingTextHandler, false>(reader, token); -	} -	return false; -} - -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) -{ -	// Abort if an empty token should be registered -	if (token.empty()) { -		return EmptyToken; -	} - -	// Search for a new slot in the tokens list -	TokenTypeId type = EmptyToken; -	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { -		if (tokens[i].empty()) { -			tokens[i] = token; -			type = i; -			break; -		} -	} - -	// No existing slot was found, add a new one -- make sure we do not -	// override the special token type handles -	if (type == EmptyToken) { -		type = tokens.size(); -		if (type == TextToken || type == EmptyToken) { -			throw OusiaException{"Token type ids depleted!"}; -		} -		tokens.emplace_back(token); -	} -	nextTokenTypeId = type + 1; - -	// Try to register the token in the trie -- if this fails, remove it -	// from the tokens list -	if (!trie.registerToken(token, type)) { -		tokens[type] = std::string(); -		nextTokenTypeId = type; -		return EmptyToken; -	} -	return type; -} - -bool DynamicTokenizer::unregisterToken(TokenTypeId type) -{ -	// Unregister the token from the trie, abort if an invalid type is given -	if (type < tokens.size() && trie.unregisterToken(tokens[type])) { -		tokens[type] = std::string{}; -		nextTokenTypeId = type; -		return true; -	} -	return false; -} - -std::string DynamicTokenizer::getTokenString(TokenTypeId type) -{ -	if (type < tokens.size()) { -		return tokens[type]; -	} -	return std::string{}; -} - -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) -{ -	whitespaceMode = mode; -} - -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } - -/* Explicitly instantiate all possible instantiations of the "next" member -   function */ -template bool DynamicTokenizer::next<PreservingTextHandler, false>( -    CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, false>( -    CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, false>( -    CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<PreservingTextHandler, true>( -    CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, true>( -    CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, true>( -    CharReader &reader,DynamicToken &token); -} - diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp deleted file mode 100644 index 0cac2e8..0000000 --- a/src/formats/osdm/DynamicTokenizer.hpp +++ /dev/null @@ -1,252 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file DynamicTokenizer.hpp - * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ - -#include <set> -#include <string> -#include <vector> - -#include <core/common/Location.hpp> - -#include "TokenTrie.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; - -/** - * The DynamicToken structure describes a token discovered by the Tokenizer. - */ -struct DynamicToken { -	/** -	 * Id of the type of this token. -	 */ -	TokenTypeId type; - -	/** -	 * String that was matched. -	 */ -	std::string content; - -	/** -	 * Location from which the string was extracted. -	 */ -	SourceLocation location; - -	/** -	 * Default constructor. -	 */ -	DynamicToken() : type(EmptyToken) {} - -	/** -	 * Constructor of the DynamicToken struct. -	 * -	 * @param id represents the token type. -	 * @param content is the string content that has been extracted. -	 * @param location is the location of the extracted string content in the -	 * source file. -	 */ -	DynamicToken(TokenTypeId type, const std::string &content, -	             SourceLocation location) -	    : type(type), content(content), location(location) -	{ -	} - -	/** -	 * Constructor of the DynamicToken struct, only initializes the token type -	 * -	 * @param type is the id corresponding to the type of the token. -	 */ -	DynamicToken(TokenTypeId type) : type(type) {} - -	/** -	 * The getLocation function allows the tokens to be directly passed as -	 * parameter to Logger or LoggableException instances. -	 * -	 * @return a reference at the location field -	 */ -	const SourceLocation &getLocation() const { return location; } -}; - -/** - * Enum specifying the whitespace handling of the DynamicTokenizer class when - * reading non-token text. - */ -enum class WhitespaceMode { -	/** -     * Preserves all whitespaces as they are found in the source file. -     */ -	PRESERVE, - -	/** -     * Trims whitespace at the beginning and the end of the found text. -     */ -	TRIM, - -	/** -     * Whitespaces are trimmed and collapsed, multiple whitespace characters -     * are replaced by a single space character. -     */ -	COLLAPSE -}; - -/** - * The DynamicTokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the - * tokenizer. - */ -class DynamicTokenizer { -private: -	/** -	 * Internally used token trie. This object holds all registered tokens. -	 */ -	TokenTrie trie; - -	/** -	 * Flag defining whether whitespaces should be preserved or not. -	 */ -	WhitespaceMode whitespaceMode; - -	/** -	 * Vector containing all registered token types. -	 */ -	std::vector<std::string> tokens; - -	/** -	 * Next index in the tokens list where to search for a new token id. -	 */ -	size_t nextTokenTypeId; - -	/** -	 * Templated function used internally to read the current token. The -	 * function is templated in order to force code generation for all six -	 * combiations of whitespace modes and reading/peeking. -	 * -	 * @tparam TextHandler is the type to be used for the textHandler instance. -	 * @tparam read specifies whether the function should start from and advance -	 * the read pointer of the char reader. -	 * @param reader is the CharReader instance from which the data should be -	 * read. -	 * @param token is the token structure into which the token information -	 * should be written. -	 * @return false if the end of the stream has been reached, true otherwise. -	 */ -	template <typename TextHandler, bool read> -	bool next(CharReader &reader, DynamicToken &token); - -public: -	/** -	 * Constructor of the DynamicTokenizer class. -	 * -	 * @param whitespaceMode specifies how whitespace should be handled. -	 */ -	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); - -	/** -	 * Registers the given string as a token. Returns a const pointer at a -	 * TokenDescriptor that will be used to reference the newly created token. -	 * -	 * @param token is the token string that should be registered. -	 * @return a unique identifier for the registered token or EmptyToken if -	 * an error occured. -	 */ -	TokenTypeId registerToken(const std::string &token); - -	/** -	 * Unregisters the token belonging to the given TokenTypeId. -	 * -	 * @param type is the token type that should be unregistered. The -	 *TokenTypeId -	 * must have been returned by registerToken. -	 * @return true if the operation was successful, false otherwise (e.g. -	 * because the given TokenDescriptor was already unregistered). -	 */ -	bool unregisterToken(TokenTypeId type); - -	/** -	 * Returns the token that was registered under the given TokenTypeId id or -	 *an -	 * empty string if an invalid TokenTypeId id is given. -	 * -	 * @param type is the TokenTypeId id for which the corresponding token -	 *string -	 * should be returned. -	 * @return the registered token string or an empty string if the given type -	 * was invalid. -	 */ -	std::string getTokenString(TokenTypeId type); - -	/** -	 * Sets the whitespace mode. -	 * -	 * @param whitespaceMode defines how whitespace should be treated in text -	 * tokens. -	 */ -	void setWhitespaceMode(WhitespaceMode mode); - -	/** -	 * Returns the current value of the whitespace mode. -	 * -	 * @return the whitespace mode. -	 */ -	WhitespaceMode getWhitespaceMode(); - -	/** -	 * Reads a new token from the CharReader and stores it in the given -	 * DynamicToken instance. -	 * -	 * @param reader is the CharReader instance from which the data should be -	 * read. -	 * @param token is a reference at the token instance into which the Token -	 * information should be written. -	 * @return true if a token could be read, false if the end of the stream -	 * has been reached. -	 */ -	bool read(CharReader &reader, DynamicToken &token); - -	/** -	 * The peek method does not advance the read position of the char reader, -	 * but reads the next token from the current char reader peek position. -	 * -	 * @param reader is the CharReader instance from which the data should be -	 * read. -	 * @param token is a reference at the token instance into which the Token -	 * information should be written. -	 * @return true if a token could be read, false if the end of the stream -	 * has been reached. -	 */ -	bool peek(CharReader &reader, DynamicToken &token); -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ - diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp deleted file mode 100644 index 4a0430b..0000000 --- a/src/formats/osdm/TokenTrie.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "TokenTrie.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -TokenTrie::Node::Node() : type(EmptyToken) {} - -/* Class DynamicTokenTree */ - -bool TokenTrie::registerToken(const std::string &token, -                              TokenTypeId type) noexcept -{ -	// Abort if the token is empty -- this would taint the root node -	if (token.empty()) { -		return false; -	} - -	// Iterate over each character in the given string and insert them as -	// (new) nodes -	Node *node = &root; -	for (size_t i = 0; i < token.size(); i++) { -		// Insert a new node if this one does not exist -		const char c = token[i]; -		auto it = node->children.find(c); -		if (it == node->children.end()) { -			it = node->children.emplace(c, std::make_shared<Node>()).first; -		} -		node = it->second.get(); -	} - -	// If the resulting node already has a type set, we're screwed. -	if (node->type != EmptyToken) { -		return false; -	} - -	// Otherwise just set the type to the given type. -	node->type = type; -	return true; -} - -bool TokenTrie::unregisterToken(const std::string &token) noexcept -{ -	// We cannot remove empty tokens as we need to access the fist character -	// upfront -	if (token.empty()) { -		return false; -	} - -	// First pass -- search the node in the path that can be deleted -	Node *subtreeRoot = &root; -	char subtreeKey = token[0]; -	Node *node = &root; -	for (size_t i = 0; i < token.size(); i++) { -		// Go to the next node, abort if the tree ends unexpectedly -		auto it = node->children.find(token[i]); -		if (it == node->children.end()) { -			return false; -		} - -		// Reset the subtree handler if this node has another type -		node = it->second.get(); -		if ((node->type != EmptyToken || node->children.size() > 1) && -		    (i + 1 != token.size())) { -			subtreeRoot = node; -			subtreeKey = token[i + 1]; -		} -	} - -	// If the node type is already EmptyToken, we cannot do anything here -	if (node->type == EmptyToken) { -		return false; -	} - -	// If the target node has children, we cannot delete the subtree. Set the -	// type to EmptyToken instead -	if (!node->children.empty()) { -		node->type = EmptyToken; -		return true; -	} - -	// If we end up here, we can safely delete the complete subtree -	subtreeRoot->children.erase(subtreeKey); -	return true; -} - -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept -{ -	Node const *node = &root; -	for (size_t i = 0; i < token.size(); i++) { -		const char c = token[i]; -		auto it = node->children.find(c); -		if (it == node->children.end()) { -			return EmptyToken; -		} -		node = it->second.get(); -	} -	return node->type; -} -} - diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp deleted file mode 100644 index 36c2ffa..0000000 --- a/src/formats/osdm/TokenTrie.hpp +++ /dev/null @@ -1,150 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file TokenTrie.hpp - * - * Class representing a token trie that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_TRIE_HPP_ -#define _OUSIA_TOKEN_TRIE_HPP_ - -#include <cstdint> -#include <memory> -#include <limits> -#include <unordered_map> - -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); - -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; - -/** - * The Tokenizer internally uses a TokenTrie to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * A token trie is a construct that structures all special tokens a Tokenizer - * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and - * three. Then the token tree would look like this: - * - * \code{*.txt} - *        ~ (0) - *       /     \ - *      a (2)  b (0) - *      |      | - *      a (0)  a (0) - *      |      | - *      b (1)  c (0) - * \endcode - * - * Where the number indicates the corresponding token descriptor identifier. - */ -class TokenTrie { -public: -	/** -	 * Structure used to build the node tree. -	 */ -	struct Node { -		/** -		 * Type used for the child map. -		 */ -		using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; - -		/** -		 * Map from single characters at the corresponding child nodes. -		 */ -		ChildMap children; - -		/** -		 * Reference at the corresponding token descriptor. Set to nullptr if -		 * no token is attached to this node. -		 */ -		TokenTypeId type; - -		/** -		 * Default constructor, initializes the descriptor with nullptr. -		 */ -		Node(); -	}; - -private: -	/** -	 * Root node of the internal token tree. -	 */ -	Node root; - -public: -	/** -	 * Registers a token containing the given string. Returns false if the -	 * token already exists, true otherwise. -	 * -	 * @param token is the character sequence that should be registered as -	 * token. -	 * @param type is the descriptor that should be set for this token. -	 * @return true if the operation is successful, false otherwise. -	 */ -	bool registerToken(const std::string &token, TokenTypeId type) noexcept; - -	/** -	 * Unregisters the token from the token tree. Returns true if the token was -	 * unregistered successfully, false otherwise. -	 * -	 * @param token is the character sequence that should be unregistered. -	 * @return true if the operation was successful, false otherwise. -	 */ -	bool unregisterToken(const std::string &token) noexcept; - -	/** -	 * Returns true, if the given token exists within the TokenTree. This -	 * function is mostly thought for debugging and unit testing. -	 * -	 * @param token is the character sequence that should be searched. -	 * @return the attached token descriptor or nullptr if the given token is -	 * not found. -	 */ -	TokenTypeId hasToken(const std::string &token) const noexcept; - -	/** -	 * Returns a reference at the root node to be used for traversing the token -	 * tree. -	 * -	 * @return a reference at the root node. -	 */ -	const Node *getRoot() const noexcept { return &root; } -}; -} - -#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ - diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp new file mode 100644 index 0000000..4973639 --- /dev/null +++ b/src/formats/osml/OsmlParser.cpp @@ -0,0 +1,57 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/parser/generic/ParserStateCallbacks.hpp> +#include <core/parser/generic/ParserStateStack.hpp> + +#include "OsdmParser.hpp" +#include "OsdmStreamParser.hpp" + +namespace ousia { + +namespace { + +/** + * The OsdmParserImplementation class contains the actual implementation of the + * parsing process and is created in the "doParse" function of the OsdmParser. +  + */ +class OsdmParserImplementation : public ParserStateCallbacks { +private: +	/** +	 * OsdmStreamParser instance. +	 */ +	OsdmStreamParser parser; + +	/** +	 * Instance of the ParserStateStack. +	 */ +	ParserStateStack stack; + +public: +	OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap) +}; +} + +void OsdmParser::doParse(CharReader &reader, ParserContext &ctx) +{ +	OsdmParserImplementation parser(reader, ctx); +	parser.parse(); +} + +} diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp new file mode 100644 index 0000000..37505b4 --- /dev/null +++ b/src/formats/osml/OsmlParser.hpp @@ -0,0 +1,48 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsdmParser.hpp + * + * Contains the parser of the osdm format, the standard plain-text format used + * by Ousía for documents. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_PARSER_HPP_ +#define _OUSIA_OSDM_PARSER_HPP_ + +#include <core/parser/Parser.hpp> + +namespace ousia { + +/** + * OsdmParser is a small wrapper implementing the Parser interface. The actual + * parsing is performed with the OsdmStreamParser in conjunction with the + * ParserStateStack. + */ +class OsdmParser : public Parser { +protected: +	void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSDM_PARSER_HPP_ */ + diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 8cb8caf..0174fa4 100644 --- a/src/formats/osdm/OsdmStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -21,14 +21,14 @@  #include <core/common/Utils.hpp>  #include <core/common/VariantReader.hpp> -#include "OsdmStreamParser.hpp" +#include "OsmlStreamParser.hpp"  namespace ousia {  /**   * Plain format default tokenizer.   */ -class PlainFormatTokens : public DynamicTokenizer { +class PlainFormatTokens : public Tokenizer {  public:  	/**  	 * Id of the backslash token. @@ -61,6 +61,21 @@ public:  	TokenTypeId FieldEnd;  	/** +	 * Id of the default field start token. +	 */ +	TokenTypeId DefaultFieldStart; + +	/** +	 * Id of the annotation start token. +	 */ +	TokenTypeId AnnotationStart; + +	/** +	 * Id of the annotation end token. +	 */ +	TokenTypeId AnnotationEnd; + +	/**  	 * Registers the plain format tokens in the internal tokenizer.  	 */  	PlainFormatTokens() @@ -71,6 +86,9 @@ public:  		BlockCommentEnd = registerToken("}%");  		FieldStart = registerToken("{");  		FieldEnd = registerToken("}"); +		DefaultFieldStart = registerToken("{!"); +		AnnotationStart = registerToken("<\\"); +		AnnotationEnd = registerToken("\\>");  	}  }; @@ -160,14 +178,14 @@ public:  	}  }; -OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)      : reader(reader), logger(logger), tokenizer(Tokens)  {  	// Place an intial command representing the complete file on the stack -	commands.push(Command{"", Variant::mapType{}, true, true, true}); +	commands.push(Command{"", Variant::mapType{}, true, true, true, false});  } -Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)  {  	bool first = true;  	bool hasCharSiceNSSep = false; @@ -210,7 +228,7 @@ Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)  	return res;  } -OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()  {  	// Expect a '{' after the command  	reader.consumeWhitespace(); @@ -251,7 +269,7 @@ OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()  	return State::COMMAND;  } -static bool checkStillInField(const OsdmStreamParser::Command &cmd, +static bool checkStillInField(const OsmlStreamParser::Command &cmd,                                const Variant &endName, Logger &logger)  {  	if (cmd.inField && !cmd.inRangeField) { @@ -264,7 +282,7 @@ static bool checkStillInField(const OsdmStreamParser::Command &cmd,  	return false;  } -OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +OsmlStreamParser::State OsmlStreamParser::parseEndCommand()  {  	// Expect a '{' after the command  	if (!reader.expect('{')) { @@ -327,7 +345,7 @@ OsdmStreamParser::State OsdmStreamParser::parseEndCommand()  	return cmd.inRangeField ? State::FIELD_END : State::NONE;  } -Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)  {  	// Parse the arguments using the universal VariantReader  	Variant commandArguments; @@ -353,7 +371,7 @@ Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)  	return commandArguments;  } -void OsdmStreamParser::pushCommand(Variant commandName, +void OsmlStreamParser::pushCommand(Variant commandName,                                     Variant commandArguments, bool hasRange)  {  	// Store the location on the stack @@ -365,10 +383,11 @@ void OsdmStreamParser::pushCommand(Variant commandName,  		commands.pop();  	}  	commands.push(Command{std::move(commandName), std::move(commandArguments), -	                      hasRange, false, false}); +	                      hasRange, false, false, false});  } -OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, +                                                       bool isAnnotation)  {  	// Parse the commandName as a first identifier  	Variant commandName = parseIdentifier(start, true); @@ -382,6 +401,9 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)  	    Utils::split(commandName.asString(), ':');  	const bool isBegin = commandNameComponents[0] == "begin";  	const bool isEnd = commandNameComponents[0] == "end"; + +	// Parse the begin or end command +	State res = State::COMMAND;  	if (isBegin || isEnd) {  		if (commandNameComponents.size() > 1) {  			logger.error( @@ -390,35 +412,81 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)  			    commandName);  		}  		if (isBegin) { -			return parseBeginCommand(); +			res = parseBeginCommand();  		} else if (isEnd) { -			return parseEndCommand(); +			res = parseEndCommand();  		} +	} else { +		// Check whether the next character is a '#', indicating the start of +		// the command name +		Variant commandArgName; +		start = reader.getOffset(); +		if (reader.expect('#')) { +			commandArgName = parseIdentifier(start); +			if (commandArgName.asString().empty()) { +				logger.error("Expected identifier after \"#\"", commandArgName); +			} +		} + +		// Parse the arugments +		Variant commandArguments = +		    parseCommandArguments(std::move(commandArgName)); + +		// Push the command onto the command stack +		pushCommand(std::move(commandName), std::move(commandArguments), false);  	} -	// Check whether the next character is a '#', indicating the start of the -	// command name -	Variant commandArgName; -	start = reader.getOffset(); -	if (reader.expect('#')) { -		commandArgName = parseIdentifier(start); -		if (commandArgName.asString().empty()) { -			logger.error("Expected identifier after \"#\"", commandArgName); +	// Check whether a ">" character is the next character that is to be read. +	// In that case the current command could be an annotation end command! +	char c; +	if (reader.fetch(c) && c == '>') { +		// Ignore the character after a begin or end command +		if (isBegin || isEnd) { +			logger.warning( +			    "Ignoring annotation end character \">\" after special " +			    "commands \"begin\" or \"end\". Write \"\\>\" to end a " +			    "\"begin\"/\"end\" enclosed annotation.", +			    reader); +			return res;  		} -	} -	// Parse the arugments -	Variant commandArguments = parseCommandArguments(std::move(commandArgName)); +		// If this should be an annoation, ignore the character +		if (isAnnotation) { +			logger.warning( +			    "Ignoring annotation end character \">\" after annotation " +			    "start command. Write \"\\>\" to end the annotation.", +			    reader); +		} else { +			// Make sure no arguments apart from the "name" argument are given +			// to an annotation end +			Variant::mapType &map = commands.top().arguments.asMap(); +			if (!map.empty()) { +				if (map.count("name") == 0 || map.size() > 1U) { +					logger.error( +					    "An annotation end command may not have any arguments " +					    "other than \"name\""); +					return res; +				} +			} -	// Push the command onto the command stack -	pushCommand(std::move(commandName), std::move(commandArguments), false); +			// If we got here, this is a valid ANNOTATION_END command, issue it +			reader.peek(c); +			reader.consumePeek(); +			return State::ANNOTATION_END; +		} +	} -	return State::COMMAND; +	// If we're starting an annotation, return the command as annotation start +	// instead of command +	if (isAnnotation && res == State::COMMAND) { +		return State::ANNOTATION_START; +	} +	return res;  } -void OsdmStreamParser::parseBlockComment() +void OsmlStreamParser::parseBlockComment()  { -	DynamicToken token; +	Token token;  	size_t depth = 1;  	while (tokenizer.read(reader, token)) {  		if (token.type == Tokens.BlockCommentEnd) { @@ -436,7 +504,7 @@ void OsdmStreamParser::parseBlockComment()  	logger.error("File ended while being in a block comment", reader);  } -void OsdmStreamParser::parseLineComment() +void OsmlStreamParser::parseLineComment()  {  	char c;  	while (reader.read(c)) { @@ -446,7 +514,7 @@ void OsdmStreamParser::parseLineComment()  	}  } -bool OsdmStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData(DataHandler &handler)  {  	if (!handler.isEmpty()) {  		data = handler.toVariant(reader.getSourceId()); @@ -457,7 +525,7 @@ bool OsdmStreamParser::checkIssueData(DataHandler &handler)  	return false;  } -bool OsdmStreamParser::checkIssueFieldStart() +bool OsmlStreamParser::checkIssueFieldStart()  {  	// Fetch the current command, and check whether we're currently inside a  	// field of this command @@ -482,18 +550,41 @@ bool OsdmStreamParser::checkIssueFieldStart()  	return false;  } -OsdmStreamParser::State OsdmStreamParser::parse() +bool OsmlStreamParser::closeField() +{ +	// Try to end an open field of the current command -- if the current command +	// is not inside an open field, end this command and try to close the next +	// one +	for (int i = 0; i < 2 && commands.size() > 1; i++) { +		Command &cmd = commands.top(); +		if (!cmd.inRangeField) { +			if (cmd.inField) { +				cmd.inField = false; +				if (cmd.inDefaultField) { +					commands.pop(); +				} +				return true; +			} +			commands.pop(); +		} else { +			return false; +		} +	} +	return false; +} + +OsmlStreamParser::State OsmlStreamParser::parse()  {  	// Handler for incomming data  	DataHandler handler;  	// Read tokens until the outer loop should be left -	DynamicToken token; +	Token token;  	while (tokenizer.peek(reader, token)) {  		const TokenTypeId type = token.type;  		// Special handling for Backslash and Text -		if (type == Tokens.Backslash) { +		if (type == Tokens.Backslash || type == Tokens.AnnotationStart) {  			// Before appending anything to the output data or starting a new  			// command, check whether FIELD_START has to be issued, as the  			// current command is a command with range @@ -519,7 +610,8 @@ OsdmStreamParser::State OsdmStreamParser::parse()  				}  				// Parse the actual command -				State res = parseCommand(token.location.getStart()); +				State res = parseCommand(token.location.getStart(), +				                         type == Tokens.AnnotationStart);  				switch (res) {  					case State::ERROR:  						throw LoggableException( @@ -536,6 +628,14 @@ OsdmStreamParser::State OsdmStreamParser::parse()  			// to the data buffer, use the escape character start as start  			// location and the peek offset as end location  			reader.peek(c);  // Peek the previously fetched character + +			// If this was an annotation start token, add the parsed < to the +			// output +			if (type == Tokens.AnnotationStart) { +				handler.append('<', token.location.getStart(), +				               token.location.getStart() + 1); +			} +  			handler.append(c, token.location.getStart(),  			               reader.getPeekOffset());  			reader.consumePeek(); @@ -579,28 +679,37 @@ OsdmStreamParser::State OsdmStreamParser::parse()  			}  			logger.error(  			    "Got field start token \"{\", but no command for which to " -			    "start the field. Did you mean \"\\{\"?", +			    "start the field. Write \"\\{\" to insert this sequence as " +			    "text.",  			    token);  		} else if (token.type == Tokens.FieldEnd) { -			// Try to end an open field of the current command -- if the current -			// command is not inside an open field, end this command and try to -			// close the next one -			for (int i = 0; i < 2 && commands.size() > 1; i++) { -				Command &cmd = commands.top(); -				if (!cmd.inRangeField) { -					if (cmd.inField) { -						cmd.inField = false; -						return State::FIELD_END; -					} -					commands.pop(); -				} else { -					break; -				} +			if (closeField()) { +				return State::FIELD_END;  			}  			logger.error( -			    "Got field end token \"}\", but there is no field to end. Did " -			    "you mean \"\\}\"?", +			    "Got field end token \"}\", but there is no field to end. " +			    "Write \"\\}\" to insert this sequence as text.",  			    token); +		} else if (token.type == Tokens.DefaultFieldStart) { +			// Try to start a default field the first time the token is reached +			Command &topCmd = commands.top(); +			if (!topCmd.inField) { +				topCmd.inField = true; +				topCmd.inDefaultField = true; +				return State::FIELD_START; +			} +			logger.error( +			    "Got default field start token \"{!\", but no command for " +			    "which to start the field. Write \"\\{!\" to insert this " +			    "sequence as text", +			    token); +		} else if (token.type == Tokens.AnnotationEnd) { +			// We got a single annotation end token "\>" -- simply issue the +			// ANNOTATION_END event +			Variant annotationName = Variant::fromString(""); +			annotationName.setLocation(token.location); +			pushCommand(annotationName, Variant::mapType{}, false); +			return State::ANNOTATION_END;  		} else {  			logger.error("Unexpected token \"" + token.content + "\"", token);  		} @@ -627,14 +736,19 @@ OsdmStreamParser::State OsdmStreamParser::parse()  	return State::END;  } -const Variant &OsdmStreamParser::getCommandName() +const Variant &OsmlStreamParser::getCommandName() const  {  	return commands.top().name;  } -const Variant &OsdmStreamParser::getCommandArguments() +const Variant &OsmlStreamParser::getCommandArguments() const  {  	return commands.top().arguments;  } + +bool OsmlStreamParser::inDefaultField() const +{ +	return commands.top().inRangeField || commands.top().inDefaultField; +}  } diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 48d8fb7..dc3034c 100644 --- a/src/formats/osdm/OsdmStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -17,23 +17,22 @@  */  /** - * @file OsdmStreamParser.hpp + * @file OsmlStreamParser.hpp   * - * Provides classes for low-level classes for reading the TeX-esque osdm + * Provides classes for low-level classes for reading the TeX-esque osml   * format. The class provided here does not build any model objects and does not   * implement the Parser interface.   *   * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)   */ -#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ -#define _OUSIA_OSDM_STREAM_PARSER_HPP_ +#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ +#define _OUSIA_OSML_STREAM_PARSER_HPP_  #include <stack>  #include <core/common/Variant.hpp> - -#include "DynamicTokenizer.hpp" +#include <core/parser/utils/Tokenizer.hpp>  namespace ousia { @@ -43,7 +42,7 @@ class Logger;  class DataHandler;  /** - * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml   * format. The parser is constructed around a "parse" function, which reads data   * from the underlying CharReader until a new state is reached and indicates   * this state in a return value. The calling code then has to pull corresponding @@ -53,10 +52,10 @@ class DataHandler;   * fields, as this would lead to too many consecutive errors) a   * LoggableException is thrown.   */ -class OsdmStreamParser { +class OsmlStreamParser {  public:  	/** -	 * Enum used to indicate which state the OsdmStreamParser class is in +	 * Enum used to indicate which state the OsmlStreamParser class is in  	 * after calling the "parse" function.  	 */  	enum class State { @@ -140,23 +139,35 @@ public:  		/**  		 * Set to true if this is a command with clear begin and end.  		 */ -		bool hasRange; +		bool hasRange : 1;  		/**  		 * Set to true if we are currently inside a field of this command.  		 */ -		bool inField; +		bool inField : 1;  		/**  		 * Set to true if we are currently in the range field of the command  		 * (implies inField being set to true).  		 */ -		bool inRangeField; +		bool inRangeField : 1; + +		/** +		 * Set to true if we are currently in a field that has been especially +		 * marked as default field (using the "|") syntax. +		 */ +		bool inDefaultField : 1;  		/**  		 * Default constructor.  		 */ -		Command() : hasRange(false), inField(false), inRangeField(false) {} +		Command() +		    : hasRange(false), +		      inField(false), +		      inRangeField(false), +		      inDefaultField() +		{ +		}  		/**  		 * Constructor of the Command class. @@ -169,16 +180,19 @@ public:  		 * explicit range.  		 * @param inField is set to true if we currently are inside a field  		 * of this command. -		 * @param inRangeField is set to true if we currently inside the outer -		 * field of the command. +		 * @param inRangeField is set to true if we currently are inside the +		 * outer field of a ranged command. +		 * @param inDefaultField is set to true if we currently are in a +		 * specially marked default field.  		 */ -		Command(Variant name, Variant arguments, bool hasRange, bool inField, -		        bool inRangeField) +		Command(Variant name, Variant arguments, bool hasRange, +		        bool inField, bool inRangeField, bool inDefaultField)  		    : name(std::move(name)),  		      arguments(std::move(arguments)),  		      hasRange(hasRange),  		      inField(inField), -		      inRangeField(inRangeField) +		      inRangeField(inRangeField), +		      inDefaultField(inDefaultField)  		{  		}  	}; @@ -198,7 +212,7 @@ private:  	/**  	 * Tokenizer instance used to read individual tokens from the text.  	 */ -	DynamicTokenizer tokenizer; +	Tokenizer tokenizer;  	/**  	 * Stack containing the current commands. @@ -258,9 +272,11 @@ private:  	 *  	 * @param start is the start byte offset of the command (including the  	 * backslash) +	 * @param isAnnotation if true, the command is not returned as command, but +	 * as annotation start.  	 * @return true if a command was actuall parsed, false otherwise.  	 */ -	State parseCommand(size_t start); +	State parseCommand(size_t start, bool isAnnotation);  	/**  	 * Function used internally to parse a block comment. @@ -290,16 +306,26 @@ private:  	 */  	bool checkIssueFieldStart(); +	/** +	 * Closes a currently open field. Note that the command will be removed from +	 * the internal command stack if the field that is being closed is a +	 * field marked as default field. +	 * +	 * @return true if the field could be closed, false if there was no field +	 * to close. +	 */ +	bool closeField(); +  public:  	/** -	 * Constructor of the OsdmStreamParser class. Attaches the new -	 * OsdmStreamParser to the given CharReader and Logger instances. +	 * Constructor of the OsmlStreamParser class. Attaches the new +	 * OsmlStreamParser to the given CharReader and Logger instances.  	 *  	 * @param reader is the reader instance from which incomming characters  	 * should be read.  	 * @param logger is the logger instance to which errors should be written.  	 */ -	OsdmStreamParser(CharReader &reader, Logger &logger); +	OsmlStreamParser(CharReader &reader, Logger &logger);  	/**  	 * Continues parsing. Returns one of the states defined in the State enum. @@ -318,7 +344,7 @@ public:  	 * @return a reference at a variant containing the data parsed by the  	 * "parse" function.  	 */ -	const Variant &getData() { return data; } +	const Variant &getData() const { return data; }  	/**  	 * Returns a reference at the internally stored command name. Only valid if @@ -327,7 +353,7 @@ public:  	 * @return a reference at a variant containing name and location of the  	 * parsed command.  	 */ -	const Variant &getCommandName(); +	const Variant &getCommandName() const;  	/**  	 * Returns a reference at the internally stored command name. Only valid if @@ -336,16 +362,24 @@ public:  	 * @return a reference at a variant containing arguments given to the  	 * command.  	 */ -	const Variant &getCommandArguments(); +	const Variant &getCommandArguments() const; + +	/** +	 * Returns true if the current field is the "default" field. This is true if +	 * the parser either is in the outer range of a range command or inside a +	 * field that has been especially marked as "default" field (using the "|" +	 * syntax). +	 */ +	bool inDefaultField() const;  	/**  	 * Returns a reference at the char reader.  	 *  	 * @return the last internal token location.  	 */ -	SourceLocation &getLocation() { return location; } +	const SourceLocation &getLocation() const { return location; }  };  } -#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ +#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */ diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp new file mode 100644 index 0000000..e37446a --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.cpp @@ -0,0 +1,144 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/Location.hpp> +#include <core/common/CharReader.hpp> +#include <core/common/Utils.hpp> + +#include "OsxmlAttributeLocator.hpp" + +namespace ousia { + +/** + * Enum used internally in the statemachine of the xml argument parser. + */ +enum class XmlAttributeState { +	IN_TAG_NAME, +	SEARCH_ATTR, +	IN_ATTR_NAME, +	HAS_ATTR_NAME, +	HAS_ATTR_EQUALS, +	IN_ATTR_DATA +}; + +std::map<std::string, SourceLocation> OsxmlAttributeLocator::locate( +    CharReader &reader, size_t offs) +{ +	std::map<std::string, SourceLocation> res; + +	// Fork the reader, we don't want to mess up the XML parsing process, do we? +	CharReaderFork readerFork = reader.fork(); + +	// Move the read cursor to the start location, abort if this does not work +	if (offs != readerFork.seek(offs)) { +		return res; +	} + +	// Now all we need to do is to implement one half of an XML parser. As this +	// is inherently complicated we'll totaly fail at it. Don't care. All we +	// want to get is those darn offsets for pretty error messages... (and we +	// can assume the XML is valid as it was already read by expat) +	XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; +	char c; +	std::stringstream attrName; +	while (readerFork.read(c)) { +		// Abort at the end of the tag +		if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { +			return res; +		} + +		// One state machine to rule them all, one state machine to find them, +		// One state machine to bring them all and in the darkness bind them +		// (the byte offsets) +		switch (state) { +			case XmlAttributeState::IN_TAG_NAME: +				if (Utils::isWhitespace(c)) { +					res.emplace("$tag", +					            SourceLocation{reader.getSourceId(), offs + 1, +					                           readerFork.getOffset() - 1}); +					state = XmlAttributeState::SEARCH_ATTR; +				} +				break; +			case XmlAttributeState::SEARCH_ATTR: +				if (!Utils::isWhitespace(c)) { +					state = XmlAttributeState::IN_ATTR_NAME; +					attrName << c; +				} +				break; +			case XmlAttributeState::IN_ATTR_NAME: +				if (Utils::isWhitespace(c)) { +					state = XmlAttributeState::HAS_ATTR_NAME; +				} else if (c == '=') { +					state = XmlAttributeState::HAS_ATTR_EQUALS; +				} else { +					attrName << c; +				} +				break; +			case XmlAttributeState::HAS_ATTR_NAME: +				if (!Utils::isWhitespace(c)) { +					if (c == '=') { +						state = XmlAttributeState::HAS_ATTR_EQUALS; +						break; +					} +					// Well, this is a strange XML file... We expected to +					// see a '=' here! Try to continue with the +					// "HAS_ATTR_EQUALS" state as this state will hopefully +					// inlcude some error recovery +				} else { +					// Skip whitespace here +					break; +				} +			// Fallthrough +			case XmlAttributeState::HAS_ATTR_EQUALS: +				if (!Utils::isWhitespace(c)) { +					if (c == '"') { +						// Here we are! We have found the beginning of an +						// attribute. Let's quickly lock the current offset away +						// in the result map +						res.emplace(attrName.str(), +						            SourceLocation{reader.getSourceId(), +						                           readerFork.getOffset()}); +						state = XmlAttributeState::IN_ATTR_DATA; +					} else { +						// No, this XML file is not well formed. Assume we're in +						// an attribute name once again +						attrName.str(std::string{&c, 1}); +						state = XmlAttributeState::IN_ATTR_NAME; +					} +				} +				break; +			case XmlAttributeState::IN_ATTR_DATA: +				if (c == '"') { +					// We're at the end of the attribute data, set the end +					// location +					auto it = res.find(attrName.str()); +					if (it != res.end()) { +						it->second.setEnd(readerFork.getOffset() - 1); +					} + +					// Reset the attribute name and restart the search +					attrName.str(std::string{}); +					state = XmlAttributeState::SEARCH_ATTR; +				} +				break; +		} +	} +	return res; +} +} + diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp new file mode 100644 index 0000000..f9a3437 --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.hpp @@ -0,0 +1,67 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlAttributeLocator.hpp + * + * Contains a class used for locating the byte offsets of the attributes given + * in a XML tag. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ +#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ + +#include <map> + +namespace ousia { + +// Forward declarations +class CharReader; +class SourceLocation; + +/** + * Class containing one static function for locating the byte offsets of the + * attributes in a XML tag. This are not retrieved by our xml parser, so we have + * to do this manually. + */ +class OsxmlAttributeLocator { +public: +	/** +	 * Function used to reconstruct the location of the attributes of a XML tag +	 * in the source code. This is necessary, as the xml parser only returns an +	 * offset to the begining of a tag and not to the position of the individual +	 * arguments. +	 * +	 * @param reader is the char reader from which the character data should be +	 * read. +	 * @param offs is a byte offset in the xml file pointing at the "<" +	 * character of the tag. +	 * @return a map from attribute keys to the corresponding location +	 * (including range) of the atribute. Also contains the location of the +	 * tagname in the form of the virtual attribute "$tag". +	 */ +	static std::map<std::string, SourceLocation> locate(CharReader &reader, +	                                                    size_t offs); +}; + +} + +#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */ + diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp new file mode 100644 index 0000000..7404960 --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -0,0 +1,547 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <expat.h> + +#include <vector> + +#include <core/common/CharReader.hpp> +#include <core/common/Logger.hpp> +#include <core/common/Variant.hpp> +#include <core/common/VariantReader.hpp> +#include <core/common/Utils.hpp> +#include <core/common/WhitespaceHandler.hpp> + +#include "OsxmlAttributeLocator.hpp" +#include "OsxmlEventParser.hpp" + +namespace ousia { + +/* Class OsxmlEventParser */ + +/** + * Class containing data used by the internal functions. + */ +class OsxmlEventParserData { +public: +	/** +	 * Contains the current depth of the parsing process. +	 */ +	ssize_t depth; + +	/** +	 * Set to a value larger or equal to zero if the parser is currently inside +	 * an annotation end tag -- the value represents the depth in which the +	 * tag was opened. +	 */ +	ssize_t annotationEndTagDepth; + +	/** +	 * Current character data buffer. +	 */ +	std::vector<char> textBuf; + +	/** +	 * Current whitespace buffer (for the trimming whitspace mode) +	 */ +	std::vector<char> whitespaceBuf; + +	/** +	 * Flag indicating whether a whitespace character was present (for the +	 * collapsing whitespace mode). +	 */ +	bool hasWhitespace; + +	/** +	 * Current character data start. +	 */ +	size_t textStart; + +	/** +	 * Current character data end. +	 */ +	size_t textEnd; + +	/** +	 * Default constructor. +	 */ +	OsxmlEventParserData(); + +	/** +	 * Increments the depth. +	 */ +	void incrDepth(); + +	/** +	 * Decrement the depth and reset the annotationEndTagDepth flag. +	 */ +	void decrDepth(); + +	/** +	 * Returns true if we're currently inside an end tag. +	 */ +	bool inAnnotationEndTag(); + +	/** +	 * Returns true if character data is available. +	 * +	 * @return true if character data is available. +	 */ +	bool hasText(); + +	/** +	 * Returns a Variant containing the character data and its location. +	 * +	 * @return a string variant containing the text data and the character +	 * location. +	 */ +	Variant getText(SourceId sourceId); +}; + +/* Class GuardedExpatXmlParser */ + +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class GuardedExpatXmlParser { +private: +	/** +	 * Internal pointer to the XML_Parser instance. +	 */ +	XML_Parser parser; + +public: +	/** +	 * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS +	 * from the expat library. Throws a parser exception if the XML parser +	 * cannot be initialized. +	 * +	 * @param encoding is the protocol-defined encoding passed to expat (or +	 * nullptr if expat should determine the encoding by itself). +	 */ +	GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) +	{ +		parser = XML_ParserCreate(encoding); +		if (!parser) { +			throw LoggableException{ +			    "Internal error: Could not create expat XML parser!"}; +		} +	} + +	/** +	 * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance. +	 */ +	~GuardedExpatXmlParser() +	{ +		if (parser) { +			XML_ParserFree(parser); +			parser = nullptr; +		} +	} + +	/** +	 * Returns the XML_Parser pointer. +	 */ +	XML_Parser operator&() { return parser; } +}; + +/** + * Name of the special outer tag used for allowing multiple top-level elements + * in an xml file. + */ +static const std::string TOP_LEVEL_TAG{"ousia"}; + +/** + * Prefix used to indicate the start of an annoation (note the trailing colon) + */ +static const std::string ANNOTATION_START_PREFIX{"a:start:"}; + +/** + * Prefix used to indicate the end of an annotation. + */ +static const std::string ANNOTATION_END_PREFIX{"a:end"}; + +/** + * Synchronizes the position of the xml parser with the default location of the + * logger instance. + * + * @param p is a pointer at the xml parser instance. + * @param len is the length of the string that should be refered to. + * @return the SourceLocation that has been set in the logger. + */ +static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) +{ +	// Fetch the OsxmlEventParser instance +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// Fetch the current location in the XML file and set the default location +	// in the logger +	size_t offs = XML_GetCurrentByteIndex(p); +	SourceLocation loc = +	    SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; +	parser->getLogger().setDefaultLocation(loc); + +	// Return the fetched location +	return loc; +} + +/** + * Callback called by eXpat whenever a start handler is reached. + */ +static void xmlStartElementHandler(void *ref, const XML_Char *name, +                                   const XML_Char **attrs) +{ +	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser +	XML_Parser p = static_cast<XML_Parser>(ref); +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// If there is any text data in the buffer, issue that first +	if (parser->getData().hasText()) { +		parser->getEvents().data( +		    parser->getData().getText(parser->getReader().getSourceId())); +	} + +	// Read the argument locations -- this is only a stupid and slow hack, +	// but it is necessary, as expat doesn't give use the byte offset of the +	// arguments. +	std::map<std::string, SourceLocation> attributeOffsets = +	    OsxmlAttributeLocator::locate(parser->getReader(), +	                                  XML_GetCurrentByteIndex(p)); + +	// Update the logger position +	SourceLocation loc = xmlSyncLoggerPosition(p); + +	// Fetch the location of the name +	SourceLocation nameLoc = loc; +	auto it = attributeOffsets.find("$tag"); +	if (it != attributeOffsets.end()) { +		nameLoc = it->second; +	} +	// Increment the current depth +	parser->getData().incrDepth(); + +	// Make sure we're currently not inside an annotation end tag -- this would +	// be highly illegal! +	if (parser->getData().inAnnotationEndTag()) { +		parser->getLogger().error( +		    "No tags allowed inside an annotation end tag", nameLoc); +		return; +	} + +	// Assemble the arguments +	Variant::mapType args; +	const XML_Char **attr = attrs; +	while (*attr) { +		// Convert the C string to a std::string +		const std::string key{*(attr++)}; + +		// Search the location of the key +		SourceLocation keyLoc; +		auto it = attributeOffsets.find(key); +		if (it != attributeOffsets.end()) { +			keyLoc = it->second; +		} + +		// Parse the string, pass the location of the key +		std::pair<bool, Variant> value = VariantReader::parseGenericString( +		    *(attr++), parser->getLogger(), keyLoc.getSourceId(), +		    keyLoc.getStart()); + +		// Set the overall location of the parsed element to the attribute +		// location +		value.second.setLocation(keyLoc); + +		// Store the keys in the map +		args.emplace(key, value.second).second; +	} + +	// Fetch the name of the tag, check for special tags +	std::string nameStr(name); +	if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) { +		// We're in the top-level and the magic tag is reached -- just +		// ignore it and issue a warning for each argument that has been given +		for (const auto &arg : args) { +			parser->getLogger().warning(std::string("Ignoring attribute \"") + +			                                arg.first + +			                                std::string("\" for magic tag \"") + +			                                TOP_LEVEL_TAG + std::string("\""), +			                            arg.second); +		} +	} else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { +		// Assemble a name variant containing the name minus the prefix +		Variant nameVar = +		    Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size())); +		nameVar.setLocation(nameLoc); + +		// Issue the "annotationStart" event +		parser->getEvents().annotationStart(nameVar, args); +	} else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) { +		// Assemble a name variant containing the name minus the prefix +		nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size()); + +		// Discard a potentially leading colon +		if (!nameStr.empty() && nameStr[0] == ':') { +			nameStr = nameStr.substr(1); +		} + +		// Assemble the variant containing the name and its location +		Variant nameVar = Variant::fromString(nameStr); +		nameVar.setLocation(nameLoc); + +		// Check whether a "name" attribute was given +		Variant elementName; +		for (const auto &arg : args) { +			if (arg.first == "name") { +				elementName = arg.second; +			} else { +				parser->getLogger().warning( +				    std::string("Ignoring attribute \"") + arg.first + +				        "\" in annotation end tag", +				    arg.second); +			} +		} + +		// Set the annotationEndTagDepth to disallow any further tags to be +		// opened inside the annotation end tag. +		parser->getData().annotationEndTagDepth = parser->getData().depth; + +		// Issue the "annotationEnd" event +		parser->getEvents().annotationEnd(nameVar, args); +	} else { +		// Just issue a "commandStart" event in any other case +		Variant nameVar = Variant::fromString(nameStr); +		nameVar.setLocation(nameLoc); +		parser->getEvents().command(nameVar, args); +	} +} + +static void xmlEndElementHandler(void *ref, const XML_Char *name) +{ +	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser +	XML_Parser p = static_cast<XML_Parser>(ref); +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// Synchronize the position of the logger with teh position +	xmlSyncLoggerPosition(p); + +	// Abort as long as we're in an annotation end tag +	if (parser->getData().inAnnotationEndTag()) { +		parser->getData().decrDepth(); +		return; +	} + +	// Decrement the current depth +	parser->getData().decrDepth(); + +	// If there is any text data in the buffer, issue that first +	if (parser->getData().hasText()) { +		parser->getEvents().data( +		    parser->getData().getText(parser->getReader().getSourceId())); +	} + +	// Abort if the special ousia tag ends here +	std::string nameStr{name}; +	if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) { +		return; +	} + +	// Issue the "fieldEnd" event +	parser->getEvents().fieldEnd(); +} + +static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) +{ +	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser +	XML_Parser p = static_cast<XML_Parser>(ref); +	OsxmlEventParser *parser = +	    static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + +	// Abort as long as we're in an annotation end tag +	if (parser->getData().inAnnotationEndTag()) { +		return; +	} + +	// Convert the signed (smell the 90's C library here?) length to an usigned +	// value +	size_t ulen = len > 0 ? static_cast<size_t>(len) : 0; + +	// Synchronize the logger position +	SourceLocation loc = xmlSyncLoggerPosition(p, ulen); + +	// Fetch some variables for convenience +	const WhitespaceMode mode = parser->getWhitespaceMode(); +	OsxmlEventParserData &data = parser->getData(); +	std::vector<char> &textBuf = data.textBuf; +	std::vector<char> &whitespaceBuf = data.whitespaceBuf; +	bool &hasWhitespace = data.hasWhitespace; +	size_t &textStart = data.textStart; +	size_t &textEnd = data.textEnd; + +	size_t pos = loc.getStart(); +	for (size_t i = 0; i < ulen; i++, pos++) { +		switch (mode) { +			case WhitespaceMode::PRESERVE: +				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, +				                                    textStart, textEnd); +				break; +			case WhitespaceMode::TRIM: +				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, +				                                  textStart, textEnd, +				                                  whitespaceBuf); +				break; +			case WhitespaceMode::COLLAPSE: +				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, +				                                    textStart, textEnd, +				                                    hasWhitespace); +				break; +		} +	} +} + +/* Class OsxmlEvents */ + +OsxmlEvents::~OsxmlEvents() {} + +/* Class OsxmlEventParser */ + +OsxmlEventParserData::OsxmlEventParserData() +    : depth(0), +      annotationEndTagDepth(-1), +      hasWhitespace(false), +      textStart(0), +      textEnd(0) +{ +} + +void OsxmlEventParserData::incrDepth() { depth++; } + +void OsxmlEventParserData::decrDepth() +{ +	if (depth > 0) { +		depth--; +	} +	if (depth < annotationEndTagDepth) { +		annotationEndTagDepth = -1; +	} +} + +bool OsxmlEventParserData::inAnnotationEndTag() +{ +	return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); +} + +bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } + +Variant OsxmlEventParserData::getText(SourceId sourceId) +{ +	// Create a variant containing the string data and the location +	Variant var = +	    Variant::fromString(std::string{textBuf.data(), textBuf.size()}); +	var.setLocation({sourceId, textStart, textEnd}); + +	// Reset the text buffers +	textBuf.clear(); +	whitespaceBuf.clear(); +	hasWhitespace = false; +	textStart = 0; +	textEnd = 0; + +	// Return the variant +	return var; +} + +/* Class OsxmlEventParser */ + +OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, +                                   Logger &logger) +    : reader(reader), +      events(events), +      logger(logger), +      whitespaceMode(WhitespaceMode::TRIM), +      data(new OsxmlEventParserData()) +{ +} + +OsxmlEventParser::~OsxmlEventParser() {} + +void OsxmlEventParser::parse() +{ +	// Create the parser object +	GuardedExpatXmlParser p{"UTF-8"}; + +	// Reset the depth +	data->depth = 0; + +	// Pass the reference to this parser instance to the XML handler +	XML_SetUserData(&p, this); +	XML_UseParserAsHandlerArg(&p); + +	// Set the callback functions +	XML_SetStartElementHandler(&p, xmlStartElementHandler); +	XML_SetEndElementHandler(&p, xmlEndElementHandler); +	XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + +	// Feed data into expat while there is data to process +	constexpr size_t BUFFER_SIZE = 64 * 1024; +	while (true) { +		// Fetch a buffer from expat for the input data +		char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE)); +		if (!buf) { +			throw OusiaException{"Internal error: XML parser out of memory!"}; +		} + +		// Read into the buffer +		size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + +		// Parse the data and handle any XML error as exception +		if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { +			throw LoggableException{ +			    "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, +			    xmlSyncLoggerPosition(&p)}; +		} + +		// Abort once there are no more bytes in the stream +		if (bytesRead == 0) { +			break; +		} +	} +} + +void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ +	this->whitespaceMode = whitespaceMode; +} + +WhitespaceMode OsxmlEventParser::getWhitespaceMode() const +{ +	return whitespaceMode; +} + +CharReader &OsxmlEventParser::getReader() const { return reader; } + +Logger &OsxmlEventParser::getLogger() const { return logger; } + +OsxmlEvents &OsxmlEventParser::getEvents() const { return events; } + +OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; } +} + diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp new file mode 100644 index 0000000..e39245f --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -0,0 +1,217 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlEventParser.hpp + * + * The OsxmlEventParser class is responsible for parsing an XML file and calling + * the corresponding event handler functions if an XML item is found. Event + * handling is performed using a listener interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OSXML_EVENT_PARSER_HPP_ +#define _OSXML_EVENT_PARSER_HPP_ + +#include <memory> +#include <string> + +#include <core/common/Whitespace.hpp> + +namespace ousia { + +// Forward declarations +class Logger; +class Variant; +class OsxmlEventParserData; + +/** + * Interface which defines the callback functions which are called by the + * OsxmlEventParser whenever an event occurs. + */ +class OsxmlEvents { +public: +	/** +	 * Virtual destructor. +	 */ +	virtual ~OsxmlEvents(); + +	/** +	 * Called whenever a command starts. Note that this implicitly always starts +	 * the default field of the command. +	 * +	 * @param name is a string variant containing name and location of the +	 * command. +	 * @param args is a map containing the arguments that were given to the +	 * command. +	 */ +	virtual void command(const Variant &name, const Variant::mapType &args) = 0; + +	/** +	 * Called whenever an annotation starts. Note that this implicitly always +	 * starts the default field of the annotation. +	 * +	 * @param className is a string variant containing the name of the +	 * annotation class and the location of the annotation definition. +	 * @param args is a map variant containing the arguments that were given +	 * to the annotation definition. +	 */ +	virtual void annotationStart(const Variant &className, +	                             const Variant::mapType &args) = 0; + +	/** +	 * Called whenever the range of an annotation ends. The callee must +	 * disambiguate the actual annotation that is finished here. +	 * +	 * @param className is a string variant containing the name of the +	 * annotation class that should end here. May be empty (or nullptr), if no +	 * elementName has been specified at the end of the annotation. +	 * @param elementName is the name of the annotation element that should be +	 * ended here. May be empty (or nullptr), if no elementName has been +	 * specified at the end of the annotation. +	 */ +	virtual void annotationEnd(const Variant &className, +	                           const Variant &elementName) = 0; + +	/** +	 * Called whenever the default field which was implicitly started by +	 * commandStart or annotationStart ends. Note that this does not end the +	 * range of an annotation, but the default field of the annotation. To +	 * signal the end of the annotation this, the annotationEnd method will be +	 * invoked. +	 */ +	virtual void fieldEnd() = 0; + +	/** +	 * Called whenever data is found. Whitespace data is handled as specified +	 * and the data has been parsed to the specified variant type. This function +	 * is not called if the parsing failed, the parser prints an error message +	 * instead. +	 * +	 * @param data is the already parsed data that should be passed to the +	 * handler. +	 */ +	virtual void data(const Variant &data) = 0; +}; + +/** + * The OsxmlEventParser class is a wrapper around eXpat which implements the + * specialities of the osxml formats class (like annotation ranges). It notifies + * a specified event handler whenever a command, annotation or data has been + * reached. + */ +class OsxmlEventParser { +private: +	/** +	 * Reference at the internal CharReader instance. +	 */ +	CharReader &reader; + +	/** +	 * Set of callback functions to be called whenever an event is triggered. +	 */ +	OsxmlEvents &events; + +	/** +	 * Reference at the Logger object to which error messages or warnings should +	 * be logged. +	 */ +	Logger &logger; + +	/** +	 * Current whitespace mode. +	 */ +	WhitespaceMode whitespaceMode; + +	/** +	 * Data to be used by the internal functions. +	 */ +	std::unique_ptr<OsxmlEventParserData> data; + +public: +	/** +	 * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents +	 * of which the callback functions are called. +	 * +	 * @param reader is a reference to the CharReader instance from which the +	 * XML should be read. +	 * @param events is a refence at an instance of the OsxmlEvents class. All +	 * events are forwarded to this class. +	 * @param logger is the Logger instance to which log messages should be +	 * written. +	 */ +	OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger); + +	/** +	 * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type) +	 */ +	~OsxmlEventParser(); + +	/** +	 * Performs the actual parsing. Reads the XML using eXpat and calles the +	 * callbacks in the event listener instance whenever something interesting +	 * happens. +	 */ +	void parse(); + +	/** +	 * Sets the whitespace handling mode. +	 * +	 * @param whitespaceMode defines how whitespace in the data should be +	 * handled. +	 */ +	void setWhitespaceMode(WhitespaceMode whitespaceMode); + +	/** +	 * Returns the current whitespace handling mode. +	 * +	 * @return the currently set whitespace handling mode. +	 */ +	WhitespaceMode getWhitespaceMode() const; + +	/** +	 * Returns the internal CharReader reference. +	 * +	 * @return the CharReader reference. +	 */ +	CharReader &getReader() const; + +	/** +	 * Returns the internal Logger reference. +	 * +	 * @return the internal Logger reference. +	 */ +	Logger &getLogger() const; + +	/** +	 * Returns the internal OsxmlEvents reference. +	 * +	 * @return the internal OsxmlEvents reference. +	 */ +	OsxmlEvents &getEvents() const; + +	/** +	 * Returns a reference at the internal data. +	 */ +	OsxmlEventParserData &getData() const; +}; +} + +#endif /* _OSXML_EVENT_PARSER_HPP_ */ + diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp new file mode 100644 index 0000000..c216855 --- /dev/null +++ b/src/formats/osxml/OsxmlParser.cpp @@ -0,0 +1,98 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/parser/stack/GenericParserStates.hpp> +#include <core/parser/stack/Stack.hpp> +#include <core/parser/ParserContext.hpp> + +#include "OsxmlEventParser.hpp" +#include "OsxmlParser.hpp" + +namespace ousia { + +using namespace parser_stack; + +/** + * Class containing the actual OsxmlParser implementation. + */ +class OsxmlParserImplementation : public OsxmlEvents { +private: +	/** +	 * Actual xml parser -- converts the xml stream into a set of events. +	 */ +	OsxmlEventParser parser; + +	/** +	 * Pushdown automaton responsible for converting the xml events into an +	 * actual Node tree. +	 */ +	Stack stack; + +public: +	/** +	 * Constructor of the OsxmlParserImplementation class. +	 * +	 * @param reader is a reference to the CharReader instance from which the +	 * XML should be read. +	 * @param ctx is a reference to the ParserContext instance that should be +	 * used. +	 */ +	OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) +	    : parser(reader, *this, ctx.getLogger()), +	      stack(ctx, GenericParserStates) +	{ +	} + +	/** +	 * Starts the actual parsing process. +	 */ +	void parse() { parser.parse(); } + +	void command(const Variant &name, const Variant::mapType &args) override +	{ +		stack.command(name, args); +		stack.fieldStart(true); +	} + +	void annotationStart(const Variant &name, +	                     const Variant::mapType &args) override +	{ +		stack.annotationStart(name, args); +		stack.fieldStart(true); +	} + +	void annotationEnd(const Variant &className, +	                   const Variant &elementName) override +	{ +		stack.annotationEnd(className, elementName); +	} + +	void fieldEnd() override { stack.fieldEnd(); } + +	void data(const Variant &data) override { stack.data(data); } +}; + +/* Class OsxmlParser */ + +void OsxmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ +	OsxmlParserImplementation impl(reader, ctx); +	impl.parse(); +} +} + diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp new file mode 100644 index 0000000..0fbf83c --- /dev/null +++ b/src/formats/osxml/OsxmlParser.hpp @@ -0,0 +1,55 @@ +/* +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlParser.hpp + * + * Contains the parser responsible for reading Ousía XML Documents (extension + * oxd) and Ousía XML Modules (extension oxm). + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_PARSER_HPP_ +#define _OUSIA_OSXML_PARSER_HPP_ + +#include <core/parser/Parser.hpp> + +namespace ousia { + +/** + * The OsxmlParser class implements parsing the various types of Ousía XML + * documents using the OsxmlEventParser and Stack classes. + */ +class OsxmlParser : public Parser { +protected: +	/** +	 * Parses the given input stream as XML file and returns the parsed +	 * top-level node. +	 * +	 * @param reader is the CharReader from which the input should be read. +	 * @param ctx is a reference to the ParserContext instance that should be +	 * used. +	 */ +	void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSXML_PARSER_HPP_ */ +  | 
