diff options
Diffstat (limited to 'src/plugins')
| -rw-r--r-- | src/plugins/css/CodeTokenizer.cpp | 169 | ||||
| -rw-r--r-- | src/plugins/css/CodeTokenizer.hpp | 136 | ||||
| -rw-r--r-- | src/plugins/css/Tokenizer.cpp | 204 | ||||
| -rw-r--r-- | src/plugins/css/Tokenizer.hpp | 227 | ||||
| -rw-r--r-- | src/plugins/xml/XmlParser.cpp | 575 | ||||
| -rw-r--r-- | src/plugins/xml/XmlParser.hpp | 55 | 
6 files changed, 736 insertions, 630 deletions
diff --git a/src/plugins/css/CodeTokenizer.cpp b/src/plugins/css/CodeTokenizer.cpp new file mode 100644 index 0000000..d65c514 --- /dev/null +++ b/src/plugins/css/CodeTokenizer.cpp @@ -0,0 +1,169 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <cassert> + +#include "CodeTokenizer.hpp" + +namespace ousia { + +Token CodeTokenizer::constructToken(const Token &t) +{ +	std::string content = buf.str(); +	buf.str(std::string()); +	return Token{ +	    returnTokenId, content, +	    SourceLocation{t.location.getSourceId(), startToken.location.getStart(), +	                   t.location.getEnd()}}; +} + +void CodeTokenizer::buffer(const Token &t) { buf << t.content; } + +bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) +{ +	auto it = descriptors.find(t.tokenId); +	CodeTokenMode mode = CodeTokenMode::NONE; +	if (it != descriptors.end()) { +		mode = it->second.mode; +	} + +	switch (state) { +		case CodeTokenizerState::NORMAL: +			switch (mode) { +				case CodeTokenMode::STRING_START_END: +					state = CodeTokenizerState::IN_STRING; +					break; +				case CodeTokenMode::BLOCK_COMMENT_START: +					state = CodeTokenizerState::IN_BLOCK_COMMENT; +					break; +				case CodeTokenMode::LINE_COMMENT: +					state = CodeTokenizerState::IN_LINE_COMMENT; +					break; +				case CodeTokenMode::LINEBREAK: +					if (!ignoreLinebreaks) { +						peeked.push_back( +						    {it->second.id, t.content, t.location}); +					} +					return !ignoreLinebreaks; +				default: +					bool empty = true; +					if (t.tokenId == TOKEN_TEXT) { +						int begin = -1; +						for (size_t c = 0; c < t.content.length(); c++) { +							bool isWhitespace = +							    t.content[c] == ' ' || t.content[c] == '\t'; +							if (begin < 0) { +								// if we have not yet set our beginning, +								// we wait for the first +								// non-whitespace-character to set it. +								if (!isWhitespace) { +									begin = c; +								} +							} else { +								// if we have set our beginning, we wait for the +								// first whitespace character, which marks the +								// end of the current word. +								if (isWhitespace) { +									peeked.push_back(Token{ +									    TOKEN_TEXT, +									    t.content.substr(begin, (int)c - begin), +									    SourceLocation{ +									        t.location.getSourceId(), +									        t.location.getStart() + begin, +									        t.location.getStart() + c}}); +									begin = -1; +									empty = false; +								} +							} +						} +						if (begin >= 0) { +							peeked.push_back(Token{ +							    TOKEN_TEXT, t.content.substr(begin), +							    SourceLocation{t.location.getSourceId(), +							                   t.location.getStart() + begin, +							                   t.location.getEnd()}}); +							empty = false; +						} +					} else { +						empty = false; +						peeked.push_back(t); +					} +					return !empty; +			} +			startToken = t; +			returnTokenId = it->second.id; +			return false; +		case CodeTokenizerState::IN_LINE_COMMENT: +			switch (mode) { +				case CodeTokenMode::LINEBREAK: +					state = CodeTokenizerState::NORMAL; +					if (!ignoreComments) { +						peeked.push_back(constructToken(t)); +					} +					return !ignoreComments; +				default: +					if (!ignoreComments) { +						buffer(t); +					} +					return false; +			} +		case CodeTokenizerState::IN_BLOCK_COMMENT: +			switch (mode) { +				case CodeTokenMode::BLOCK_COMMENT_END: +					state = CodeTokenizerState::NORMAL; +					if (!ignoreComments) { +						peeked.push_back(constructToken(t)); +					} +					return !ignoreComments; +				default: +					if (!ignoreComments) { +						buffer(t); +					} +					return false; +			} +		case CodeTokenizerState::IN_STRING: +			switch (mode) { +				case CodeTokenMode::ESCAPE: +					if (escaped) { +						buffer(t); +					} +					escaped = !escaped; +					return false; +				case CodeTokenMode::STRING_START_END: +					if (escaped) { +						buffer(t); +						escaped = false; +						return false; +					} else { +						peeked.push_back(constructToken(t)); +						state = CodeTokenizerState::NORMAL; +						return true; +					} +				default: +					if (escaped) { +						// TODO: handle escaped characters? +						escaped = false; +					} +					buffer(t); +					return false; +			} +	} +	assert(false); +	return false; +} +} diff --git a/src/plugins/css/CodeTokenizer.hpp b/src/plugins/css/CodeTokenizer.hpp new file mode 100644 index 0000000..154f949 --- /dev/null +++ b/src/plugins/css/CodeTokenizer.hpp @@ -0,0 +1,136 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file CodeTokenizer.hpp +  + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) + */ +#ifndef _OUSIA_CODE_TOKENIZER_HPP_ +#define _OUSIA_CODE_TOKENIZER_HPP_ + +#include <map> +#include <sstream> + +#include <core/common/CharReader.hpp> +#include "Tokenizer.hpp" + +namespace ousia { + +/* + * This enum contains all special Token the CodeTokenizer supports, namely: + * + * 1.) An ambigous Tokens - in post programming languages single-quotes ' or + * double-quotes " - to delimit string tokens. + * 2.) A start token for line comments, which would e.g. be // in Java. + * 3.) A start token for a block comment + * 4.) An end token for a block comment. + * 5.) A linebreak token + * 6.) The escape token, which would e.g. be \ in java. + */ +enum class CodeTokenMode { +	STRING_START_END, +	LINE_COMMENT, +	BLOCK_COMMENT_START, +	BLOCK_COMMENT_END, +	LINEBREAK, +	ESCAPE, +	NONE +}; + +/** + * A CodeTokenDescriptor defines the id the user likes to have returned for + * a Token of the mode specified, e.g. if you want to get the id 4 for a + * String Token the corresponding CodeTokenDescriptor would be inizialized + * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; + */ +struct CodeTokenDescriptor { +	CodeTokenMode mode; +	int id; + +	CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} +}; + +/** + * The CodeTokenizer is a finite state machine with the states NORMAL, being + * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. + */ +enum class CodeTokenizerState { +	NORMAL, +	IN_BLOCK_COMMENT, +	IN_LINE_COMMENT, +	IN_STRING +}; + +/** + * The purpose of a CodeTokenizer is to make it easier to parse classical + * programming Code. It adds the following features to a regular Tokenizer: + * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens + * for the opening delimiter, the text and the closing delimiter. + * 2.) Escaping in String tokens. + * 3.) Comment Tokens (for line comments as well as block comments) + */ +class CodeTokenizer : public Tokenizer { +private: +	std::map<int, CodeTokenDescriptor> descriptors; +	CodeTokenizerState state; +	std::stringstream buf; +	Token startToken; +	int returnTokenId; +	bool escaped = false; + +	Token constructToken(const Token &t); +	void buffer(const Token &t); + +protected: +	bool doPrepare(const Token &t, std::deque<Token> &peeked) override; + +public: +	/** +	 * If you do not want comment tokens to be returned you can set this to +	 * true. +	 */ +	bool ignoreComments = false; +	/** +	 * If you do not want linebreaks to be returned you can set this to true. +	 */ +	 bool ignoreLinebreaks = false; + +	/** +	 * +	 * @param input a CharReader containing the input for this tokenizer, as +	 * with a regular tokenizer. +	 * @param root a TokenTreeNode representing the root of the TokenTree. +	 * Please note that you have to specify all tokenIDs here that you use +	 * in the descriptors map. +	 * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. +	 * In this way you can specify the meaning of certain Tokens. Say you +	 * specified the Token "//" with the id 1 in the TokenTree. Then you could +	 * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map +	 * and this CodeTokenizer would recognize the token "//" as starting a +	 * line comment. +	 */ +	CodeTokenizer(CharReader &input, const TokenTreeNode &root, +	              std::map<int, CodeTokenDescriptor> descriptors) +	    : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) +	{ +	} +}; +} + +#endif diff --git a/src/plugins/css/Tokenizer.cpp b/src/plugins/css/Tokenizer.cpp new file mode 100644 index 0000000..ab4735a --- /dev/null +++ b/src/plugins/css/Tokenizer.cpp @@ -0,0 +1,204 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <sstream> + +#include "Tokenizer.hpp" + +namespace ousia { + +static std::map<char, TokenTreeNode> buildChildren( +    const std::map<std::string, int> &inputs) +{ +	std::map<char, TokenTreeNode> children; +	std::map<char, std::map<std::string, int>> nexts; + +	for (auto &e : inputs) { +		const std::string &s = e.first; +		const int id = e.second; +		if (s.empty()) { +			continue; +		} +		char start = s[0]; +		const std::string suffix = s.substr(1); +		if (nexts.find(start) != nexts.end()) { +			nexts[start].insert(std::make_pair(suffix, id)); +		} else { +			nexts.insert(std::make_pair( +			    start, std::map<std::string, int>{{suffix, id}})); +		} +	} + +	for (auto &n : nexts) { +		children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); +	} + +	return children; +} + +static int buildId(const std::map<std::string, int> &inputs) +{ +	int tokenId = TOKEN_NONE; +	for (auto &e : inputs) { +		if (e.first.empty()) { +			if (tokenId != TOKEN_NONE) { +				throw TokenizerException{std::string{"Ambigous token found: "} + +				                         std::to_string(e.second)}; +			} else { +				tokenId = e.second; +			} +		} +	} +	return tokenId; +} + +TokenTreeNode::TokenTreeNode(const std::map<std::string, int> &inputs) +    : children(buildChildren(inputs)), tokenId(buildId(inputs)) +{ +} + +Tokenizer::Tokenizer(CharReader &input, const TokenTreeNode &root) +    : input(input), root(root) +{ +} + +bool Tokenizer::prepare() +{ +	std::stringstream buffer; +	char c; +	SourcePosition start = input.getOffset(); +	bool bufEmpty = true; +	while (input.peek(c)) { +		if (root.children.find(c) != root.children.end()) { +			// if there might be a special token, keep peeking forward +			// until we find the token (or we don't). +			TokenTreeNode const *n = &root; +			std::stringstream tBuf; +			int match = TOKEN_NONE; +			while (true) { +				tBuf << c; +				n = &(n->children.at(c)); +				if (n->tokenId != TOKEN_NONE) { +					match = n->tokenId; +					// from here on we found a token. If we have something +					// in our buffer already, we end the search now. +					if (!bufEmpty) { +						break; +					} else { +						// if we want to return this token ( = we have nothing +						// in our buffer yet) we look greedily for the longest +						// possible token we can construct. +						input.consumePeek(); +					} +				} +				if (!input.peek(c)) { +					// if we are at the end we break off the search. +					break; +				} +				if (n->children.find(c) == n->children.end()) { +					// if we do not find a possible continuation anymore, +					// break off the search. +					break; +				} +			} +			//reset the peek pointer to the last valid position. +			input.resetPeek(); +			// check if we did indeed find a special token. +			if (match != TOKEN_NONE) { +				if (bufEmpty) { +					// if we did not have text before, construct that token. +					if (doPrepare( +					        Token{match, tBuf.str(), input.getLocation(start)}, +					        peeked)) { +						return true; +					} else { +						start = input.getOffset(); +						continue; +					} +				} else { +					// otherwise we return the text before the token. +					if (doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, +					              peeked)) { +						return true; +					} else{ +						//we need to clear the buffer here. After all the token +						//corresponding to this buffer segment is already +						//constructed. +						buffer.str(std::string()); +						bufEmpty = true; +						start = input.getOffset(); +						continue; +					}  +				} +			} else{ +				//if we found nothing, read at least one character. +				input.peek(c); +			} +		} +		buffer << c; +		bufEmpty = false; +		input.consumePeek(); +	} +	if (!bufEmpty) { +		return doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)}, +		                 peeked); +	} +	return false; +} + +bool Tokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) +{ +	peeked.push_back(t); +	return true; +} + +bool Tokenizer::next(Token &t) +{ +	if (peeked.empty()) { +		if (!prepare()) { +			return false; +		} +	} +	t = peeked.front(); +	peeked.pop_front(); +	resetPeek(); +	return true; +} + +bool Tokenizer::peek(Token &t) +{ +	if (peekCursor >= peeked.size()) { +		if (!prepare()) { +			return false; +		} +	} +	t = peeked[peekCursor]; +	peekCursor++; +	return true; +} + +void Tokenizer::resetPeek() { peekCursor = 0; } + +void Tokenizer::consumePeek() +{ +	while (peekCursor > 0) { +		peeked.pop_front(); +		peekCursor--; +	} +} +} diff --git a/src/plugins/css/Tokenizer.hpp b/src/plugins/css/Tokenizer.hpp new file mode 100644 index 0000000..50e458c --- /dev/null +++ b/src/plugins/css/Tokenizer.hpp @@ -0,0 +1,227 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef _OUSIA_TOKENIZER_HPP_ +#define _OUSIA_TOKENIZER_HPP_ + +#include <cstdint> +#include <deque> +#include <istream> +#include <map> + +#include <core/common/CharReader.hpp> + +namespace ousia { + +/** + * This exception is currently only thrown if errors are made during the + * initialization of the Tokenizer. Have a closer look at the documentation + * of the TokenTreeNode constructor for more information. + */ +class TokenizerException : public std::exception { +public: +	const std::string msg; + +	TokenizerException(const std::string &msg) : msg(msg){}; + +	virtual const char *what() const noexcept override { return msg.c_str(); } +}; + +/** + * The Tokenizer internally uses a TokenTree to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * The TokenTree is a construct that structures all special tokens this + * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then + * the TokenTree would look like this: + * + * a + * | \ + * a $ + * | \ + * b c + * | | + * $ $ + * + * Every node in the TokenTree is a valid end state that has a $ attached to it. + * During the search algorithm the Tokenizer goes through the tree and stores + * the last valid position. If a character follows that does not lead to a new + * node in the TokenTree the search ends (and starts again at this character). + * The token corresponding to the last valid position is returned. + * + * This allows us to uniquely identify the matching token given a certain + * input text. Note that this is a greedy matching approach that does not + * work if you're using truly ambiguous tokens (that have the same text). + * + * It is also not allowed that tokens have common middle parts but varying + * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and + * the input string "abc". In that case we start looking for "abd" at the + * start, won't find it, wenn we hit "c" and start the scanning process + * anew. Thus the "bc" token is not found. + * + * For most (well-behaved) tokenization schemes this is not the case, + * though. + */ +class TokenTreeNode { +public: +	const std::map<char, TokenTreeNode> children; +	const int tokenId; + +	/** +	 * The TokenTreeNode constructor builds a TokenTree from the given token +	 * specifications. The node returned by this constructor then is the root of +	 * said TokenTree. +	 * @param inputs Specifications of tokens in map form. Each specification +	 * is a tuple of the text that should be matched and some unique ID (>= 0) +	 * that is returned to you if that Token is found in the text. +	 * An example for such a map would be +	 * { +	 *	{ "#" , 1}, +	 *  { "##", 2}, +	 *  { "/" , 3} +	 * } +	 * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE +	 * (-1) and TOKEN_TEXT (-2). +	 */ +	TokenTreeNode(const std::map<std::string, int> &inputs); +}; + +/** + * This is a reserved constant for the empty token. + */ +static const int TOKEN_NONE = -1; +/** + * This is a reserved constant for every part of the input text that is not a + * specified token. + */ +static const int TOKEN_TEXT = -2; + +/** + * A token for us is identified by an integer tokenID (either one of the + * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). + * Additionally we return the matched text (which should only be really + * interesting in case of TOKEN_TEXT tokens) and the position in the input text. + */ +struct Token { +	int tokenId; +	std::string content; +	SourceLocation location; + +	Token(int tokenId, std::string content, SourceLocation location) +	    : tokenId(tokenId), +	      content(content), +	      location(location) +	{ +	} + +	Token() : tokenId(TOKEN_NONE) {} +}; + +/** + * A Tokenizer has the purpose of subdividing an input text into tokens. In our + * definition here we distinguish between two kinds of tokens: + * 1.) User-specified tokens that match a fixed text. + * 2.) Any other text between those tokens. + * The user might want to specify the tokens '#{' and '#}' for example, because + * they have some meaning in her code. The user sets the IDs to 1 and 2. + * Given the input text + * "some text #{ special command #} some text" + * the tokenizer would return the tokens: + * 1.) "some text " with the id TOKEN_TEXT (-2). + * 2.) "#{" with the id 1. + * 3.) " special command " with the id TOKEN_TEXT (-2). + * 4.) "#}" with the id 2. + * 5.) " some text" with the id TOKEN_TEXT (-2). + * This makes the subsequent parsing of files of a specific type easier. + * Note that in case of tokens with that are prefixes of other tokens the + * longest possible match is returned. + */ +class Tokenizer { +private: +	CharReader &input; +	const TokenTreeNode &root; +	std::deque<Token> peeked; +	unsigned int peekCursor = 0; + +	bool prepare(); + +protected: +	/** +	* This method is an interface to build multiple tokens from a single one in +	* derived classes. This might be interesting if you want to implement +	* further logic on text tokens or similar applications. +	* +	* @param t a Token the "basic" tokenizer found. +	* @param peeked a reference to the deque containing all temporary Tokens. +	* You are supposed to append your tokens there. In the trivial case you just +	* put the given Token on top of the deque. +	* @return false if no token was appended to the deque (meaning that you want +	* to ignore the given token explicitly) and true in all other cases. +	*/ +	virtual bool doPrepare(const Token &t, std::deque<Token> &peeked); + +public: +	/** +	 * @param input The input of a Tokenizer is given in the form of a +	 * CharReader. Please refer to the respective documentation. +	 * @param root This is meant to be the root of a TokenTree giving the +	 * specification of user-defined tokens this Tokenizer should recognize. +	 * The Tokenizer promises to not change the TokenTree such that you can +	 * re-use the same specification for multiple inputs. +	 * Please refer to the TokenTreeNode documentation for more information. +	 */ +	Tokenizer(CharReader &input, const TokenTreeNode &root); + +	/** +	 * The next method consumes one Token from the input stream and gives +	 * it to the user (stored in the input argument). +	 * +	 * @param t a Token reference that is set to the next found token. +	 * @return true if a next token was found and false if the input is at its +	 * end. +	 */ +	bool next(Token &t); +	/** +	 * The peek method does not consume the next Token but buffers it and +	 * shows it to the user (stored in the input argument). +	 * +	 * @param t a Token reference that is set to the next found token. +	 * @return true if a next token was found and false if the input is at its +	 * end. +	 */ +	bool peek(Token &t); + +	/** +	 * Resets the peek pointer to the current position in the stream (to the +	 * beginning of the buffer). +	 */ +	void resetPeek(); + +	/** +	 * Clears the peek buffer, such that all peeked Tokens are consumed. +	 */ +	void consumePeek(); + +	const CharReader &getInput() const { return input; } +	 +	CharReader &getInput() { return input; } +}; +} + +#endif diff --git a/src/plugins/xml/XmlParser.cpp b/src/plugins/xml/XmlParser.cpp deleted file mode 100644 index 6dfad49..0000000 --- a/src/plugins/xml/XmlParser.cpp +++ /dev/null @@ -1,575 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <iostream> -#include <map> -#include <sstream> -#include <vector> - -#include <expat.h> - -#include <core/common/CharReader.hpp> -#include <core/common/Utils.hpp> -#include <core/common/VariantReader.hpp> -#include <core/parser/ParserScope.hpp> -#include <core/parser/ParserStack.hpp> -#include <core/parser/stack/DocumentHandler.hpp> -#include <core/parser/stack/DomainHandler.hpp> -#include <core/parser/stack/ImportIncludeHandler.hpp> -#include <core/parser/stack/TypesystemHandler.hpp> -#include <core/model/Document.hpp> -#include <core/model/Domain.hpp> -#include <core/model/Typesystem.hpp> - -#include "XmlParser.hpp" - -namespace ousia { - -namespace ParserStates { -/* Document states */ -static const ParserState Document = -    ParserStateBuilder() -        .parent(&None) -        .createdNodeType(&RttiTypes::Document) -        .elementHandler(DocumentHandler::create) -        .arguments({Argument::String("name", "")}); - -static const ParserState DocumentChild = -    ParserStateBuilder() -        .parents({&Document, &DocumentChild}) -        .createdNodeTypes({&RttiTypes::StructureNode, -                           &RttiTypes::AnnotationEntity, -                           &RttiTypes::DocumentField}) -        .elementHandler(DocumentChildHandler::create); - -/* Domain states */ -static const ParserState Domain = ParserStateBuilder() -                                      .parents({&None, &Document}) -                                      .createdNodeType(&RttiTypes::Domain) -                                      .elementHandler(DomainHandler::create) -                                      .arguments({Argument::String("name")}); - -static const ParserState DomainStruct = -    ParserStateBuilder() -        .parent(&Domain) -        .createdNodeType(&RttiTypes::StructuredClass) -        .elementHandler(DomainStructHandler::create) -        .arguments({Argument::String("name"), -                    Argument::Cardinality("cardinality", Cardinality::any()), -                    Argument::Bool("isRoot", false), -                    Argument::Bool("transparent", false), -                    Argument::String("isa", "")}); - -static const ParserState DomainAnnotation = -    ParserStateBuilder() -        .parent(&Domain) -        .createdNodeType(&RttiTypes::AnnotationClass) -        .elementHandler(DomainAnnotationHandler::create) -        .arguments({Argument::String("name")}); - -static const ParserState DomainAttributes = -    ParserStateBuilder() -        .parents({&DomainStruct, &DomainAnnotation}) -        .createdNodeType(&RttiTypes::StructType) -        .elementHandler(DomainAttributesHandler::create) -        .arguments({}); - -static const ParserState DomainAttribute = -    ParserStateBuilder() -        .parent(&DomainAttributes) -        .elementHandler(TypesystemStructFieldHandler::create) -        .arguments({Argument::String("name"), Argument::String("type"), -                    Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState DomainField = -    ParserStateBuilder() -        .parents({&DomainStruct, &DomainAnnotation}) -        .createdNodeType(&RttiTypes::FieldDescriptor) -        .elementHandler(DomainFieldHandler::create) -        .arguments({Argument::String("name", ""), -                    Argument::Bool("isSubtree", false), -                    Argument::Bool("optional", false)}); - -static const ParserState DomainFieldRef = -    ParserStateBuilder() -        .parents({&DomainStruct, &DomainAnnotation}) -        .createdNodeType(&RttiTypes::FieldDescriptor) -        .elementHandler(DomainFieldRefHandler::create) -        .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); - -static const ParserState DomainStructPrimitive = -    ParserStateBuilder() -        .parents({&DomainStruct, &DomainAnnotation}) -        .createdNodeType(&RttiTypes::FieldDescriptor) -        .elementHandler(DomainPrimitiveHandler::create) -        .arguments( -            {Argument::String("name", ""), Argument::Bool("isSubtree", false), -             Argument::Bool("optional", false), Argument::String("type")}); - -static const ParserState DomainStructChild = -    ParserStateBuilder() -        .parent(&DomainField) -        .elementHandler(DomainChildHandler::create) -        .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParent = -    ParserStateBuilder() -        .parent(&DomainStruct) -        .createdNodeType(&RttiTypes::DomainParent) -        .elementHandler(DomainParentHandler::create) -        .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParentField = -    ParserStateBuilder() -        .parent(&DomainStructParent) -        .createdNodeType(&RttiTypes::FieldDescriptor) -        .elementHandler(DomainParentFieldHandler::create) -        .arguments({Argument::String("name", ""), -                    Argument::Bool("isSubtree", false), -                    Argument::Bool("optional", false)}); - -static const ParserState DomainStructParentFieldRef = -    ParserStateBuilder() -        .parent(&DomainStructParent) -        .createdNodeType(&RttiTypes::FieldDescriptor) -        .elementHandler(DomainParentFieldRefHandler::create) -        .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); - -/* Typesystem states */ -static const ParserState Typesystem = -    ParserStateBuilder() -        .parents({&None, &Domain}) -        .createdNodeType(&RttiTypes::Typesystem) -        .elementHandler(TypesystemHandler::create) -        .arguments({Argument::String("name", "")}); - -static const ParserState TypesystemEnum = -    ParserStateBuilder() -        .parent(&Typesystem) -        .createdNodeType(&RttiTypes::EnumType) -        .elementHandler(TypesystemEnumHandler::create) -        .arguments({Argument::String("name")}); - -static const ParserState TypesystemEnumEntry = -    ParserStateBuilder() -        .parent(&TypesystemEnum) -        .elementHandler(TypesystemEnumEntryHandler::create) -        .arguments({}); - -static const ParserState TypesystemStruct = -    ParserStateBuilder() -        .parent(&Typesystem) -        .createdNodeType(&RttiTypes::StructType) -        .elementHandler(TypesystemStructHandler::create) -        .arguments({Argument::String("name"), Argument::String("parent", "")}); - -static const ParserState TypesystemStructField = -    ParserStateBuilder() -        .parent(&TypesystemStruct) -        .elementHandler(TypesystemStructFieldHandler::create) -        .arguments({Argument::String("name"), Argument::String("type"), -                    Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState TypesystemConstant = -    ParserStateBuilder() -        .parent(&Typesystem) -        .createdNodeType(&RttiTypes::Constant) -        .elementHandler(TypesystemConstantHandler::create) -        .arguments({Argument::String("name"), Argument::String("type"), -                    Argument::Any("value")}); - -/* Special states for import and include */ -static const ParserState Import = -    ParserStateBuilder() -        .parents({&Document, &Typesystem, &Domain}) -        .elementHandler(ImportHandler::create) -        .arguments({Argument::String("rel", ""), Argument::String("type", ""), -                    Argument::String("src", "")}); - -static const ParserState Include = -    ParserStateBuilder() -        .parent(&All) -        .elementHandler(IncludeHandler::create) -        .arguments({Argument::String("rel", ""), Argument::String("type", ""), -                    Argument::String("src", "")}); - -static const std::multimap<std::string, const ParserState *> XmlStates{ -    {"document", &Document}, -    {"*", &DocumentChild}, -    {"domain", &Domain}, -    {"struct", &DomainStruct}, -    {"annotation", &DomainAnnotation}, -    {"attributes", &DomainAttributes}, -    {"attribute", &DomainAttribute}, -    {"field", &DomainField}, -    {"fieldRef", &DomainFieldRef}, -    {"primitive", &DomainStructPrimitive}, -    {"childRef", &DomainStructChild}, -    {"parentRef", &DomainStructParent}, -    {"field", &DomainStructParentField}, -    {"fieldRef", &DomainStructParentFieldRef}, -    {"typesystem", &Typesystem}, -    {"enum", &TypesystemEnum}, -    {"entry", &TypesystemEnumEntry}, -    {"struct", &TypesystemStruct}, -    {"field", &TypesystemStructField}, -    {"constant", &TypesystemConstant}, -    {"import", &Import}, -    {"include", &Include}}; -} - -/** - * Structue containing the private data that is being passed to the - * XML-Handlers. - */ -struct XMLUserData { -	/** -	 * Containing the depth of the current XML file -	 */ -	size_t depth; - -	/** -	 * Reference at the ParserStack instance. -	 */ -	ParserStack *stack; - -	/** -	 * Reference at the CharReader instance. -	 */ -	CharReader *reader; - -	/** -	 * Constructor of the XMLUserData struct. -	 * -	 * @param stack is a pointer at the ParserStack instance. -	 * @param reader is a pointer at the CharReader instance. -	 */ -	XMLUserData(ParserStack *stack, CharReader *reader) -	    : depth(0), stack(stack), reader(reader) -	{ -	} -}; - -/** - * Wrapper class around the XML_Parser pointer which safely frees it whenever - * the scope is left (e.g. because an exception was thrown). - */ -class ScopedExpatXmlParser { -private: -	/** -	 * Internal pointer to the XML_Parser instance. -	 */ -	XML_Parser parser; - -public: -	/** -	 * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS -	 * from the expat library. Throws a parser exception if the XML parser -	 * cannot be initialized. -	 * -	 * @param encoding is the protocol-defined encoding passed to expat (or -	 * nullptr if expat should determine the encoding by itself). -	 */ -	ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) -	{ -		parser = XML_ParserCreate(encoding); -		if (!parser) { -			throw LoggableException{ -			    "Internal error: Could not create expat XML parser!"}; -		} -	} - -	/** -	 * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. -	 */ -	~ScopedExpatXmlParser() -	{ -		if (parser) { -			XML_ParserFree(parser); -			parser = nullptr; -		} -	} - -	/** -	 * Returns the XML_Parser pointer. -	 */ -	XML_Parser operator&() { return parser; } -}; - -/* Adapter Expat -> ParserStack */ - -static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) -{ -	// Fetch the parser stack and the associated user data -	XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p)); -	ParserStack *stack = userData->stack; - -	// Fetch the current location in the XML file -	size_t offs = XML_GetCurrentByteIndex(p); - -	// Build the source location and update the default location of the -	// current -	// logger instance -	SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; -	stack->getContext().getLogger().setDefaultLocation(loc); -	return loc; -} - -enum class XMLAttributeState { -	IN_TAG_NAME, -	SEARCH_ATTR, -	IN_ATTR_NAME, -	HAS_ATTR_NAME, -	HAS_ATTR_EQUALS, -	IN_ATTR_DATA -}; - -static std::map<std::string, SourceLocation> reconstructXMLAttributeOffsets( -    CharReader &reader, SourceLocation location) -{ -	std::map<std::string, SourceLocation> res; - -	// Fork the reader, we don't want to mess up the XML parsing process, do we? -	CharReaderFork readerFork = reader.fork(); - -	// Move the read cursor to the start location, abort if this does not work -	size_t offs = location.getStart(); -	if (!location.isValid() || offs != readerFork.seek(offs)) { -		return res; -	} - -	// Now all we need to do is to implement one half of an XML parser. As this -	// is inherently complicated we'll totaly fail at it. Don't care. All we -	// want to get is those darn offsets for pretty error messages... (and we -	// can assume the XML is valid as it was already read by expat) -	XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; -	char c; -	std::stringstream attrName; -	while (readerFork.read(c)) { -		// Abort at the end of the tag -		if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { -			return res; -		} - -		// One state machine to rule them all, one state machine to find them, -		// One state machine to bring them all and in the darkness bind them -		// (the byte offsets) -		switch (state) { -			case XMLAttributeState::IN_TAG_NAME: -				if (Utils::isWhitespace(c)) { -					state = XMLAttributeState::SEARCH_ATTR; -				} -				break; -			case XMLAttributeState::SEARCH_ATTR: -				if (!Utils::isWhitespace(c)) { -					state = XMLAttributeState::IN_ATTR_NAME; -					attrName << c; -				} -				break; -			case XMLAttributeState::IN_ATTR_NAME: -				if (Utils::isWhitespace(c)) { -					state = XMLAttributeState::HAS_ATTR_NAME; -				} else if (c == '=') { -					state = XMLAttributeState::HAS_ATTR_EQUALS; -				} else { -					attrName << c; -				} -				break; -			case XMLAttributeState::HAS_ATTR_NAME: -				if (!Utils::isWhitespace(c)) { -					if (c == '=') { -						state = XMLAttributeState::HAS_ATTR_EQUALS; -						break; -					} -					// Well, this is a strange XML file... We expected to -					// see a '=' here! Try to continue with the -					// "HAS_ATTR_EQUALS" state as this state will hopefully -					// inlcude some error recovery -				} else { -					// Skip whitespace here -					break; -				} -			// Fallthrough -			case XMLAttributeState::HAS_ATTR_EQUALS: -				if (!Utils::isWhitespace(c)) { -					if (c == '"') { -						// Here we are! We have found the beginning of an -						// attribute. Let's quickly lock the current offset away -						// in the result map -						res.emplace(attrName.str(), -						            SourceLocation{reader.getSourceId(), -						                           readerFork.getOffset()}); -						attrName.str(std::string{}); -						state = XMLAttributeState::IN_ATTR_DATA; -					} else { -						// No, this XML file is not well formed. Assume we're in -						// an attribute name once again -						attrName.str(std::string{&c, 1}); -						state = XMLAttributeState::IN_ATTR_NAME; -					} -				} -				break; -			case XMLAttributeState::IN_ATTR_DATA: -				if (c == '"') { -					// We're at the end of the attribute data, start anew -					state = XMLAttributeState::SEARCH_ATTR; -				} -				break; -		} -	} -	return res; -} - -static void xmlStartElementHandler(void *p, const XML_Char *name, -                                   const XML_Char **attrs) -{ -	XML_Parser parser = static_cast<XML_Parser>(p); -	XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p)); -	ParserStack *stack = userData->stack; - -	SourceLocation loc = syncLoggerPosition(parser); - -	// Read the argument locations -- this is only a stupid and slow hack, -	// but it is necessary, as expat doesn't give use the byte offset of the -	// arguments. -	std::map<std::string, SourceLocation> offs = -	    reconstructXMLAttributeOffsets(*userData->reader, loc); - -	// Assemble the arguments -	Variant::mapType args; - -	const XML_Char **attr = attrs; -	while (*attr) { -		// Convert the C string to a std::string -		const std::string key{*(attr++)}; - -		// Search the location of the key -		SourceLocation keyLoc; -		auto it = offs.find(key); -		if (it != offs.end()) { -			keyLoc = it->second; -		} - -		// Parse the string, pass the location of the key -		std::pair<bool, Variant> value = VariantReader::parseGenericString( -		    *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), -		    keyLoc.getStart()); -		args.emplace(key, value.second); -	} - -	// Call the start function -	std::string nameStr(name); -	if (nameStr != "ousia" || userData->depth > 0) { -		stack->start(std::string(name), args, loc); -	} - -	// Increment the current depth -	userData->depth++; -} - -static void xmlEndElementHandler(void *p, const XML_Char *name) -{ -	XML_Parser parser = static_cast<XML_Parser>(p); -	XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p)); -	ParserStack *stack = userData->stack; - -	syncLoggerPosition(parser); - -	// Decrement the current depth -	userData->depth--; - -	// Call the end function -	std::string nameStr(name); -	if (nameStr != "ousia" || userData->depth > 0) { -		stack->end(); -	} -} - -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) -{ -	XML_Parser parser = static_cast<XML_Parser>(p); -	XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p)); -	ParserStack *stack = userData->stack; - -	size_t ulen = len > 0 ? static_cast<size_t>(len) : 0; -	syncLoggerPosition(parser, ulen); -	const std::string data = Utils::trim(std::string{s, ulen}); -	if (!data.empty()) { -		stack->data(data); -	} -} - -/* Class XmlParser */ - -void XmlParser::doParse(CharReader &reader, ParserContext &ctx) -{ -	// Create the parser object -	ScopedExpatXmlParser p{"UTF-8"}; - -	// Create the parser stack instance, if we're starting on a non-empty scope, -	// try to deduce the parser state -	ParserStack stack(ctx, ParserStates::XmlStates); -	if (!ctx.getScope().isEmpty()) { -		if (!stack.deduceState()) { -			return; -		} -	} - -	// Pass the reference to the ParserStack to the XML handler -	XMLUserData data(&stack, &reader); -	XML_SetUserData(&p, &data); -	XML_UseParserAsHandlerArg(&p); - -	// Set the callback functions -	XML_SetStartElementHandler(&p, xmlStartElementHandler); -	XML_SetEndElementHandler(&p, xmlEndElementHandler); -	XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); - -	// Feed data into expat while there is data to process -	constexpr size_t BUFFER_SIZE = 64 * 1024; -	while (true) { -		// Fetch a buffer from expat for the input data -		char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE)); -		if (!buf) { -			throw LoggableException{ -			    "Internal error: XML parser out of memory!"}; -		} - -		// Read into the buffer -		size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); - -		// Parse the data and handle any XML error -		if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { -			// Fetch the xml parser byte offset -			size_t offs = XML_GetCurrentByteIndex(&p); - -			// Throw a corresponding exception -			XML_Error code = XML_GetErrorCode(&p); -			std::string msg = std::string{XML_ErrorString(code)}; -			throw LoggableException{"XML: " + msg, -			                        SourceLocation{ctx.getSourceId(), offs}}; -		} - -		// Abort once there are no more bytes in the stream -		if (bytesRead == 0) { -			break; -		} -	} -} -} - diff --git a/src/plugins/xml/XmlParser.hpp b/src/plugins/xml/XmlParser.hpp deleted file mode 100644 index c8b6302..0000000 --- a/src/plugins/xml/XmlParser.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file XmlParser.hpp - * - * Contains the parser responsible for reading Ousía XML Documents (extension - * oxd) and Ousía XML Modules (extension oxm). - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_XML_PARSER_HPP_ -#define _OUSIA_XML_PARSER_HPP_ - -#include <core/parser/Parser.hpp> - -namespace ousia { - -/** - * The XmlParser class implements parsing the various types of Ousía XML - * documents using the expat stream XML parser. - */ -class XmlParser : public Parser { -protected: -	/** -	 * Parses the given input stream as XML file and returns the parsed -	 * top-level node. -	 * -	 * @param reader is the CharReader from which the input should be read. -	 * @param ctx is a reference to the ParserContext instance that should be -	 * used. -	 */ -	void doParse(CharReader &reader, ParserContext &ctx) override; -}; - -} - -#endif /* _OUSIA_XML_PARSER_HPP_ */ -  | 
