diff options
Diffstat (limited to 'src/core/common')
| -rw-r--r-- | src/core/common/Argument.cpp | 4 | ||||
| -rw-r--r-- | src/core/common/Utils.cpp | 54 | ||||
| -rw-r--r-- | src/core/common/Utils.hpp | 86 | ||||
| -rw-r--r-- | src/core/common/Variant.hpp | 25 | ||||
| -rw-r--r-- | src/core/common/VariantReader.cpp | 2 | ||||
| -rw-r--r-- | src/core/common/VariantReader.hpp | 2 | ||||
| -rw-r--r-- | src/core/common/Whitespace.hpp | 60 | ||||
| -rw-r--r-- | src/core/common/WhitespaceHandler.hpp | 284 | 
8 files changed, 488 insertions, 29 deletions
diff --git a/src/core/common/Argument.cpp b/src/core/common/Argument.cpp index bfe74a4..b10fad3 100644 --- a/src/core/common/Argument.cpp +++ b/src/core/common/Argument.cpp @@ -302,10 +302,10 @@ bool Arguments::validateMap(Variant::mapType &map, Logger &logger,  		} else {  			if (ignoreUnknown) {  				logger.note(std::string("Ignoring argument \"") + e.first + -				            std::string("\"")); +				            std::string("\""), e.second);  			} else {  				logger.error(std::string("Unknown argument \"") + e.first + -				             std::string("\"")); +				             std::string("\""), e.second);  				ok = false;  			}  		} diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 563fe2a..f8b53c6 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -18,19 +18,13 @@  #include <algorithm>  #include <cctype> -#include <limits>  #include <string>  #include "Utils.hpp" +#include "WhitespaceHandler.hpp"  namespace ousia { -std::string Utils::trim(const std::string &s) -{ -	std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); -	return s.substr(bounds.first, bounds.second - bounds.first); -} -  bool Utils::isIdentifier(const std::string &name)  {  	bool first = true; @@ -43,7 +37,27 @@ bool Utils::isIdentifier(const std::string &name)  		}  		first = false;  	} -	return true; +	return !first; +} + +bool Utils::isIdentifierOrEmpty(const std::string &name) +{ +	return name.empty() || isIdentifier(name); +} + +bool Utils::isNamespacedIdentifier(const std::string &name) +{ +	bool first = true; +	for (char c : name) { +		if (first && !isIdentifierStartCharacter(c)) { +			return false; +		} +		if (!first && (!isIdentifierCharacter(c) && c != ':')) { +			return false; +		} +		first = (c == ':'); +	} +	return !first;  }  bool Utils::hasNonWhitepaceChar(const std::string &s) @@ -94,5 +108,29 @@ std::string Utils::extractFileExtension(const std::string &filename)  	}  	return std::string{};  } + +std::string Utils::trim(const std::string &s) +{ +	std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); +	return s.substr(bounds.first, bounds.second - bounds.first); +} + +std::string Utils::collapse(const std::string &s) +{ +	CollapsingWhitespaceHandler h; +	appendToWhitespaceHandler(h, s, 0); +	return h.toString(); +} + +bool Utils::startsWith(const std::string &s, const std::string &prefix) +{ +	return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; +} + +bool Utils::endsWith(const std::string &s, const std::string &suffix) +{ +	return suffix.size() <= s.size() && +	       s.substr(s.size() - suffix.size(), suffix.size()) == suffix; +}  } diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 2c8a5b3..b5a54fc 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -74,16 +74,45 @@ public:  	}  	/** -	 * Returns true if the given character is in [A-Za-z][A-Za-z0-9_-]* +	 * Returns true if the given string is in +	 * \code{.txt} +	 * [A-Za-z][A-Za-z0-9_-]* +	 * \endCode +	 * +	 * @param name is the string that should be tested. +	 * @return true if the string matches the regular expression given above,  +	 * false otherwise.  	 */  	static bool isIdentifier(const std::string &name);  	/** +	 * Returns true if the given string is an identifier or an empty string. +	 */ +	static bool isIdentifierOrEmpty(const std::string &name); + +	/** +	 * Returns true if the given string is in +	 * \code{.txt} +	 * ([A-Za-z][A-Za-z0-9_-]*)(:[A-Za-z][A-Za-z0-9_-]*)* +	 * \endCode +	 * +	 * @param name is the string that should be tested. +	 * @return true if the string matches the regular expression given above,  +	 * false otherwise. +	 */ +	static bool isNamespacedIdentifier(const std::string &name); + +	/** +	 * Returns true if the given character is a linebreak character. +	 */ +	static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } + +	/**  	 * Returns true if the given character is a whitespace character.  	 */  	static bool isWhitespace(const char c)  	{ -		return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); +		return (c == ' ') || (c == '\t') || isLinebreak(c);  	}  	/** @@ -95,11 +124,6 @@ public:  	static bool hasNonWhitepaceChar(const std::string &s);  	/** -	 * Returns true if the given character is a whitespace character. -	 */ -	static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } - -	/**  	 * Removes whitespace at the beginning and the end of the given string.  	 *  	 * @param s is the string that should be trimmed. @@ -120,8 +144,25 @@ public:  	template <class T, class Filter>  	static std::pair<size_t, size_t> trim(const T &s, Filter f)  	{ +		return trim(s, s.size(), f); +	} + +	/** +	 * Trims the given string or vector of chars by returning the start and end +	 * index. +	 * +	 * @param s is the container that should be trimmed. +	 * @param len is the number of elements in the container. +	 * @param f is a function that returns true for values that should be +	 * removed. +	 * @return start and end index. Note that "end" points at the character +	 * beyond the end, thus "end" minus "start" +	 */ +	template <class T, class Filter> +	static std::pair<size_t, size_t> trim(const T &s, size_t len, Filter f) +	{  		size_t start = 0; -		for (size_t i = 0; i < s.size(); i++) { +		for (size_t i = 0; i < len; i++) {  			if (!f(s[i])) {  				start = i;  				break; @@ -129,7 +170,7 @@ public:  		}  		size_t end = 0; -		for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) { +		for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {  			if (!f(s[i])) {  				end = i + 1;  				break; @@ -145,6 +186,15 @@ public:  	}  	/** +	 * Collapses the whitespaces in the given string (trims the string and +	 * replaces all whitespace characters by a single one). +	 * +	 * @param s is the string in which the whitespace should be collapsed. +	 * @return a copy of s with collapsed whitespace. +	 */ +	static std::string collapse(const std::string &s); + +	/**  	 * Turns the elements of a collection into a string separated by the  	 * given delimiter.  	 * @@ -205,6 +255,24 @@ public:  	static std::string extractFileExtension(const std::string &filename);  	/** +	 * Checks whether the given string starts with the given prefix. +	 * +	 * @param s is the string. +	 * @param prefix is the string which should be checked for being a prefix of +	 * s. +	 */ +	static bool startsWith(const std::string &s, const std::string &prefix); + +	/** +	 * Checks whether the given string ends with the given suffix. +	 * +	 * @param s is the string. +	 * @param suffix is the string which should be checked for being a suffix of +	 * s. +	 */ +	static bool endsWith(const std::string &s, const std::string &suffix); + +	/**  	 * Hash functional to be used for enum classes.  	 * See http://stackoverflow.com/a/24847480/2188211  	 */ diff --git a/src/core/common/Variant.hpp b/src/core/common/Variant.hpp index 6eae7e1..ddd17d7 100644 --- a/src/core/common/Variant.hpp +++ b/src/core/common/Variant.hpp @@ -884,6 +884,21 @@ public:  	}  	/** +	 * If the value of the variant already is a string, the markAsMagic function +	 * marks this string as a "magic" value (a variant which might also be an +	 * identifier). Throws an exception if the variant is not a string or magic +	 * value. +	 */ +	void markAsMagic() +	{ +		if (getType() == VariantType::STRING) { +			meta.setType(VariantType::MAGIC); +			return; +		} +		throw TypeException{getType(), VariantType::STRING}; +	} + +	/**  	 * Returns the value of the Variant as boolean, performs type conversion.  	 *  	 * @return the Variant value converted to a boolean value. @@ -1146,10 +1161,7 @@ public:  	 *  	 * @retun true if the  	 */ -	bool hasLocation() const -	{ -		return meta.hasLocation(); -	} +	bool hasLocation() const { return meta.hasLocation(); }  	/**  	 * Unpacks ans returns the stored source location. Note that the returned @@ -1158,10 +1170,7 @@ public:  	 *  	 * @return the stored SourceLocation.  	 */ -	SourceLocation getLocation() const -	{ -		return meta.getLocation(); -	} +	SourceLocation getLocation() const { return meta.getLocation(); }  	/**  	 * Packs the given source location and stores it in the metadata. Not all diff --git a/src/core/common/VariantReader.cpp b/src/core/common/VariantReader.cpp index 3f02226..fb93ad0 100644 --- a/src/core/common/VariantReader.cpp +++ b/src/core/common/VariantReader.cpp @@ -495,7 +495,7 @@ std::pair<bool, Variant::boolType> VariantReader::parseBool(CharReader &reader,  	bool val = false;  	CharReaderFork readerFork = reader.fork();  	LoggerFork loggerFork = logger.fork(); -	auto res = parseToken(readerFork, loggerFork, {}); +	auto res = parseToken(readerFork, loggerFork, std::unordered_set<char>{});  	if (res.first) {  		bool valid = false;  		if (res.second == "true") { diff --git a/src/core/common/VariantReader.hpp b/src/core/common/VariantReader.hpp index 1232f6e..44132a0 100644 --- a/src/core/common/VariantReader.hpp +++ b/src/core/common/VariantReader.hpp @@ -322,7 +322,7 @@ public:  	 */  	static std::pair<bool, Variant> parseTyped(  	    VariantType type, CharReader &reader, Logger &logger, -	    const std::unordered_set<char> &delims = {}); +	    const std::unordered_set<char> &delims = std::unordered_set<char>{});  	/**  	 * Tries to parse an instance of the given type from the given string. The  	 * called method is one of the parse methods defined here and adheres to the diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp new file mode 100644 index 0000000..72a2291 --- /dev/null +++ b/src/core/common/Whitespace.hpp @@ -0,0 +1,60 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Whitespace.hpp + * + * Contains the WhitespaceMode enum used in various places. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HPP_ +#define _OUSIA_WHITESPACE_HPP_ + +#include <string> +#include <utility> + +namespace ousia { + +/** + * Enum specifying the whitespace handling mode of the tokenizer and the + * parsers. + */ +enum class WhitespaceMode { +	/** +     * Preserves all whitespaces as they are found in the source file. +     */ +	PRESERVE, + +	/** +     * Trims whitespace at the beginning and the end of the found text. +     */ +	TRIM, + +	/** +     * Whitespaces are trimmed and collapsed, multiple whitespace characters +     * are replaced by a single space character. +     */ +	COLLAPSE +}; + +} + +#endif /* _OUSIA_WHITESPACE_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp new file mode 100644 index 0000000..ed52ea3 --- /dev/null +++ b/src/core/common/WhitespaceHandler.hpp @@ -0,0 +1,284 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file WhitespaceHandler.hpp + * + * Contains the WhitespaceHandler classes which are used in multiple places to + * trim, compact or preserve whitespaces while at the same time maintaining the + * position information associated with the input strings. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ +#define _OUSIA_WHITESPACE_HANDLER_HPP_ + +#include <string> +#include <vector> + +#include "Utils.hpp" + +namespace ousia { + +/** + * WhitespaceHandler is a based class that can be used to collect text on a + * character-by-character basis. Note that this class and its descendants are + * hoped to be inlined by the compiler (and used in conjunction with templates), + * thus they are fully defined inside this header. + */ +class WhitespaceHandler { +public: +	/** +	 * Start position of the extracted text. +	 */ +	size_t textStart; + +	/** +	 * End position of the extracted text. +	 */ +	size_t textEnd; + +	/** +	 * Buffer containing the extracted text. +	 */ +	std::vector<char> textBuf; + +	/** +	 * Constructor of the TextHandlerBase base class. Initializes the start and +	 * end position with zeros. +	 */ +	WhitespaceHandler() : textStart(0), textEnd(0) {} + +	/** +	 * Returns true if this whitespace handler has found any text and a text +	 * token could be emitted. +	 * +	 * @return true if the internal data buffer is non-empty. +	 */ +	bool hasText() { return !textBuf.empty(); } + +	/** +	 * Returns the content of the WhitespaceHandler as string. +	 */ +	std::string toString() const +	{ +		return std::string(textBuf.data(), textBuf.size()); +	} +}; + +/** + * The PreservingWhitespaceHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingWhitespaceHandler : public WhitespaceHandler { +public: +	/** +	 * Appends the given character to the internal text buffer, does not +	 * eliminate whitespace. +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 */ +	void append(char c, size_t start, size_t end) +	{ +		append(c, start, end, textBuf, textStart, textEnd); +	} + +	/** +	 * Static version of PreservingWhitespaceHandler append +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 * @param textBuf is a reference at the text buffer that is to be used. +	 * @param textStart is a reference at the text start variable that is to be +	 * used. +	 * @param textEnd is a reference at the text end variable that is to be +	 * used. +	 */ +	static void append(char c, size_t start, size_t end, +	                   std::vector<char> &textBuf, size_t &textStart, +	                   size_t &textEnd) +	{ +		if (textBuf.empty()) { +			textStart = start; +		} +		textEnd = end; +		textBuf.push_back(c); +	} +}; + +/** + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. + */ +class TrimmingWhitespaceHandler : public WhitespaceHandler { +public: +	/** +	 * Buffer used internally to temporarily store all whitespace characters. +	 * They are only added to the output buffer if another non-whitespace +	 * character is reached. +	 */ +	std::vector<char> whitespaceBuf; + +	/** +	 * Appends the given character to the internal text buffer, eliminates +	 * whitespace characters at the begin and end of the text. +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 */ +	void append(char c, size_t start, size_t end) +	{ +		append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); +	} + +	/** +	 * Static version of TrimmingWhitespaceHandler append +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 * @param textBuf is a reference at the text buffer that is to be used. +	 * @param textStart is a reference at the text start variable that is to be +	 * used. +	 * @param textEnd is a reference at the text end variable that is to be +	 * used. +	 * @param whitespaceBuf is a reference at the buffer for storing whitespace +	 * characters. +	 */ +	static void append(char c, size_t start, size_t end, +	                   std::vector<char> &textBuf, size_t &textStart, +	                   size_t &textEnd, std::vector<char> &whitespaceBuf) +	{ +		// Handle whitespace characters +		if (Utils::isWhitespace(c)) { +			if (!textBuf.empty()) { +				whitespaceBuf.push_back(c); +			} +			return; +		} + +		// Set the start and end offset correctly +		if (textBuf.empty()) { +			textStart = start; +		} +		textEnd = end; + +		// Store the character +		if (!whitespaceBuf.empty()) { +			textBuf.insert(textBuf.end(), whitespaceBuf.begin(), +			               whitespaceBuf.end()); +			whitespaceBuf.clear(); +		} +		textBuf.push_back(c); +	} +}; + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingWhitespaceHandler : public WhitespaceHandler { +public: +	/** +	 * Flag set to true if a whitespace character was reached. +	 */ +	bool hasWhitespace = false; + +	/** +	 * Appends the given character to the internal text buffer, eliminates +	 * redundant whitespace characters. +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 */ +	void append(char c, size_t start, size_t end) +	{ +		append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); +	} + +	/** +	 * Static version of CollapsingWhitespaceHandler append +	 * +	 * @param c is the character that should be appended to the internal buffer. +	 * @param start is the start byte offset of the given character. +	 * @param end is the end byte offset of the given character. +	 * @param textBuf is a reference at the text buffer that is to be used. +	 * @param textStart is a reference at the text start variable that is to be +	 * used. +	 * @param textEnd is a reference at the text end variable that is to be +	 * used. +	 * @param hasWhitespace is a reference at the "hasWhitespace" flag. +	 */ +	static void append(char c, size_t start, size_t end, +	                   std::vector<char> &textBuf, size_t &textStart, +	                   size_t &textEnd, bool &hasWhitespace) +	{ +		// Handle whitespace characters +		if (Utils::isWhitespace(c)) { +			if (!textBuf.empty()) { +				hasWhitespace = true; +			} +			return; +		} + +		// Set the start and end offset correctly +		if (textBuf.empty()) { +			textStart = start; +		} +		textEnd = end; + +		// Store the character +		if (hasWhitespace) { +			textBuf.push_back(' '); +			hasWhitespace = false; +		} +		textBuf.push_back(c); +	} +}; + +/** + * Function that can be used to append the given buffer (e.g. a string or a + * vector) to the whitespace handler. + * + * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. + * @tparam Buffer is an iterable type. + * @param handler is the handler to which the characters of the Buffer should be + * appended. + * @param buf is the buffer from which the characters should be read. + * @param start is the start byte offset. Each character is counted as one byte. + */ +template <typename WhitespaceHandler, typename Buffer> +inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, +                                      size_t start) +{ +	for (auto elem : buf) { +		handler.append(elem, start, start + 1); +		start++; +	} +} +} + +#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ +  | 
