diff options
| author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-24 02:13:46 +0100 | 
|---|---|---|
| committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-24 02:13:46 +0100 | 
| commit | 5a67fc7d682ddba6a862aacf616d02cd20b727eb (patch) | |
| tree | 34a6e34d835f70459f3cb6aed9543cc22319a92b /src/core/common | |
| parent | 8891dea26a1653a003b4171155e155d3aa6689ae (diff) | |
start of branch, commit log will be rewritten
Diffstat (limited to 'src/core/common')
| -rw-r--r-- | src/core/common/Token.cpp | 24 | ||||
| -rw-r--r-- | src/core/common/Token.hpp | 181 | ||||
| -rw-r--r-- | src/core/common/WhitespaceHandler.hpp | 284 | 
3 files changed, 205 insertions, 284 deletions
diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp new file mode 100644 index 0000000..8bcdbb5 --- /dev/null +++ b/src/core/common/Token.cpp @@ -0,0 +1,24 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "Token.hpp" + +namespace ousia { +// Stub to make sure Tokens.hpp is valid +} + diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp new file mode 100644 index 0000000..07d7c8f --- /dev/null +++ b/src/core/common/Token.hpp @@ -0,0 +1,181 @@ +/* +    Ousía +    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel + +    This program is free software: you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation, either version 3 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program.  If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Token.hpp + * + * Definition of the TokenId id and constants for some special tokens. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_HPP_ +#define _OUSIA_TOKEN_HPP_ + +#include <cstdint> +#include <limits> +#include <string> +#include <unordered_set> + +#include <core/common/Location.hpp> + +namespace ousia { + +/** + * The TokenId is used to give each token id a unique id. + */ +using TokenId = uint32_t; + +/** + * Type used for storing token lengths. + */ +using TokenLength = uint16_t; + +/** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set<TokenId>; + +/** + * Namespace containing constants for TokenId instances with special meaning. + */ +namespace Tokens { +/** + * Token which is not a token. + */ +constexpr TokenId Empty = std::numeric_limits<TokenId>::max(); + +/** + * Token which represents data (represented as TokenizedData). + */ +constexpr TokenId Data = std::numeric_limits<TokenId>::max() - 1; + +/** + * Token which represents a newline token. + */ +constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2; + +/** + * Token which represents a paragraph token -- issued if two consecutive + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached. + */ +constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3; + +/** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4; + +/** + * Token which represents an indentation token -- issued if the indentation of + * this line is larger than the indentation of the previous line. + */ +constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5; + +/** + * Token which represents an unindentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. + */ +constexpr TokenId Unindent = std::numeric_limits<TokenId>::max() - 6; + +/** + * Maximum token id to be used. Tokens allocated for users should not surpass + * this value. + */ +constexpr TokenId MaxTokenId = std::numeric_limits<TokenId>::max() - 255; +} + +/** + * The Token structure describes a token discovered by the Tokenizer or read + * from the TokenizedData struct. + */ +struct Token { +	/** +	 * Id of the id of this token. +	 */ +	TokenId id; + +	/** +	 * String that was matched. +	 */ +	std::string content; + +	/** +	 * Location from which the string was extracted. +	 */ +	SourceLocation location; + +	/** +	 * Default constructor. +	 */ +	Token() : id(Tokens::Empty) {} + +	/** +	 * Constructor of a "data" token with no explicit content. +	 * +	 * @param location is the location of the extracted string content in the +	 * source file. +	 */ +	Token(SourceLocation location) +	    : id(Tokens::Data), location(location) +	{ +	} + +	/** +	 * Constructor of the Token struct. +	 * +	 * @param id represents the token id. +	 * @param content is the string content that has been extracted. +	 * @param location is the location of the extracted string content in the +	 * source file. +	 */ +	Token(TokenId id, const std::string &content, SourceLocation location) +	    : id(id), content(content), location(location) +	{ +	} + +	/** +	 * Constructor of the Token struct, only initializes the token id +	 * +	 * @param id is the id corresponding to the id of the token. +	 */ +	Token(TokenId id) : id(id) {} + +	/** +	 * Returns true if this token is special. +	 * +	 * @return true if the TokenId indicates that this token is a "special" +	 * token. +	 */ +	 + +	/** +	 * The getLocation function allows the tokens to be directly passed as +	 * parameter to Logger or LoggableException instances. +	 * +	 * @return a reference at the location field +	 */ +	const SourceLocation &getLocation() const { return location; } +}; +} + +#endif /* _OUSIA_TOKENS_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* -    Ousía -    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel - -    This program is free software: you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation, either version 3 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program.  If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include <string> -#include <vector> - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: -	/** -	 * Start position of the extracted text. -	 */ -	size_t textStart; - -	/** -	 * End position of the extracted text. -	 */ -	size_t textEnd; - -	/** -	 * Buffer containing the extracted text. -	 */ -	std::vector<char> textBuf; - -	/** -	 * Constructor of the TextHandlerBase base class. Initializes the start and -	 * end position with zeros. -	 */ -	WhitespaceHandler() : textStart(0), textEnd(0) {} - -	/** -	 * Returns true if this whitespace handler has found any text and a text -	 * token could be emitted. -	 * -	 * @return true if the internal data buffer is non-empty. -	 */ -	bool hasText() { return !textBuf.empty(); } - -	/** -	 * Returns the content of the WhitespaceHandler as string. -	 */ -	std::string toString() const -	{ -		return std::string(textBuf.data(), textBuf.size()); -	} -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: -	/** -	 * Appends the given character to the internal text buffer, does not -	 * eliminate whitespace. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		append(c, start, end, textBuf, textStart, textEnd); -	} - -	/** -	 * Static version of PreservingWhitespaceHandler append -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 * @param textBuf is a reference at the text buffer that is to be used. -	 * @param textStart is a reference at the text start variable that is to be -	 * used. -	 * @param textEnd is a reference at the text end variable that is to be -	 * used. -	 */ -	static void append(char c, size_t start, size_t end, -	                   std::vector<char> &textBuf, size_t &textStart, -	                   size_t &textEnd) -	{ -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; -		textBuf.push_back(c); -	} -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: -	/** -	 * Buffer used internally to temporarily store all whitespace characters. -	 * They are only added to the output buffer if another non-whitespace -	 * character is reached. -	 */ -	std::vector<char> whitespaceBuf; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * whitespace characters at the begin and end of the text. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); -	} - -	/** -	 * Static version of TrimmingWhitespaceHandler append -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 * @param textBuf is a reference at the text buffer that is to be used. -	 * @param textStart is a reference at the text start variable that is to be -	 * used. -	 * @param textEnd is a reference at the text end variable that is to be -	 * used. -	 * @param whitespaceBuf is a reference at the buffer for storing whitespace -	 * characters. -	 */ -	static void append(char c, size_t start, size_t end, -	                   std::vector<char> &textBuf, size_t &textStart, -	                   size_t &textEnd, std::vector<char> &whitespaceBuf) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				whitespaceBuf.push_back(c); -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (!whitespaceBuf.empty()) { -			textBuf.insert(textBuf.end(), whitespaceBuf.begin(), -			               whitespaceBuf.end()); -			whitespaceBuf.clear(); -		} -		textBuf.push_back(c); -	} -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: -	/** -	 * Flag set to true if a whitespace character was reached. -	 */ -	bool hasWhitespace = false; - -	/** -	 * Appends the given character to the internal text buffer, eliminates -	 * redundant whitespace characters. -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 */ -	void append(char c, size_t start, size_t end) -	{ -		append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); -	} - -	/** -	 * Static version of CollapsingWhitespaceHandler append -	 * -	 * @param c is the character that should be appended to the internal buffer. -	 * @param start is the start byte offset of the given character. -	 * @param end is the end byte offset of the given character. -	 * @param textBuf is a reference at the text buffer that is to be used. -	 * @param textStart is a reference at the text start variable that is to be -	 * used. -	 * @param textEnd is a reference at the text end variable that is to be -	 * used. -	 * @param hasWhitespace is a reference at the "hasWhitespace" flag. -	 */ -	static void append(char c, size_t start, size_t end, -	                   std::vector<char> &textBuf, size_t &textStart, -	                   size_t &textEnd, bool &hasWhitespace) -	{ -		// Handle whitespace characters -		if (Utils::isWhitespace(c)) { -			if (!textBuf.empty()) { -				hasWhitespace = true; -			} -			return; -		} - -		// Set the start and end offset correctly -		if (textBuf.empty()) { -			textStart = start; -		} -		textEnd = end; - -		// Store the character -		if (hasWhitespace) { -			textBuf.push_back(' '); -			hasWhitespace = false; -		} -		textBuf.push_back(c); -	} -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template <typename WhitespaceHandler, typename Buffer> -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, -                                      size_t start) -{ -	for (auto elem : buf) { -		handler.append(elem, start, start + 1); -		start++; -	} -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ -  | 
