diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-24 02:13:46 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-24 02:13:46 +0100 |
commit | 5a67fc7d682ddba6a862aacf616d02cd20b727eb (patch) | |
tree | 34a6e34d835f70459f3cb6aed9543cc22319a92b /src/core/common | |
parent | 8891dea26a1653a003b4171155e155d3aa6689ae (diff) |
start of branch, commit log will be rewritten
Diffstat (limited to 'src/core/common')
-rw-r--r-- | src/core/common/Token.cpp | 24 | ||||
-rw-r--r-- | src/core/common/Token.hpp | 181 | ||||
-rw-r--r-- | src/core/common/WhitespaceHandler.hpp | 284 |
3 files changed, 205 insertions, 284 deletions
diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp new file mode 100644 index 0000000..8bcdbb5 --- /dev/null +++ b/src/core/common/Token.cpp @@ -0,0 +1,24 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "Token.hpp" + +namespace ousia { +// Stub to make sure Tokens.hpp is valid +} + diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp new file mode 100644 index 0000000..07d7c8f --- /dev/null +++ b/src/core/common/Token.hpp @@ -0,0 +1,181 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Token.hpp + * + * Definition of the TokenId id and constants for some special tokens. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_HPP_ +#define _OUSIA_TOKEN_HPP_ + +#include <cstdint> +#include <limits> +#include <string> +#include <unordered_set> + +#include <core/common/Location.hpp> + +namespace ousia { + +/** + * The TokenId is used to give each token id a unique id. + */ +using TokenId = uint32_t; + +/** + * Type used for storing token lengths. + */ +using TokenLength = uint16_t; + +/** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set<TokenId>; + +/** + * Namespace containing constants for TokenId instances with special meaning. + */ +namespace Tokens { +/** + * Token which is not a token. + */ +constexpr TokenId Empty = std::numeric_limits<TokenId>::max(); + +/** + * Token which represents data (represented as TokenizedData). + */ +constexpr TokenId Data = std::numeric_limits<TokenId>::max() - 1; + +/** + * Token which represents a newline token. + */ +constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2; + +/** + * Token which represents a paragraph token -- issued if two consecutive + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached. + */ +constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3; + +/** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4; + +/** + * Token which represents an indentation token -- issued if the indentation of + * this line is larger than the indentation of the previous line. + */ +constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5; + +/** + * Token which represents an unindentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. + */ +constexpr TokenId Unindent = std::numeric_limits<TokenId>::max() - 6; + +/** + * Maximum token id to be used. Tokens allocated for users should not surpass + * this value. + */ +constexpr TokenId MaxTokenId = std::numeric_limits<TokenId>::max() - 255; +} + +/** + * The Token structure describes a token discovered by the Tokenizer or read + * from the TokenizedData struct. + */ +struct Token { + /** + * Id of the id of this token. + */ + TokenId id; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + Token() : id(Tokens::Empty) {} + + /** + * Constructor of a "data" token with no explicit content. + * + * @param location is the location of the extracted string content in the + * source file. + */ + Token(SourceLocation location) + : id(Tokens::Data), location(location) + { + } + + /** + * Constructor of the Token struct. + * + * @param id represents the token id. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + Token(TokenId id, const std::string &content, SourceLocation location) + : id(id), content(content), location(location) + { + } + + /** + * Constructor of the Token struct, only initializes the token id + * + * @param id is the id corresponding to the id of the token. + */ + Token(TokenId id) : id(id) {} + + /** + * Returns true if this token is special. + * + * @return true if the TokenId indicates that this token is a "special" + * token. + */ + + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; +} + +#endif /* _OUSIA_TOKENS_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include <string> -#include <vector> - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector<char> textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - WhitespaceHandler() : textStart(0), textEnd(0) {} - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } - - /** - * Returns the content of the WhitespaceHandler as string. - */ - std::string toString() const - { - return std::string(textBuf.data(), textBuf.size()); - } -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd); - } - - /** - * Static version of PreservingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector<char> whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); - } - - /** - * Static version of TrimmingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param whitespaceBuf is a reference at the buffer for storing whitespace - * characters. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd, std::vector<char> &whitespaceBuf) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); - } - - /** - * Static version of CollapsingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param hasWhitespace is a reference at the "hasWhitespace" flag. - */ - static void append(char c, size_t start, size_t end, - std::vector<char> &textBuf, size_t &textStart, - size_t &textEnd, bool &hasWhitespace) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template <typename WhitespaceHandler, typename Buffer> -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, - size_t start) -{ - for (auto elem : buf) { - handler.append(elem, start, start + 1); - start++; - } -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ - |