diff options
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/common/Utils.cpp | 7 | ||||
-rw-r--r-- | src/core/common/Utils.hpp | 57 | ||||
-rw-r--r-- | src/core/common/Whitespace.cpp | 38 | ||||
-rw-r--r-- | src/core/common/Whitespace.hpp | 120 | ||||
-rw-r--r-- | src/core/common/WhitespaceHandler.hpp | 223 |
5 files changed, 387 insertions, 58 deletions
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 563fe2a..4005143 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -18,19 +18,12 @@ #include <algorithm> #include <cctype> -#include <limits> #include <string> #include "Utils.hpp" namespace ousia { -std::string Utils::trim(const std::string &s) -{ - std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::isIdentifier(const std::string &name) { bool first = true; diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 2c8a5b3..af7a773 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -79,11 +79,16 @@ public: static bool isIdentifier(const std::string &name); /** + * Returns true if the given character is a linebreak character. + */ + static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } + + /** * Returns true if the given character is a whitespace character. */ static bool isWhitespace(const char c) { - return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); + return (c == ' ') || (c == '\t') || isLinebreak(c); } /** @@ -95,56 +100,6 @@ public: static bool hasNonWhitepaceChar(const std::string &s); /** - * Returns true if the given character is a whitespace character. - */ - static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } - - /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** - * Trims the given string or vector of chars by returning the start and end - * index. - * - * @param s is the container that should be trimmed. - * @param f is a function that returns true for values that should be - * removed. - * @return start and end index. Note that "end" points at the character - * beyond the end, thus "end" minus "start" - */ - template <class T, class Filter> - static std::pair<size_t, size_t> trim(const T &s, Filter f) - { - size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { - if (!f(s[i])) { - start = i; - break; - } - } - - size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) { - if (!f(s[i])) { - end = i + 1; - break; - } - } - - if (end < start) { - start = 0; - end = 0; - } - - return std::pair<size_t, size_t>{start, end}; - } - - /** * Turns the elements of a collection into a string separated by the * given delimiter. * diff --git a/src/core/common/Whitespace.cpp b/src/core/common/Whitespace.cpp new file mode 100644 index 0000000..4d7c01a --- /dev/null +++ b/src/core/common/Whitespace.cpp @@ -0,0 +1,38 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "Whitespace.hpp" +#include "WhitespaceHandler.hpp" + +namespace ousia { + +std::string Utils::trim(const std::string &s) +{ + std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace); + return s.substr(bounds.first, bounds.second - bounds.first); +} + +std::string Utils::collapse(const std::string &s) +{ + CollapsingWhitespaceHandler h; + appendToWhitespaceHandler(h, s, 0); + return h.toString(); +} + +} + diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp new file mode 100644 index 0000000..1e9f36a --- /dev/null +++ b/src/core/common/Whitespace.hpp @@ -0,0 +1,120 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Whitespace.hpp + * + * Contains the WhitespaceMode enum used in various places, as well es functions + * for trimming and collapsing whitespaces. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HPP_ +#define _OUSIA_WHITESPACE_HPP_ + +#include <string> +#include <utility> + +namespace ousia { + +/** + * Enum specifying the whitespace handling mode of the tokenizer and the + * parsers. + */ +enum class WhitespaceMode { + /** + * Preserves all whitespaces as they are found in the source file. + */ + PRESERVE, + + /** + * Trims whitespace at the beginning and the end of the found text. + */ + TRIM, + + /** + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ + COLLAPSE +}; + +/** + * Collection of functions for trimming or collapsing whitespace. + */ +class Whitespace { + /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s); + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template <class T, class Filter> + static std::pair<size_t, size_t> trim(const T &s, Filter f) + { + size_t start = 0; + for (size_t i = 0; i < s.size(); i++) { + if (!f(s[i])) { + start = i; + break; + } + } + + size_t end = 0; + for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) { + if (!f(s[i])) { + end = i + 1; + break; + } + } + + if (end < start) { + start = 0; + end = 0; + } + + return std::pair<size_t, size_t>{start, end}; + } + + /** + * Collapses the whitespaces in the given string (trims the string and + * replaces all whitespace characters by a single one). + * + * @param s is the string in which the whitespace should be collapsed. + * @return a copy of s with collapsed whitespace. + */ + static std::string collapse(const std::string &s); +}; + +} + +#endif /* _OUSIA_WHITESPACE_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp new file mode 100644 index 0000000..1935c24 --- /dev/null +++ b/src/core/common/WhitespaceHandler.hpp @@ -0,0 +1,223 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file WhitespaceHandler.hpp + * + * Contains the WhitespaceHandler classes which are used in multiple places to + * trim, compact or preserve whitespaces while at the same time maintaining the + * position information associated with the input strings. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ +#define _OUSIA_WHITESPACE_HANDLER_HPP_ + +#include <string> +#include <vector> + +#include "WhitespaceHandler.hpp" + +namespace ousia { + +/** + * WhitespaceHandler is a based class that can be used to collect text on a + * character-by-character basis. Note that this class and its descendants are + * hoped to be inlined by the compiler (and used in conjunction with templates), + * thus they are fully defined inside this header. + */ +class WhitespaceHandler { +public: + /** + * Start position of the extracted text. + */ + size_t textStart; + + /** + * End position of the extracted text. + */ + size_t textEnd; + + /** + * Buffer containing the extracted text. + */ + std::vector<char> textBuf; + + /** + * Constructor of the TextHandlerBase base class. Initializes the start and + * end position with zeros. + */ + WhitespaceHandler() : textStart(0), textEnd(0) {} + + /** + * Returns true if this whitespace handler has found any text and a text + * token could be emitted. + * + * @return true if the internal data buffer is non-empty. + */ + bool hasText() { return !textBuf.empty(); } + + /** + * Returns the content of the WhitespaceHandler as string. + */ + std::string toString() + { + return std::string(textBuf.data(), textBuf.size()); + } +}; + +/** + * The PreservingWhitespaceHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Appends the given character to the internal text buffer, does not + * eliminate whitespace. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + textBuf.push_back(c); + } +}; + +/** + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. + */ +class TrimmingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Buffer used internally to temporarily store all whitespace characters. + * They are only added to the output buffer if another non-whitespace + * character is reached. + */ + std::vector<char> whitespaceBuf; + + /** + * Appends the given character to the internal text buffer, eliminates + * whitespace characters at the begin and end of the text. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + whitespaceBuf.push_back(c); + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (!whitespaceBuf.empty()) { + textBuf.insert(textBuf.end(), whitespaceBuf.begin(), + whitespaceBuf.end()); + whitespaceBuf.clear(); + } + textBuf.push_back(c); + } +}; + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Flag set to true if a whitespace character was reached. + */ + bool hasWhitespace = false; + + /** + * Appends the given character to the internal text buffer, eliminates + * redundant whitespace characters. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + hasWhitespace = true; + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (hasWhitespace) { + textBuf.push_back(' '); + hasWhitespace = false; + } + textBuf.push_back(c); + } +}; + +/** + * Function that can be used to append the given buffer (e.g. a string or a + * vector) to the whitespace handler. + * + * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. + * @tparam Buffer is an iterable type. + * @param handler is the handler to which the characters of the Buffer should be + * appended. + * @param buf is the buffer from which the characters should be read. + * @param start is the start byte offset. Each character is counted as one byte. + */ +template <typename WhitespaceHandler, typename Buffer> +inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, + size_t start) +{ + for (auto elem : buf) { + handler.append(elem, start++); + } +} +} + +#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ + |