summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-14 23:43:32 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-14 23:43:32 +0100
commitce4fd84a714d80859aa01bbca32a81302b93c4d7 (patch)
tree4de6d48f7c1fe0455bca9e3d4b81f69117397bcd /src
parentc771577b9c7c7a3c1b019139ed132101add73cf9 (diff)
Moved code for handling whitespaces to own header, including the "WhitespaceMode" enum
Diffstat (limited to 'src')
-rw-r--r--src/core/common/Utils.cpp7
-rw-r--r--src/core/common/Utils.hpp57
-rw-r--r--src/core/common/Whitespace.cpp38
-rw-r--r--src/core/common/Whitespace.hpp120
-rw-r--r--src/core/common/WhitespaceHandler.hpp223
5 files changed, 387 insertions, 58 deletions
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index 563fe2a..4005143 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -18,19 +18,12 @@
#include <algorithm>
#include <cctype>
-#include <limits>
#include <string>
#include "Utils.hpp"
namespace ousia {
-std::string Utils::trim(const std::string &s)
-{
- std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
- return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
bool Utils::isIdentifier(const std::string &name)
{
bool first = true;
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 2c8a5b3..af7a773 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -79,11 +79,16 @@ public:
static bool isIdentifier(const std::string &name);
/**
+ * Returns true if the given character is a linebreak character.
+ */
+ static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); }
+
+ /**
* Returns true if the given character is a whitespace character.
*/
static bool isWhitespace(const char c)
{
- return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r');
+ return (c == ' ') || (c == '\t') || isLinebreak(c);
}
/**
@@ -95,56 +100,6 @@ public:
static bool hasNonWhitepaceChar(const std::string &s);
/**
- * Returns true if the given character is a whitespace character.
- */
- static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); }
-
- /**
- * Removes whitespace at the beginning and the end of the given string.
- *
- * @param s is the string that should be trimmed.
- * @return a trimmed copy of s.
- */
- static std::string trim(const std::string &s);
-
- /**
- * Trims the given string or vector of chars by returning the start and end
- * index.
- *
- * @param s is the container that should be trimmed.
- * @param f is a function that returns true for values that should be
- * removed.
- * @return start and end index. Note that "end" points at the character
- * beyond the end, thus "end" minus "start"
- */
- template <class T, class Filter>
- static std::pair<size_t, size_t> trim(const T &s, Filter f)
- {
- size_t start = 0;
- for (size_t i = 0; i < s.size(); i++) {
- if (!f(s[i])) {
- start = i;
- break;
- }
- }
-
- size_t end = 0;
- for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) {
- if (!f(s[i])) {
- end = i + 1;
- break;
- }
- }
-
- if (end < start) {
- start = 0;
- end = 0;
- }
-
- return std::pair<size_t, size_t>{start, end};
- }
-
- /**
* Turns the elements of a collection into a string separated by the
* given delimiter.
*
diff --git a/src/core/common/Whitespace.cpp b/src/core/common/Whitespace.cpp
new file mode 100644
index 0000000..4d7c01a
--- /dev/null
+++ b/src/core/common/Whitespace.cpp
@@ -0,0 +1,38 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Whitespace.hpp"
+#include "WhitespaceHandler.hpp"
+
+namespace ousia {
+
+std::string Utils::trim(const std::string &s)
+{
+ std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
+ return s.substr(bounds.first, bounds.second - bounds.first);
+}
+
+std::string Utils::collapse(const std::string &s)
+{
+ CollapsingWhitespaceHandler h;
+ appendToWhitespaceHandler(h, s, 0);
+ return h.toString();
+}
+
+}
+
diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp
new file mode 100644
index 0000000..1e9f36a
--- /dev/null
+++ b/src/core/common/Whitespace.hpp
@@ -0,0 +1,120 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Whitespace.hpp
+ *
+ * Contains the WhitespaceMode enum used in various places, as well es functions
+ * for trimming and collapsing whitespaces.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_WHITESPACE_HPP_
+#define _OUSIA_WHITESPACE_HPP_
+
+#include <string>
+#include <utility>
+
+namespace ousia {
+
+/**
+ * Enum specifying the whitespace handling mode of the tokenizer and the
+ * parsers.
+ */
+enum class WhitespaceMode {
+ /**
+ * Preserves all whitespaces as they are found in the source file.
+ */
+ PRESERVE,
+
+ /**
+ * Trims whitespace at the beginning and the end of the found text.
+ */
+ TRIM,
+
+ /**
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
+ COLLAPSE
+};
+
+/**
+ * Collection of functions for trimming or collapsing whitespace.
+ */
+class Whitespace {
+ /**
+ * Removes whitespace at the beginning and the end of the given string.
+ *
+ * @param s is the string that should be trimmed.
+ * @return a trimmed copy of s.
+ */
+ static std::string trim(const std::string &s);
+
+ /**
+ * Trims the given string or vector of chars by returning the start and end
+ * index.
+ *
+ * @param s is the container that should be trimmed.
+ * @param f is a function that returns true for values that should be
+ * removed.
+ * @return start and end index. Note that "end" points at the character
+ * beyond the end, thus "end" minus "start"
+ */
+ template <class T, class Filter>
+ static std::pair<size_t, size_t> trim(const T &s, Filter f)
+ {
+ size_t start = 0;
+ for (size_t i = 0; i < s.size(); i++) {
+ if (!f(s[i])) {
+ start = i;
+ break;
+ }
+ }
+
+ size_t end = 0;
+ for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) {
+ if (!f(s[i])) {
+ end = i + 1;
+ break;
+ }
+ }
+
+ if (end < start) {
+ start = 0;
+ end = 0;
+ }
+
+ return std::pair<size_t, size_t>{start, end};
+ }
+
+ /**
+ * Collapses the whitespaces in the given string (trims the string and
+ * replaces all whitespace characters by a single one).
+ *
+ * @param s is the string in which the whitespace should be collapsed.
+ * @return a copy of s with collapsed whitespace.
+ */
+ static std::string collapse(const std::string &s);
+};
+
+}
+
+#endif /* _OUSIA_WHITESPACE_HPP_ */
+
diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp
new file mode 100644
index 0000000..1935c24
--- /dev/null
+++ b/src/core/common/WhitespaceHandler.hpp
@@ -0,0 +1,223 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file WhitespaceHandler.hpp
+ *
+ * Contains the WhitespaceHandler classes which are used in multiple places to
+ * trim, compact or preserve whitespaces while at the same time maintaining the
+ * position information associated with the input strings.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
+#define _OUSIA_WHITESPACE_HANDLER_HPP_
+
+#include <string>
+#include <vector>
+
+#include "WhitespaceHandler.hpp"
+
+namespace ousia {
+
+/**
+ * WhitespaceHandler is a based class that can be used to collect text on a
+ * character-by-character basis. Note that this class and its descendants are
+ * hoped to be inlined by the compiler (and used in conjunction with templates),
+ * thus they are fully defined inside this header.
+ */
+class WhitespaceHandler {
+public:
+ /**
+ * Start position of the extracted text.
+ */
+ size_t textStart;
+
+ /**
+ * End position of the extracted text.
+ */
+ size_t textEnd;
+
+ /**
+ * Buffer containing the extracted text.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Constructor of the TextHandlerBase base class. Initializes the start and
+ * end position with zeros.
+ */
+ WhitespaceHandler() : textStart(0), textEnd(0) {}
+
+ /**
+ * Returns true if this whitespace handler has found any text and a text
+ * token could be emitted.
+ *
+ * @return true if the internal data buffer is non-empty.
+ */
+ bool hasText() { return !textBuf.empty(); }
+
+ /**
+ * Returns the content of the WhitespaceHandler as string.
+ */
+ std::string toString()
+ {
+ return std::string(textBuf.data(), textBuf.size());
+ }
+};
+
+/**
+ * The PreservingWhitespaceHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Appends the given character to the internal text buffer, does not
+ * eliminate whitespace.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
+ */
+class TrimmingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Buffer used internally to temporarily store all whitespace characters.
+ * They are only added to the output buffer if another non-whitespace
+ * character is reached.
+ */
+ std::vector<char> whitespaceBuf;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * whitespace characters at the begin and end of the text.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ whitespaceBuf.push_back(c);
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (!whitespaceBuf.empty()) {
+ textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+ whitespaceBuf.end());
+ whitespaceBuf.clear();
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Flag set to true if a whitespace character was reached.
+ */
+ bool hasWhitespace = false;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * redundant whitespace characters.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ hasWhitespace = true;
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (hasWhitespace) {
+ textBuf.push_back(' ');
+ hasWhitespace = false;
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * Function that can be used to append the given buffer (e.g. a string or a
+ * vector) to the whitespace handler.
+ *
+ * @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
+ * @tparam Buffer is an iterable type.
+ * @param handler is the handler to which the characters of the Buffer should be
+ * appended.
+ * @param buf is the buffer from which the characters should be read.
+ * @param start is the start byte offset. Each character is counted as one byte.
+ */
+template <typename WhitespaceHandler, typename Buffer>
+inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
+ size_t start)
+{
+ for (auto elem : buf) {
+ handler.append(elem, start++);
+ }
+}
+}
+
+#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */
+