summaryrefslogtreecommitdiff
path: root/src/core/common
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/common')
-rw-r--r--src/core/common/Utils.cpp54
-rw-r--r--src/core/common/Utils.hpp86
-rw-r--r--src/core/common/Whitespace.hpp60
-rw-r--r--src/core/common/WhitespaceHandler.hpp284
4 files changed, 467 insertions, 17 deletions
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index 563fe2a..f8b53c6 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -18,19 +18,13 @@
#include <algorithm>
#include <cctype>
-#include <limits>
#include <string>
#include "Utils.hpp"
+#include "WhitespaceHandler.hpp"
namespace ousia {
-std::string Utils::trim(const std::string &s)
-{
- std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
- return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
bool Utils::isIdentifier(const std::string &name)
{
bool first = true;
@@ -43,7 +37,27 @@ bool Utils::isIdentifier(const std::string &name)
}
first = false;
}
- return true;
+ return !first;
+}
+
+bool Utils::isIdentifierOrEmpty(const std::string &name)
+{
+ return name.empty() || isIdentifier(name);
+}
+
+bool Utils::isNamespacedIdentifier(const std::string &name)
+{
+ bool first = true;
+ for (char c : name) {
+ if (first && !isIdentifierStartCharacter(c)) {
+ return false;
+ }
+ if (!first && (!isIdentifierCharacter(c) && c != ':')) {
+ return false;
+ }
+ first = (c == ':');
+ }
+ return !first;
}
bool Utils::hasNonWhitepaceChar(const std::string &s)
@@ -94,5 +108,29 @@ std::string Utils::extractFileExtension(const std::string &filename)
}
return std::string{};
}
+
+std::string Utils::trim(const std::string &s)
+{
+ std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
+ return s.substr(bounds.first, bounds.second - bounds.first);
+}
+
+std::string Utils::collapse(const std::string &s)
+{
+ CollapsingWhitespaceHandler h;
+ appendToWhitespaceHandler(h, s, 0);
+ return h.toString();
+}
+
+bool Utils::startsWith(const std::string &s, const std::string &prefix)
+{
+ return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix;
+}
+
+bool Utils::endsWith(const std::string &s, const std::string &suffix)
+{
+ return suffix.size() <= s.size() &&
+ s.substr(s.size() - suffix.size(), suffix.size()) == suffix;
+}
}
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 2c8a5b3..b5a54fc 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -74,16 +74,45 @@ public:
}
/**
- * Returns true if the given character is in [A-Za-z][A-Za-z0-9_-]*
+ * Returns true if the given string is in
+ * \code{.txt}
+ * [A-Za-z][A-Za-z0-9_-]*
+ * \endCode
+ *
+ * @param name is the string that should be tested.
+ * @return true if the string matches the regular expression given above,
+ * false otherwise.
*/
static bool isIdentifier(const std::string &name);
/**
+ * Returns true if the given string is an identifier or an empty string.
+ */
+ static bool isIdentifierOrEmpty(const std::string &name);
+
+ /**
+ * Returns true if the given string is in
+ * \code{.txt}
+ * ([A-Za-z][A-Za-z0-9_-]*)(:[A-Za-z][A-Za-z0-9_-]*)*
+ * \endCode
+ *
+ * @param name is the string that should be tested.
+ * @return true if the string matches the regular expression given above,
+ * false otherwise.
+ */
+ static bool isNamespacedIdentifier(const std::string &name);
+
+ /**
+ * Returns true if the given character is a linebreak character.
+ */
+ static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); }
+
+ /**
* Returns true if the given character is a whitespace character.
*/
static bool isWhitespace(const char c)
{
- return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r');
+ return (c == ' ') || (c == '\t') || isLinebreak(c);
}
/**
@@ -95,11 +124,6 @@ public:
static bool hasNonWhitepaceChar(const std::string &s);
/**
- * Returns true if the given character is a whitespace character.
- */
- static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); }
-
- /**
* Removes whitespace at the beginning and the end of the given string.
*
* @param s is the string that should be trimmed.
@@ -120,8 +144,25 @@ public:
template <class T, class Filter>
static std::pair<size_t, size_t> trim(const T &s, Filter f)
{
+ return trim(s, s.size(), f);
+ }
+
+ /**
+ * Trims the given string or vector of chars by returning the start and end
+ * index.
+ *
+ * @param s is the container that should be trimmed.
+ * @param len is the number of elements in the container.
+ * @param f is a function that returns true for values that should be
+ * removed.
+ * @return start and end index. Note that "end" points at the character
+ * beyond the end, thus "end" minus "start"
+ */
+ template <class T, class Filter>
+ static std::pair<size_t, size_t> trim(const T &s, size_t len, Filter f)
+ {
size_t start = 0;
- for (size_t i = 0; i < s.size(); i++) {
+ for (size_t i = 0; i < len; i++) {
if (!f(s[i])) {
start = i;
break;
@@ -129,7 +170,7 @@ public:
}
size_t end = 0;
- for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) {
+ for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {
if (!f(s[i])) {
end = i + 1;
break;
@@ -145,6 +186,15 @@ public:
}
/**
+ * Collapses the whitespaces in the given string (trims the string and
+ * replaces all whitespace characters by a single one).
+ *
+ * @param s is the string in which the whitespace should be collapsed.
+ * @return a copy of s with collapsed whitespace.
+ */
+ static std::string collapse(const std::string &s);
+
+ /**
* Turns the elements of a collection into a string separated by the
* given delimiter.
*
@@ -205,6 +255,24 @@ public:
static std::string extractFileExtension(const std::string &filename);
/**
+ * Checks whether the given string starts with the given prefix.
+ *
+ * @param s is the string.
+ * @param prefix is the string which should be checked for being a prefix of
+ * s.
+ */
+ static bool startsWith(const std::string &s, const std::string &prefix);
+
+ /**
+ * Checks whether the given string ends with the given suffix.
+ *
+ * @param s is the string.
+ * @param suffix is the string which should be checked for being a suffix of
+ * s.
+ */
+ static bool endsWith(const std::string &s, const std::string &suffix);
+
+ /**
* Hash functional to be used for enum classes.
* See http://stackoverflow.com/a/24847480/2188211
*/
diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp
new file mode 100644
index 0000000..72a2291
--- /dev/null
+++ b/src/core/common/Whitespace.hpp
@@ -0,0 +1,60 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Whitespace.hpp
+ *
+ * Contains the WhitespaceMode enum used in various places.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_WHITESPACE_HPP_
+#define _OUSIA_WHITESPACE_HPP_
+
+#include <string>
+#include <utility>
+
+namespace ousia {
+
+/**
+ * Enum specifying the whitespace handling mode of the tokenizer and the
+ * parsers.
+ */
+enum class WhitespaceMode {
+ /**
+ * Preserves all whitespaces as they are found in the source file.
+ */
+ PRESERVE,
+
+ /**
+ * Trims whitespace at the beginning and the end of the found text.
+ */
+ TRIM,
+
+ /**
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
+ COLLAPSE
+};
+
+}
+
+#endif /* _OUSIA_WHITESPACE_HPP_ */
+
diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp
new file mode 100644
index 0000000..ed52ea3
--- /dev/null
+++ b/src/core/common/WhitespaceHandler.hpp
@@ -0,0 +1,284 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file WhitespaceHandler.hpp
+ *
+ * Contains the WhitespaceHandler classes which are used in multiple places to
+ * trim, compact or preserve whitespaces while at the same time maintaining the
+ * position information associated with the input strings.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
+#define _OUSIA_WHITESPACE_HANDLER_HPP_
+
+#include <string>
+#include <vector>
+
+#include "Utils.hpp"
+
+namespace ousia {
+
+/**
+ * WhitespaceHandler is a based class that can be used to collect text on a
+ * character-by-character basis. Note that this class and its descendants are
+ * hoped to be inlined by the compiler (and used in conjunction with templates),
+ * thus they are fully defined inside this header.
+ */
+class WhitespaceHandler {
+public:
+ /**
+ * Start position of the extracted text.
+ */
+ size_t textStart;
+
+ /**
+ * End position of the extracted text.
+ */
+ size_t textEnd;
+
+ /**
+ * Buffer containing the extracted text.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Constructor of the TextHandlerBase base class. Initializes the start and
+ * end position with zeros.
+ */
+ WhitespaceHandler() : textStart(0), textEnd(0) {}
+
+ /**
+ * Returns true if this whitespace handler has found any text and a text
+ * token could be emitted.
+ *
+ * @return true if the internal data buffer is non-empty.
+ */
+ bool hasText() { return !textBuf.empty(); }
+
+ /**
+ * Returns the content of the WhitespaceHandler as string.
+ */
+ std::string toString() const
+ {
+ return std::string(textBuf.data(), textBuf.size());
+ }
+};
+
+/**
+ * The PreservingWhitespaceHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Appends the given character to the internal text buffer, does not
+ * eliminate whitespace.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ append(c, start, end, textBuf, textStart, textEnd);
+ }
+
+ /**
+ * Static version of PreservingWhitespaceHandler append
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ * @param textBuf is a reference at the text buffer that is to be used.
+ * @param textStart is a reference at the text start variable that is to be
+ * used.
+ * @param textEnd is a reference at the text end variable that is to be
+ * used.
+ */
+ static void append(char c, size_t start, size_t end,
+ std::vector<char> &textBuf, size_t &textStart,
+ size_t &textEnd)
+ {
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
+ */
+class TrimmingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Buffer used internally to temporarily store all whitespace characters.
+ * They are only added to the output buffer if another non-whitespace
+ * character is reached.
+ */
+ std::vector<char> whitespaceBuf;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * whitespace characters at the begin and end of the text.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf);
+ }
+
+ /**
+ * Static version of TrimmingWhitespaceHandler append
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ * @param textBuf is a reference at the text buffer that is to be used.
+ * @param textStart is a reference at the text start variable that is to be
+ * used.
+ * @param textEnd is a reference at the text end variable that is to be
+ * used.
+ * @param whitespaceBuf is a reference at the buffer for storing whitespace
+ * characters.
+ */
+ static void append(char c, size_t start, size_t end,
+ std::vector<char> &textBuf, size_t &textStart,
+ size_t &textEnd, std::vector<char> &whitespaceBuf)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ whitespaceBuf.push_back(c);
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (!whitespaceBuf.empty()) {
+ textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+ whitespaceBuf.end());
+ whitespaceBuf.clear();
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Flag set to true if a whitespace character was reached.
+ */
+ bool hasWhitespace = false;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * redundant whitespace characters.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ append(c, start, end, textBuf, textStart, textEnd, hasWhitespace);
+ }
+
+ /**
+ * Static version of CollapsingWhitespaceHandler append
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ * @param textBuf is a reference at the text buffer that is to be used.
+ * @param textStart is a reference at the text start variable that is to be
+ * used.
+ * @param textEnd is a reference at the text end variable that is to be
+ * used.
+ * @param hasWhitespace is a reference at the "hasWhitespace" flag.
+ */
+ static void append(char c, size_t start, size_t end,
+ std::vector<char> &textBuf, size_t &textStart,
+ size_t &textEnd, bool &hasWhitespace)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ hasWhitespace = true;
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (hasWhitespace) {
+ textBuf.push_back(' ');
+ hasWhitespace = false;
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * Function that can be used to append the given buffer (e.g. a string or a
+ * vector) to the whitespace handler.
+ *
+ * @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
+ * @tparam Buffer is an iterable type.
+ * @param handler is the handler to which the characters of the Buffer should be
+ * appended.
+ * @param buf is the buffer from which the characters should be read.
+ * @param start is the start byte offset. Each character is counted as one byte.
+ */
+template <typename WhitespaceHandler, typename Buffer>
+inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
+ size_t start)
+{
+ for (auto elem : buf) {
+ handler.append(elem, start, start + 1);
+ start++;
+ }
+}
+}
+
+#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */
+