summaryrefslogtreecommitdiff
path: root/src/core/common
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/common')
-rw-r--r--src/core/common/SourceContextReader.cpp5
-rw-r--r--src/core/common/Token.cpp24
-rw-r--r--src/core/common/Token.hpp181
-rw-r--r--src/core/common/Utils.cpp6
-rw-r--r--src/core/common/Utils.hpp53
-rw-r--r--src/core/common/WhitespaceHandler.hpp284
6 files changed, 240 insertions, 313 deletions
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp
index d5d379c..f7dbdf3 100644
--- a/src/core/common/SourceContextReader.cpp
+++ b/src/core/common/SourceContextReader.cpp
@@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader,
ctx.relLen = end - start; // end >= start (I2)
// Remove linebreaks at the beginning and the end
- const std::pair<size_t, size_t> b =
- Utils::trim(lineBuf, Utils::isLinebreak);
+ const std::pair<size_t, size_t> b = Utils::trim(
+ lineBuf,
+ [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); });
ssize_t s = b.first, e = b.second;
s = std::min(s, static_cast<ssize_t>(ctx.relPos));
diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp
new file mode 100644
index 0000000..8bcdbb5
--- /dev/null
+++ b/src/core/common/Token.cpp
@@ -0,0 +1,24 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Token.hpp"
+
+namespace ousia {
+// Stub to make sure Tokens.hpp is valid
+}
+
diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp
new file mode 100644
index 0000000..0cf56b0
--- /dev/null
+++ b/src/core/common/Token.hpp
@@ -0,0 +1,181 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Token.hpp
+ *
+ * Definition of the TokenId id and constants for some special tokens.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_TOKEN_HPP_
+#define _OUSIA_TOKEN_HPP_
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include <core/common/Location.hpp>
+
+namespace ousia {
+
+/**
+ * The TokenId is used to give each token id a unique id.
+ */
+using TokenId = uint32_t;
+
+/**
+ * Type used for storing token lengths.
+ */
+using TokenLength = uint16_t;
+
+/**
+ * Type used for storing token sets.
+ */
+using TokenSet = std::unordered_set<TokenId>;
+
+/**
+ * Namespace containing constants for TokenId instances with special meaning.
+ */
+namespace Tokens {
+/**
+ * Token which is not a token.
+ */
+constexpr TokenId Empty = std::numeric_limits<TokenId>::max();
+
+/**
+ * Token which represents data (represented as TokenizedData).
+ */
+constexpr TokenId Data = std::numeric_limits<TokenId>::max() - 1;
+
+/**
+ * Token which represents a newline token.
+ */
+constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2;
+
+/**
+ * Token which represents a paragraph token -- issued if two consecutive
+ * newlines occur with optionally any amout of whitespace between them. The
+ * paragraph token is not repeated until more text is reached.
+ */
+constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3;
+
+/**
+ * Token which represents a section token -- issued if three or more
+ * consecutive newlines occur with optionally any amout of whitespace between
+ * them. The section token is not repeated until more text is reached.
+ */
+constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4;
+
+/**
+ * Token which represents an indentation token -- issued if the indentation of
+ * this line is larger than the indentation of the previous line.
+ */
+constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5;
+
+/**
+ * Token which represents an dedentation -- issued if the indentation of
+ * this line is smaller than the indentation of the previous line.
+ */
+constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6;
+
+/**
+ * Maximum token id to be used. Tokens allocated for users should not surpass
+ * this value.
+ */
+constexpr TokenId MaxTokenId = std::numeric_limits<TokenId>::max() - 255;
+}
+
+/**
+ * The Token structure describes a token discovered by the Tokenizer or read
+ * from the TokenizedData struct.
+ */
+struct Token {
+ /**
+ * Id of the id of this token.
+ */
+ TokenId id;
+
+ /**
+ * String that was matched.
+ */
+ std::string content;
+
+ /**
+ * Location from which the string was extracted.
+ */
+ SourceLocation location;
+
+ /**
+ * Default constructor.
+ */
+ Token() : id(Tokens::Empty) {}
+
+ /**
+ * Constructor of a "data" token with no explicit content.
+ *
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ Token(SourceLocation location)
+ : id(Tokens::Data), location(location)
+ {
+ }
+
+ /**
+ * Constructor of the Token struct.
+ *
+ * @param id represents the token id.
+ * @param content is the string content that has been extracted.
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ Token(TokenId id, const std::string &content, SourceLocation location)
+ : id(id), content(content), location(location)
+ {
+ }
+
+ /**
+ * Constructor of the Token struct, only initializes the token id
+ *
+ * @param id is the id corresponding to the id of the token.
+ */
+ Token(TokenId id) : id(id) {}
+
+ /**
+ * Returns true if this token is special.
+ *
+ * @return true if the TokenId indicates that this token is a "special"
+ * token.
+ */
+ bool isSpecial() const {return id > Tokens::MaxTokenId;}
+
+ /**
+ * The getLocation function allows the tokens to be directly passed as
+ * parameter to Logger or LoggableException instances.
+ *
+ * @return a reference at the location field
+ */
+ const SourceLocation &getLocation() const { return location; }
+};
+}
+
+#endif /* _OUSIA_TOKENS_HPP_ */
+
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index a77951e..85d2c28 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename)
return std::string{};
}
-std::string Utils::trim(const std::string &s)
-{
- std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
- return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
bool Utils::startsWith(const std::string &s, const std::string &prefix)
{
return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix;
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 7d96562..82a8f8c 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -124,14 +124,6 @@ public:
static bool hasNonWhitepaceChar(const std::string &s);
/**
- * Removes whitespace at the beginning and the end of the given string.
- *
- * @param s is the string that should be trimmed.
- * @return a trimmed copy of s.
- */
- static std::string trim(const std::string &s);
-
- /**
* Trims the given string or vector of chars by returning the start and end
* index.
*
@@ -153,8 +145,8 @@ public:
*
* @param s is the container that should be trimmed.
* @param len is the number of elements in the container.
- * @param f is a function that returns true for values that should be
- * removed.
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
*/
@@ -163,7 +155,7 @@ public:
{
size_t start = 0;
for (size_t i = 0; i < len; i++) {
- if (!f(s[i])) {
+ if (!f(i)) {
start = i;
break;
}
@@ -171,7 +163,7 @@ public:
size_t end = 0;
for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {
- if (!f(s[i])) {
+ if (!f(i)) {
end = i + 1;
break;
}
@@ -198,17 +190,33 @@ public:
* the collapsed version of the string ends.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
*/
- template <class T>
- static std::string trim(const T &s, size_t len, size_t &start, size_t &end)
+ template <class T, class Filter>
+ static std::string trim(const T &s, size_t len, size_t &start, size_t &end,
+ Filter f)
{
- auto res = trim(s, len, isWhitespace);
+ auto res = trim(s, len, f);
start = res.first;
end = res.second;
return std::string(&s[start], end - start);
}
/**
+ * Removes whitespace at the beginning and the end of the given string.
+ *
+ * @param s is the string that should be trimmed.
+ * @return a trimmed copy of s.
+ */
+ static std::string trim(const std::string &s)
+ {
+ std::pair<size_t, size_t> bounds =
+ trim(s, [&s](size_t i) { return isWhitespace(s[i]); });
+ return s.substr(bounds.first, bounds.second - bounds.first);
+ }
+
+ /**
* Collapses the whitespaces in the given string (trims the string and
* replaces all whitespace characters by a single one).
*
@@ -219,7 +227,8 @@ public:
{
size_t start;
size_t end;
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -236,7 +245,8 @@ public:
static std::string collapse(const std::string &s, size_t &start,
size_t &end)
{
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -244,6 +254,8 @@ public:
* replaces all whitespace characters by a single one).
*
* @tparam T is the string type that should be used.
+ * @tparam Filter is a filter function used for detecting the character
+ * indices that might be removed.
* @param s is the string in which the whitespace should be collapsed.
* @param len is the length of the input string
* @param start is an output parameter which is set to the offset at which
@@ -252,9 +264,9 @@ public:
* the collapsed version of the string ends.
* @return a copy of s with collapsed whitespace.
*/
- template <class T>
+ template <class T, class Filter>
static std::string collapse(const T &s, size_t len, size_t &start,
- size_t &end)
+ size_t &end, Filter f)
{
// Result vector
std::vector<char> res;
@@ -268,8 +280,7 @@ public:
bool hadWhitespace = false;
for (size_t i = 0; i < len; i++) {
const char c = s[i];
- const bool whitespace = isWhitespace(c);
- if (whitespace) {
+ if (f(i)) {
hadWhitespace = !res.empty();
} else {
// Adapt the start and end position
diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp
deleted file mode 100644
index ed52ea3..0000000
--- a/src/core/common/WhitespaceHandler.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file WhitespaceHandler.hpp
- *
- * Contains the WhitespaceHandler classes which are used in multiple places to
- * trim, compact or preserve whitespaces while at the same time maintaining the
- * position information associated with the input strings.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
-#define _OUSIA_WHITESPACE_HANDLER_HPP_
-
-#include <string>
-#include <vector>
-
-#include "Utils.hpp"
-
-namespace ousia {
-
-/**
- * WhitespaceHandler is a based class that can be used to collect text on a
- * character-by-character basis. Note that this class and its descendants are
- * hoped to be inlined by the compiler (and used in conjunction with templates),
- * thus they are fully defined inside this header.
- */
-class WhitespaceHandler {
-public:
- /**
- * Start position of the extracted text.
- */
- size_t textStart;
-
- /**
- * End position of the extracted text.
- */
- size_t textEnd;
-
- /**
- * Buffer containing the extracted text.
- */
- std::vector<char> textBuf;
-
- /**
- * Constructor of the TextHandlerBase base class. Initializes the start and
- * end position with zeros.
- */
- WhitespaceHandler() : textStart(0), textEnd(0) {}
-
- /**
- * Returns true if this whitespace handler has found any text and a text
- * token could be emitted.
- *
- * @return true if the internal data buffer is non-empty.
- */
- bool hasText() { return !textBuf.empty(); }
-
- /**
- * Returns the content of the WhitespaceHandler as string.
- */
- std::string toString() const
- {
- return std::string(textBuf.data(), textBuf.size());
- }
-};
-
-/**
- * The PreservingWhitespaceHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Appends the given character to the internal text buffer, does not
- * eliminate whitespace.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd);
- }
-
- /**
- * Static version of PreservingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd)
- {
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
- textBuf.push_back(c);
- }
-};
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Buffer used internally to temporarily store all whitespace characters.
- * They are only added to the output buffer if another non-whitespace
- * character is reached.
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * whitespace characters at the begin and end of the text.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf);
- }
-
- /**
- * Static version of TrimmingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- * @param whitespaceBuf is a reference at the buffer for storing whitespace
- * characters.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd, std::vector<char> &whitespaceBuf)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- whitespaceBuf.push_back(c);
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (!whitespaceBuf.empty()) {
- textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
- whitespaceBuf.end());
- whitespaceBuf.clear();
- }
- textBuf.push_back(c);
- }
-};
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Flag set to true if a whitespace character was reached.
- */
- bool hasWhitespace = false;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * redundant whitespace characters.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd, hasWhitespace);
- }
-
- /**
- * Static version of CollapsingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- * @param hasWhitespace is a reference at the "hasWhitespace" flag.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd, bool &hasWhitespace)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- hasWhitespace = true;
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (hasWhitespace) {
- textBuf.push_back(' ');
- hasWhitespace = false;
- }
- textBuf.push_back(c);
- }
-};
-
-/**
- * Function that can be used to append the given buffer (e.g. a string or a
- * vector) to the whitespace handler.
- *
- * @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
- * @tparam Buffer is an iterable type.
- * @param handler is the handler to which the characters of the Buffer should be
- * appended.
- * @param buf is the buffer from which the characters should be read.
- * @param start is the start byte offset. Each character is counted as one byte.
- */
-template <typename WhitespaceHandler, typename Buffer>
-inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
- size_t start)
-{
- for (auto elem : buf) {
- handler.append(elem, start, start + 1);
- start++;
- }
-}
-}
-
-#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */
-