summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/common/SourceContextReader.cpp5
-rw-r--r--src/core/common/Token.cpp (renamed from src/core/parser/utils/Token.cpp)2
-rw-r--r--src/core/common/Token.hpp (renamed from src/core/parser/utils/Token.hpp)61
-rw-r--r--src/core/common/Utils.cpp39
-rw-r--r--src/core/common/Utils.hpp73
-rw-r--r--src/core/common/WhitespaceHandler.hpp284
-rw-r--r--src/core/model/Ontology.cpp193
-rw-r--r--src/core/model/Ontology.hpp297
-rw-r--r--src/core/model/Syntax.cpp58
-rw-r--r--src/core/model/Syntax.hpp196
-rw-r--r--src/core/parser/stack/Callbacks.cpp10
-rw-r--r--src/core/parser/stack/Callbacks.hpp70
-rw-r--r--src/core/parser/stack/DocumentHandler.cpp108
-rw-r--r--src/core/parser/stack/DocumentHandler.hpp30
-rw-r--r--src/core/parser/stack/Handler.cpp133
-rw-r--r--src/core/parser/stack/Handler.hpp335
-rw-r--r--src/core/parser/stack/OntologyHandler.cpp37
-rw-r--r--src/core/parser/stack/OntologyHandler.hpp22
-rw-r--r--src/core/parser/stack/Stack.cpp897
-rw-r--r--src/core/parser/stack/Stack.hpp278
-rw-r--r--src/core/parser/stack/State.cpp15
-rw-r--r--src/core/parser/stack/State.hpp33
-rw-r--r--src/core/parser/stack/TokenRegistry.cpp80
-rw-r--r--src/core/parser/stack/TokenRegistry.hpp114
-rw-r--r--src/core/parser/stack/TokenStack.cpp45
-rw-r--r--src/core/parser/stack/TokenStack.hpp112
-rw-r--r--src/core/parser/stack/TypesystemHandler.cpp24
-rw-r--r--src/core/parser/stack/TypesystemHandler.hpp10
-rw-r--r--src/core/parser/utils/SourceOffsetVector.hpp89
-rw-r--r--src/core/parser/utils/TokenTrie.cpp16
-rw-r--r--src/core/parser/utils/TokenTrie.hpp11
-rw-r--r--src/core/parser/utils/TokenizedData.cpp361
-rw-r--r--src/core/parser/utils/TokenizedData.hpp246
-rw-r--r--src/core/parser/utils/Tokenizer.cpp276
-rw-r--r--src/core/parser/utils/Tokenizer.hpp142
-rw-r--r--src/formats/osml/OsmlParser.cpp30
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp800
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp331
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp138
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp48
-rw-r--r--src/formats/osxml/OsxmlParser.cpp30
41 files changed, 3943 insertions, 2136 deletions
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp
index d5d379c..f7dbdf3 100644
--- a/src/core/common/SourceContextReader.cpp
+++ b/src/core/common/SourceContextReader.cpp
@@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader,
ctx.relLen = end - start; // end >= start (I2)
// Remove linebreaks at the beginning and the end
- const std::pair<size_t, size_t> b =
- Utils::trim(lineBuf, Utils::isLinebreak);
+ const std::pair<size_t, size_t> b = Utils::trim(
+ lineBuf,
+ [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); });
ssize_t s = b.first, e = b.second;
s = std::min(s, static_cast<ssize_t>(ctx.relPos));
diff --git a/src/core/parser/utils/Token.cpp b/src/core/common/Token.cpp
index 8bcdbb5..17ce03e 100644
--- a/src/core/parser/utils/Token.cpp
+++ b/src/core/common/Token.cpp
@@ -19,6 +19,6 @@
#include "Token.hpp"
namespace ousia {
-// Stub to make sure Tokens.hpp is valid
+
}
diff --git a/src/core/parser/utils/Token.hpp b/src/core/common/Token.hpp
index f907450..4b56f1a 100644
--- a/src/core/parser/utils/Token.hpp
+++ b/src/core/common/Token.hpp
@@ -30,6 +30,7 @@
#include <cstdint>
#include <limits>
#include <string>
+#include <unordered_set>
#include <core/common/Location.hpp>
@@ -46,6 +47,11 @@ using TokenId = uint32_t;
using TokenLength = uint16_t;
/**
+ * Type used for storing token sets.
+ */
+using TokenSet = std::unordered_set<TokenId>;
+
+/**
* Namespace containing constants for TokenId instances with special meaning.
*/
namespace Tokens {
@@ -66,15 +72,29 @@ constexpr TokenId Newline = std::numeric_limits<TokenId>::max() - 2;
/**
* Token which represents a paragraph token -- issued if two consecutive
- * newlines occur with optionally any amout of whitespace between them.
+ * newlines occur with optionally any amout of whitespace between them. The
+ * paragraph token is not repeated until more text is reached.
*/
constexpr TokenId Paragraph = std::numeric_limits<TokenId>::max() - 3;
/**
+ * Token which represents a section token -- issued if three or more
+ * consecutive newlines occur with optionally any amout of whitespace between
+ * them. The section token is not repeated until more text is reached.
+ */
+constexpr TokenId Section = std::numeric_limits<TokenId>::max() - 4;
+
+/**
* Token which represents an indentation token -- issued if the indentation of
- * this line is larget than the indentation of the previous line.
+ * this line is larger than the indentation of the previous line.
+ */
+constexpr TokenId Indent = std::numeric_limits<TokenId>::max() - 5;
+
+/**
+ * Token which represents an dedentation -- issued if the indentation of
+ * this line is smaller than the indentation of the previous line.
*/
-constexpr TokenId Indentation = std::numeric_limits<TokenId>::max() - 4;
+constexpr TokenId Dedent = std::numeric_limits<TokenId>::max() - 6;
/**
* Maximum token id to be used. Tokens allocated for users should not surpass
@@ -109,6 +129,16 @@ struct Token {
Token() : id(Tokens::Empty) {}
/**
+ * Constructor of a "data" token with no explicit content.
+ *
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ Token(const SourceLocation &location) : id(Tokens::Data), location(location)
+ {
+ }
+
+ /**
* Constructor of the Token struct.
*
* @param id represents the token id.
@@ -116,12 +146,26 @@ struct Token {
* @param location is the location of the extracted string content in the
* source file.
*/
- Token(TokenId id, const std::string &content, SourceLocation location)
+ Token(TokenId id, const std::string &content,
+ const SourceLocation &location)
: id(id), content(content), location(location)
{
}
/**
+ * Constructor of the a "data" Token with the given string data and
+ * location.
+ *
+ * @param content is the string content that should be stored in the token.
+ * @param location is the location of the content within the source file.
+ */
+ Token(const std::string &content,
+ const SourceLocation &location = SourceLocation{})
+ : id(Tokens::Data), content(content), location(location)
+ {
+ }
+
+ /**
* Constructor of the Token struct, only initializes the token id
*
* @param id is the id corresponding to the id of the token.
@@ -129,6 +173,14 @@ struct Token {
Token(TokenId id) : id(id) {}
/**
+ * Returns true if this token is special.
+ *
+ * @return true if the TokenId indicates that this token is a "special"
+ * token.
+ */
+ bool isSpecial() const { return id > Tokens::MaxTokenId; }
+
+ /**
* The getLocation function allows the tokens to be directly passed as
* parameter to Logger or LoggableException instances.
*
@@ -139,4 +191,3 @@ struct Token {
}
#endif /* _OUSIA_TOKENS_HPP_ */
-
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index a77951e..a87ff6d 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename)
return std::string{};
}
-std::string Utils::trim(const std::string &s)
-{
- std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
- return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
bool Utils::startsWith(const std::string &s, const std::string &prefix)
{
return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix;
@@ -124,5 +118,36 @@ bool Utils::endsWith(const std::string &s, const std::string &suffix)
return suffix.size() <= s.size() &&
s.substr(s.size() - suffix.size(), suffix.size()) == suffix;
}
-}
+bool Utils::isUserDefinedToken(const std::string &token)
+{
+ // Make sure the token meets is neither empty, nor starts or ends with an
+ // alphanumeric character
+ const size_t len = token.size();
+ if (len == 0 || isAlphanumeric(token[0]) ||
+ isAlphanumeric(token[len - 1])) {
+ return false;
+ }
+
+ // Make sure the token is not any special OSML token
+ if (token == "\\" || token == "%" || token == "%{" || token == "}%" ||
+ token == "{!" || token == "<\\" || token == "\\>") {
+ return false;
+ }
+
+ // Make sure the token does not contain any whitespaces.
+ for (char c : token) {
+ if (isWhitespace(c)) {
+ return false;
+ }
+ }
+
+ // Make sure the token contains other characters but { and }
+ for (char c : token) {
+ if (c != '{' && c != '}') {
+ return true;
+ }
+ }
+ return false;
+}
+} \ No newline at end of file
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 7d96562..d9e26da 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -103,6 +103,26 @@ public:
static bool isNamespacedIdentifier(const std::string &name);
/**
+ * Returns true if the given characters form a valid user-defined token.
+ * This function returns true under the following circumstances:
+ * <ul>
+ * <li>The given token is not empty</li>
+ * <li>The given token starts and ends with a non-alphanumeric character
+ * </li>
+ * <li>The token is none of the following character sequences (which are
+ * special in OSML):
+ * <ul>
+ * <li>'{', '}' or any combined repetition of these characters</li>
+ * <li>'\', '{!', '<\', '\>'</li>
+ * <li>'%', '%{', '}%'</li>
+ * </ul>
+ * </li>
+ * <li>The token does not contain any whitespaces.</li>
+ * </ul>
+ */
+ static bool isUserDefinedToken(const std::string &token);
+
+ /**
* Returns true if the given character is a linebreak character.
*/
static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); }
@@ -124,14 +144,6 @@ public:
static bool hasNonWhitepaceChar(const std::string &s);
/**
- * Removes whitespace at the beginning and the end of the given string.
- *
- * @param s is the string that should be trimmed.
- * @return a trimmed copy of s.
- */
- static std::string trim(const std::string &s);
-
- /**
* Trims the given string or vector of chars by returning the start and end
* index.
*
@@ -153,8 +165,8 @@ public:
*
* @param s is the container that should be trimmed.
* @param len is the number of elements in the container.
- * @param f is a function that returns true for values that should be
- * removed.
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
*/
@@ -163,7 +175,7 @@ public:
{
size_t start = 0;
for (size_t i = 0; i < len; i++) {
- if (!f(s[i])) {
+ if (!f(i)) {
start = i;
break;
}
@@ -171,7 +183,7 @@ public:
size_t end = 0;
for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {
- if (!f(s[i])) {
+ if (!f(i)) {
end = i + 1;
break;
}
@@ -198,17 +210,33 @@ public:
* the collapsed version of the string ends.
* @return start and end index. Note that "end" points at the character
* beyond the end, thus "end" minus "start"
+ * @param f is a function that returns true for values at a certain index
+ * that should be removed.
*/
- template <class T>
- static std::string trim(const T &s, size_t len, size_t &start, size_t &end)
+ template <class T, class Filter>
+ static std::string trim(const T &s, size_t len, size_t &start, size_t &end,
+ Filter f)
{
- auto res = trim(s, len, isWhitespace);
+ auto res = trim(s, len, f);
start = res.first;
end = res.second;
return std::string(&s[start], end - start);
}
/**
+ * Removes whitespace at the beginning and the end of the given string.
+ *
+ * @param s is the string that should be trimmed.
+ * @return a trimmed copy of s.
+ */
+ static std::string trim(const std::string &s)
+ {
+ std::pair<size_t, size_t> bounds =
+ trim(s, [&s](size_t i) { return isWhitespace(s[i]); });
+ return s.substr(bounds.first, bounds.second - bounds.first);
+ }
+
+ /**
* Collapses the whitespaces in the given string (trims the string and
* replaces all whitespace characters by a single one).
*
@@ -219,7 +247,8 @@ public:
{
size_t start;
size_t end;
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -236,7 +265,8 @@ public:
static std::string collapse(const std::string &s, size_t &start,
size_t &end)
{
- return collapse(s, s.size(), start, end);
+ return collapse(s, s.size(), start, end,
+ [&s](size_t i) { return isWhitespace(s[i]); });
}
/**
@@ -244,6 +274,8 @@ public:
* replaces all whitespace characters by a single one).
*
* @tparam T is the string type that should be used.
+ * @tparam Filter is a filter function used for detecting the character
+ * indices that might be removed.
* @param s is the string in which the whitespace should be collapsed.
* @param len is the length of the input string
* @param start is an output parameter which is set to the offset at which
@@ -252,9 +284,9 @@ public:
* the collapsed version of the string ends.
* @return a copy of s with collapsed whitespace.
*/
- template <class T>
+ template <class T, class Filter>
static std::string collapse(const T &s, size_t len, size_t &start,
- size_t &end)
+ size_t &end, Filter f)
{
// Result vector
std::vector<char> res;
@@ -268,8 +300,7 @@ public:
bool hadWhitespace = false;
for (size_t i = 0; i < len; i++) {
const char c = s[i];
- const bool whitespace = isWhitespace(c);
- if (whitespace) {
+ if (f(i)) {
hadWhitespace = !res.empty();
} else {
// Adapt the start and end position
diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp
deleted file mode 100644
index ed52ea3..0000000
--- a/src/core/common/WhitespaceHandler.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file WhitespaceHandler.hpp
- *
- * Contains the WhitespaceHandler classes which are used in multiple places to
- * trim, compact or preserve whitespaces while at the same time maintaining the
- * position information associated with the input strings.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
-#define _OUSIA_WHITESPACE_HANDLER_HPP_
-
-#include <string>
-#include <vector>
-
-#include "Utils.hpp"
-
-namespace ousia {
-
-/**
- * WhitespaceHandler is a based class that can be used to collect text on a
- * character-by-character basis. Note that this class and its descendants are
- * hoped to be inlined by the compiler (and used in conjunction with templates),
- * thus they are fully defined inside this header.
- */
-class WhitespaceHandler {
-public:
- /**
- * Start position of the extracted text.
- */
- size_t textStart;
-
- /**
- * End position of the extracted text.
- */
- size_t textEnd;
-
- /**
- * Buffer containing the extracted text.
- */
- std::vector<char> textBuf;
-
- /**
- * Constructor of the TextHandlerBase base class. Initializes the start and
- * end position with zeros.
- */
- WhitespaceHandler() : textStart(0), textEnd(0) {}
-
- /**
- * Returns true if this whitespace handler has found any text and a text
- * token could be emitted.
- *
- * @return true if the internal data buffer is non-empty.
- */
- bool hasText() { return !textBuf.empty(); }
-
- /**
- * Returns the content of the WhitespaceHandler as string.
- */
- std::string toString() const
- {
- return std::string(textBuf.data(), textBuf.size());
- }
-};
-
-/**
- * The PreservingWhitespaceHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Appends the given character to the internal text buffer, does not
- * eliminate whitespace.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd);
- }
-
- /**
- * Static version of PreservingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd)
- {
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
- textBuf.push_back(c);
- }
-};
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Buffer used internally to temporarily store all whitespace characters.
- * They are only added to the output buffer if another non-whitespace
- * character is reached.
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * whitespace characters at the begin and end of the text.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf);
- }
-
- /**
- * Static version of TrimmingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- * @param whitespaceBuf is a reference at the buffer for storing whitespace
- * characters.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd, std::vector<char> &whitespaceBuf)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- whitespaceBuf.push_back(c);
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (!whitespaceBuf.empty()) {
- textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
- whitespaceBuf.end());
- whitespaceBuf.clear();
- }
- textBuf.push_back(c);
- }
-};
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingWhitespaceHandler : public WhitespaceHandler {
-public:
- /**
- * Flag set to true if a whitespace character was reached.
- */
- bool hasWhitespace = false;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * redundant whitespace characters.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- append(c, start, end, textBuf, textStart, textEnd, hasWhitespace);
- }
-
- /**
- * Static version of CollapsingWhitespaceHandler append
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- * @param textBuf is a reference at the text buffer that is to be used.
- * @param textStart is a reference at the text start variable that is to be
- * used.
- * @param textEnd is a reference at the text end variable that is to be
- * used.
- * @param hasWhitespace is a reference at the "hasWhitespace" flag.
- */
- static void append(char c, size_t start, size_t end,
- std::vector<char> &textBuf, size_t &textStart,
- size_t &textEnd, bool &hasWhitespace)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- hasWhitespace = true;
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (hasWhitespace) {
- textBuf.push_back(' ');
- hasWhitespace = false;
- }
- textBuf.push_back(c);
- }
-};
-
-/**
- * Function that can be used to append the given buffer (e.g. a string or a
- * vector) to the whitespace handler.
- *
- * @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
- * @tparam Buffer is an iterable type.
- * @param handler is the handler to which the characters of the Buffer should be
- * appended.
- * @param buf is the buffer from which the characters should be read.
- * @param start is the start byte offset. Each character is counted as one byte.
- */
-template <typename WhitespaceHandler, typename Buffer>
-inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
- size_t start)
-{
- for (auto elem : buf) {
- handler.append(elem, start, start + 1);
- start++;
- }
-}
-}
-
-#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */
-
diff --git a/src/core/model/Ontology.cpp b/src/core/model/Ontology.cpp
index 8829139..3af727d 100644
--- a/src/core/model/Ontology.cpp
+++ b/src/core/model/Ontology.cpp
@@ -20,8 +20,9 @@
#include <queue>
#include <set>
-#include <core/common/RttiBuilder.hpp>
#include <core/common/Exceptions.hpp>
+#include <core/common/RttiBuilder.hpp>
+#include <core/common/Utils.hpp>
#include "Ontology.hpp"
@@ -169,52 +170,60 @@ static NodeVector<Node> pathTo(const Node *start, Logger &logger,
return shortest;
}
+struct CollectState {
+ Node *n;
+ size_t depth;
+
+ CollectState(Node *n, size_t depth) : n(n), depth(depth) {}
+};
+
template <typename F>
static NodeVector<Node> collect(const Node *start, F match)
{
// result
NodeVector<Node> res;
// queue for breadth-first search of graph.
- std::queue<Rooted<Node>> q;
+ std::queue<CollectState> q;
// put the initial node on the stack.
- q.push(const_cast<Node *>(start));
+ q.push(CollectState(const_cast<Node *>(start), 0));
// set of visited nodes.
std::unordered_set<const Node *> visited;
while (!q.empty()) {
- Rooted<Node> n = q.front();
+ CollectState state = q.front();
q.pop();
// do not proceed if this node was already visited.
- if (!visited.insert(n.get()).second) {
+ if (!visited.insert(state.n).second) {
continue;
}
- if (n->isa(&RttiTypes::StructuredClass)) {
- Rooted<StructuredClass> strct = n.cast<StructuredClass>();
+ if (state.n->isa(&RttiTypes::Descriptor)) {
+ Rooted<Descriptor> strct{static_cast<Descriptor *>(state.n)};
// look through all fields.
NodeVector<FieldDescriptor> fields = strct->getFieldDescriptors();
for (auto fd : fields) {
// note matches.
- if (match(fd)) {
+ if (match(fd, state.depth)) {
res.push_back(fd);
}
// only continue in the TREE field.
if (fd->getFieldType() == FieldDescriptor::FieldType::TREE) {
- q.push(fd);
+ q.push(CollectState(fd.get(), state.depth));
}
}
} else {
// otherwise this is a FieldDescriptor.
- Rooted<FieldDescriptor> field = n.cast<FieldDescriptor>();
+ Rooted<FieldDescriptor> field{
+ static_cast<FieldDescriptor *>(state.n)};
// and we proceed by visiting all permitted children.
for (auto c : field->getChildrenWithSubclasses()) {
// note matches.
- if (match(c)) {
+ if (match(c, state.depth)) {
res.push_back(c);
}
// We only continue our search via transparent children.
if (c->isTransparent()) {
- q.push(c);
+ q.push(CollectState(c.get(), state.depth + 1));
}
}
}
@@ -222,28 +231,59 @@ static NodeVector<Node> collect(const Node *start, F match)
return res;
}
+static std::vector<SyntaxDescriptor> collectPermittedTokens(
+ const Node *start, Handle<Domain> domain)
+{
+ // gather SyntaxDescriptors for structure children first.
+ std::vector<SyntaxDescriptor> res;
+ collect(start, [&res](Handle<Node> n, size_t depth) {
+ SyntaxDescriptor stx;
+ if (n->isa(&RttiTypes::FieldDescriptor)) {
+ stx = n.cast<FieldDescriptor>()->getSyntaxDescriptor(depth);
+ } else {
+ stx = n.cast<Descriptor>()->getSyntaxDescriptor(depth);
+ }
+ // do not add trivial SyntaxDescriptors.
+ if (!stx.isEmpty()) {
+ res.push_back(stx);
+ }
+ return false;
+ });
+ // gather SyntaxDescriptors for AnnotationClasses.
+ for (auto a : domain->getAnnotationClasses()) {
+ SyntaxDescriptor stx = a->getSyntaxDescriptor();
+ if (!stx.isEmpty()) {
+ res.push_back(stx);
+ }
+ }
+ return res;
+}
+
/* Class FieldDescriptor */
FieldDescriptor::FieldDescriptor(Manager &mgr, Handle<Type> primitiveType,
Handle<Descriptor> parent, FieldType fieldType,
- std::string name, bool optional)
+ std::string name, bool optional,
+ WhitespaceMode whitespaceMode)
: Node(mgr, std::move(name), parent),
children(this),
fieldType(fieldType),
primitiveType(acquire(primitiveType)),
optional(optional),
- primitive(true)
+ primitive(true),
+ whitespaceMode(whitespaceMode)
{
}
FieldDescriptor::FieldDescriptor(Manager &mgr, Handle<Descriptor> parent,
FieldType fieldType, std::string name,
- bool optional)
+ bool optional, WhitespaceMode whitespaceMode)
: Node(mgr, std::move(name), parent),
children(this),
fieldType(fieldType),
optional(optional),
- primitive(false)
+ primitive(false),
+ whitespaceMode(whitespaceMode)
{
}
@@ -272,6 +312,25 @@ bool FieldDescriptor::doValidate(Logger &logger) const
} else {
valid = valid & validateName(logger);
}
+ // check start and end token.
+ if (!startToken.special && !startToken.token.empty() &&
+ !Utils::isUserDefinedToken(startToken.token)) {
+ // TODO: Correct error message.
+ logger.error(std::string("Field \"") + getName() +
+ "\" has an invalid custom start token: " +
+ startToken.token,
+ *this);
+ valid = false;
+ }
+ if (!endToken.special && !endToken.token.empty() &&
+ !Utils::isUserDefinedToken(endToken.token)) {
+ // TODO: Correct error message.
+ logger.error(std::string("Field \"") + getName() +
+ "\" has an invalid custom end token: " +
+ endToken.token,
+ *this);
+ valid = false;
+ }
// check consistency of FieldType with the rest of the FieldDescriptor.
if (primitive) {
@@ -325,7 +384,7 @@ bool FieldDescriptor::doValidate(Logger &logger) const
}
static void gatherSubclasses(
- std::unordered_set<const StructuredClass *>& visited,
+ std::unordered_set<const StructuredClass *> &visited,
NodeVector<StructuredClass> &res, Handle<StructuredClass> strct)
{
// this check is to prevent cycles.
@@ -334,7 +393,7 @@ static void gatherSubclasses(
}
for (auto sub : strct->getSubclasses()) {
// this check is to prevent cycles.
- if(visited.count(sub.get())){
+ if (visited.count(sub.get())) {
continue;
}
res.push_back(sub);
@@ -381,7 +440,7 @@ NodeVector<Node> FieldDescriptor::pathTo(Handle<FieldDescriptor> field,
NodeVector<FieldDescriptor> FieldDescriptor::getDefaultFields() const
{
// TODO: In principle a cast would be nicer here, but for now we copy.
- NodeVector<Node> nodes = collect(this, [](Handle<Node> n) {
+ NodeVector<Node> nodes = collect(this, [](Handle<Node> n, size_t depth) {
if (!n->isa(&RttiTypes::FieldDescriptor)) {
return false;
}
@@ -396,6 +455,16 @@ NodeVector<FieldDescriptor> FieldDescriptor::getDefaultFields() const
return res;
}
+std::vector<SyntaxDescriptor> FieldDescriptor::getPermittedTokens() const
+{
+ if (getParent() == nullptr ||
+ getParent().cast<Descriptor>()->getParent() == nullptr) {
+ return std::vector<SyntaxDescriptor>();
+ }
+ return collectPermittedTokens(
+ this, getParent().cast<Descriptor>()->getParent().cast<Domain>());
+}
+
/* Class Descriptor */
void Descriptor::doResolve(ResolutionState &state)
@@ -443,6 +512,25 @@ bool Descriptor::doValidate(Logger &logger) const
}
valid = valid & attributesDescriptor->validate(logger);
}
+
+ // check start and end token.
+ if (!startToken.special && !startToken.token.empty() &&
+ !Utils::isUserDefinedToken(startToken.token)) {
+ logger.error(std::string("Descriptor \"") + getName() +
+ "\" has an invalid custom start token: " +
+ startToken.token,
+ *this);
+ valid = false;
+ }
+ if (!endToken.special && !endToken.token.empty() &&
+ !Utils::isUserDefinedToken(endToken.token)) {
+ logger.error(std::string("Descriptor \"") + getName() +
+ "\" has an invalid custom end token: " +
+ endToken.token,
+ *this);
+ valid = false;
+ }
+
// check that only one FieldDescriptor is of type TREE.
auto fds = Descriptor::getFieldDescriptors();
bool hasTREE = false;
@@ -483,7 +571,7 @@ std::pair<NodeVector<Node>, bool> Descriptor::pathTo(
NodeVector<FieldDescriptor> Descriptor::getDefaultFields() const
{
// TODO: In principle a cast would be nicer here, but for now we copy.
- NodeVector<Node> nodes = collect(this, [](Handle<Node> n) {
+ NodeVector<Node> nodes = collect(this, [](Handle<Node> n, size_t depth) {
if (!n->isa(&RttiTypes::FieldDescriptor)) {
return false;
}
@@ -501,7 +589,7 @@ NodeVector<FieldDescriptor> Descriptor::getDefaultFields() const
NodeVector<StructuredClass> Descriptor::getPermittedChildren() const
{
// TODO: In principle a cast would be nicer here, but for now we copy.
- NodeVector<Node> nodes = collect(this, [](Handle<Node> n) {
+ NodeVector<Node> nodes = collect(this, [](Handle<Node> n, size_t depth) {
return n->isa(&RttiTypes::StructuredClass);
});
NodeVector<StructuredClass> res;
@@ -669,6 +757,14 @@ std::pair<Rooted<FieldDescriptor>, bool> Descriptor::createFieldDescriptor(
return std::make_pair(fd, sorted);
}
+std::vector<SyntaxDescriptor> Descriptor::getPermittedTokens() const
+{
+ if (getParent() == nullptr) {
+ return std::vector<SyntaxDescriptor>();
+ }
+ return collectPermittedTokens(this, getParent().cast<Domain>());
+}
+
/* Class StructuredClass */
StructuredClass::StructuredClass(Manager &mgr, std::string name,
@@ -709,6 +805,16 @@ bool StructuredClass::doValidate(Logger &logger) const
logger.error(cardinality.toString() + " is not a cardinality!", *this);
valid = false;
}
+
+ // check short token.
+ if (!shortToken.special && !shortToken.token.empty() &&
+ !Utils::isUserDefinedToken(shortToken.token)) {
+ logger.error(std::string("Descriptor \"") + getName() +
+ "\" has an invalid custom short form token: " +
+ shortToken.token,
+ *this);
+ valid = false;
+ }
// check the validity of this superclass.
if (superclass != nullptr) {
valid = valid & superclass->validate(logger);
@@ -961,6 +1067,51 @@ Rooted<AnnotationClass> Ontology::createAnnotationClass(std::string name)
new AnnotationClass(getManager(), std::move(name), this)};
}
+static void gatherTokenDescriptors(
+ Handle<Descriptor> desc, std::vector<TokenDescriptor *> &res,
+ std::unordered_set<FieldDescriptor *> &visited)
+{
+ // add the TokenDescriptors for the Descriptor itself.
+ if (!desc->getStartToken().isEmpty()) {
+ res.push_back(desc->getStartTokenPointer());
+ }
+ if (!desc->getEndToken().isEmpty()) {
+ res.push_back(desc->getEndTokenPointer());
+ }
+ // add the TokenDescriptors for its FieldDescriptors.
+ for (auto fd : desc->getFieldDescriptors()) {
+ if (!visited.insert(fd.get()).second) {
+ continue;
+ }
+ if (!fd->getStartToken().isEmpty()) {
+ res.push_back(fd->getStartTokenPointer());
+ }
+ if (!fd->getEndToken().isEmpty()) {
+ res.push_back(fd->getEndTokenPointer());
+ }
+ }
+}
+
+std::vector<TokenDescriptor *> Domain::getAllTokenDescriptors() const
+{
+ std::vector<TokenDescriptor *> res;
+ // note all fields that are already visited because FieldReferences might
+ // lead to doubled fields.
+ std::unordered_set<FieldDescriptor *> visited;
+ // add the TokenDescriptors for the StructuredClasses (and their fields).
+ for (auto s : structuredClasses) {
+ if (!s->getShortToken().isEmpty()) {
+ res.push_back(s->getShortTokenPointer());
+ }
+ gatherTokenDescriptors(s, res, visited);
+ }
+ // add the TokenDescriptors for the AnnotationClasses (and their fields).
+ for (auto a : annotationClasses) {
+ gatherTokenDescriptors(a, res, visited);
+ }
+ return res;
+}
+
/* Type registrations */
namespace RttiTypes {
diff --git a/src/core/model/Ontology.hpp b/src/core/model/Ontology.hpp
index e1fbe96..d682bdf 100644
--- a/src/core/model/Ontology.hpp
+++ b/src/core/model/Ontology.hpp
@@ -168,11 +168,13 @@
#ifndef _OUSIA_MODEL_DOMAIN_HPP_
#define _OUSIA_MODEL_DOMAIN_HPP_
+#include <core/common/Whitespace.hpp>
#include <core/managed/ManagedContainer.hpp>
#include <core/RangeSet.hpp>
#include "Node.hpp"
#include "RootNode.hpp"
+#include "Syntax.hpp"
#include "Typesystem.hpp"
namespace ousia {
@@ -226,6 +228,9 @@ private:
Owned<Type> primitiveType;
bool optional;
bool primitive;
+ TokenDescriptor startToken;
+ TokenDescriptor endToken;
+ WhitespaceMode whitespaceMode;
protected:
bool doValidate(Logger &logger) const override;
@@ -234,39 +239,46 @@ public:
/**
* This is the constructor for primitive fields.
*
- * @param mgr is the global Manager instance.
- * @param parent is a handle of the Descriptor node that has this
- * FieldDescriptor.
- * @param primitiveType is a handle to some Type in some Typesystem of which
- * one instance is allowed to fill this field.
- * @param name is the name of this field.
- * @param optional should be set to 'false' is this field needs to be
- * filled in order for an instance of the parent
- * Descriptor to be valid.
+ * @param mgr is the global Manager instance.
+ * @param parent is a handle of the Descriptor node that has this
+ * FieldDescriptor.
+ * @param primitiveType is a handle to some Type in some Typesystem of
+ *which
+ * one instance is allowed to fill this field.
+ * @param name is the name of this field.
+ * @param optional should be set to 'false' is this field needs to be
+ * filled in order for an instance of the parent
+ * Descriptor to be valid.
+ * @param whitespaceMode the WhitespaceMode to be used when an instance of
+ * this FieldDescriptor is parsed.
*/
FieldDescriptor(Manager &mgr, Handle<Type> primitiveType,
Handle<Descriptor> parent,
FieldType fieldType = FieldType::TREE,
- std::string name = "", bool optional = false);
+ std::string name = "", bool optional = false,
+ WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
/**
* This is the constructor for non-primitive fields. You have to provide
* children here later on.
*
- * @param mgr is the global Manager instance.
- * @param parent is a handle of the Descriptor node that has this
- * FieldDescriptor.
- * @param fieldType is the FieldType of this FieldDescriptor, either
- * TREE for the main or default structure or SUBTREE
- * for supporting structures.
- * @param name is the name of this field.
- * @param optional should be set to 'false' is this field needs to be
- * filled in order for an instance of the parent
- * Descriptor to be valid.
+ * @param mgr is the global Manager instance.
+ * @param parent is a handle of the Descriptor node that has this
+ * FieldDescriptor.
+ * @param fieldType is the FieldType of this FieldDescriptor, either
+ * TREE for the main or default structure or SUBTREE
+ * for supporting structures.
+ * @param name is the name of this field.
+ * @param optional should be set to 'false' is this field needs to be
+ * filled in order for an instance of the parent
+ * Descriptor to be valid.
+ * @param whitespaceMode the WhitespaceMode to be used when an instance of
+ * this FieldDescriptor is parsed.
*/
FieldDescriptor(Manager &mgr, Handle<Descriptor> parent = nullptr,
FieldType fieldType = FieldType::TREE,
- std::string name = "", bool optional = false);
+ std::string name = "", bool optional = false,
+ WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
/**
* Returns a const reference to the NodeVector of StructuredClasses whose
@@ -455,6 +467,109 @@ public:
return std::move(name);
}
}
+
+ /**
+ * Returns a pointer to the start TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor starts.
+ *
+ * Note that this does not invalidate the FieldDescriptor. So use with
+ * care.
+ *
+ * @return a pointer to the start TokenDescriptor.
+ */
+ TokenDescriptor *getStartTokenPointer() { return &startToken; }
+
+ /**
+ * Returns a copy of the start TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor starts.
+ *
+ * @return a copy of the start TokenDescriptor.
+ */
+ TokenDescriptor getStartToken() const { return startToken; }
+
+ /**
+ * Sets the start TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor starts.
+ *
+ * @param st the new start TokenDescriptor.
+ */
+ void setStartToken(TokenDescriptor st)
+ {
+ invalidate();
+ startToken = st;
+ }
+
+ /**
+ * Returns a pointer to the end TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor ends.
+ *
+ * @return a pointer to the end TokenDescriptor.
+ */
+ TokenDescriptor *getEndTokenPointer() { return &endToken; }
+
+ /**
+ * Returns a copy of the end TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor ends.
+ *
+ * @return a copy of the end TokenDescriptor.
+ */
+ TokenDescriptor getEndToken() const { return endToken; }
+
+ /**
+ * Sets the end TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor ends.
+ *
+ * @param e the new end TokenDescriptor.
+ */
+ void setEndToken(TokenDescriptor e)
+ {
+ invalidate();
+ endToken = e;
+ }
+
+ /**
+ * Returns the WhitespaceMode to be used when an instance of this
+ * FieldDescriptor is parsed.
+ *
+ * @return the WhitespaceMode to be used when an instance of this
+ * FieldDescriptor is parsed.
+ */
+ WhitespaceMode getWhitespaceMode() const { return whitespaceMode; }
+
+ /**
+ * Sets the WhitespaceMode to be used when an instance of this
+ * FieldDescriptor is parsed.
+ *
+ * @param wm the WhitespaceMode to be used when an instance of this
+ * FieldDescriptor is parsed.
+ */
+ WhitespaceMode setWhitespaceMode(WhitespaceMode wm)
+ {
+ return whitespaceMode = wm;
+ }
+
+ /**
+ * Returns the SyntaxDescriptor for this FieldDescriptor.
+ *
+ * @return the SyntaxDescriptor for this FieldDescriptor.
+ */
+ SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1)
+ {
+ SyntaxDescriptor stx{startToken.id, endToken.id, Tokens::Empty,
+ const_cast<FieldDescriptor *>(this), depth};
+ return stx;
+ }
+
+ /**
+ * Returns a vector of SyntaxDescriptors, one for each Descriptor
+ * (StructuredClasses, AnnotationClasses or FieldDescriptors) that is
+ * permitted as child of this FieldDescriptor. This also makes use
+ * of transparency.
+ *
+ * @return a vector of SyntaxDescriptors, one for each Descriptor that is
+ * permitted as child of this FieldDescriptor
+ */
+ std::vector<SyntaxDescriptor> getPermittedTokens() const;
};
/**
@@ -478,7 +593,10 @@ public:
* </A>
* \endcode
*
- * key="value" inside the A-node would be an attribute, while <key>value</key>
+ * key="value" inside the A-node would be an attribute, while
+ * \code{.xml}
+ * <key>value</key>
+ * \endcode
* would be a primitive field. While equivalent in XML the semantics are
* different: An attribute describes indeed attributes, features of one single
* node whereas a primitive field describes the _content_ of a node.
@@ -490,6 +608,8 @@ class Descriptor : public Node {
private:
Owned<StructType> attributesDescriptor;
NodeVector<FieldDescriptor> fieldDescriptors;
+ TokenDescriptor startToken;
+ TokenDescriptor endToken;
bool addAndSortFieldDescriptor(Handle<FieldDescriptor> fd, Logger &logger);
@@ -738,6 +858,85 @@ public:
* of an instance of this Descriptor in the structure tree.
*/
NodeVector<StructuredClass> getPermittedChildren() const;
+
+ /**
+ * Returns a pointer to the start TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor starts.
+ *
+ * @return a pointer to the start TokenDescriptor.
+ */
+ TokenDescriptor *getStartTokenPointer() { return &startToken; }
+
+ /**
+ * Returns a copy of the start TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor starts.
+ *
+ * @return a copy of the start TokenDescriptor.
+ */
+ TokenDescriptor getStartToken() const { return startToken; }
+
+ /**
+ * Sets the start TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor starts.
+ *
+ * @param st the new start TokenDescriptor.
+ */
+ void setStartToken(TokenDescriptor st)
+ {
+ invalidate();
+ startToken = st;
+ }
+
+ /**
+ * Returns a pointer to the end TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor ends.
+ *
+ * @return a pointer to the end TokenDescriptor.
+ */
+ TokenDescriptor *getEndTokenPointer() { return &endToken; }
+
+ /**
+ * Returns a copy of the end TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor ends.
+ *
+ * @return a copy of the end TokenDescriptor.
+ */
+ TokenDescriptor getEndToken() const { return endToken; }
+
+ /**
+ * Sets the end TokenDescriptor. This Token is used as a
+ * signifier during parsing that an instance of this FieldDescriptor ends.
+ *
+ * @param e the new end TokenDescriptor.
+ */
+ void setEndToken(TokenDescriptor e)
+ {
+ invalidate();
+ endToken = e;
+ }
+
+ /**
+ * Returns the SyntaxDescriptor for this Descriptor.
+ *
+ * @return the SyntaxDescriptor for this Descriptor.
+ */
+ virtual SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1)
+ {
+ SyntaxDescriptor stx{startToken.id, endToken.id, Tokens::Empty,
+ const_cast<Descriptor *>(this), depth};
+ return stx;
+ }
+
+ /**
+ * Returns a vector of SyntaxDescriptors, one for each Descriptor
+ * (StructuredClasses, AnnotationClasses or FieldDescriptors) that is
+ * permitted as child of this Descriptor. This also makes use
+ * of transparency.
+ *
+ * @return a vector of SyntaxDescriptors, one for each Descriptor that is
+ * permitted as child of this Descriptor.
+ */
+ std::vector<SyntaxDescriptor> getPermittedTokens() const;
};
/*
* TODO: We should discuss Cardinalities one more time. Is it smart to define
@@ -824,6 +1023,7 @@ private:
NodeVector<StructuredClass> subclasses;
bool transparent;
bool root;
+ TokenDescriptor shortToken;
/**
* Helper method for getFieldDescriptors.
@@ -981,6 +1181,50 @@ public:
invalidate();
root = std::move(r);
}
+
+ /**
+ * Returns a pointer to the short TokenDescriptor. During parsing an
+ * occurence of this token will be translated to an empty instance of this
+ * StructuredClass.
+ *
+ * @return a pointer to the short TokenDescriptor.
+ */
+ TokenDescriptor *getShortTokenPointer() { return &shortToken; }
+
+ /**
+ * Returns a copy of the short TokenDescriptor. During parsing an
+ * occurence of this token will be translated to an empty instance of this
+ * StructuredClass.
+ *
+ * @return a copy of the short TokenDescriptor.
+ */
+ TokenDescriptor getShortToken() const { return shortToken; }
+
+ /**
+ * Sets the short TokenDescriptor. During parsing an
+ * occurence of this token will be translated to an empty instance of this
+ * StructuredClass.
+ *
+ * @param s the new short TokenDescriptor.
+ */
+ void setShortToken(TokenDescriptor s)
+ {
+ invalidate();
+ shortToken = s;
+ }
+
+ /**
+ * Returns the SyntaxDescriptor for this StructuredClass.
+ *
+ * @return the SyntaxDescriptor for this StructuredClass.
+ */
+ SyntaxDescriptor getSyntaxDescriptor(ssize_t depth = -1) override
+ {
+ SyntaxDescriptor stx{getStartToken().id, getEndToken().id,
+ shortToken.id, const_cast<StructuredClass *>(this),
+ depth};
+ return stx;
+ }
};
/**
@@ -1207,6 +1451,13 @@ public:
{
ontologies.insert(ontologies.end(), ds.begin(), ds.end());
}
+
+ /**
+ * Returns all TokenDescriptors of classes and fields in this Ontology.
+ *
+ * @return all TokenDescriptors of classes and fields in this Ontology.
+ */
+ std::vector<TokenDescriptor *> getAllTokenDescriptors() const;
};
namespace RttiTypes {
@@ -1219,4 +1470,4 @@ extern const Rtti Ontology;
}
}
-#endif /* _OUSIA_MODEL_DOMAIN_HPP_ */ \ No newline at end of file
+#endif /* _OUSIA_MODEL_DOMAIN_HPP_ */
diff --git a/src/core/model/Syntax.cpp b/src/core/model/Syntax.cpp
new file mode 100644
index 0000000..9dbaccc
--- /dev/null
+++ b/src/core/model/Syntax.cpp
@@ -0,0 +1,58 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Syntax.hpp"
+
+#include "Domain.hpp"
+
+namespace ousia {
+
+/* Class TokenSyntaxDescriptor */
+
+bool SyntaxDescriptor::isAnnotation() const
+{
+ return descriptor->isa(&RttiTypes::AnnotationClass);
+}
+bool SyntaxDescriptor::isFieldDescriptor() const
+{
+ return descriptor->isa(&RttiTypes::FieldDescriptor);
+}
+bool SyntaxDescriptor::isStruct() const
+{
+ return descriptor->isa(&RttiTypes::StructuredClass);
+}
+
+void SyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const
+{
+ if (start != Tokens::Empty) {
+ set.insert(start);
+ }
+ if (end != Tokens::Empty) {
+ set.insert(end);
+ }
+ if (shortForm != Tokens::Empty) {
+ set.insert(shortForm);
+ }
+}
+
+bool SyntaxDescriptor::isEmpty() const
+{
+ return start == Tokens::Empty && end == Tokens::Empty &&
+ shortForm == Tokens::Empty;
+}
+} \ No newline at end of file
diff --git a/src/core/model/Syntax.hpp b/src/core/model/Syntax.hpp
new file mode 100644
index 0000000..4da3408
--- /dev/null
+++ b/src/core/model/Syntax.hpp
@@ -0,0 +1,196 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Syntax.hpp
+ *
+ * This header contains the Descriptor classes for user definable syntax for
+ * Document entities or fields. These classes are referenced in Ontology.hpp.
+ */
+
+#ifndef _OUSIA_MODEL_SYNTAX_HPP_
+#define _OUSIA_MODEL_SYNTAX_HPP_
+
+#include <core/common/Token.hpp>
+#include "Node.hpp"
+
+namespace ousia {
+
+/**
+ * Class to describe a single token that shall be used as user-defined syntax.
+ */
+struct TokenDescriptor {
+ /**
+ * The string content of this token, if it is not a special one.
+ */
+ std::string token;
+ /**
+ * A flag to be set true if this TokenDescriptor uses a special token.
+ */
+ bool special;
+ /**
+ * An id to uniquely identify this token.
+ */
+ TokenId id;
+
+ /**
+ * Constructor for non-special tokens. The special flag is set to false and
+ * the id to Tokens::Empty.
+ *
+ * @param token The string content of this token, if it is not a special
+ * one.
+ */
+ TokenDescriptor(std::string token = std::string())
+ : token(std::move(token)), special(false), id(Tokens::Empty)
+ {
+ }
+
+ /**
+ * Constructor for special tokens. The token is set to an empty string and
+ * the special flag to true.
+ *
+ * @param id the id of the special token.
+ */
+ TokenDescriptor(TokenId id) : special(true), id(id) {}
+
+ /**
+ * Returns true if and only if neither a string nor an ID is given.
+ *
+ * @return true if and only if neither a string nor an ID is given.
+ */
+ bool isEmpty() const { return token.empty() && id == Tokens::Empty; }
+};
+
+/**
+ * Class describing the user defined syntax for a StructuredClass,
+ * AnnotationClass or FieldDescriptor.
+ *
+ * This class is used during parsing of a Document. It is used to describe
+ * the tokens relevant for one Descriptor that could be created at this point
+ * during parsing.
+ */
+struct SyntaxDescriptor {
+ /**
+ * Possible start token or Tokens::Empty if no token is set.
+ */
+ TokenId start;
+
+ /**
+ * Possible end token or Tokens::Empty if no token is set.
+ */
+ TokenId end;
+
+ /**
+ * Possible representation token or Tokens::Empty if no token is set.
+ */
+ TokenId shortForm;
+
+ /*
+ * The Descriptor this SyntaxDescriptor belongs to. As this may be
+ * a FieldDescriptor as well as a class Descriptor (StructuredClass or
+ * AnnotationClass) we can only use the class Node as inner argument here.
+ */
+ Rooted<Node> descriptor;
+ /*
+ * Given the current leaf in the parsed document the depth of a
+ * SyntaxDescriptor is defined as the number of transparent elements that
+ * would be needed to construct an instance of the referenced descriptor.
+ */
+ ssize_t depth;
+
+ /**
+ * Default constructor, sets all token ids to Tokens::Empty and the
+ * descriptor handle to nullptr.
+ */
+ SyntaxDescriptor()
+ : start(Tokens::Empty),
+ end(Tokens::Empty),
+ shortForm(Tokens::Empty),
+ descriptor(nullptr),
+ depth(-1)
+ {
+ }
+
+ /**
+ * Member initializer constructor.
+ *
+ * @param start is a possible start token.
+ * @param end is a possible end token.
+ * @param shortForm is a possible short form token.
+ * @param descriptor The Descriptor this SyntaxDescriptor belongs to.
+ * @param depth Given the current leaf in the parsed document the depth of a
+ * SyntaxDescriptor is defined as the number of transparent elements that
+ * would be needed to construct an instance of the referenced descriptor.
+ */
+ SyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm,
+ Handle<Node> descriptor, ssize_t depth)
+ : start(start),
+ end(end),
+ shortForm(shortForm),
+ descriptor(descriptor),
+ depth(depth)
+ {
+ }
+
+ /**
+ * Inserts all tokens referenced in this SyntaxDescriptor into the
+ * given TokenSet. Skips token ids set to Tokens::Empty.
+ *
+ * @param set is the TokenSet instance into which the Tokens should be
+ * inserted.
+ */
+ void insertIntoTokenSet(TokenSet &set) const;
+
+ /**
+ * Returns true if and only if this SyntaxDescriptor belongs to an
+ * AnnotationClass.
+ *
+ * @return true if and only if this SyntaxDescriptor belongs to an
+ * AnnotationClass.
+ */
+ bool isAnnotation() const;
+
+ /**
+ * Returns true if and only if this SyntaxDescriptor belongs to a
+ * StrcturedClass.
+ *
+ * @return true if and only if this SyntaxDescriptor belongs to a
+ * StrcturedClass.
+ */
+ bool isStruct() const;
+
+ /**
+ * Returns true if and only if this SyntaxDescriptor belongs to a
+ * FieldDescriptor.
+ *
+ * @return true if and only if this SyntaxDescriptor belongs to a
+ * FieldDescriptor.
+ */
+ bool isFieldDescriptor() const;
+
+ /**
+ * Returns true if and only if this SyntaxDescriptor has only empty
+ * entries in start, end and short.
+ *
+ * @return true if and only if this SyntaxDescriptor has only empty
+ * entries in start, end and short.
+ */
+ bool isEmpty() const;
+};
+}
+#endif \ No newline at end of file
diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp
index 6ebc549..44b31c6 100644
--- a/src/core/parser/stack/Callbacks.cpp
+++ b/src/core/parser/stack/Callbacks.cpp
@@ -19,5 +19,15 @@
#include "Callbacks.hpp"
namespace ousia {
+namespace parser_stack {
+
+/* Class ParserCallbacks */
+
+ParserCallbacks::~ParserCallbacks()
+{
+ // Do nothing here
+}
+
+}
}
diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp
index 9c61000..dfe41fc 100644
--- a/src/core/parser/stack/Callbacks.hpp
+++ b/src/core/parser/stack/Callbacks.hpp
@@ -30,68 +30,80 @@
#define _OUSIA_PARSER_STACK_CALLBACKS_HPP_
#include <string>
+#include <vector>
#include <core/common/Whitespace.hpp>
+#include <core/common/Token.hpp>
+#include <core/model/Syntax.hpp>
namespace ousia {
+
+// Forward declarations
+class Variant;
+
namespace parser_stack {
/**
- * Interface defining a set of callback functions that act as a basis for the
- * StateStackCallbacks and the ParserCallbacks.
+ * Interface between the Stack class and the underlying parser used for
+ * registering and unregistering tokens.
*/
-class Callbacks {
+class ParserCallbacks {
public:
/**
* Virtual descructor.
*/
- virtual ~Callbacks() {};
-
- /**
- * Sets the whitespace mode that specifies how string data should be
- * processed.
- *
- * @param whitespaceMode specifies one of the three WhitespaceMode constants
- * PRESERVE, TRIM or COLLAPSE.
- */
- virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0;
+ virtual ~ParserCallbacks();
/**
* Registers the given token as token that should be reported to the handler
* using the "token" function.
*
* @param token is the token string that should be reported.
+ * @return the token id with which the token will be reported. Should return
+ * Tokens::Empty if the given token could not be registered.
*/
- virtual void registerToken(const std::string &token) = 0;
+ virtual TokenId registerToken(const std::string &token) = 0;
/**
* Unregisters the given token, it will no longer be reported to the handler
* using the "token" function.
*
- * @param token is the token string that should be unregistered.
+ * @param id is the token id of the token that should be unregistered.
*/
- virtual void unregisterToken(const std::string &token) = 0;
+ virtual void unregisterToken(TokenId id) = 0;
};
/**
- * Interface defining the callback functions that can be passed from a
- * StateStack to the underlying parser.
+ * Interface defining a set of callback functions that act as a basis for the
+ * StateStackCallbacks and the ParserCallbacks.
*/
-class ParserCallbacks : public Callbacks {
+class HandlerCallbacks : public ParserCallbacks {
+public:
/**
- * Checks whether the given token is supported by the parser. The parser
- * returns true, if the token is supported, false if this token cannot be
- * registered. Note that parsers that do not support the registration of
- * tokens at all should always return "true".
+ * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack.
+ * The tokens described in the token list are the tokens that are currently
+ * enabled.
*
- * @param token is the token that should be checked for support.
- * @return true if the token is generally supported (or the parser does not
- * support registering tokens at all), false if the token is not supported,
- * because e.g. it is a reserved token or it interferes with other tokens.
+ * @param tokens is a list of TokenSyntaxDescriptor instances that should be
+ * stored on the stack.
*/
- virtual bool supportsToken(const std::string &token) = 0;
-};
+ virtual void pushTokens(const std::vector<SyntaxDescriptor> &tokens) = 0;
+
+ /**
+ * Removes the previously pushed list of tokens from the stack.
+ */
+ virtual void popTokens() = 0;
+ /**
+ * Reads a string variant form the current input stream. This function must
+ * be called from the data() method.
+ *
+ * @return a string variant containing the current text data. The return
+ * value depends on the currently set whitespace mode and the tokens that
+ * were enabled using the enableTokens callback method.
+ */
+ virtual Variant readData() = 0;
+};
}
}
diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp
index a307f71..26b9b6e 100644
--- a/src/core/parser/stack/DocumentHandler.cpp
+++ b/src/core/parser/stack/DocumentHandler.cpp
@@ -25,6 +25,7 @@
#include <core/model/Ontology.hpp>
#include <core/model/Project.hpp>
#include <core/model/Typesystem.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <core/parser/ParserScope.hpp>
#include <core/parser/ParserContext.hpp>
@@ -36,7 +37,7 @@ namespace parser_stack {
/* DocumentHandler */
-bool DocumentHandler::start(Variant::mapType &args)
+bool DocumentHandler::startCommand(Variant::mapType &args)
{
Rooted<Document> document =
context().getProject()->createDocument(args["name"].asString());
@@ -51,6 +52,11 @@ void DocumentHandler::end() { scope().pop(logger()); }
/* DocumentChildHandler */
+DocumentChildHandler::DocumentChildHandler(const HandlerData &handlerData)
+ : Handler(handlerData), isExplicitField(false)
+{
+}
+
void DocumentChildHandler::preamble(Rooted<Node> &parentNode, size_t &fieldIdx,
DocumentEntity *&parent)
{
@@ -121,10 +127,10 @@ void DocumentChildHandler::createPath(const size_t &firstFieldIdx,
scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, false);
}
-bool DocumentChildHandler::start(Variant::mapType &args)
+bool DocumentChildHandler::startCommand(Variant::mapType &args)
{
- // extract the special "name" attribute from the input arguments.
- // the remaining attributes will be forwarded to the newly constructed
+ // Extract the special "name" attribute from the input arguments.
+ // The remaining attributes will be forwarded to the newly constructed
// element.
std::string nameAttr;
{
@@ -168,13 +174,6 @@ bool DocumentChildHandler::start(Variant::mapType &args)
preamble(parentNode, fieldIdx, parent);
- // TODO: REMOVE
- std::string thisName = name();
- std::string parentClassName;
- if (parent != nullptr) {
- parentClassName = parent->getDescriptor()->getName();
- }
-
/*
* Try to find a FieldDescriptor for the given tag if we are not in
* a field already. This does _not_ try to construct transparent
@@ -191,9 +190,9 @@ bool DocumentChildHandler::start(Variant::mapType &args)
"Data or structure commands have already been "
"given, command \"") +
name() + std::string(
- "\" is not interpreted explicit "
- "field. Move explicit field "
- "references to the beginning."),
+ "\" is not interpreted explicit "
+ "field. Move explicit field "
+ "references to the beginning."),
location());
} else {
Rooted<DocumentField> field{new DocumentField(
@@ -260,15 +259,34 @@ bool DocumentChildHandler::start(Variant::mapType &args)
}
}
+bool DocumentChildHandler::startAnnotation(Variant::mapType &args,
+ AnnotationType annotationType)
+{
+ // TODO: Handle annotation
+ return false;
+}
+
+bool DocumentChildHandler::startToken(Handle<Node> node)
+{
+ // TODO: Handle token start
+ return false;
+}
+
+DocumentChildHandler::EndTokenResult DocumentChildHandler::endToken(
+ const Token &token, Handle<Node> node)
+{
+ // TODO: Handle token end
+ return EndTokenResult::ENDED_NONE;
+}
+
void DocumentChildHandler::end()
{
- // in case of explicit fields we do not want to pop something from the
+ // In case of explicit fields we do not want to pop something from the
// stack.
- if (isExplicitField) {
- return;
+ if (!isExplicitField) {
+ // pop the "main" element.
+ scope().pop(logger());
}
- // pop the "main" element.
- scope().pop(logger());
}
bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx)
@@ -278,6 +296,7 @@ bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx)
isDefault = true;
return fieldIdx == 0;
}
+
Rooted<Node> parentNode = scope().getLeaf();
assert(parentNode->isa(&RttiTypes::StructuredEntity) ||
parentNode->isa(&RttiTypes::AnnotationEntity));
@@ -290,7 +309,7 @@ bool DocumentChildHandler::fieldStart(bool &isDefault, size_t fieldIdx)
parent->getDescriptor()->getFieldDescriptors();
if (isDefault) {
- if(fields.empty()){
+ if (fields.empty()) {
return false;
}
fieldIdx = fields.size() - 1;
@@ -316,33 +335,19 @@ void DocumentChildHandler::fieldEnd()
{
assert(scope().getLeaf()->isa(&RttiTypes::DocumentField));
- // pop the field from the stack.
+ // Pop the field from the stack.
scope().pop(logger());
- // pop all remaining transparent elements.
+ // Pop all remaining transparent elements.
while (scope().getLeaf()->isa(&RttiTypes::StructuredEntity) &&
scope().getLeaf().cast<StructuredEntity>()->isTransparent()) {
- // pop the transparent element.
+ // Pop the transparent element.
scope().pop(logger());
- // pop the transparent field.
+ // Pop the transparent field.
scope().pop(logger());
}
}
-bool DocumentChildHandler::annotationStart(const Variant &className,
- Variant::mapType &args)
-{
- // TODO: Implement
- return false;
-}
-
-bool DocumentChildHandler::annotationEnd(const Variant &className,
- const Variant &elementName)
-{
- // TODO: Implement
- return false;
-}
-
bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field,
Variant &data, Logger &logger)
{
@@ -370,7 +375,7 @@ bool DocumentChildHandler::convertData(Handle<FieldDescriptor> field,
return valid && scope().resolveValue(data, type, logger);
}
-bool DocumentChildHandler::data(Variant &data)
+bool DocumentChildHandler::data()
{
// We're past the region in which explicit fields can be defined in the
// parent structure element
@@ -391,11 +396,12 @@ bool DocumentChildHandler::data(Variant &data)
// If it is a primitive field directly, try to parse the content.
if (field->isPrimitive()) {
// Add it as primitive content.
- if (!convertData(field, data, logger())) {
+ Variant text = readData();
+ if (!convertData(field, text, logger())) {
return false;
}
- parent->createChildDocumentPrimitive(data, fieldIdx);
+ parent->createChildDocumentPrimitive(text, fieldIdx);
return true;
}
@@ -409,7 +415,11 @@ bool DocumentChildHandler::data(Variant &data)
for (auto primitiveField : defaultFields) {
// Then try to parse the content using the type specification.
forks.emplace_back(logger().fork());
- if (!convertData(primitiveField, data, forks.back())) {
+
+ // TODO: Actually the data has to be read after the path has been
+ // created (as createPath may push more tokens onto the stack)
+ Variant text = readData();
+ if (!convertData(primitiveField, text, forks.back())) {
continue;
}
@@ -418,24 +428,24 @@ bool DocumentChildHandler::data(Variant &data)
// Construct the necessary path
NodeVector<Node> path = field->pathTo(primitiveField, logger());
- // TODO: Create methods with indices instead of names.
createPath(fieldIdx, path, parent);
// Then create the primitive element
- parent->createChildDocumentPrimitive(data);
+ parent->createChildDocumentPrimitive(text);
return true;
}
// No field was found that might take the data -- dump the error messages
// from the loggers -- or, if there were no primitive fields, clearly state
// this fact
+ Variant text = readData();
if (defaultFields.empty()) {
logger().error("Got data, but structure \"" + name() +
"\" does not have any primitive field",
- data);
+ text);
} else {
logger().error("Could not read data with any of the possible fields:",
- data);
+ text);
size_t f = 0;
for (auto field : defaultFields) {
logger().note(std::string("Field ") +
@@ -461,7 +471,9 @@ const State DocumentChild = StateBuilder()
.createdNodeTypes({&RttiTypes::StructureNode,
&RttiTypes::AnnotationEntity,
&RttiTypes::DocumentField})
- .elementHandler(DocumentChildHandler::create);
+ .elementHandler(DocumentChildHandler::create)
+ .supportsAnnotations(true)
+ .supportsTokens(true);
}
}
@@ -469,4 +481,4 @@ namespace RttiTypes {
const Rtti DocumentField = RttiBuilder<ousia::parser_stack::DocumentField>(
"DocumentField").parent(&Node);
}
-} \ No newline at end of file
+}
diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp
index 44feb2b..0e35558 100644
--- a/src/core/parser/stack/DocumentHandler.hpp
+++ b/src/core/parser/stack/DocumentHandler.hpp
@@ -53,7 +53,7 @@ class DocumentHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
/**
@@ -92,9 +92,10 @@ public:
*/
class DocumentChildHandler : public Handler {
private:
- bool isExplicitField = false;
- //TODO: REMOVE
- std::string strct_name;
+ /**
+ * If set to true, this handler represents an explicit field.
+ */
+ bool isExplicitField;
/**
* Code shared by both the start(), fieldStart() and the data() method.
@@ -163,22 +164,18 @@ private:
Logger &logger);
public:
- using Handler::Handler;
+ DocumentChildHandler(const HandlerData &handlerData);
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
+ bool startAnnotation(Variant::mapType &args,
+ AnnotationType annotationType) override;
+ bool startToken(Handle<Node> node) override;
+ EndTokenResult endToken(const Token &token, Handle<Node> node) override;
void end() override;
- bool data(Variant &data) override;
-
+ bool data() override;
bool fieldStart(bool &isDefault, size_t fieldIdx) override;
-
void fieldEnd() override;
- bool annotationStart(const Variant &className,
- Variant::mapType &args) override;
-
- bool annotationEnd(const Variant &className,
- const Variant &elementName) override;
-
/**
* Creates a new instance of the DocumentChildHandler.
*
@@ -213,4 +210,5 @@ extern const Rtti DocumentField;
}
}
-#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ \ No newline at end of file
+#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */
+
diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp
index bf5d4ea..c01e74c 100644
--- a/src/core/parser/stack/Handler.cpp
+++ b/src/core/parser/stack/Handler.cpp
@@ -18,6 +18,8 @@
#include <core/common/Exceptions.hpp>
#include <core/common/Logger.hpp>
+#include <core/common/Variant.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <core/parser/ParserContext.hpp>
#include "Callbacks.hpp"
@@ -29,14 +31,10 @@ namespace parser_stack {
/* Class HandlerData */
-HandlerData::HandlerData(ParserContext &ctx, /*Callbacks &callbacks,*/
- const std::string &name, const State &state,
- const SourceLocation &location)
- : ctx(ctx),
- /*callbacks(callbacks),*/
- name(name),
- state(state),
- location(location)
+HandlerData::HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks,
+ const State &state, const Token &token,
+ HandlerType type)
+ : ctx(ctx), callbacks(callbacks), state(state), token(token), type(type)
{
}
@@ -63,28 +61,39 @@ Logger &Handler::logger()
return handlerData.ctx.getLogger();
}
-const SourceLocation &Handler::location() const { return handlerData.location; }
+const std::string &Handler::name() const { return handlerData.token.content; }
-const std::string &Handler::name() const { return handlerData.name; }
+TokenId Handler::tokenId() const { return handlerData.token.id; }
-void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode)
+const Token &Handler::token() const { return handlerData.token; }
+
+const SourceLocation &Handler::location() const
{
- /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/
+ return handlerData.token.location;
}
-void Handler::registerToken(const std::string &token)
+HandlerType Handler::type() const { return handlerData.type; }
+
+const State &Handler::state() const { return handlerData.state; }
+
+Variant Handler::readData() { return handlerData.callbacks.readData(); }
+
+void Handler::pushTokens(const std::vector<SyntaxDescriptor> &tokens)
{
- /*handlerData.callbacks.registerToken(token);*/
+ handlerData.callbacks.pushTokens(tokens);
}
-void Handler::unregisterToken(const std::string &token)
+void Handler::popTokens() { handlerData.callbacks.popTokens(); }
+
+TokenId Handler::registerToken(const std::string &token)
{
- /*handlerData.callbacks.unregisterToken(token);*/
+ return handlerData.callbacks.registerToken(token);
}
-const std::string &Handler::getName() const { return name(); }
-
-const State &Handler::getState() const { return handlerData.state; }
+void Handler::unregisterToken(TokenId id)
+{
+ handlerData.callbacks.unregisterToken(id);
+}
void Handler::setLogger(Logger &logger) { internalLogger = &logger; }
@@ -94,43 +103,50 @@ const SourceLocation &Handler::getLocation() const { return location(); }
/* Class EmptyHandler */
-bool EmptyHandler::start(Variant::mapType &args)
+bool EmptyHandler::startCommand(Variant::mapType &args)
{
- // Just accept anything
+ // Well, we'll support any command we get, don't we?
return true;
}
-void EmptyHandler::end()
+bool EmptyHandler::startAnnotation(Variant::mapType &args,
+ Handler::AnnotationType annotationType)
{
- // Do nothing if a command ends
+ // Do not support annotations. Annotations are too complicated for poor
+ // EmptyHandler.
+ return false;
}
-bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex)
+bool EmptyHandler::startToken(Handle<Node> node)
{
- // Accept any field
- return true;
+ // EmptyHandler does not support tokens.
+ return false;
}
-void EmptyHandler::fieldEnd()
+Handler::EndTokenResult EmptyHandler::endToken(const Token &token,
+ Handle<Node> node)
{
- // Do not handle fields
+ // There are no tokens to end here.
+ return EndTokenResult::ENDED_NONE;
}
-bool EmptyHandler::annotationStart(const Variant &className,
- Variant::mapType &args)
+void EmptyHandler::end()
{
- // Accept any data
- return true;
+ // Do nothing if a command ends
}
-bool EmptyHandler::annotationEnd(const Variant &className,
- const Variant &elementName)
+bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex)
{
- // Accept any annotation
+ // Accept any field
return true;
}
-bool EmptyHandler::data(Variant &data)
+void EmptyHandler::fieldEnd()
+{
+ // Do not handle field ends
+}
+
+bool EmptyHandler::data()
{
// Support any data
return true;
@@ -143,12 +159,26 @@ Handler *EmptyHandler::create(const HandlerData &handlerData)
/* Class StaticHandler */
-bool StaticHandler::start(Variant::mapType &args)
+bool StaticHandler::startCommand(Variant::mapType &args)
{
// Do nothing in the default implementation, accept anything
return true;
}
+bool StaticHandler::startAnnotation(Variant::mapType &args,
+ Handler::AnnotationType annotationType)
+{
+ return false;
+}
+
+bool StaticHandler::startToken(Handle<Node> node) { return false; }
+
+Handler::EndTokenResult StaticHandler::endToken(const Token &token,
+ Handle<Node> node)
+{
+ return EndTokenResult::ENDED_NONE;
+}
+
void StaticHandler::end()
{
// Do nothing here
@@ -170,23 +200,9 @@ void StaticHandler::fieldEnd()
// Do nothing here
}
-bool StaticHandler::annotationStart(const Variant &className,
- Variant::mapType &args)
-{
- // No annotations supported
- return false;
-}
-
-bool StaticHandler::annotationEnd(const Variant &className,
- const Variant &elementName)
+bool StaticHandler::data()
{
- // No annotations supported
- return false;
-}
-
-bool StaticHandler::data(Variant &data)
-{
- logger().error("Did not expect any data here", data);
+ logger().error("Did not expect any data here", readData());
return false;
}
@@ -198,7 +214,7 @@ StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData,
{
}
-bool StaticFieldHandler::start(Variant::mapType &args)
+bool StaticFieldHandler::startCommand(Variant::mapType &args)
{
if (!argName.empty()) {
auto it = args.find(argName);
@@ -227,12 +243,15 @@ void StaticFieldHandler::end()
}
}
-bool StaticFieldHandler::data(Variant &data)
+bool StaticFieldHandler::data()
{
+ // Fetch the actual text data
+ Variant stringData = readData();
+
// Call the doHandle function if this has not been done before
if (!handled) {
handled = true;
- doHandle(data, args);
+ doHandle(stringData, args);
return true;
}
@@ -240,7 +259,7 @@ bool StaticFieldHandler::data(Variant &data)
logger().error(
std::string("Found data, but the corresponding argument \"") + argName +
std::string("\" was already specified"),
- data);
+ stringData);
// Print the location at which the attribute was originally specified
auto it = args.find(argName);
diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp
index 7cda7a4..67fde06 100644
--- a/src/core/parser/stack/Handler.hpp
+++ b/src/core/parser/stack/Handler.hpp
@@ -1,6 +1,6 @@
/*
Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,6 +16,15 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+/**
+ * @file Handler.hpp
+ *
+ * Contains the definition of the Handler class, used for representing Handlers
+ * for certain syntactic elements.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
#ifndef _OUSIA_PARSER_STACK_HANDLER_HPP_
#define _OUSIA_PARSER_STACK_HANDLER_HPP_
@@ -24,6 +33,9 @@
#include <core/common/Location.hpp>
#include <core/common/Variant.hpp>
#include <core/common/Whitespace.hpp>
+#include <core/common/Token.hpp>
+#include <core/model/Node.hpp>
+#include <core/model/Syntax.hpp>
namespace ousia {
@@ -31,14 +43,23 @@ namespace ousia {
class ParserScope;
class ParserContext;
class Logger;
+class TokenizedData;
+class Variant;
namespace parser_stack {
// More forward declarations
-class Callbacks;
+class HandlerCallbacks;
class State;
/**
+ * Enum describing the type of the Handler instance -- a document handler may
+ * be created for handling a simple command, a token or an annotation start and
+ * end.
+ */
+enum class HandlerType { COMMAND, ANNOTATION_START, ANNOTATION_END, TOKEN };
+
+/**
* Class collecting all the data that is being passed to a Handler
* instance.
*/
@@ -51,26 +72,28 @@ public:
ParserContext &ctx;
/**
- * Reference at an instance of the Callbacks class, used for
- * modifying the behaviour of the parser (like registering tokens, setting
- * the data type or changing the whitespace handling mode).
+ * Reference at a class implementing the HandlerCallbacks interface, used
+ * for modifying the behaviour of the parser (like registering tokens,
+ * setting the data type or changing the whitespace handling mode).
*/
- // Callbacks &callbacks;
+ HandlerCallbacks &callbacks;
/**
- * Contains the name of the command that is being handled.
+ * Contains the current state of the state machine.
*/
- std::string name;
+ const State &state;
/**
- * Contains the current state of the state machine.
+ * Token containing the name of the command that is being handled, the
+ * location of the element in the source code or the token id of the token
+ * that is being handled.
*/
- const State &state;
+ Token token;
/**
- * Current source code location.
+ * Type describing for which purpose the HandlerData instance was created.
*/
- SourceLocation location;
+ HandlerType type;
/**
* Constructor of the HandlerData class.
@@ -78,13 +101,13 @@ public:
* @param ctx is the parser context the handler should be executed in.
* @param callbacks is an instance of Callbacks used to notify
* the parser about certain state changes.
- * @param name is the name of the string.
* @param state is the state this handler was called for.
- * @param location is the location at which the handler is created.
+ * @param token contains name, token id and location of the command that is
+ * being handled.
+ * @param type describes the purpose of the Handler instance at hand.
*/
- HandlerData(ParserContext &ctx,
- /*Callbacks &callbacks,*/ const std::string &name,
- const State &state, const SourceLocation &location);
+ HandlerData(ParserContext &ctx, HandlerCallbacks &callbacks,
+ const State &state, const Token &token, HandlerType type);
};
/**
@@ -115,6 +138,94 @@ protected:
Handler(const HandlerData &handlerData);
/**
+ * Calls the corresponding function in the HandlerCallbacks instance. This
+ * method registers the given tokens as tokens that are generally available,
+ * tokens must be explicitly enabled using the "pushTokens" and "popTokens"
+ * method. Tokens that have not been registered are not guaranteed to be
+ * reported (except for special tokens, these do not have to be registerd).
+ *
+ * @param token is the token string that should be made available.
+ * @return the TokenId that will be used to refer to the token.
+ */
+ TokenId registerToken(const std::string &token);
+
+ /**
+ * Calls the corresponding function in the HandlerCallbacks instance. This
+ * method unregisters the given token. Note that for a token to be no longer
+ * reported, this function has to be called as many times as registerToken()
+ * for the corresponding token.
+ *
+ * @param id is the id of the Token that should be unregistered.
+ */
+ void unregisterToken(TokenId id);
+
+ /**
+ * Pushes a list of TokenSyntaxDescriptor instances onto the internal stack.
+ * The tokens described in the token list are the tokens that are currently
+ * enabled.
+ *
+ * @param tokens is a list of TokenSyntaxDescriptor instances that should be
+ * stored on the stack.
+ */
+ void pushTokens(const std::vector<SyntaxDescriptor> &tokens);
+
+ /**
+ * Calls the corresponding function in the HandlerCallbacks instance.
+ * Removes the previously pushed list of tokens from the stack.
+ */
+ void popTokens();
+
+ /**
+ * Calls the corresponding method in the HandlerCallbacks instance. Reads a
+ * string variant form the current input stream. This function must be
+ * called from the data() method.
+ *
+ * @return a string variant containing the current text data. The return
+ * value depends on the currently set whitespace mode and the tokens that
+ * were enabled using the enableTokens callback method.
+ */
+ Variant readData();
+
+ /**
+ * Calls the corresponding function in the Callbacks instance. Sets the
+ * whitespace mode that specifies how string data should be processed. The
+ * calls to this function are placed on a stack by the underlying Stack
+ * class. This function should be called from the "fieldStart" callback and
+ * the "start" callback. If no whitespace mode is pushed in the "start"
+ * method the whitespace mode "TRIM" is implicitly assumed.
+ *
+ * @param whitespaceMode specifies one of the three WhitespaceMode constants
+ * PRESERVE, TRIM or COLLAPSE.
+ */
+ // void pushWhitespaceMode(WhitespaceMode whitespaceMode);
+
+ /**
+ * Pops a previously pushed whitespace mode. Calls to this function should
+ * occur in the "end" callback and the "fieldEnd" callback. This function
+ * can only undo pushs that were performed by the pushWhitespaceMode()
+ * method of the same handler.
+ */
+ // void popWhitespaceMode();
+
+public:
+ /**
+ * Enum representing the type of the annotation a Handle instance handles.
+ * It may either handle the start of an annotation or the end of an
+ * annotation.
+ */
+ enum class AnnotationType { START, END };
+
+ /**
+ * Enum type representing the possible outcomes of the endToken() method.
+ */
+ enum class EndTokenResult { ENDED_THIS, ENDED_HIDDEN, ENDED_NONE };
+
+ /**
+ * Virtual destructor.
+ */
+ virtual ~Handler();
+
+ /**
* Returns a reference at the ParserContext.
*
* @return a reference at the ParserContext.
@@ -144,68 +255,55 @@ protected:
Logger &logger();
/**
- * Returns the location of the element in the source file, for which this
- * Handler was created.
+ * Returns the name of the command or annotation the handler is currently
+ * handling. In case the command is currently handling a token, the name
+ * corresponds to the token string sequence.
*
- * @return the location of the Handler in the source file.
- */
- const SourceLocation &location() const;
-
- /**
- * Returns the command name for which the handler was created.
- *
- * @return a const reference at the command name.
+ * @return the name of the command or the string sequence of the token that
+ * is being handled by this handler.
*/
const std::string &name() const;
-public:
- /**
- * Virtual destructor.
- */
- virtual ~Handler();
-
/**
- * Calls the corresponding function in the Callbacks instance. Sets the
- * whitespace mode that specifies how string data should be processed. The
- * calls to this function are placed on a stack by the underlying Stack
- * class.
+ * Returns the token id of the token that is currently being handled by the
+ * handler. In case the handler currently handles a command or annotation,
+ * the token id is set to Tokens::Data.
*
- * @param whitespaceMode specifies one of the three WhitespaceMode constants
- * PRESERVE, TRIM or COLLAPSE.
+ * @return the current token id or Tokens::Data if no token is being
+ * handled.
*/
- void setWhitespaceMode(WhitespaceMode whitespaceMode);
+ TokenId tokenId() const;
/**
- * Calls the corresponding function in the Callbacks instance.
- * Registers the given token as token that should be reported to the handler
- * using the "token" function.
+ * Returns a reference at the Token instance, containing either the token
+ * that is currently being handled or the name of the command and annotation
+ * and their location.
*
- * @param token is the token string that should be reported.
+ * @return a const reference at the internal token instance.
*/
- void registerToken(const std::string &token);
+ const Token &token() const;
/**
- * Calls the corresponding function in the Callbacks instance.
- * Unregisters the given token, it will no longer be reported to the handler
- * using the "token" function.
+ * Returns the location of the element in the source file, for which this
+ * Handler was created.
*
- * @param token is the token string that should be unregistered.
+ * @return the location of the Handler in the source file.
*/
- void unregisterToken(const std::string &token);
+ const SourceLocation &location() const;
/**
- * Returns the command name for which the handler was created.
- *
- * @return a const reference at the command name.
+ * Returns the type describing the purpose for which the handler instance
+ * was created.
*/
- const std::string &getName() const;
+ HandlerType type() const;
/**
- * Reference at the State descriptor for which this Handler was created.
+ * Returns a reference at the State descriptor for which this Handler was
+ * created.
*
* @return a const reference at the constructing State descriptor.
*/
- const State &getState() const;
+ const State &state() const;
/**
* Sets the internal logger to the given logger instance.
@@ -229,14 +327,62 @@ public:
const SourceLocation &getLocation() const;
/**
- * Called when the command that was specified in the constructor is
- * instanciated.
+ * Called whenever the handler should handle the start of a command. This
+ * method (or any other of the "start" methods) is called exactly once,
+ * after the constructor. The name of the command that is started here can
+ * be accessed using the name() method.
+ *
+ * @param args is a map from strings to variants (argument name and value).
+ * @return true if the handler was successful in starting an element with
+ * the given name represents, false otherwise.
+ */
+ virtual bool startCommand(Variant::mapType &args) = 0;
+
+ /**
+ * Called whenever the handler should handle the start of an annotation.
+ * This method (or any other of the "start" methods) is called exactly once,
+ * after the constructor. This method is only called if the
+ * "supportsAnnotations" flag of the State instance referencing this Handler
+ * is set to true. The name of the command that is started here can be
+ * accessed using the name() method.
*
* @param args is a map from strings to variants (argument name and value).
- * @return true if the handler was successful in starting the element it
- * represents, false otherwise.
+ * @param type specifies whether this handler should handle the start of an
+ * annotation or the end of an annotation.
+ */
+ virtual bool startAnnotation(Variant::mapType &args,
+ AnnotationType annotationType) = 0;
+
+ /**
+ * Called whenever the handler should handle the start of a token. This
+ * method (or any other of the "start" methods) is called exactly once,
+ * after the constructor. This method is only called if the "supportsTokens"
+ * flag of the State instance referencing this Handler is set to true. The
+ * token id of the token that is should be handled can be accessed using the
+ * tokenId() method.
+ *
+ * @param node is the node for which this token was registered.
+ */
+ virtual bool startToken(Handle<Node> node) = 0;
+
+ /**
+ * Called whenever a token is marked as "end" token and this handler happens
+ * to be the currently active handler. This operation may have three
+ * outcomes:
+ * <ol>
+ * <li>The token marks the end of the complete handler and the calling
+ * code should call the "end" method.</li>
+ * <li>The token marks the end of some element that is unknown the calling
+ * code. So the operation itself was a success, but the calling code
+ * should not call the "end" method.
+ * <li>The token did not anything in this context. Basically this shuold
+ * never happen, but who knows.</li>
+ * </ol>
+ *
+ * @param id is the Token for which the handler should be started.
+ * @param node is the node for which this token was registered.
*/
- virtual bool start(Variant::mapType &args) = 0;
+ virtual EndTokenResult endToken(const Token &token, Handle<Node> node) = 0;
/**
* Called before the command for which this handler is defined ends (is
@@ -266,44 +412,14 @@ public:
virtual void fieldEnd() = 0;
/**
- * Called whenever an annotation starts while this handler is active. The
- * function should return true if starting the annotation was successful,
- * false otherwise.
- *
- * @param className is a string variant containing the name of the
- * annotation class and the location of the name in the source code.
- * @param args is a map from strings to variants (argument name and value).
- * @return true if the mentioned annotation could be started here, false
- * if an error occurred.
- */
- virtual bool annotationStart(const Variant &className,
- Variant::mapType &args) = 0;
-
- /**
- * Called whenever an annotation ends while this handler is active. The
- * function should return true if ending the annotation was successful,
- * false otherwise.
- *
- * @param className is a string variant containing the name of the
- * annotation class and the location of the class name in the source code.
- * @param elementName is a string variant containing the name of the
- * annotation class and the location of the element name in the source code.
- * @return true if the mentioned annotation could be started here, false if
- * an error occurred.
- */
- virtual bool annotationEnd(const Variant &className,
- const Variant &elementName) = 0;
-
- /**
* Called whenever raw data (int the form of a string) is available for the
* Handler instance. Should return true if the data could be handled, false
- * otherwise.
+ * otherwise. The actual data variant must be retrieved using the "text()"
+ * callback.
*
- * @param data is a string variant containing the character data and its
- * location.
* @return true if the data could be handled, false otherwise.
*/
- virtual bool data(Variant &data) = 0;
+ virtual bool data() = 0;
};
/**
@@ -325,15 +441,15 @@ protected:
using Handler::Handler;
public:
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
+ bool startAnnotation(Variant::mapType &args,
+ AnnotationType annotationType) override;
+ bool startToken(Handle<Node> node) override;
+ EndTokenResult endToken(const Token &token, Handle<Node> node) override;
void end() override;
bool fieldStart(bool &isDefault, size_t fieldIdx) override;
void fieldEnd() override;
- bool annotationStart(const Variant &className,
- Variant::mapType &args) override;
- bool annotationEnd(const Variant &className,
- const Variant &elementName) override;
- bool data(Variant &data) override;
+ bool data() override;
/**
* Creates an instance of the EmptyHandler class.
@@ -351,15 +467,15 @@ protected:
using Handler::Handler;
public:
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
+ bool startAnnotation(Variant::mapType &args,
+ AnnotationType annotationType) override;
+ bool startToken(Handle<Node> node) override;
+ EndTokenResult endToken(const Token &token, Handle<Node> node) override;
void end() override;
bool fieldStart(bool &isDefault, size_t fieldIdx) override;
void fieldEnd() override;
- bool annotationStart(const Variant &className,
- Variant::mapType &args) override;
- bool annotationEnd(const Variant &className,
- const Variant &elementName) override;
- bool data(Variant &data) override;
+ bool data() override;
};
/**
@@ -406,13 +522,12 @@ protected:
* @param fieldData is the captured field data.
* @param args are the arguments that were given in the "start" function.
*/
- virtual void doHandle(const Variant &fieldData,
- Variant::mapType &args) = 0;
+ virtual void doHandle(const Variant &fieldData, Variant::mapType &args) = 0;
public:
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
+ bool data() override;
void end() override;
- bool data(Variant &data) override;
};
}
}
diff --git a/src/core/parser/stack/OntologyHandler.cpp b/src/core/parser/stack/OntologyHandler.cpp
index 8c0e4d9..3b3b386 100644
--- a/src/core/parser/stack/OntologyHandler.cpp
+++ b/src/core/parser/stack/OntologyHandler.cpp
@@ -33,7 +33,7 @@ namespace parser_stack {
/* OntologyHandler */
-bool OntologyHandler::start(Variant::mapType &args)
+bool DomainHandler::startCommand(Variant::mapType &args)
{
// Create the Ontology node
Rooted<Ontology> ontology =
@@ -226,9 +226,9 @@ bool OntologyChildHandler::start(Variant::mapType &args)
{
Rooted<FieldDescriptor> field = scope().selectOrThrow<FieldDescriptor>();
- const std::string &ref = args["ref"].asString();
+ const std::string &name = args["ref"].asString();
scope().resolve<StructuredClass>(
- ref, field, logger(),
+ name, field, logger(),
[](Handle<Node> child, Handle<Node> field, Logger &logger) {
if (child != nullptr) {
field.cast<FieldDescriptor>()->addChild(
@@ -275,7 +275,7 @@ bool OntologyParentFieldHandler::start(Variant::mapType &args)
scope().resolve<Descriptor>(
parentNameNode->getName(), strct, logger(),
[type, name, optional](Handle<Node> parent, Handle<Node> strct,
- Logger &logger) {
+ Logger &logger) {
if (parent != nullptr) {
Rooted<FieldDescriptor> field =
(parent.cast<Descriptor>()->createFieldDescriptor(
@@ -299,21 +299,20 @@ bool OntologyParentFieldRefHandler::start(Variant::mapType &args)
// resolve the parent, get the referenced field and add the declared
// StructuredClass as child to it.
- scope().resolve<Descriptor>(
- parentNameNode->getName(), strct, logger(),
- [name, loc](Handle<Node> parent, Handle<Node> strct, Logger &logger) {
- if (parent != nullptr) {
- Rooted<FieldDescriptor> field =
- parent.cast<Descriptor>()->getFieldDescriptor(name);
- if (field == nullptr) {
- logger.error(
- std::string("Could not find referenced field ") + name,
- loc);
- return;
- }
- field->addChild(strct.cast<StructuredClass>());
- }
- });
+ scope().resolve<Descriptor>(parentNameNode->getName(), strct, logger(),
+ [name, loc](Handle<Node> parent,
+ Handle<Node> strct, Logger &logger) {
+ if (parent != nullptr) {
+ Rooted<FieldDescriptor> field =
+ parent.cast<Descriptor>()->getFieldDescriptor(name);
+ if (field == nullptr) {
+ logger.error(
+ std::string("Could not find referenced field ") + name, loc);
+ return;
+ }
+ field->addChild(strct.cast<StructuredClass>());
+ }
+ });
return true;
}
diff --git a/src/core/parser/stack/OntologyHandler.hpp b/src/core/parser/stack/OntologyHandler.hpp
index caeacc7..66146bd 100644
--- a/src/core/parser/stack/OntologyHandler.hpp
+++ b/src/core/parser/stack/OntologyHandler.hpp
@@ -46,7 +46,7 @@ class OntologyHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -59,7 +59,7 @@ class OntologyStructHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -72,7 +72,7 @@ class OntologyAnnotationHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -85,7 +85,7 @@ class OntologyAttributesHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -98,7 +98,7 @@ class OntologyFieldHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -111,7 +111,7 @@ class OntologyFieldRefHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -124,7 +124,7 @@ class OntologyPrimitiveHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -137,7 +137,7 @@ class OntologyChildHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
static Handler *create(const HandlerData &handlerData)
{
@@ -154,7 +154,7 @@ class OntologyParentHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
static Handler *create(const HandlerData &handlerData)
@@ -167,7 +167,7 @@ class OntologyParentFieldHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
static Handler *create(const HandlerData &handlerData)
{
@@ -179,7 +179,7 @@ class OntologyParentFieldRefHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
static Handler *create(const HandlerData &handlerData)
{
diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp
index 5b67248..f341f1d 100644
--- a/src/core/parser/stack/Stack.cpp
+++ b/src/core/parser/stack/Stack.cpp
@@ -19,18 +19,148 @@
#include <core/common/Logger.hpp>
#include <core/common/Utils.hpp>
#include <core/common/Exceptions.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include <core/parser/ParserScope.hpp>
#include <core/parser/ParserContext.hpp>
+#include "Callbacks.hpp"
#include "Handler.hpp"
#include "Stack.hpp"
#include "State.hpp"
+#include "TokenRegistry.hpp"
+#include "TokenStack.hpp"
+
+#define STACK_DEBUG_OUTPUT 0
+#if STACK_DEBUG_OUTPUT
+#include <iostream>
+#endif
namespace ousia {
namespace parser_stack {
+namespace {
/* Class HandlerInfo */
+/**
+ * The HandlerInfo class is used internally by the stack to associate additional
+ * (mutable) data with a handler instance.
+ */
+class HandlerInfo {
+public:
+ /**
+ * Pointer pointing at the actual handler instance.
+ */
+ std::shared_ptr<Handler> handler;
+
+ /**
+ * Next field index to be passed to the "fieldStart" function of the Handler
+ * class.
+ */
+ size_t fieldIdx;
+
+ /**
+ * Set to true if the handler is valid (which is the case if the "start"
+ * method has returned true). If the handler is invalid, no more calls are
+ * directed at it until it can be removed from the stack.
+ */
+ bool valid : 1;
+
+ /**
+ * Set to true if this is an implicit handler, that was created when the
+ * current stack state was deduced.
+ */
+ bool implicit : 1;
+
+ /**
+ * Set to true if the handled command or annotation has a range.
+ */
+ bool range : 1;
+
+ /**
+ * Set to true if the handler currently is in a field.
+ */
+ bool inField : 1;
+
+ /**
+ * Set to true if the handler currently is in the default field.
+ */
+ bool inDefaultField : 1;
+
+ /**
+ * Set to true if the handler currently is in an implicitly started default
+ * field.
+ */
+ bool inImplicitDefaultField : 1;
+
+ /**
+ * Set to false if this field is only opened pro-forma and does not accept
+ * any data. Otherwise set to true.
+ */
+ bool inValidField : 1;
+
+ /**
+ * Set to true, if the default field was already started.
+ */
+ bool hadDefaultField : 1;
+
+ /**
+ * Default constructor of the HandlerInfo class.
+ */
+ HandlerInfo();
+
+ /**
+ * Constructor of the HandlerInfo class, allows to set some flags manually.
+ */
+ HandlerInfo(bool implicit, bool inField, bool inDefaultField,
+ bool inImplicitDefaultField);
+
+ /**
+ * Constructor of the HandlerInfo class, taking a shared_ptr to the handler
+ * to which additional information should be attached.
+ */
+ HandlerInfo(std::shared_ptr<Handler> handler);
+
+ /**
+ * Destructor of the HandlerInfo class (to allow Handler to be forward
+ * declared).
+ */
+ ~HandlerInfo();
+
+ /**
+ * Updates the "field" flags according to a "fieldStart" event.
+ */
+ void fieldStart(bool isDefault, bool isImplicit, bool isValid);
+
+ /**
+ * Updates the "fields" flags according to a "fieldEnd" event.
+ */
+ void fieldEnd();
+
+ /**
+ * Returns the name of the referenced handler or an empty string if no
+ * handler is present.
+ *
+ * @return the current handler name.
+ */
+ std::string name() const;
+
+ /**
+ * Returns the type of the referenced handler or COMMAND if no handler is
+ * present.
+ *
+ * @return the current handler type.
+ */
+ HandlerType type() const;
+
+ /**
+ * Returns the current state the handler is on or States::None if no handler
+ * is present.
+ *
+ * @return the current state machine state.
+ */
+ const State &state() const;
+};
+
HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {}
HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler)
@@ -38,6 +168,7 @@ HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler)
fieldIdx(0),
valid(true),
implicit(false),
+ range(false),
inField(false),
inDefaultField(false),
inImplicitDefaultField(false),
@@ -46,21 +177,36 @@ HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler)
{
}
-HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField,
- bool inDefaultField, bool inImplicitDefaultField,
- bool inValidField)
+HandlerInfo::HandlerInfo(bool implicit, bool inField, bool inDefaultField,
+ bool inImplicitDefaultField)
: handler(nullptr),
fieldIdx(0),
- valid(valid),
+ valid(true),
implicit(implicit),
+ range(false),
inField(inField),
inDefaultField(inDefaultField),
inImplicitDefaultField(inImplicitDefaultField),
- inValidField(inValidField),
+ inValidField(true),
hadDefaultField(false)
{
}
+std::string HandlerInfo::name() const
+{
+ return handler == nullptr ? std::string{} : handler->name();
+}
+
+HandlerType HandlerInfo::type() const
+{
+ return handler == nullptr ? HandlerType::COMMAND : handler->type();
+}
+
+const State &HandlerInfo::state() const
+{
+ return handler == nullptr ? States::None : handler->state();
+}
+
HandlerInfo::~HandlerInfo()
{
// Do nothing
@@ -87,7 +233,20 @@ void HandlerInfo::fieldEnd()
/**
* Stub instance of HandlerInfo containing no handler information.
*/
-static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true};
+static HandlerInfo EmptyHandlerInfo{true, true, true, true};
+
+/**
+ * Small helper class makeing sure the reference at some variable is reset once
+ * the scope is left.
+ */
+template <class T>
+struct GuardedTemporaryPointer {
+ T **ptr;
+ GuardedTemporaryPointer(T *ref, T **ptr) : ptr(ptr) { *ptr = ref; }
+
+ ~GuardedTemporaryPointer() { *ptr = nullptr; }
+};
+}
/* Helper functions */
@@ -116,11 +275,197 @@ static LoggableException buildInvalidCommandException(
}
}
-/* Class Stack */
-
-Stack::Stack(ParserContext &ctx,
- const std::multimap<std::string, const State *> &states)
- : ctx(ctx), states(states)
+/* Class StackImpl */
+
+class StackImpl : public HandlerCallbacks {
+private:
+ /**
+ * Reference at an implementation of the ParserCallbacks instance to which
+ * certain handler callbacks are directed.
+ */
+ ParserCallbacks &parser;
+
+ /**
+ * Reference at the parser context.
+ */
+ ParserContext &ctx;
+
+ /**
+ * Map containing all registered command names and the corresponding
+ * state descriptors.
+ */
+ const std::multimap<std::string, const State *> &states;
+
+ /**
+ * Registry responsible for registering the tokens proposed by the
+ * Handlers in the parser.
+ */
+ TokenRegistry tokenRegistry;
+
+ /**
+ * Pointer at a TokenizedDataReader instance from which the data should
+ * currently be read.
+ */
+ TokenizedDataReader *dataReader;
+
+ /**
+ * Internal stack used for managing the currently active Handler instances.
+ */
+ std::vector<HandlerInfo> stack;
+
+ /**
+ * Return the reference in the Logger instance stored within the context.
+ */
+ Logger &logger() { return ctx.getLogger(); }
+
+ /**
+ * Used internally to get all expected command names for the current state.
+ * This function is used to build error messages.
+ *
+ * @return a set of strings containing the names of the expected commands.
+ */
+ std::set<std::string> expectedCommands();
+
+ /**
+ * Returns the targetState for a command with the given name that can be
+ * reached from the current state.
+ *
+ * @param name is the name of the requested command.
+ * @return nullptr if no target state was found, a pointer at the target
+ * state otherwise.
+ */
+ const State *findTargetState(const std::string &name);
+
+ /**
+ * Returns the targetState for a command with the given name that can be
+ * reached from the current state, also including the wildcard "*" state.
+ * Throws an exception if the given target state is not a valid identifier.
+ *
+ * @param name is the name of the requested command.
+ * @return nullptr if no target state was found, a pointer at the target
+ * state otherwise.
+ */
+ const State *findTargetStateOrWildcard(const std::string &name);
+
+ /**
+ * Tries to reconstruct the parser state from the Scope instance of the
+ * ParserContext given in the constructor. This functionality is needed for
+ * including files,as the Parser of the included file needs to be brought to
+ * an equivalent state as the one in the including file.
+ */
+ void deduceState();
+
+ /**
+ * Returns a reference at the current HandlerInfo instance (or a stub
+ * HandlerInfo instance if the stack is empty).
+ */
+ HandlerInfo &currentInfo();
+
+ /**
+ * Returns a reference at the last HandlerInfo instance (or a stub
+ * HandlerInfo instance if the stack has only one element).
+ */
+ HandlerInfo &lastInfo();
+
+ /**
+ * Returns a set containing the tokens that should currently be processed
+ * by the TokenizedData instance.
+ *
+ * @return a TokenSet instance containing all tokens that should currently
+ * be processed.
+ */
+ TokenSet currentTokens() const;
+
+ /**
+ * Returns the whitespace mode defined by the current command.
+ */
+ WhitespaceMode currentWhitespaceMode() const;
+
+ /**
+ * Ends the current handler and removes the corresponding element from the
+ * stack.
+ *
+ * @return true if a command was ended, false otherwise.
+ */
+ bool endCurrentHandler();
+
+ /**
+ * Ends all handlers that currently are not inside a field and already had
+ * a default field. Tries to start a default field for the current handler,
+ * if currently the handler is not inside a field and did not have a default
+ * field yet. This method is called whenever the data(), startAnnotation(),
+ * startToken(), startCommand(), annotationStart() or annotationEnd() events
+ * are reached.
+ *
+ * @return true if the current command is in a valid field.
+ */
+ bool prepareCurrentHandler(bool startImplicitDefaultField = true);
+
+ /**
+ * Returns true if all handlers on the stack are currently valid, or false
+ * if at least one handler is invalid.
+ *
+ * @return true if all handlers on the stack are valid.
+ */
+ bool handlersValid();
+
+ /**
+ * Called whenever there is an actual data pending on the current
+ * TokenizedDataReader. Tries to feed this data to the current handler.
+ */
+ void handleData();
+
+ /**
+ * Called whenever there is a token waiting to be processed. If possible
+ * tries to end a current handler with this token or to start a new handler
+ * with the token.
+ *
+ * @param token is the token that should be handled.
+ */
+ void handleToken(const Token &token);
+
+ /**
+ * Called by the rangeEnd() and fieldEnd() methods to end the current ranged
+ * command.
+ *
+ * @param endRange specifies whether this should end the range of a
+ * command with range.
+ */
+ void handleFieldEnd(bool endRange);
+
+public:
+ StackImpl(ParserCallbacks &parser, ParserContext &ctx,
+ const std::multimap<std::string, const State *> &states);
+
+ ~StackImpl();
+
+ const State &currentState() const;
+ std::string currentCommandName() const;
+
+ void commandStart(const Variant &name, const Variant::mapType &args,
+ bool range);
+ void annotationStart(const Variant &className, const Variant &args,
+ bool range);
+ void annotationEnd(const Variant &className, const Variant &elementName);
+ void rangeEnd();
+ void fieldStart(bool isDefault);
+ void fieldEnd();
+ void data(const TokenizedData &data);
+
+ TokenId registerToken(const std::string &token) override;
+ void unregisterToken(TokenId id) override;
+ Variant readData() override;
+ void pushTokens(const std::vector<SyntaxDescriptor> &tokens) override;
+ void popTokens() override;
+};
+
+StackImpl::StackImpl(ParserCallbacks &parser, ParserContext &ctx,
+ const std::multimap<std::string, const State *> &states)
+ : parser(parser),
+ ctx(ctx),
+ states(states),
+ tokenRegistry(parser),
+ dataReader(nullptr)
{
// If the scope instance is not empty we need to deduce the current parser
// state
@@ -129,7 +474,7 @@ Stack::Stack(ParserContext &ctx,
}
}
-Stack::~Stack()
+StackImpl::~StackImpl()
{
while (!stack.empty()) {
// Fetch the topmost stack element
@@ -142,7 +487,7 @@ Stack::~Stack()
!info.inImplicitDefaultField) {
logger().error(
std::string("Reached end of stream, but command \"") +
- info.handler->getName() +
+ currentCommandName() +
"\" has not ended yet. Command was started here:",
info.handler->getLocation());
}
@@ -153,7 +498,7 @@ Stack::~Stack()
}
}
-void Stack::deduceState()
+void StackImpl::deduceState()
{
// Assemble all states
std::vector<const State *> states;
@@ -176,8 +521,8 @@ void Stack::deduceState()
HandlerConstructor ctor =
state.elementHandler ? state.elementHandler : EmptyHandler::create;
- std::shared_ptr<Handler> handler =
- std::shared_ptr<Handler>{ctor({ctx, "", state, SourceLocation{}})};
+ std::shared_ptr<Handler> handler = std::shared_ptr<Handler>{
+ ctor({ctx, *this, state, SourceLocation{}, HandlerType::COMMAND})};
stack.emplace_back(handler);
// Set the correct flags for this implicit handler
@@ -186,7 +531,7 @@ void Stack::deduceState()
info.fieldStart(true, false, true);
}
-std::set<std::string> Stack::expectedCommands()
+std::set<std::string> StackImpl::expectedCommands()
{
const State *currentState = &(this->currentState());
std::set<std::string> res;
@@ -198,17 +543,7 @@ std::set<std::string> Stack::expectedCommands()
return res;
}
-const State &Stack::currentState()
-{
- return stack.empty() ? States::None : stack.back().handler->getState();
-}
-
-std::string Stack::currentCommandName()
-{
- return stack.empty() ? std::string{} : stack.back().handler->getName();
-}
-
-const State *Stack::findTargetState(const std::string &name)
+const State *StackImpl::findTargetState(const std::string &name)
{
const State *currentState = &(this->currentState());
auto range = states.equal_range(name);
@@ -222,7 +557,7 @@ const State *Stack::findTargetState(const std::string &name)
return nullptr;
}
-const State *Stack::findTargetStateOrWildcard(const std::string &name)
+const State *StackImpl::findTargetStateOrWildcard(const std::string &name)
{
// Try to find the target state with the given name, if none is found, try
// find a matching "*" state.
@@ -233,16 +568,40 @@ const State *Stack::findTargetStateOrWildcard(const std::string &name)
return targetState;
}
-HandlerInfo &Stack::currentInfo()
+const State &StackImpl::currentState() const
+{
+ return stack.empty() ? States::None : stack.back().state();
+}
+
+std::string StackImpl::currentCommandName() const
+{
+ return stack.empty() ? std::string{} : stack.back().name();
+}
+
+TokenSet StackImpl::currentTokens() const
+{
+ // TODO: Implement
+ return TokenSet{};
+}
+
+WhitespaceMode StackImpl::currentWhitespaceMode() const
+{
+ // TODO: Implement
+ return WhitespaceMode::COLLAPSE;
+}
+
+HandlerInfo &StackImpl::currentInfo()
{
return stack.empty() ? EmptyHandlerInfo : stack.back();
}
-HandlerInfo &Stack::lastInfo()
+HandlerInfo &StackImpl::lastInfo()
{
return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2];
}
-void Stack::endCurrentHandler()
+/* Stack helper functions */
+
+bool StackImpl::endCurrentHandler()
{
if (!stack.empty()) {
// Fetch the handler info for the current top-level element
@@ -266,50 +625,59 @@ void Stack::endCurrentHandler()
// Remove the element from the stack
stack.pop_back();
+ return true;
}
+ return false;
}
-void Stack::endOverdueHandlers()
+bool StackImpl::prepareCurrentHandler(bool startImplicitDefaultField)
{
- if (!stack.empty()) {
- // Fetch the handler info for the current top-level element
- HandlerInfo &info = stack.back();
+ // Repeat until a valid handler is found on the stack
+ while (!stack.empty()) {
+ // Fetch the handler for the current top-level element
+ HandlerInfo &info = currentInfo();
- // Abort if this handler currently is inside a field
- if (info.inField || (!info.hadDefaultField && info.valid)) {
- return;
+ // If the current Handler is in a field, there is nothing to be done,
+ // abort
+ if (info.inField) {
+ return true;
}
- // Otherwise end the current handler
- endCurrentHandler();
- }
-}
+ // If the current field already had a default field or is not valid,
+ // end it and repeat
+ if ((info.hadDefaultField || !startImplicitDefaultField) ||
+ !info.valid) {
+ // We cannot end the command if it is marked as "range" command
+ if (info.range) {
+ return false;
+ }
-bool Stack::ensureHandlerIsInField()
-{
- // If the current handler is not in a field (and actually has a handler)
- // try to start a default field
- HandlerInfo &info = currentInfo();
- if (!info.inField && info.handler != nullptr) {
- // Abort if the element already had a default field or the handler is
- // not valid
- if (info.hadDefaultField || !info.valid) {
+ // End the current handler
+ endCurrentHandler();
+ continue;
+ }
+
+ // Abort if starting new default fields is not allowed here
+ if (!startImplicitDefaultField) {
return false;
}
// Try to start a new default field, abort if this did not work
bool isDefault = true;
if (!info.handler->fieldStart(isDefault, info.fieldIdx)) {
- return false;
+ endCurrentHandler();
+ continue;
}
- // Mark the field as started
- info.fieldStart(true, true, true);
+ // Mark the field as started and return -- the field should be marked
+ // is implicit if this is not a field with range
+ info.fieldStart(true, !info.range, true);
+ return true;
}
- return true;
+ return false;
}
-bool Stack::handlersValid()
+bool StackImpl::handlersValid()
{
for (auto it = stack.crbegin(); it != stack.crend(); it++) {
if (!it->valid) {
@@ -319,13 +687,131 @@ bool Stack::handlersValid()
return true;
}
-Logger &Stack::logger() { return ctx.getLogger(); }
+void StackImpl::handleData()
+{
+ // Repeat until we found some handle willingly consuming the data
+ while (true) {
+ // Prepare the stack -- make sure all overdue handlers are ended and
+ // we currently are in an open field
+ if (stack.empty() || !prepareCurrentHandler()) {
+ throw LoggableException("Did not expect any data here");
+ }
+
+ // Fetch the current handler information
+ HandlerInfo &info = currentInfo();
+
+ // If this field should not get any data, log an error and do not
+ // call the "data" handler
+ if (!info.inValidField) {
+ if (!info.hadDefaultField) {
+ logger().error("Did not expect any data here");
+ }
+ return;
+ }
+
+ // If we're currently in an invalid subtree, just eat the data and abort
+ if (!handlersValid()) {
+ return;
+ }
+
+ // Fork the logger and set it as temporary logger for the "data"
+ // method. We only want to keep error messages if this was not a
+ // try to implicitly open a default field.
+ LoggerFork loggerFork = logger().fork();
+ info.handler->setLogger(loggerFork);
+
+ // Pass the data to the current Handler instance
+ bool valid = false;
+ try {
+ valid = info.handler->data();
+ }
+ catch (LoggableException ex) {
+ loggerFork.log(ex);
+ }
+
+ // Reset the logger instance of the handler as soon as possible
+ info.handler->resetLogger();
+
+ // If placing the data here failed and we're currently in an
+ // implicitly opened field, just unroll the stack to the next field
+ // and try again
+ if (!valid && info.inImplicitDefaultField) {
+ endCurrentHandler();
+ continue;
+ }
+
+ // Commit the content of the logger fork. Do not change the valid flag.
+ loggerFork.commit();
+ return;
+ }
+}
+
+void StackImpl::handleToken(const Token &token)
+{
+ // TODO: Implement
+ // Just eat them for now
+}
+
+void StackImpl::handleFieldEnd(bool endRange)
+{
+ // Throw away all overdue handlers
+ prepareCurrentHandler(false);
+
+ // Close all implicit default fields
+ while (!stack.empty()) {
+ HandlerInfo &info = currentInfo();
+ if (!info.inImplicitDefaultField || info.range) {
+ break;
+ }
+ endCurrentHandler();
+ }
+
+ // Fetch the information attached to the current handler
+ HandlerInfo &info = currentInfo();
+ if (stack.empty() || (!info.inField && !endRange) ||
+ (!info.range && endRange)) {
+ if (endRange) {
+ logger().error(
+ "Got end of range, but there is no command here to end");
+ } else {
+ logger().error("Got field end, but there is no field here to end");
+ }
+ return;
+ }
+
+ // Only continue if the current handler stack is in a valid state, do not
+ // call the fieldEnd function if something went wrong before
+ if (handlersValid()) {
+ // End the current field if it is valid
+ if (info.inValidField) {
+ info.handler->fieldEnd();
+ info.fieldEnd();
+ }
+
+ // End the complete command if this is a range command, start the
+ // default field for once if range command did not have a default field
+ if (info.range && endRange) {
+ if (!info.hadDefaultField) {
+ bool isDefault = true;
+ info.handler->fieldStart(isDefault, true);
+ info.fieldStart(true, true, true);
+ }
+ endCurrentHandler();
+ return;
+ }
+ }
+
+ // This command no longer is in a field
+ info.fieldEnd();
+}
+
+/* Class StackImpl public functions */
-void Stack::command(const Variant &name, const Variant::mapType &args)
+void StackImpl::commandStart(const Variant &name, const Variant::mapType &args,
+ bool range)
{
- // End handlers that already had a default field and are currently not
- // active.
- endOverdueHandlers();
+ // Call prepareCurrentHandler once to end all overdue commands
+ prepareCurrentHandler();
// Make sure the given identifier is valid (preventing "*" from being
// malicously passed to this function)
@@ -336,14 +822,18 @@ void Stack::command(const Variant &name, const Variant::mapType &args)
}
while (true) {
+ // Prepare the stack -- make sure all overdue handlers are ended and
+ // we currently are in an open field
+ prepareCurrentHandler();
+
// Try to find a target state for the given command, if none can be
// found and the current command does not have an open field, then try
// to create an empty default field, otherwise this is an exception
const State *targetState = findTargetStateOrWildcard(name.asString());
if (targetState == nullptr) {
HandlerInfo &info = currentInfo();
- if (info.inImplicitDefaultField || !info.inField) {
- endCurrentHandler();
+ if ((info.inImplicitDefaultField || !info.inField) &&
+ endCurrentHandler()) {
continue;
} else {
throw buildInvalidCommandException(name.asString(),
@@ -351,12 +841,6 @@ void Stack::command(const Variant &name, const Variant::mapType &args)
}
}
- // Make sure we're currently inside a field
- if (!ensureHandlerIsInField()) {
- endCurrentHandler();
- continue;
- }
-
// Fork the logger. We do not want any validation errors to skip
LoggerFork loggerFork = logger().fork();
@@ -365,10 +849,15 @@ void Stack::command(const Variant &name, const Variant::mapType &args)
? targetState->elementHandler
: EmptyHandler::create;
std::shared_ptr<Handler> handler{
- ctor({ctx, name.asString(), *targetState, name.getLocation()})};
+ ctor({ctx,
+ *this,
+ *targetState,
+ {name.asString(), name.getLocation()},
+ HandlerType::COMMAND})};
stack.emplace_back(handler);
- // Fetch the HandlerInfo for the parent element and the current element
+ // Fetch the HandlerInfo for the parent element and the current
+ // element
HandlerInfo &parentInfo = lastInfo();
HandlerInfo &info = currentInfo();
@@ -387,7 +876,7 @@ void Stack::command(const Variant &name, const Variant::mapType &args)
handler->setLogger(loggerFork);
try {
- info.valid = handler->start(canonicalArgs);
+ info.valid = handler->startCommand(canonicalArgs);
}
catch (LoggableException ex) {
loggerFork.log(ex);
@@ -395,94 +884,65 @@ void Stack::command(const Variant &name, const Variant::mapType &args)
handler->resetLogger();
}
- // We started the command within an implicit default field and it is not
- // valid -- remove both the new handler and the parent field from the
- // stack
+ // We started the command within an implicit default field and it is
+ // not valid -- remove both the new handler and the parent field from
+ // the stack
if (!info.valid && parentInfo.inImplicitDefaultField) {
- endCurrentHandler();
- endCurrentHandler();
- continue;
+ // Only continue if the parent handler could actually be removed
+ if (endCurrentHandler() && endCurrentHandler()) {
+ continue;
+ }
}
- // If we ended up here, starting the command may or may not have worked,
- // but after all, we cannot unroll the stack any further. Update the
- // "valid" flag, commit any potential error messages and return.
+ // If we ended up here, starting the command may or may not have
+ // worked, but after all, we cannot unroll the stack any further. Update
+ // the "valid" flag, commit any potential error messages and return.
info.valid = parentInfo.valid && info.valid;
+ info.range = range;
loggerFork.commit();
return;
}
}
-void Stack::data(const Variant &data)
+void StackImpl::annotationStart(const Variant &className, const Variant &args,
+ bool range)
{
- // End handlers that already had a default field and are currently not
- // active.
- endOverdueHandlers();
-
- while (true) {
- // Check whether there is any command the data can be sent to
- if (stack.empty()) {
- throw LoggableException("No command here to receive data.", data);
- }
-
- // Fetch the current command handler information
- HandlerInfo &info = currentInfo();
-
- // Make sure the current handler has an open field
- if (!ensureHandlerIsInField()) {
- endCurrentHandler();
- continue;
- }
-
- // If this field should not get any data, log an error and do not call
- // the "data" handler
- if (!info.inValidField) {
- // If the "hadDefaultField" flag is set, we already issued an error
- // message
- if (!info.hadDefaultField) {
- logger().error("Did not expect any data here", data);
- }
- }
-
- if (handlersValid() && info.inValidField) {
- // Fork the logger and set it as temporary logger for the "start"
- // method. We only want to keep error messages if this was not a try
- // to implicitly open a default field.
- LoggerFork loggerFork = logger().fork();
- info.handler->setLogger(loggerFork);
-
- // Pass the data to the current Handler instance
- bool valid = false;
- try {
- Variant dataCopy = data;
- valid = info.handler->data(dataCopy);
- }
- catch (LoggableException ex) {
- loggerFork.log(ex);
- }
+ // TODO
+}
- // Reset the logger instance as soon as possible
- info.handler->resetLogger();
+void StackImpl::annotationEnd(const Variant &className,
+ const Variant &elementName)
+{
+ // TODO
+}
- // If placing the data here failed and we're currently in an
- // implicitly opened field, just unroll the stack to the next field
- // and try again
- if (!valid && info.inImplicitDefaultField) {
- endCurrentHandler();
- continue;
- }
+void StackImpl::rangeEnd() { handleFieldEnd(true); }
- // Commit the content of the logger fork. Do not change the valid
- // flag.
- loggerFork.commit();
+void StackImpl::data(const TokenizedData &data)
+{
+ // Fetch a reader for the given tokenized data instance.
+ TokenizedDataReader reader = data.reader();
+
+ // Use the GuardedTemporaryPointer to make sure that the member variable
+ // dataReader is resetted to nullptr once this scope is left.
+ GuardedTemporaryPointer<TokenizedDataReader> ptr(&reader, &dataReader);
+
+ // Peek a token from the reader, repeat until all tokens have been read
+ Token token;
+ while (reader.peek(token, currentTokens(), currentWhitespaceMode())) {
+ // Handle the token as text data or as actual token
+ if (token.id == Tokens::Data) {
+ handleData();
+ } else {
+ handleToken(token);
}
- // There was no reason to unroll the stack any further, so continue
- return;
+ // Consume the peeked token
+ reader.consumePeek();
}
}
-void Stack::fieldStart(bool isDefault)
+void StackImpl::fieldStart(bool isDefault)
{
// Make sure the current handler stack is not empty
if (stack.empty()) {
@@ -494,13 +954,14 @@ void Stack::fieldStart(bool isDefault)
HandlerInfo &info = currentInfo();
if (info.inField) {
logger().error(
- "Got field start, but there is no command for which to start the "
+ "Got field start, but there is no command for which to start "
+ "the "
"field.");
return;
}
- // If the handler already had a default field we cannot start a new field
- // (the default field always is the last field) -- mark the command as
+ // If the handler already had a default field we cannot start a new
+ // field (the default field always is the last field) -- mark the command as
// invalid
if (info.hadDefaultField) {
logger().error(std::string("Got field start, but command \"") +
@@ -534,54 +995,132 @@ void Stack::fieldStart(bool isDefault)
info.fieldStart(defaultField, false, valid);
}
-void Stack::fieldEnd()
+void StackImpl::fieldEnd() { handleFieldEnd(false); }
+
+/* Class StackImpl HandlerCallbacks */
+
+TokenId StackImpl::registerToken(const std::string &token)
{
- // Unroll the stack until the next explicitly open field
- while (!stack.empty()) {
- HandlerInfo &info = currentInfo();
- if (info.inField && !info.inImplicitDefaultField) {
- break;
- }
- endCurrentHandler();
- }
+ return tokenRegistry.registerToken(token);
+}
- // Fetch the information attached to the current handler
- HandlerInfo &info = currentInfo();
- if (!info.inField || info.inImplicitDefaultField || stack.empty()) {
- logger().error(
- "Got field end, but there is no command for which to end the "
- "field.");
- return;
- }
+void StackImpl::unregisterToken(TokenId id)
+{
+ tokenRegistry.unregisterToken(id);
+}
- // Only continue if the current handler stack is in a valid state, do not
- // call the fieldEnd function if something went wrong before
- if (handlersValid() && !info.hadDefaultField && info.inValidField) {
- try {
- info.handler->fieldEnd();
- }
- catch (LoggableException ex) {
- logger().log(ex);
+void StackImpl::pushTokens(const std::vector<SyntaxDescriptor> &tokens)
+{
+ // TODO
+}
+
+void StackImpl::popTokens()
+{
+ // TODO
+}
+
+Variant StackImpl::readData()
+{
+ if (dataReader != nullptr) {
+ TokenizedDataReaderFork dataReaderFork = dataReader->fork();
+ Token token;
+ dataReaderFork.read(token, currentTokens(), currentWhitespaceMode());
+ if (token.id == Tokens::Data) {
+ Variant res = Variant::fromString(token.content);
+ res.setLocation(token.getLocation());
+ return res;
}
}
+ return Variant{};
+}
- // This command no longer is in a field
- info.fieldEnd();
+/* Class Stack */
+
+Stack::Stack(ParserCallbacks &parser, ParserContext &ctx,
+ const std::multimap<std::string, const State *> &states)
+ : impl(new StackImpl(parser, ctx, states))
+{
}
-void Stack::annotationStart(const Variant &className, const Variant &args)
+Stack::~Stack()
{
- // TODO
+ // Do nothing here, stub needed because StackImpl is incomplete in hpp
+}
+
+const State &Stack::currentState() const { return impl->currentState(); }
+
+std::string Stack::currentCommandName() const
+{
+ return impl->currentCommandName();
+}
+
+void Stack::commandStart(const Variant &name, const Variant::mapType &args,
+ bool range)
+{
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: commandStart " << name << " " << args << " " << range
+ << std::endl;
+#endif
+ impl->commandStart(name, args, range);
+}
+
+void Stack::annotationStart(const Variant &className, const Variant &args,
+ bool range)
+{
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: annotationStart " << className << " " << args << " "
+ << range << std::endl;
+#endif
+ impl->annotationStart(className, args, range);
}
void Stack::annotationEnd(const Variant &className, const Variant &elementName)
{
- // TODO
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: annotationEnd " << className << " " << elementName
+ << std::endl;
+#endif
+ impl->annotationEnd(className, elementName);
}
-void Stack::token(Variant token)
+void Stack::rangeEnd()
{
- // TODO
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: rangeEnd" << std::endl;
+#endif
+ impl->rangeEnd();
+}
+
+void Stack::fieldStart(bool isDefault)
+{
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: fieldStart " << isDefault << std::endl;
+#endif
+ impl->fieldStart(isDefault);
+}
+
+void Stack::fieldEnd()
+{
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: fieldEnd" << std::endl;
+#endif
+ impl->fieldEnd();
+}
+
+void Stack::data(const TokenizedData &data)
+{
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: data" << std::endl;
+#endif
+ impl->data(data);
+}
+
+void Stack::data(const std::string &str)
+{
+#if STACK_DEBUG_OUTPUT
+ std::cout << "STACK: data (string) " << str << std::endl;
+#endif
+ data(TokenizedData(str));
+}
}
}
-} \ No newline at end of file
diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp
index b67ce82..6d42f10 100644
--- a/src/core/parser/stack/Stack.hpp
+++ b/src/core/parser/stack/Stack.hpp
@@ -29,235 +29,48 @@
#ifndef _OUSIA_PARSER_STACK_STACK_HPP_
#define _OUSIA_PARSER_STACK_STACK_HPP_
-#include <cstdint>
-
#include <map>
#include <memory>
-#include <set>
-#include <vector>
-
-#include <core/common/Variant.hpp>
-#include <core/parser/Parser.hpp>
namespace ousia {
// Forward declarations
class ParserContext;
-class Logger;
+class TokenizedData;
+class Variant;
namespace parser_stack {
// Forward declarations
-class Handler;
+class ParserCallbacks;
+class StackImpl;
class State;
/**
- * The HandlerInfo class is used internally by the stack to associate additional
- * (mutable) data with a handler instance.
- */
-class HandlerInfo {
-public:
- /**
- * Pointer pointing at the actual handler instance.
- */
- std::shared_ptr<Handler> handler;
-
- /**
- * Next field index to be passed to the "fieldStart" function of the Handler
- * class.
- */
- size_t fieldIdx;
-
- /**
- * Set to true if the handler is valid (which is the case if the "start"
- * method has returned true). If the handler is invalid, no more calls are
- * directed at it until it can be removed from the stack.
- */
- bool valid : 1;
-
- /**
- * Set to true if this is an implicit handler, that was created when the
- * current stack state was deduced.
- */
- bool implicit : 1;
-
- /**
- * Set to true if the handler currently is in a field.
- */
- bool inField : 1;
-
- /**
- * Set to true if the handler currently is in the default field.
- */
- bool inDefaultField : 1;
-
- /**
- * Set to true if the handler currently is in an implicitly started default
- * field.
- */
- bool inImplicitDefaultField : 1;
-
- /**
- * Set to false if this field is only opened pro-forma and does not accept
- * any data. Otherwise set to true.
- */
- bool inValidField : 1;
-
- /**
- * Set to true, if the default field was already started.
- */
- bool hadDefaultField : 1;
-
- /**
- * Default constructor of the HandlerInfo class.
- */
- HandlerInfo();
- /**
- * Constructor of the HandlerInfo class, allows to set all flags manually.
- */
- HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField,
- bool inImplicitDefaultField, bool inValidField);
-
- /**
- * Constructor of the HandlerInfo class, taking a shared_ptr to the handler
- * to which additional information should be attached.
- */
- HandlerInfo(std::shared_ptr<Handler> handler);
-
- /**
- * Destructor of the HandlerInfo class (to allow Handler to be forward
- * declared).
- */
- ~HandlerInfo();
-
- /**
- * Updates the "field" flags according to a "fieldStart" event.
- */
- void fieldStart(bool isDefault, bool isImplicit, bool isValid);
-
- /**
- * Updates the "fields" flags according to a "fieldEnd" event.
- */
- void fieldEnd();
-};
-
-/**
* The Stack class is a pushdown automaton responsible for turning a command
* stream into a tree of Node instances. It does so by following a state
* transition graph and creating a set of Handler instances, which are placed
- * on the stack.
+ * on the stack. Additionally it is responsible for the normalization of
+ * Annotations and for handling tokens.
*/
class Stack {
private:
/**
- * Reference at the parser context.
- */
- ParserContext &ctx;
-
- /**
- * Map containing all registered command names and the corresponding
- * state descriptors.
- */
- const std::multimap<std::string, const State *> &states;
-
- /**
- * Internal stack used for managing the currently active Handler instances.
- */
- std::vector<HandlerInfo> stack;
-
- /**
- * Return the reference in the Logger instance stored within the context.
- */
- Logger &logger();
-
- /**
- * Used internally to get all expected command names for the current state.
- * This function is used to build error messages.
- *
- * @return a set of strings containing the names of the expected commands.
+ * Pointer at the internal implementation
*/
- std::set<std::string> expectedCommands();
-
- /**
- * Returns the targetState for a command with the given name that can be
- * reached from the current state.
- *
- * @param name is the name of the requested command.
- * @return nullptr if no target state was found, a pointer at the target
- * state otherwise.
- */
- const State *findTargetState(const std::string &name);
-
- /**
- * Returns the targetState for a command with the given name that can be
- * reached from the current state, also including the wildcard "*" state.
- * Throws an exception if the given target state is not a valid identifier.
- *
- * @param name is the name of the requested command.
- * @return nullptr if no target state was found, a pointer at the target
- * state otherwise.
- */
- const State *findTargetStateOrWildcard(const std::string &name);
-
- /**
- * Tries to reconstruct the parser state from the Scope instance of the
- * ParserContext given in the constructor. This functionality is needed for
- * including files,as the Parser of the included file needs to be brought to
- * an equivalent state as the one in the including file.
- */
- void deduceState();
-
- /**
- * Returns a reference at the current HandlerInfo instance (or a stub
- * HandlerInfo instance if the stack is empty).
- */
- HandlerInfo &currentInfo();
-
- /**
- * Returns a reference at the last HandlerInfo instance (or a stub
- * HandlerInfo instance if the stack has only one element).
- */
- HandlerInfo &lastInfo();
-
- /**
- * Ends all handlers that currently are not inside a field and already had
- * a default field. This method is called whenever the data() and command()
- * events are reached.
- */
- void endOverdueHandlers();
-
- /**
- * Ends the current handler and removes the corresponding element from the
- * stack.
- */
- void endCurrentHandler();
-
- /**
- * Tries to start a default field for the current handler, if currently the
- * handler is not inside a field and did not have a default field yet.
- *
- * @return true if the handler is inside a field, false if no field could
- * be started.
- */
- bool ensureHandlerIsInField();
-
- /**
- * Returns true if all handlers on the stack are currently valid, or false
- * if at least one handler is invalid.
- *
- * @return true if all handlers on the stack are valid.
- */
- bool handlersValid();
+ std::unique_ptr<StackImpl> impl;
public:
/**
* Creates a new instance of the Stack class.
*
+ * @param parser is an implementation of the ParserCallbacks instance to
+ * which certain calls are directed.
* @param ctx is the parser context the parser stack is working on.
* @param states is a map containing the command names and pointers at the
* corresponding State instances.
*/
- Stack(ParserContext &ctx,
+ Stack(ParserCallbacks &parser, ParserContext &ctx,
const std::multimap<std::string, const State *> &states);
/**
@@ -268,10 +81,10 @@ public:
/**
* Returns the state the Stack instance currently is in.
*
- * @return the state of the currently active Handler instance or STATE_NONE
- * if no handler is on the stack.
+ * @return the state of the currently active Handler instance or
+ * States::None if no handler is on the stack.
*/
- const State &currentState();
+ const State &currentState() const;
/**
* Returns the command name that is currently being handled.
@@ -279,7 +92,7 @@ public:
* @return the name of the command currently being handled by the active
* Handler instance or an empty string if no handler is currently active.
*/
- std::string currentCommandName();
+ std::string currentCommandName() const;
/**
* Function that should be called whenever a new command is reached.
@@ -288,17 +101,36 @@ public:
* separator ':') and its corresponding location. Must be a string variant.
* @param args is a map containing the arguments that were passed to the
* command.
+ * @param range if true, the started command has an explicit range.
*/
- void command(const Variant &name, const Variant::mapType &args);
+ void commandStart(const Variant &name, const Variant::mapType &args,
+ bool range = false);
/**
- * Function that shuold be called whenever character data is found in the
- * input stream. May only be called if the currently is a command on the
- * stack.
+ * Function that should be called whenever an annotation starts.
+ *
+ * @param name is the name of the annotation class.
+ * @param args is a map variant containing the arguments that were passed
+ * to the annotation.
+ * @param range if true, the annotation fields have an explicit range.
+ */
+ void annotationStart(const Variant &className, const Variant &args,
+ bool range = false);
+
+ /**
+ * Function that should be called whenever an annotation ends.
*
- * @param data is a string variant containing the data that has been found.
+ * @param name is the name of the annotation class that was ended.
+ * @param annotationName is the name of the annotation that was ended.
*/
- void data(const Variant &data);
+ void annotationEnd(const Variant &className, const Variant &elementName);
+
+ /**
+ * Function the should be called whenever a ranged command or annotation
+ * ends. Must be called if the range parameter range was set to true when
+ * annotationStart() or commandStart() were called.
+ */
+ void rangeEnd();
/**
* Function that should be called whenever a new field starts. Fields of the
@@ -317,29 +149,25 @@ public:
void fieldEnd();
/**
- * Function that should be called whenever an annotation starts.
- *
- * @param name is the name of the annotation class.
- * @param args is a map variant containing the arguments that were passed
- * to the annotation.
- */
- void annotationStart(const Variant &className, const Variant &args);
-
- /**
- * Function that should be called whenever an annotation ends.
+ * Function that should be called whenever character data is found in the
+ * input stream. May only be called if there currently is a command on the
+ * stack.
*
- * @param name is the name of the annotation class that was ended.
- * @param annotationName is the name of the annotation that was ended.
+ * @param data is a TokenizedData instance containing the pre-segmented data
+ * that should be read.
*/
- void annotationEnd(const Variant &className, const Variant &elementName);
+ void data(const TokenizedData &data);
/**
- * Function that should be called whenever a previously registered token
- * is found in the input stream.
+ * Function that may be called whenever character data is found in the
+ * input stream. May only be called if the currently is a command on the
+ * stack. This method is mainly intended for unit testing. Pass a
+ * TokenizedData instance to the
*
- * @param token is string variant containing the token that was encountered.
+ * @param str is a string containing the data that should be passed to the
+ * tokenizer.
*/
- void token(Variant token);
+ void data(const std::string &str);
};
}
}
diff --git a/src/core/parser/stack/State.cpp b/src/core/parser/stack/State.cpp
index d72f533..0feeed6 100644
--- a/src/core/parser/stack/State.cpp
+++ b/src/core/parser/stack/State.cpp
@@ -23,17 +23,19 @@ namespace parser_stack {
/* Class State */
-State::State() : elementHandler(nullptr) {}
+State::State() : elementHandler(nullptr), supportsAnnotations(false), supportsTokens(false) {}
State::State(StateSet parents, Arguments arguments,
RttiSet createdNodeTypes,
HandlerConstructor elementHandler,
- bool supportsAnnotations)
+ bool supportsAnnotations,
+ bool supportsTokens)
: parents(parents),
arguments(arguments),
createdNodeTypes(createdNodeTypes),
elementHandler(elementHandler),
- supportsAnnotations(supportsAnnotations)
+ supportsAnnotations(supportsAnnotations),
+ supportsTokens(supportsTokens)
{
}
@@ -93,6 +95,13 @@ StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations)
return *this;
}
+StateBuilder &StateBuilder::supportsTokens(bool supportsTokens)
+{
+ state.supportsTokens = supportsTokens;
+ return *this;
+}
+
+
const State &StateBuilder::build() const { return state; }
/* Class StateDeductor */
diff --git a/src/core/parser/stack/State.hpp b/src/core/parser/stack/State.hpp
index 4766235..011ccd6 100644
--- a/src/core/parser/stack/State.hpp
+++ b/src/core/parser/stack/State.hpp
@@ -82,13 +82,21 @@ struct State {
/**
* Set to true if this handler does support annotations. This is almost
- * always false (e.g. all description handlers), except for document
+ * always false (e.g. all description handlers), except for document
* element handlers.
*/
- bool supportsAnnotations;
+ bool supportsAnnotations : 1;
/**
- * Default constructor, initializes the handlers with nullptr.
+ * Set to true if this handler does support tokens. This is almost
+ * always false (e.g. all description handlers), except for document
+ * element handlers.
+ */
+ bool supportsTokens : 1;
+
+ /**
+ * Default constructor, initializes the handlers with nullptr and the
+ * supportsAnnotations and supportsTokens flags with false.
*/
State();
@@ -108,11 +116,12 @@ struct State {
* be nullptr in which case no handler instance is created.
* @param supportsAnnotations specifies whether annotations are supported
* here at all.
+ * @param supportsTokens specified whether tokens are supported here at all.
*/
State(StateSet parents, Arguments arguments = Arguments{},
- RttiSet createdNodeTypes = RttiSet{},
- HandlerConstructor elementHandler = nullptr,
- bool supportsAnnotations = false);
+ RttiSet createdNodeTypes = RttiSet{},
+ HandlerConstructor elementHandler = nullptr,
+ bool supportsAnnotations = false, bool supportsTokens = false);
/**
* Creates this State from the given StateBuilder instance.
@@ -220,6 +229,16 @@ public:
StateBuilder &supportsAnnotations(bool supportsAnnotations);
/**
+ * Sets the state of the "supportsTokens" flag (default value is false).
+ *
+ * @param supportsTokens should be set to true, if the elementHandler
+ * registered for this state is capable of handling tokens.
+ * @return a reference at this StateBuilder instance for method
+ * chaining.
+ */
+ StateBuilder &supportsTokens(bool supportsTokens);
+
+ /**
* Returns a reference at the internal State instance that was built
* using the StateBuilder.
*
@@ -275,7 +294,7 @@ public:
* @param states is a list of states that should be checked.
*/
StateDeductor(std::vector<const Rtti *> signature,
- std::vector<const State *> states);
+ std::vector<const State *> states);
/**
* Selects all active states from the given states. Only considers those
diff --git a/src/core/parser/stack/TokenRegistry.cpp b/src/core/parser/stack/TokenRegistry.cpp
new file mode 100644
index 0000000..c135b98
--- /dev/null
+++ b/src/core/parser/stack/TokenRegistry.cpp
@@ -0,0 +1,80 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Callbacks.hpp"
+#include "TokenRegistry.hpp"
+
+namespace ousia {
+namespace parser_stack {
+
+TokenRegistry::~TokenRegistry()
+{
+ for (const auto &tid: tokenIds) {
+ parser.unregisterToken(tid.first);
+ }
+}
+
+TokenId TokenRegistry::registerToken(const std::string &token)
+{
+ // Check whether the given token is already registered
+ auto it = tokens.find(token);
+ if (it != tokens.end()) {
+ // Increment the reference count
+ size_t &refCount = it->second.second;
+ refCount++;
+
+ // Return the token id
+ return it->second.first;
+ }
+
+ // Register the token in the parser
+ TokenId id = parser.registerToken(token);
+ tokens[token] = std::pair<TokenId, size_t>(id, 1);
+ tokenIds[id] = token;
+ return id;
+}
+
+void TokenRegistry::unregisterToken(TokenId id)
+{
+ // Lookup the token corresponding to the given token id
+ auto tokenIt = tokenIds.find(id);
+ if (tokenIt != tokenIds.end()) {
+ const std::string &token = tokenIt->second;
+ // Lookup the reference count for the corresponding token
+ auto idIt = tokens.find(token);
+ if (idIt != tokens.end()) {
+ // Decrement the reference count, abort if the refCount is larger
+ // than zero
+ size_t &refCount = idIt->second.second;
+ refCount--;
+ if (refCount > 0) {
+ return;
+ }
+
+ // Unregister the token from the parser
+ parser.unregisterToken(id);
+
+ // Unregister the token from the internal tokens map
+ tokens.erase(token);
+ }
+ // Unregister the token from the internal id map
+ tokenIds.erase(id);
+ }
+}
+}
+}
diff --git a/src/core/parser/stack/TokenRegistry.hpp b/src/core/parser/stack/TokenRegistry.hpp
new file mode 100644
index 0000000..545db39
--- /dev/null
+++ b/src/core/parser/stack/TokenRegistry.hpp
@@ -0,0 +1,114 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file TokenRegistry.hpp
+ *
+ * Contains the TokenRegistry class used for registering all user defined tokens
+ * during the parsing process.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_
+#define _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_
+
+#include <string>
+#include <unordered_map>
+
+#include <core/common/Token.hpp>
+
+namespace ousia {
+namespace parser_stack {
+
+// Forward declarations
+class ParserCallbacks;
+
+/**
+ * The TokenRegistry class is used for registering all user defined tokens
+ * during the Parsing process. The TokenRegistry class acts as an adapter
+ * between the parser which allocates a TokenId for each unique token and the
+ * Handler classes which may register the same token multiple times and expect
+ * the same TokenId to be returned for the same token.
+ */
+class TokenRegistry {
+private:
+ /**
+ * Reference at the ParserCallback instance the tokens are relayed to.
+ */
+ ParserCallbacks &parser;
+
+ /**
+ * Store containing all TokenId instances for all registered tokens. The map
+ * maps from the token strings to the corresponding TokenId and a reference
+ * count.
+ */
+ std::unordered_map<std::string, std::pair<TokenId, size_t>> tokens;
+
+ /**
+ * Reverse map containing the string corresponding to a TokenId.
+ */
+ std::unordered_map<TokenId, std::string> tokenIds;
+
+public:
+ /**
+ * Constructor of the TokenRegistry class.
+ *
+ * @param parser is the underlying parser implementing the ParserCallbacks
+ * interface to which all calls are relayed.
+ */
+ TokenRegistry(ParserCallbacks &parser) : parser(parser) {}
+
+ /**
+ * Destructor of the TokenRegistry class, removes all registered tokens from
+ * the parser.
+ */
+ ~TokenRegistry();
+
+ /* No copy construction */
+ TokenRegistry(const TokenRegistry &) = delete;
+
+ /* No assignment */
+ TokenRegistry &operator=(const TokenRegistry &) = delete;
+
+ /**
+ * Registers the given string token in the underlying parser and returns the
+ * TokenId of that token. If the same token string is given multiple times,
+ * the same TokenId is returned. The token is only registered once in the
+ * parser.
+ *
+ * @param token is the token that should be registered.
+ * @return the TokenId associated with this token.
+ */
+ TokenId registerToken(const std::string &token);
+
+ /**
+ * Unregisters the token with the given TokenId from the parser. Note that
+ * the token will only be unregistered if unregisterToken() has been called
+ * as many times as registerToken() for the same token.
+ *
+ * @param id is the id of the token returned by registerToken() that should
+ * be unregistered.
+ */
+ void unregisterToken(TokenId id);
+};
+}
+}
+
+#endif /* _OUSIA_PARSER_STACK_TOKEN_REGISTRY_HPP_ */
+
diff --git a/src/core/parser/stack/TokenStack.cpp b/src/core/parser/stack/TokenStack.cpp
new file mode 100644
index 0000000..ac1d94e
--- /dev/null
+++ b/src/core/parser/stack/TokenStack.cpp
@@ -0,0 +1,45 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "TokenStack.hpp"
+
+namespace ousia {
+namespace parser_stack {
+
+void TokenStack::pushTokens(const std::vector<SyntaxDescriptor> &tokens)
+{
+ stack.push_back(tokens);
+}
+
+void TokenStack::popTokens() { stack.pop_back(); }
+
+TokenSet TokenStack::tokens() const
+{
+ if (stack.empty() && parentStack != nullptr) {
+ return parentStack->tokens();
+ }
+
+ TokenSet res;
+ for (const SyntaxDescriptor &descr : stack.back()) {
+ descr.insertIntoTokenSet(res);
+ }
+ return res;
+}
+}
+}
+
diff --git a/src/core/parser/stack/TokenStack.hpp b/src/core/parser/stack/TokenStack.hpp
new file mode 100644
index 0000000..f2e7edc
--- /dev/null
+++ b/src/core/parser/stack/TokenStack.hpp
@@ -0,0 +1,112 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file TokenStack.hpp
+ *
+ * Contains the TokenStack class used for collecting the currently enabled user
+ * defined tokens on a per-field basis.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_
+#define _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_
+
+#include <memory>
+#include <vector>
+
+#include <core/common/Token.hpp>
+#include <core/model/Syntax.hpp>
+
+namespace ousia {
+namespace parser_stack {
+
+/**
+ * The TokenStack class is used by the Stack class to collect all currently
+ * enabled user defined tokens.
+ */
+class TokenStack {
+private:
+ /**
+ * Shared pointer at the parent TokenStack instance. May be nullptr, in
+ * which case no parent TokenStack instance exists.
+ */
+ const TokenStack *parentStack;
+
+ /**
+ * Stack containing vectors of TokenSyntaxDescriptor instances as given by
+ * the user.
+ */
+ std::vector<std::vector<SyntaxDescriptor>> stack;
+
+ /**
+ * Constructor of the TokenStack class.
+ *
+ * @param parentStack is a pointer at the underlying parentStack instance
+ * to which calls should be forwarded if no data has been pushed onto this
+ * stack instance.
+ */
+ TokenStack(const TokenStack *parentStack) : parentStack(parentStack) {}
+
+public:
+ /**
+ * Default constructor of the TokenStack class with no reference at a parent
+ * stack.
+ */
+ TokenStack() : TokenStack(nullptr) {}
+
+ /**
+ * Constructor of the TokenStack class with a reference at a parent
+ * TokenStack instance.
+ *
+ * @param parentStack is a reference at a parent TokenStack instance. If no
+ * data has yet been pushed onto this instance, calls will be forwarded to
+ * the parent stack.
+ */
+ TokenStack(const TokenStack &parentStack) : TokenStack(&parentStack) {}
+
+ /**
+ * Pushes a list of SyntaxDescriptor instances onto the internal stack.
+ *
+ * @param tokens is a list of SyntaxDescriptor instances that should be
+ * stored on the stack.
+ */
+ void pushTokens(const std::vector<SyntaxDescriptor> &tokens);
+
+ /**
+ * Removes the previously pushed list of tokens from the stack.
+ */
+ void popTokens();
+
+ /**
+ * Returns a set containing all currently enabled tokens. The set of enabled
+ * tokens are those tokens that were pushed last onto the stack. This set
+ * has to be passed to the TokenizedData instance in order to gather all
+ * tokens that are currently possible.
+ *
+ * @return a set of tokens containing all the Tokens that are currently
+ * possible.
+ */
+ TokenSet tokens() const;
+};
+}
+}
+
+#endif /* _OUSIA_PARSER_STACK_TOKEN_STACK_HPP_ */
+
diff --git a/src/core/parser/stack/TypesystemHandler.cpp b/src/core/parser/stack/TypesystemHandler.cpp
index b62f684..73bcf62 100644
--- a/src/core/parser/stack/TypesystemHandler.cpp
+++ b/src/core/parser/stack/TypesystemHandler.cpp
@@ -32,7 +32,7 @@ namespace parser_stack {
/* TypesystemHandler */
-bool TypesystemHandler::start(Variant::mapType &args)
+bool TypesystemHandler::startCommand(Variant::mapType &args)
{
// Create the typesystem instance
Rooted<Typesystem> typesystem =
@@ -63,7 +63,7 @@ void TypesystemHandler::end() { scope().pop(logger()); }
/* TypesystemEnumHandler */
-bool TypesystemEnumHandler::start(Variant::mapType &args)
+bool TypesystemEnumHandler::startCommand(Variant::mapType &args)
{
scope().setFlag(ParserFlag::POST_HEAD, true);
@@ -91,17 +91,17 @@ void TypesystemEnumEntryHandler::doHandle(const Variant &fieldData,
/* TypesystemStructHandler */
-bool TypesystemStructHandler::start(Variant::mapType &args)
+bool TypesystemStructHandler::startCommand(Variant::mapType &args)
{
scope().setFlag(ParserFlag::POST_HEAD, true);
// Fetch the arguments used for creating this type
- const std::string &name = args["name"].asString();
+ const std::string &structNmae = args["name"].asString();
const std::string &parent = args["parent"].asString();
// Fetch the current typesystem and create the struct node
Rooted<Typesystem> typesystem = scope().selectOrThrow<Typesystem>();
- Rooted<StructType> structType = typesystem->createStructType(name);
+ Rooted<StructType> structType = typesystem->createStructType(structNmae);
structType->setLocation(location());
// Try to resolve the parent type and set it as parent structure
@@ -124,18 +124,18 @@ void TypesystemStructHandler::end() { scope().pop(logger()); }
/* TypesystemStructFieldHandler */
-bool TypesystemStructFieldHandler::start(Variant::mapType &args)
+bool TypesystemStructFieldHandler::startCommand(Variant::mapType &args)
{
// Read the argument values
- const std::string &name = args["name"].asString();
+ const std::string &fieldName = args["name"].asString();
const std::string &type = args["type"].asString();
const Variant &defaultValue = args["default"];
const bool optional =
!(defaultValue.isObject() && defaultValue.asObject() == nullptr);
Rooted<StructType> structType = scope().selectOrThrow<StructType>();
- Rooted<Attribute> attribute =
- structType->createAttribute(name, defaultValue, optional, logger());
+ Rooted<Attribute> attribute = structType->createAttribute(
+ fieldName, defaultValue, optional, logger());
attribute->setLocation(location());
// Try to resolve the type and default value
@@ -163,17 +163,17 @@ bool TypesystemStructFieldHandler::start(Variant::mapType &args)
/* TypesystemConstantHandler */
-bool TypesystemConstantHandler::start(Variant::mapType &args)
+bool TypesystemConstantHandler::startCommand(Variant::mapType &args)
{
scope().setFlag(ParserFlag::POST_HEAD, true);
// Read the argument values
- const std::string &name = args["name"].asString();
+ const std::string &constantName = args["name"].asString();
const std::string &type = args["type"].asString();
const Variant &value = args["value"];
Rooted<Typesystem> typesystem = scope().selectOrThrow<Typesystem>();
- Rooted<Constant> constant = typesystem->createConstant(name, value);
+ Rooted<Constant> constant = typesystem->createConstant(constantName, value);
constant->setLocation(location());
// Try to resolve the type
diff --git a/src/core/parser/stack/TypesystemHandler.hpp b/src/core/parser/stack/TypesystemHandler.hpp
index 85494f1..0773a3a 100644
--- a/src/core/parser/stack/TypesystemHandler.hpp
+++ b/src/core/parser/stack/TypesystemHandler.hpp
@@ -43,7 +43,7 @@ class TypesystemHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
/**
@@ -67,7 +67,7 @@ class TypesystemEnumHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
/**
@@ -114,7 +114,7 @@ class TypesystemStructHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
void end() override;
/**
@@ -139,7 +139,7 @@ class TypesystemStructFieldHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
/**
* Creates a new instance of the TypesystemStructFieldHandler.
@@ -162,7 +162,7 @@ class TypesystemConstantHandler : public StaticHandler {
public:
using StaticHandler::StaticHandler;
- bool start(Variant::mapType &args) override;
+ bool startCommand(Variant::mapType &args) override;
/**
* Creates a new instance of the TypesystemConstantHandler.
diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp
index d15055a..f322a88 100644
--- a/src/core/parser/utils/SourceOffsetVector.hpp
+++ b/src/core/parser/utils/SourceOffsetVector.hpp
@@ -33,6 +33,7 @@
#include <limits>
#include <vector>
#include <utility>
+#include <unordered_map>
#include <core/common/Location.hpp>
@@ -43,6 +44,9 @@ namespace ousia {
* a delta compression.
*/
class SourceOffsetVector {
+public:
+ using OffsPair = std::pair<SourceOffset, SourceOffset>;
+
private:
/**
* Type used for representing the length of a character.
@@ -82,9 +86,12 @@ private:
std::vector<SourceOffset> offsets;
/**
+ * Map used to store discontinuities in the character offsets.
+ */
+ std::unordered_map<size_t, OffsPair> gaps;
+
+ /**
* Last position given as "end" position in the storeOffset() method.
- * Used to adapt the length of the previous element in case start and end
- * positions do not match.
*/
SourceOffset lastEnd;
@@ -105,19 +112,22 @@ public:
// Make sure (end - start) is smaller than MAX_LEN
assert(end - start < MAX_LEN);
- // Adapt the length of the previous character in case there is a gap
- if (!lens.empty() && start > lastEnd) {
- lens.back() += start - lastEnd;
- }
- lastEnd = end;
-
// Store an absolute offset every OFFSET_INTERVAL elements
if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) {
offsets.push_back(start);
}
- // Store the length
- lens.push_back(end - start);
+ // Adapt the length of the previous character in case there is a gap
+ if (!lens.empty() && start > lastEnd) {
+ // There is a discontinuity, store the given offsets in the "gaps"
+ // map
+ gaps[lens.size()] = OffsPair(start, end);
+ lens.push_back(MAX_LEN);
+ } else {
+ // Store the length
+ lens.push_back(end - start);
+ }
+ lastEnd = end;
}
/**
@@ -127,14 +137,13 @@ public:
* read.
* @return a pair containing start and end source offset.
*/
- std::pair<SourceOffset, SourceOffset> loadOffset(size_t idx)
+ OffsPair loadOffset(size_t idx) const
{
// Special treatment for the last character
const size_t count = lens.size();
if (idx > 0 && idx == count) {
auto offs = loadOffset(count - 1);
- return std::pair<SourceOffset, SourceOffset>(offs.second,
- offs.second);
+ return OffsPair(offs.second, offs.second);
}
// Calculate the start index in the lens vector and in the offsets
@@ -146,18 +155,66 @@ public:
assert(idx < count);
assert(offsetIdx < offsets.size());
+ // If the length of the last character is MAX_LEN, the position is
+ // stored in the "gaps" list
+ if (lens[idx] == MAX_LEN) {
+ auto it = gaps.find(idx);
+ assert(it != gaps.end());
+ return it->second;
+ }
+
// Sum over the length starting with the start offset
SourceOffset start = offsets[offsetIdx];
for (size_t i = sumStartIdx; i < idx; i++) {
- start += lens[i];
+ if (lens[i] == MAX_LEN) {
+ auto it = gaps.find(i);
+ assert(it != gaps.end());
+ start = it->second.first;
+ } else {
+ start += lens[i];
+ }
}
- return std::pair<SourceOffset, SourceOffset>(start, start + lens[idx]);
+ return OffsPair(start, start + lens[idx]);
}
/**
* Returns the number of characters for which offsets are stored.
*/
- size_t size() { return lens.size(); }
+ size_t size() const { return lens.size(); }
+
+ /**
+ * Trims the length of the TokenizedData instance to the given length.
+ * Removes all token matches that lie within the trimmed region.
+ *
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
+ */
+ void trim(size_t length)
+ {
+ if (length < size()) {
+ lens.resize(length);
+ if (length > 0) {
+ offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1);
+ lastEnd = loadOffset(length - 1).second;
+ } else {
+ offsets.clear();
+ gaps.clear();
+ lastEnd = 0;
+ }
+ }
+ }
+
+ /**
+ * Resets the SourceOffsetVector to the state it had when it was
+ * constructed.
+ */
+ void clear()
+ {
+ lens.clear();
+ offsets.clear();
+ gaps.clear();
+ lastEnd = 0;
+ }
};
}
diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp
index 80cc945..a45d3ff 100644
--- a/src/core/parser/utils/TokenTrie.cpp
+++ b/src/core/parser/utils/TokenTrie.cpp
@@ -22,12 +22,12 @@ namespace ousia {
/* Class DynamicTokenTree::Node */
-TokenTrie::Node::Node() : type(Tokens::Empty) {}
+TokenTrie::Node::Node() : id(Tokens::Empty) {}
/* Class DynamicTokenTree */
bool TokenTrie::registerToken(const std::string &token,
- TokenId type) noexcept
+ TokenId id) noexcept
{
// Abort if the token is empty -- this would taint the root node
if (token.empty()) {
@@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token,
}
// If the resulting node already has a type set, we're screwed.
- if (node->type != Tokens::Empty) {
+ if (node->id != Tokens::Empty) {
return false;
}
// Otherwise just set the type to the given type.
- node->type = type;
+ node->id = id;
return true;
}
@@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
// Reset the subtree handler if this node has another type
node = it->second.get();
- if ((node->type != Tokens::Empty || node->children.size() > 1) &&
+ if ((node->id != Tokens::Empty || node->children.size() > 1) &&
(i + 1 != token.size())) {
subtreeRoot = node;
subtreeKey = token[i + 1];
@@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept
}
// If the node type is already Tokens::Empty, we cannot do anything here
- if (node->type == Tokens::Empty) {
+ if (node->id == Tokens::Empty) {
return false;
}
// If the target node has children, we cannot delete the subtree. Set the
// type to Tokens::Empty instead
if (!node->children.empty()) {
- node->type = Tokens::Empty;
+ node->id = Tokens::Empty;
return true;
}
@@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept
}
node = it->second.get();
}
- return node->type;
+ return node->id;
}
}
diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp
index b2d1539..c470acc 100644
--- a/src/core/parser/utils/TokenTrie.hpp
+++ b/src/core/parser/utils/TokenTrie.hpp
@@ -33,7 +33,7 @@
#include <limits>
#include <unordered_map>
-#include "Token.hpp"
+#include <core/common/Token.hpp>
namespace ousia {
@@ -75,10 +75,9 @@ public:
ChildMap children;
/**
- * Reference at the corresponding token descriptor. Set to nullptr if
- * no token is attached to this node.
+ * Id of the token represented by this node.
*/
- TokenId type;
+ TokenId id;
/**
* Default constructor, initializes the descriptor with nullptr.
@@ -99,10 +98,10 @@ public:
*
* @param token is the character sequence that should be registered as
* token.
- * @param type is the descriptor that should be set for this token.
+ * @param id is the descriptor that should be set for this token.
* @return true if the operation is successful, false otherwise.
*/
- bool registerToken(const std::string &token, TokenId type) noexcept;
+ bool registerToken(const std::string &token, TokenId id) noexcept;
/**
* Unregisters the token from the token tree. Returns true if the token was
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp
index fc7bfaf..d8a8b37 100644
--- a/src/core/parser/utils/TokenizedData.cpp
+++ b/src/core/parser/utils/TokenizedData.cpp
@@ -26,6 +26,11 @@
#include "TokenizedData.hpp"
namespace ousia {
+/**
+ * Maximum token length.
+ */
+constexpr TokenLength MaxTokenLength = std::numeric_limits<TokenLength>::max();
+
namespace {
/**
* Structure used to represent the position of a token in the internal
@@ -48,6 +53,11 @@ struct TokenMark {
TokenLength len;
/**
+ * Specifies whether the token is special or not.
+ */
+ bool special;
+
+ /**
* Constructor of the TokenMark structure, initializes all members with the
* given values.
*
@@ -55,9 +65,10 @@ struct TokenMark {
* @param bufStart is the start position of the TokenMark in the internal
* character buffer.
* @param len is the length of the token.
+ * @param special modifies the sort order, special tokens are prefered.
*/
- TokenMark(TokenId id, size_t bufStart, TokenLength len)
- : bufStart(bufStart), id(id), len(len)
+ TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special)
+ : bufStart(bufStart), id(id), len(len), special(special)
{
}
@@ -72,7 +83,8 @@ struct TokenMark {
TokenMark(size_t bufStart)
: bufStart(bufStart),
id(Tokens::Empty),
- len(std::numeric_limits<TokenLength>::max())
+ len(MaxTokenLength),
+ special(true)
{
}
@@ -86,8 +98,22 @@ struct TokenMark {
*/
friend bool operator<(const TokenMark &m1, const TokenMark &m2)
{
- return (m1.bufStart < m2.bufStart) ||
- (m1.bufStart == m2.bufStart && m1.len > m2.len);
+ // Prefer the mark with the smaller bufStart
+ if (m1.bufStart < m2.bufStart) {
+ return true;
+ }
+
+ // Special handling for marks with the same bufStart
+ if (m1.bufStart == m2.bufStart) {
+ // If exactly one of the two marks is special, return true if this
+ // one is special
+ if (m1.special != m2.special) {
+ return m1.special;
+ }
+ // Otherwise prefer longer marks
+ return m1.len > m2.len;
+ }
+ return false;
}
};
}
@@ -110,9 +136,9 @@ private:
std::vector<char> buf;
/**
- * Vector containing all token marks.
+ * Buffset storing the "protected" flag of the character data.
*/
- std::vector<TokenMark> marks;
+ std::vector<bool> protectedChars;
/**
* Vector storing all the character offsets efficiently.
@@ -120,9 +146,34 @@ private:
SourceOffsetVector offsets;
/**
+ * Vector containing all token marks.
+ */
+ mutable std::vector<TokenMark> marks;
+
+ /**
+ * Position of the first linebreak in a sequence of linebreaks.
+ */
+ size_t firstLinebreak;
+
+ /**
+ * Current indentation level.
+ */
+ uint16_t currentIndentation;
+
+ /**
+ * Last indentation level.
+ */
+ uint16_t lastIndentation;
+
+ /**
+ * Number of linebreaks without any content between them.
+ */
+ uint16_t numLinebreaks;
+
+ /**
* Flag indicating whether the internal "marks" vector is sorted.
*/
- bool sorted;
+ mutable bool sorted;
public:
/**
@@ -132,7 +183,7 @@ public:
* @param sourceId is the source identifier that should be used for
* constructing the location when returning tokens.
*/
- TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {}
+ TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); }
/**
* Appends a complete string to the internal character buffer and extends
@@ -140,22 +191,22 @@ public:
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
+ * @param protect if set to true, the appended characters will not be
+ * affected by whitespace handling, they will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(const std::string &data, SourceOffset offsStart)
- { // Append the data to the internal buffer
- buf.insert(buf.end(), data.begin(), data.end());
-
- // Extend the text regions, interpolate the source position (this may
- // yield incorrect results)
- const size_t size = buf.size();
- for (SourceOffset offs = offsStart; offs < offsStart + data.size();
- offs++) {
- offsets.storeOffset(offs, offs + 1);
+ size_t append(const std::string &data, SourceOffset offsStart, bool protect)
+ {
+ for (size_t i = 0; i < data.size(); i++) {
+ if (offsStart != InvalidSourceOffset) {
+ append(data[i], offsStart + i, offsStart + i + 1, protect);
+ } else {
+ append(data[i], InvalidSourceOffset, InvalidSourceOffset,
+ protect);
+ }
}
-
- return size;
+ return size();
}
/**
@@ -165,16 +216,86 @@ public:
* @param c is the character that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
* @param offsEnd is the end offset in bytes in the input file.
+ * @param protect if set to true, the appended character will not be
+ * affected by whitespace handling, it will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd)
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+ bool protect)
{
// Add the character to the list and store the location of the character
// in the source file
buf.push_back(c);
+ protectedChars.push_back(protect);
offsets.storeOffset(offsStart, offsEnd);
- return buf.size();
+
+ // Insert special tokens
+ const size_t size = buf.size();
+ const bool isWhitespace = Utils::isWhitespace(c);
+ const bool isLinebreak = Utils::isLinebreak(c);
+
+ // Handle linebreaks
+ if (isLinebreak) {
+ // Mark linebreaks as linebreak
+ mark(Tokens::Newline, size - 1, 1, false);
+
+ // The linebreak sequence started at the previous character
+ if (numLinebreaks == 0) {
+ firstLinebreak = size - 1;
+ }
+
+ // Reset the indentation
+ currentIndentation = 0;
+
+ // Increment the number of linebreaks
+ numLinebreaks++;
+
+ const size_t markStart = firstLinebreak;
+ const size_t markLength = size - firstLinebreak;
+
+ // Issue two consecutive linebreaks as paragraph token
+ if (numLinebreaks == 2) {
+ mark(Tokens::Paragraph, markStart, markLength, false);
+ }
+
+ // Issue three consecutive linebreaks as paragraph token
+ if (numLinebreaks >= 3) {
+ mark(Tokens::Section, markStart, markLength, false);
+ }
+ } else if (isWhitespace) {
+ // Count the whitespace characters at the beginning of the line
+ if (numLinebreaks > 0) {
+ // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the
+ // next multiple of eight.
+ if (c == '\t') {
+ currentIndentation = (currentIndentation + 8) & ~7;
+ } else {
+ currentIndentation++;
+ }
+ }
+ }
+
+ // Issue indent and unindent tokens
+ if (!isWhitespace && numLinebreaks > 0) {
+ // Issue a larger indentation than that in the previous line as
+ // "Indent" token
+ if (currentIndentation > lastIndentation) {
+ mark(Tokens::Indent, size - 1, 0, true);
+ }
+
+ // Issue a smaller indentation than that in the previous line as
+ // "Dedent" token
+ if (currentIndentation < lastIndentation) {
+ mark(Tokens::Dedent, size - 1, 0, true);
+ }
+
+ // Reset the internal state machine
+ lastIndentation = currentIndentation;
+ numLinebreaks = 0;
+ }
+
+ return size;
}
/**
@@ -184,11 +305,12 @@ public:
* @param bufStart is the start position in the internal buffer. Use the
* values returned by append to calculate the start position.
* @param len is the length of the token.
+ * @param special tags the mark as "special", prefering it in the sort order
*/
- void mark(TokenId id, size_t bufStart, TokenLength len)
+ void mark(TokenId id, size_t bufStart, TokenLength len, bool special)
{
// Push the new instance back onto the list
- marks.emplace_back(id, bufStart, len);
+ marks.emplace_back(id, bufStart, len, special);
// Update the sorted flag as soon as more than one element is in the
// list
@@ -212,9 +334,13 @@ public:
* @return true if a token was returned, false if no more tokens are
* available.
*/
- bool next(Token &token, WhitespaceMode mode,
- const std::unordered_set<TokenId> &tokens, size_t &cursor)
+ bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens,
+ TokenizedDataCursor &cursor) const
{
+ // Some variables for convenient access
+ size_t &bufPos = cursor.bufPos;
+ size_t &markPos = cursor.markPos;
+
// Sort the "marks" vector if it has not been sorted yet.
if (!sorted) {
std::sort(marks.begin(), marks.end());
@@ -222,10 +348,11 @@ public:
}
// Fetch the next larger TokenMark instance, make sure the token is in
- // the "enabled" list
- auto it =
- std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor));
- while (it != marks.end() && tokens.count(it->id) == 0) {
+ // the "enabled" list and within the buffer range
+ auto it = std::lower_bound(marks.begin() + markPos, marks.end(),
+ TokenMark(bufPos));
+ while (it != marks.end() && (tokens.count(it->id) == 0 ||
+ it->bufStart + it->len > buf.size())) {
it++;
}
@@ -236,15 +363,15 @@ public:
// Depending on the whitespace mode, fetch all the data between the
// cursor position and the calculated end position and return a token
// containing that data.
- if (cursor < end && cursor < buf.size()) {
+ if (bufPos < end && bufPos < buf.size()) {
switch (mode) {
case WhitespaceMode::PRESERVE: {
token = Token(
- Tokens::Data, std::string(&buf[cursor], end - cursor),
+ Tokens::Data, std::string(&buf[bufPos], end - bufPos),
SourceLocation(sourceId,
- offsets.loadOffset(cursor).first,
+ offsets.loadOffset(bufPos).first,
offsets.loadOffset(end).first));
- cursor = end;
+ bufPos = end;
return true;
}
case WhitespaceMode::TRIM:
@@ -254,30 +381,35 @@ public:
size_t stringStart;
size_t stringEnd;
std::string content;
+ const char *cBuf = &buf[bufPos];
+ auto filter = [cBuf, this](size_t i) -> bool {
+ return Utils::isWhitespace(cBuf[i]) &&
+ !protectedChars[i];
+ };
if (mode == WhitespaceMode::TRIM) {
- content = Utils::trim(&buf[cursor], end - cursor,
- stringStart, stringEnd);
+ content = Utils::trim(cBuf, end - bufPos, stringStart,
+ stringEnd, filter);
} else {
- content = Utils::collapse(&buf[cursor], end - cursor,
- stringStart, stringEnd);
+ content = Utils::collapse(
+ cBuf, end - bufPos, stringStart, stringEnd, filter);
}
// If the resulting string is empty (only whitespaces),
// abort
if (content.empty()) {
- cursor = end;
+ bufPos = end;
break;
}
// Calculate the absolute positions and return the token
- stringStart += cursor;
- stringEnd += cursor;
+ stringStart += bufPos;
+ stringEnd += bufPos;
token = Token(
Tokens::Data, content,
SourceLocation(sourceId,
offsets.loadOffset(stringStart).first,
offsets.loadOffset(stringEnd).first));
- cursor = end;
+ bufPos = end;
return true;
}
}
@@ -286,14 +418,18 @@ public:
// If start equals end, we're currently directly at a token
// instance. Return this token and advance the cursor to the end of
// the token.
- if (cursor == end && it != marks.end()) {
+ if (bufPos == end && it != marks.end()) {
const size_t tokenStart = it->bufStart;
const size_t tokenEnd = it->bufStart + it->len;
token = Token(
it->id, std::string(&buf[tokenStart], it->len),
SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,
offsets.loadOffset(tokenEnd).first));
- cursor = tokenEnd;
+
+ // Update the cursor, consume the token by incrementing the marks
+ // pos counter
+ bufPos = tokenEnd;
+ markPos = it - marks.begin() + 1;
return true;
}
@@ -304,11 +440,64 @@ public:
}
/**
+ * Resets the TokenizedDataImpl instance to the state it had when it was
+ * constructred.
+ */
+ void clear()
+ {
+ buf.clear();
+ protectedChars.clear();
+ offsets.clear();
+ marks.clear();
+ firstLinebreak = 0;
+ currentIndentation = 0;
+ lastIndentation = 0;
+ numLinebreaks = 1; // Assume the stream starts with a linebreak
+ sorted = true;
+ }
+
+ /**
+ * Trims the length of the TokenizedDataImpl instance to the given length.
+ *
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
+ */
+ void trim(size_t length)
+ {
+ if (length < size()) {
+ buf.resize(length);
+ protectedChars.resize(length);
+ offsets.trim(length);
+ }
+ }
+
+ /**
* Returns the current size of the internal buffer.
*
* @return the size of the internal character buffer.
*/
- size_t getSize() { return buf.size(); }
+ size_t size() const { return buf.size(); }
+
+ /**
+ * Returns true if no data is in the data buffer.
+ *
+ * @return true if the "buf" instance has no data.
+ */
+ bool empty() const { return buf.empty(); }
+
+ /**
+ * Returns the current location of all data in the buffer.
+ *
+ * @return the location of the entire data represented by this instance.
+ */
+ SourceLocation getLocation() const
+ {
+ if (empty()) {
+ return SourceLocation{sourceId};
+ }
+ return SourceLocation{sourceId, offsets.loadOffset(0).first,
+ offsets.loadOffset(size()).second};
+ }
};
/* Class TokenizedData */
@@ -316,50 +505,90 @@ public:
TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}
TokenizedData::TokenizedData(SourceId sourceId)
- : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0)
+ : impl(std::make_shared<TokenizedDataImpl>(sourceId))
{
}
+TokenizedData::TokenizedData(const std::string &data, SourceOffset offsStart,
+ SourceId sourceId)
+ : TokenizedData(sourceId)
+{
+ append(data, offsStart);
+}
+
TokenizedData::~TokenizedData() {}
-size_t TokenizedData::append(const std::string &data, SourceOffset offsStart)
+size_t TokenizedData::append(const std::string &data, SourceOffset offsStart,
+ bool protect)
{
- return impl->append(data, offsStart);
+ return impl->append(data, offsStart, protect);
}
size_t TokenizedData::append(char c, SourceOffset offsStart,
- SourceOffset offsEnd)
+ SourceOffset offsEnd, bool protect)
{
- return impl->append(c, offsStart, offsEnd);
+ return impl->append(c, offsStart, offsEnd, protect);
}
void TokenizedData::mark(TokenId id, TokenLength len)
{
- impl->mark(id, impl->getSize() - len, len);
+ impl->mark(id, impl->size() - len, len, false);
}
void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)
{
- impl->mark(id, bufStart, len);
+ impl->mark(id, bufStart, len, false);
}
-bool TokenizedData::next(Token &token, WhitespaceMode mode)
+void TokenizedData::clear() { impl->clear(); }
+
+void TokenizedData::trim(size_t length) { impl->trim(length); }
+
+size_t TokenizedData::size() const { return impl->size(); }
+
+bool TokenizedData::empty() const { return impl->empty(); }
+
+SourceLocation TokenizedData::getLocation() const
{
- return impl->next(token, mode, tokens, cursor);
+ return impl->getLocation();
}
-bool TokenizedData::text(Token &token, WhitespaceMode mode)
+TokenizedDataReader TokenizedData::reader() const
{
- // Copy the current cursor position to not update the actual cursor position
- // if the operation was not successful
- size_t cursorCopy = cursor;
- if (!impl->next(token, mode, tokens, cursorCopy) ||
- token.id != Tokens::Data) {
- return false;
- }
+ return TokenizedDataReader(impl, TokenizedDataCursor(),
+ TokenizedDataCursor());
+}
+
+/* Class TokenizedDataReader */
- // There is indeed a text token, update the internal cursor position
- cursor = cursorCopy;
- return true;
+TokenizedDataReader::TokenizedDataReader(
+ std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor)
+ : impl(impl), readCursor(readCursor), peekCursor(peekCursor)
+{
+}
+
+TokenizedDataReaderFork TokenizedDataReader::fork()
+{
+ return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor);
+}
+
+bool TokenizedDataReader::atEnd() const
+{
+ return readCursor.bufPos >= impl->size();
+}
+
+bool TokenizedDataReader::read(Token &token, const TokenSet &tokens,
+ WhitespaceMode mode)
+{
+ peekCursor = readCursor;
+ return impl->next(token, mode, tokens, readCursor);
+}
+
+bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens,
+ WhitespaceMode mode)
+{
+ return impl->next(token, mode, tokens, peekCursor);
}
}
diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp
index 38125c4..bc937f2 100644
--- a/src/core/parser/utils/TokenizedData.hpp
+++ b/src/core/parser/utils/TokenizedData.hpp
@@ -37,40 +37,48 @@
#include <core/common/Location.hpp>
#include <core/common/Whitespace.hpp>
-
-#include "Token.hpp"
+#include <core/common/Token.hpp>
namespace ousia {
// Forward declaration
class TokenizedDataImpl;
+class TokenizedDataReader;
+class TokenizedDataReaderFork;
/**
- * The TokenizedData class stores data extracted from a user defined document.
- * As users are capable of defining their own tokens and these are only valid
- * in certain scopes TokenizedData allows to divide the stored data into chunks
- * separated by tokens.
+ * Internally used structure representing a cursor within the TokenizedData
+ * stream.
*/
-class TokenizedData {
-private:
+struct TokenizedDataCursor {
/**
- * Shared pointer pointing at the internal data. This data is shared when
- * copying TokenizedData instances, which corresponds to forking a
- * TokenizedData instance.
+ * Position within the byte buffer.
*/
- std::shared_ptr<TokenizedDataImpl> impl;
+ size_t bufPos;
/**
- * Contains all currently enabled token ids.
+ * Position within the token mark buffer.
*/
- std::unordered_set<TokenId> tokens;
+ size_t markPos;
/**
- * Position from which the last element was read from the internal buffer.
- * This information is not shared with the other instances of TokenizedData
- * pointing at the same location.
+ * Default constructor. The resulting cursor points at the beginning of the
+ * stream.
+ */
+ TokenizedDataCursor() : bufPos(0), markPos(0) {}
+};
+
+/**
+ * The TokenizedData class stores data extracted from a user defined document.
+ * The data stored in TokenizedData
+ */
+class TokenizedData {
+private:
+ /**
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
*/
- size_t cursor;
+ std::shared_ptr<TokenizedDataImpl> impl;
public:
/**
@@ -88,6 +96,18 @@ public:
TokenizedData(SourceId sourceId);
/**
+ * Creates a new instance of TokenizedData, takes a SourceId and an initial
+ * string buffer.
+ *
+ * @param data is the string that should be appended to the buffer.
+ * @param offsStart is the start offset in bytes in the input file.
+ * @param sourceId is the source identifier that should be used for
+ * constructing the location when returning tokens.
+ */
+ TokenizedData(const std::string &data, SourceOffset offsStart = 0,
+ SourceId sourceId = InvalidSourceId);
+
+ /**
* Destructor. Needs to be defined explicitly for freeing a shared pointer
* of the incomplete TokenizedDataImpl type.
*/
@@ -101,10 +121,13 @@ public:
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
+ * @param protect if set to true, the appended characters will not be
+ * affected by whitespace handling, they will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(const std::string &data, SourceOffset offsStart = 0);
+ size_t append(const std::string &data, SourceOffset offsStart = 0,
+ bool protect = false);
/**
* Appends a single character to the internal character buffer.
@@ -112,10 +135,13 @@ public:
* @param c is the character that should be appended to the buffer.
* @param start is the start offset in bytes in the input file.
* @param end is the end offset in bytes in the input file.
+ * @param protect if set to true, the appended character will not be
+ * affected by whitespace handling, it will be returned as is.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
- size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd);
+ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
+ bool protect = false);
/**
* Stores a token ending at the last character of the current buffer.
@@ -136,54 +162,194 @@ public:
void mark(TokenId id, size_t bufStart, TokenLength len);
/**
- * Enables a single token id. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Resets the TokenizedData instance to the state it had when it was
+ * constructred.
+ */
+ void clear();
+
+ /**
+ * Trims the length of the TokenizedData instance to the given length. Note
+ * that this function does not remove any token matches for performance
+ * reasons, it merely renders them incaccessible. Appending new data after
+ * calling trim will make the token marks accessible again. Thus this method
+ * should be the last function called to modify the data buffer and the
+ * token marks.
*
- * @param id is the TokenId of the token that should be enabled.
+ * @param length is the number of characters to which the TokenizedData
+ * instance should be trimmed.
+ */
+ void trim(size_t length);
+
+ /**
+ * Returns the number of characters currently represented by this
+ * TokenizedData instance.
*/
- void enableToken(TokenId id) { tokens.insert(id); }
+ size_t size() const;
/**
- * Enables a set of token ids. Enabled tokens will no longer be returned as
- * text. Instead, when querying for the next token, TokenizedData will
- * return them as token and not as part of a Text token.
+ * Returns true if the TokenizedData instance is empty, false otherwise.
*
- * @param ids is the TokenId of the token that should be enabled.
+ * @return true if not data is stored inside the TokenizedData instance.
*/
- void enableToken(const std::unordered_set<TokenId> &ids)
- {
- tokens.insert(ids.begin(), ids.end());
- }
+ bool empty() const;
+
+ /**
+ * Returns the location of the entire TokenizedData instance.
+ *
+ * @return the location of the entire data represented by this instance.
+ */
+ SourceLocation getLocation() const;
+
+ /**
+ * Returns a TokenizedDataReader instance that can be used to access the
+ * data.
+ *
+ * @return a new TokenizedDataReader instance pointing at the beginning of
+ * the internal buffer.
+ */
+ TokenizedDataReader reader() const;
+};
+
+/**
+ * The TokenizedDataReader
+ */
+class TokenizedDataReader {
+private:
+ friend TokenizedData;
+
+ /**
+ * Shared pointer pointing at the internal data. This data is shared with
+ * all the TokenizedDataReader instances.
+ */
+ std::shared_ptr<const TokenizedDataImpl> impl;
+
+ /**
+ * Position from which the last element was read from the internal buffer.
+ */
+ TokenizedDataCursor readCursor;
+
+ /**
+ * Position from which the last element was peeked from the internal buffer.
+ */
+ TokenizedDataCursor peekCursor;
+
+protected:
+ /**
+ * Protected constructor of TokenizedDataReader, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader.
+ *
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor);
+
+public:
+ /**
+ * Returns a new TokenizedDataReaderFork from which tokens and text can be
+ * read without advancing this reader instance.
+ */
+ TokenizedDataReaderFork fork();
+
+ /**
+ * Returns true if this TokenizedData instance is at the end.
+ *
+ * @return true if the end of the TokenizedData instance has been reached.
+ */
+ bool atEnd() const;
/**
* Stores the next token in the given token reference, returns true if the
- * operation was successful, false if there are no more tokens.
+ * operation was successful, false if there are no more tokens. Advances the
+ * internal cursor and re
*
* @param token is an output parameter into which the read token will be
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
- bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ bool read(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM);
/**
- * Stores the next text token in the given token reference, returns true if
- * the operation was successful (there was indeed a text token), false if
- * the next token is not a text token or there were no more tokens.
+ * Stores the next token in the given token reference, returns true if the
+ * operation was successful, false if there are no more tokens.
*
* @param token is an output parameter into which the read token will be
* stored. The TokenId is set to Tokens::Empty if there are no more tokens.
+ * @param tokens is the set of token identifers, representing the currently
+ * enabled tokens.
* @param mode is the whitespace mode that should be used when a text token
* is returned.
* @return true if the operation was successful and there is a next token,
* false if there are no more tokens.
*/
- bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE);
+ bool peek(Token &token, const TokenSet &tokens = TokenSet{},
+ WhitespaceMode mode = WhitespaceMode::TRIM);
+
+ /**
+ * Consumes the peeked tokens, the read cursor will now be at the position
+ * of the peek cursor.
+ */
+ void consumePeek() { readCursor = peekCursor; }
+
+ /**
+ * Resets the peek cursor to the position of the read cursor.
+ */
+ void resetPeek() { peekCursor = readCursor; }
+};
+
+/**
+ * The TokenizedDataReaderFork class is created when forking a
+ * TokenizedDataReader
+ */
+class TokenizedDataReaderFork : public TokenizedDataReader {
+private:
+ friend TokenizedDataReader;
+
+ /**
+ * Reference pointing at the parent TokenizedDataReader to which changes may
+ * be commited.
+ */
+ TokenizedDataReader &parent;
+
+ /**
+ * Private constructor of TokenizedDataReaderFork, taking a reference to the
+ * internal TokenizedDataImpl structure storing the data that is accessed by
+ * the reader and a reference at the parent TokenizedDataReader.
+ *
+ * @param parent is the TokenizedDataReader instance to which the current
+ * read/peek progress may be commited.
+ * @param impl is the TokenizedDataImpl instance that holds the actual data.
+ * @param readCursor is the cursor position from which tokens and text are
+ * read.
+ * @param peekCursor is the cursor position from which tokens and text are
+ * peeked.
+ */
+ TokenizedDataReaderFork(TokenizedDataReader &parent,
+ std::shared_ptr<const TokenizedDataImpl> impl,
+ const TokenizedDataCursor &readCursor,
+ const TokenizedDataCursor &peekCursor)
+ : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
+ {
+ }
+
+public:
+ /**
+ * Commits the read/peek progress to the underlying parent.
+ */
+ void commit() { parent = *this; }
};
}
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index 2e0ac13..8d540a6 100644
--- a/src/core/parser/utils/Tokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -22,8 +22,8 @@
#include <core/common/CharReader.hpp>
#include <core/common/Exceptions.hpp>
#include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
+#include "TokenizedData.hpp"
#include "Tokenizer.hpp"
namespace ousia {
@@ -42,26 +42,33 @@ struct TokenMatch {
Token token;
/**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
+ * Position at which this token starts in the TokenizedData instance.
*/
- size_t textLength;
+ size_t dataStartOffset;
/**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
+ * Set to true if the matched token is a primary token.
*/
- size_t textEnd;
+ bool primary;
/**
* Constructor of the TokenMatch class.
*/
- TokenMatch() : textLength(0), textEnd(0) {}
+ TokenMatch() : dataStartOffset(0), primary(false) {}
/**
* Returns true if this TokenMatch instance actually represents a match.
+ *
+ * @return true if the TokenMatch actually has a match.
+ */
+ bool hasMatch() const { return token.id != Tokens::Empty; }
+
+ /**
+ * Returns the length of the matched token.
+ *
+ * @return the length of the token string.
*/
- bool hasMatch() { return token.id != Tokens::Empty; }
+ size_t size() const { return token.content.size(); }
};
/* Internal class TokenLookup */
@@ -83,36 +90,28 @@ private:
size_t start;
/**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
+ * Position at which this token starts in the TokenizedData instance.
*/
- size_t textLength;
-
- /**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
- */
- size_t textEnd;
+ size_t dataStartOffset;
public:
/**
* Constructor of the TokenLookup class.
*
* @param node is the current node.
- * @param start is the start position.
- * @param textLength is the text buffer length of the previous text token.
- * @param textEnd is the current end location of the previous text token.
+ * @param start is the start position in the source file.
+ * @param dataStartOffset is the current length of the TokenizedData buffer.
*/
- TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength,
- size_t textEnd)
- : node(node), start(start), textLength(textLength), textEnd(textEnd)
+ TokenLookup(const TokenTrie::Node *node, size_t start,
+ size_t dataStartOffset)
+ : node(node), start(start), dataStartOffset(dataStartOffset)
{
}
/**
* Tries to extend the current path in the token trie with the given
- * character. If a complete token is matched, stores this match in the
- * tokens list (in case it is longer than any previous token).
+ * character. If a complete token is matched, stores the match in the given
+ * TokenMatch reference and returns true.
*
* @param c is the character that should be appended to the current prefix.
* @param lookups is a list to which new TokeLookup instances are added --
@@ -123,73 +122,48 @@ public:
* Tokenizer.
* @param end is the end byte offset of the current character.
* @param sourceId is the source if of this file.
+ * @return true if a token was matched, false otherwise.
*/
- void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
- const std::vector<std::string> &tokens, SourceOffset end,
- SourceId sourceId)
+ bool advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+ const std::vector<Tokenizer::TokenDescriptor> &tokens,
+ SourceOffset end, SourceId sourceId)
{
- // Check whether we can continue the current token path with the given
- // character without visiting an already visited node
+ // Set to true once a token has been matched
+ bool res = false;
+
+ // Check whether we can continue the current token path, if not, abort
auto it = node->children.find(c);
if (it == node->children.end()) {
- return;
+ return res;
}
// Check whether the new node represents a complete token a whether it
// is longer than the current token. If yes, replace the current token.
node = it->second.get();
- if (node->type != Tokens::Empty) {
- const std::string &str = tokens[node->type];
- size_t len = str.size();
- if (len > match.token.content.size()) {
- match.token =
- Token{node->type, str, {sourceId, start, end}};
- match.textLength = textLength;
- match.textEnd = textEnd;
- }
+ if (node->id != Tokens::Empty) {
+ const Tokenizer::TokenDescriptor &descr = tokens[node->id];
+ match.token = Token(node->id, descr.string,
+ SourceLocation(sourceId, start, end));
+ match.dataStartOffset = dataStartOffset;
+ match.primary = descr.primary;
+ res = true;
}
// If this state can possibly be advanced, store it in the states list.
if (!node->children.empty()) {
lookups.emplace_back(*this);
}
+ return res;
}
};
-
-/**
- * Transforms the given token into a data token containing the extracted
- * text.
- *
- * @param handler is the WhitespaceHandler containing the collected data.
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
-static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match,
- SourceId sourceId)
-{
- if (match.hasMatch()) {
- match.token.content =
- std::string{handler.textBuf.data(), match.textLength};
- match.token.location =
- SourceLocation{sourceId, handler.textStart, match.textEnd};
- } else {
- match.token.content = handler.toString();
- match.token.location =
- SourceLocation{sourceId, handler.textStart, handler.textEnd};
- }
- match.token.id = Tokens::Data;
-}
}
/* Class Tokenizer */
-Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
- : whitespaceMode(whitespaceMode), nextTokenId(0)
-{
-}
+Tokenizer::Tokenizer() : nextTokenId(0) {}
-template <typename TextHandler, bool read>
-bool Tokenizer::next(CharReader &reader, Token &token)
+template <bool read>
+bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
{
// If we're in the read mode, reset the char reader peek position to the
// current read position
@@ -199,45 +173,63 @@ bool Tokenizer::next(CharReader &reader, Token &token)
// Prepare the lookups in the token trie
const TokenTrie::Node *root = trie.getRoot();
- TokenMatch match;
+ TokenMatch bestMatch;
std::vector<TokenLookup> lookups;
std::vector<TokenLookup> nextLookups;
- // Instantiate the text handler
- TextHandler textHandler;
-
// Peek characters from the reader and try to advance the current token tree
// cursor
char c;
+ const size_t initialDataSize = data.size();
size_t charStart = reader.getPeekOffset();
const SourceId sourceId = reader.getSourceId();
while (reader.peek(c)) {
const size_t charEnd = reader.getPeekOffset();
- const size_t textLength = textHandler.textBuf.size();
- const size_t textEnd = textHandler.textEnd;
+ const size_t dataStartOffset = data.size();
// If we do not have a match yet, start a new lookup from the root
- if (!match.hasMatch()) {
- TokenLookup{root, charStart, textLength, textEnd}.advance(
- c, nextLookups, match, tokens, charEnd, sourceId);
+ if (!bestMatch.hasMatch() || !bestMatch.primary) {
+ lookups.emplace_back(root, charStart, dataStartOffset);
}
// Try to advance all other lookups with the new character
+ TokenMatch match;
for (TokenLookup &lookup : lookups) {
- lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+ // Continue if the current lookup
+ if (!lookup.advance(c, nextLookups, match, tokens, charEnd,
+ sourceId)) {
+ continue;
+ }
+
+ // Replace the best match with longest token
+ if (match.size() > bestMatch.size()) {
+ bestMatch = match;
+ }
+
+ // If the matched token is a non-primary token -- mark the match in
+ // the TokenizedData list
+ if (!match.primary) {
+ data.mark(match.token.id, data.size() - match.size() + 1,
+ match.size());
+ }
}
- // We have found a token and there are no more states to advance or the
- // text handler has found something -- abort to return the new token
- if (match.hasMatch()) {
- if ((nextLookups.empty() || textHandler.hasText())) {
+
+ // If a token has been found and the token is a primary token, check
+ // whether we have to abort, otherwise if we have a non-primary match,
+ // reset it once it can no longer be advanced
+ if (bestMatch.hasMatch() && nextLookups.empty()) {
+ if (bestMatch.primary) {
break;
+ } else {
+ bestMatch = TokenMatch{};
}
- } else {
- // Record all incomming characters
- textHandler.append(c, charStart, charEnd);
}
+ // Record all incomming characters
+ data.append(c, charStart, charEnd);
+
+
// Swap the lookups and the nextLookups list
lookups = std::move(nextLookups);
nextLookups.clear();
@@ -246,60 +238,57 @@ bool Tokenizer::next(CharReader &reader, Token &token)
charStart = charEnd;
}
- // If we found text, emit that text
- if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
- buildDataToken(textHandler, match, sourceId);
+ // If we found data, emit a corresponding data token
+ if (data.size() > initialDataSize &&
+ (!bestMatch.hasMatch() || !bestMatch.primary ||
+ bestMatch.dataStartOffset > initialDataSize)) {
+ // If we have a "bestMatch" wich starts after text data has started,
+ // trim the TokenizedData to this offset
+ if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) {
+ data.trim(bestMatch.dataStartOffset);
+ }
+
+ // Create a token containing the data location
+ bestMatch.token = Token{data.getLocation()};
+ } else if (bestMatch.hasMatch() && bestMatch.primary &&
+ bestMatch.dataStartOffset == initialDataSize) {
+ data.trim(initialDataSize);
}
// Move the read/peek cursor to the end of the token, abort if an error
// happens while doing so
- if (match.hasMatch()) {
+ if (bestMatch.hasMatch()) {
// Make sure we have a valid location
- if (match.token.location.getEnd() == InvalidSourceOffset) {
+ if (bestMatch.token.location.getEnd() == InvalidSourceOffset) {
throw OusiaException{"Token end position offset out of range"};
}
// Seek to the end of the current token
- const size_t end = match.token.location.getEnd();
+ const size_t end = bestMatch.token.location.getEnd();
if (read) {
reader.seek(end);
} else {
reader.seekPeekCursor(end);
}
- token = match.token;
+
+ token = bestMatch.token;
} else {
token = Token{};
}
- return match.hasMatch();
+ return bestMatch.hasMatch();
}
-bool Tokenizer::read(CharReader &reader, Token &token)
+bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data)
{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingWhitespaceHandler, true>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingWhitespaceHandler, true>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingWhitespaceHandler, true>(reader, token);
- }
- return false;
+ return next<true>(reader, token, data);
}
-bool Tokenizer::peek(CharReader &reader, Token &token)
+bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data)
{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingWhitespaceHandler, false>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingWhitespaceHandler, false>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingWhitespaceHandler, false>(reader, token);
- }
- return false;
+ return next<false>(reader, token, data);
}
-TokenId Tokenizer::registerToken(const std::string &token)
+TokenId Tokenizer::registerToken(const std::string &token, bool primary)
{
// Abort if an empty token should be registered
if (token.empty()) {
@@ -309,8 +298,8 @@ TokenId Tokenizer::registerToken(const std::string &token)
// Search for a new slot in the tokens list
TokenId type = Tokens::Empty;
for (size_t i = nextTokenId; i < tokens.size(); i++) {
- if (tokens[i].empty()) {
- tokens[i] = token;
+ if (!tokens[i].valid()) {
+ tokens[i] = TokenDescriptor(token, primary);
type = i;
break;
}
@@ -320,62 +309,47 @@ TokenId Tokenizer::registerToken(const std::string &token)
// override the special token type handles
if (type == Tokens::Empty) {
type = tokens.size();
- if (type == Tokens::Data || type == Tokens::Empty) {
+ if (type >= Tokens::MaxTokenId) {
throw OusiaException{"Token type ids depleted!"};
}
- tokens.emplace_back(token);
+ tokens.emplace_back(token, primary);
}
nextTokenId = type + 1;
- // Try to register the token in the trie -- if this fails, remove it
- // from the tokens list
+ // Try to register the token in the trie -- if this fails, remove it from
+ // the tokens list
if (!trie.registerToken(token, type)) {
- tokens[type] = std::string{};
+ tokens[type] = TokenDescriptor();
nextTokenId = type;
return Tokens::Empty;
}
return type;
}
-bool Tokenizer::unregisterToken(TokenId type)
+bool Tokenizer::unregisterToken(TokenId id)
{
// Unregister the token from the trie, abort if an invalid type is given
- if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
- tokens[type] = std::string{};
- nextTokenId = type;
+ if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) {
+ tokens[id] = TokenDescriptor();
+ nextTokenId = id;
return true;
}
return false;
}
-std::string Tokenizer::getTokenString(TokenId type)
-{
- if (type < tokens.size()) {
- return tokens[type];
- }
- return std::string{};
-}
+static Tokenizer::TokenDescriptor EmptyTokenDescriptor;
-void Tokenizer::setWhitespaceMode(WhitespaceMode mode)
+const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const
{
- whitespaceMode = mode;
+ if (id < tokens.size()) {
+ return tokens[id];
+ }
+ return EmptyTokenDescriptor;
}
-WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; }
-
/* Explicitly instantiate all possible instantiations of the "next" member
function */
-template bool Tokenizer::next<PreservingWhitespaceHandler, false>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, false>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, false>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<PreservingWhitespaceHandler, true>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<TrimmingWhitespaceHandler, true>(
- CharReader &reader, Token &token);
-template bool Tokenizer::next<CollapsingWhitespaceHandler, true>(
- CharReader &reader, Token &token);
+template bool Tokenizer::next<false>(CharReader &, Token &, TokenizedData &);
+template bool Tokenizer::next<true>(CharReader &, Token &, TokenizedData &);
}
diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index f21c6a3..74e3f0d 100644
--- a/src/core/parser/utils/Tokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -19,8 +19,8 @@
/**
* @file Tokenizer.hpp
*
- * Tokenizer that can be reconfigured at runtime used for parsing the plain
- * text format.
+ * Tokenizer that can be reconfigured at runtime and is used for parsing the
+ * plain text format.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
@@ -28,44 +28,80 @@
#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
-#include <set>
+#include <cstdint>
#include <string>
#include <vector>
#include <core/common/Location.hpp>
-#include <core/common/Whitespace.hpp>
+#include <core/common/Token.hpp>
-#include "Token.hpp"
#include "TokenTrie.hpp"
namespace ousia {
// Forward declarations
class CharReader;
+class TokenizedData;
/**
* The Tokenizer is used to extract tokens and chunks of text from a
- * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters. Note that the
- * Tokenizer always tries to extract the longest possible token from the
- * tokenizer.
+ * CharReader. It allows to register and unregister tokens while parsing. Note
+ * that the Tokenizer always tries to extract the longest possible token from
+ * the tokenizer. Tokens can be registered as primary or non-primary token. If
+ * a Token is registered as a primary token, it is returned as a single Token
+ * instance if it occurs. In the non-primary case the token is returned as part
+ * of a segmented TokenizedData instance.
*/
class Tokenizer {
-private:
+public:
/**
- * Internally used token trie. This object holds all registered tokens.
+ * Internally used structure describing a registered token.
*/
- TokenTrie trie;
+ struct TokenDescriptor {
+ /**
+ * String describing the token.
+ */
+ std::string string;
+
+ /**
+ * Set to true if this token is primary.
+ */
+ bool primary;
+
+ /**
+ * Constructor of the TokenDescriptor class.
+ *
+ * @param string is the string representation of the registered token.
+ * @param primary specifies whether the token is a primary token that
+ * should be returned as a single token, or a secondary token, that
+ * should be returned as part of TokenizedData.
+ */
+ TokenDescriptor(const std::string &string, bool primary)
+ : string(string), primary(primary)
+ {
+ }
+
+ /**
+ * Default constructor.
+ */
+ TokenDescriptor() : primary(false) {}
+
+ /**
+ * Returns true if the TokenDescriptor represents a valid token.
+ */
+ bool valid() { return !string.empty(); }
+ };
+private:
/**
- * Flag defining whether whitespaces should be preserved or not.
+ * Internally used token trie. This object holds all registered tokens.
*/
- WhitespaceMode whitespaceMode;
+ TokenTrie trie;
/**
* Vector containing all registered token types.
*/
- std::vector<std::string> tokens;
+ std::vector<TokenDescriptor> tokens;
/**
* Next index in the tokens list where to search for a new token id.
@@ -74,90 +110,78 @@ private:
/**
* Templated function used internally to read the current token. The
- * function is templated in order to force code generation for all six
- * combiations of whitespace modes and reading/peeking.
+ * function is templated in order to force optimized code generation for
+ * both reading and peeking.
*
- * @tparam TextHandler is the type to be used for the textHandler instance.
- * @tparam read specifies whether the function should start from and advance
- * the read pointer of the char reader.
+ * @tparam read specifies whether the method should read the token or just
+ * peek.
* @param reader is the CharReader instance from which the data should be
* read.
* @param token is the token structure into which the token information
* should be written.
+ * @param data is a reference at the TokenizedData instance to which the
+ * token information should be appended.
* @return false if the end of the stream has been reached, true otherwise.
*/
- template <typename TextHandler, bool read>
- bool next(CharReader &reader, Token &token);
+ template <bool read>
+ bool next(CharReader &reader, Token &token, TokenizedData &data);
public:
/**
* Constructor of the Tokenizer class.
- *
- * @param whitespaceMode specifies how whitespace should be handled.
*/
- Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+ Tokenizer();
/**
- * Registers the given string as a token. Returns a const pointer at a
- * TokenDescriptor that will be used to reference the newly created token.
+ * Registers the given string as a token. Returns a unique identifier
+ * describing the registered token.
*
* @param token is the token string that should be registered.
- * @return a unique identifier for the registered token or EmptyToken if
+ * @param primary specifies whether the token is a primary token -- if true,
+ * the token will be returned as a single, standalone token. Otherwise the
+ * token will be returned as part of a "TokenizedData" structure.
+ * @return a unique identifier for the registered token or Tokens::Empty if
* an error occured.
*/
- TokenId registerToken(const std::string &token);
+ TokenId registerToken(const std::string &token, bool primary = true);
/**
* Unregisters the token belonging to the given TokenId.
*
* @param type is the token type that should be unregistered. The
- *TokenId
- * must have been returned by registerToken.
+ * TokenId must have been returned by registerToken.
* @return true if the operation was successful, false otherwise (e.g.
- * because the given TokenDescriptor was already unregistered).
+ * because the token with the given TokenId was already unregistered).
*/
- bool unregisterToken(TokenId type);
+ bool unregisterToken(TokenId id);
/**
* Returns the token that was registered under the given TokenId id or
- *an
- * empty string if an invalid TokenId id is given.
+ * an empty string if an invalid TokenId id is given.
*
- * @param type is the TokenId id for which the corresponding token
- *string
+ * @param id is the TokenId for which the corresponding TokenDescriptor
* should be returned.
- * @return the registered token string or an empty string if the given type
- * was invalid.
- */
- std::string getTokenString(TokenId type);
-
- /**
- * Sets the whitespace mode.
- *
- * @param whitespaceMode defines how whitespace should be treated in text
- * tokens.
- */
- void setWhitespaceMode(WhitespaceMode mode);
-
- /**
- * Returns the current value of the whitespace mode.
- *
- * @return the whitespace mode.
+ * @return the registered TokenDescriptor or an invalid TokenDescriptor if
+ * the given TokenId is invalid.
*/
- WhitespaceMode getWhitespaceMode();
+ const TokenDescriptor& lookupToken(TokenId id) const;
/**
* Reads a new token from the CharReader and stores it in the given
- * Token instance.
+ * Token instance. If the token has the id Tokens::Data, use the "getData"
+ * method to fetch a reference at the underlying TokenizedData instance
+ * storing the data.
*
* @param reader is the CharReader instance from which the data should be
* read.
* @param token is a reference at the token instance into which the Token
* information should be written.
+ * @param data is a reference at the TokenizedData instance to which the
+ * token information should be appended.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool read(CharReader &reader, Token &token);
+ bool read(CharReader &reader, Token &token, TokenizedData &data);
/**
* The peek method does not advance the read position of the char reader,
@@ -167,10 +191,12 @@ public:
* read.
* @param token is a reference at the token instance into which the Token
* information should be written.
+ * @param data is a reference at the TokenizedData instance to which the
+ * token information should be appended.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool peek(CharReader &reader, Token &token);
+ bool peek(CharReader &reader, Token &token, TokenizedData &data);
};
}
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp
index 16e7aa4..d169393 100644
--- a/src/formats/osml/OsmlParser.cpp
+++ b/src/formats/osml/OsmlParser.cpp
@@ -73,7 +73,7 @@ public:
: logger(ctx.getLogger()),
ctx(ctx),
parser(reader, logger),
- stack(ctx, GenericParserStates)
+ stack(parser, ctx, GenericParserStates)
{
}
@@ -88,7 +88,7 @@ public:
OsmlStreamParser::State state = parser.parse();
logger.setDefaultLocation(parser.getLocation());
switch (state) {
- case OsmlStreamParser::State::COMMAND: {
+ case OsmlStreamParser::State::COMMAND_START: {
// Implicitly create a "document" element if the first
// command is not any other top-level command
if (needsDocument) {
@@ -96,23 +96,23 @@ public:
parser.getCommandName().asString();
if (cmd != "typesystem" && cmd != "document" &&
cmd != "ontology") {
- stack.command("document", Variant::mapType{});
+ stack.commandStart("document", Variant::mapType{},
+ false);
}
needsDocument = false;
}
- stack.command(parser.getCommandName(),
- parser.getCommandArguments().asMap());
+ stack.commandStart(parser.getCommandName(),
+ parser.getCommandArguments().asMap(),
+ parser.inRangeCommand());
break;
}
- case OsmlStreamParser::State::DATA:
- stack.data(parser.getData());
- break;
- case OsmlStreamParser::State::ENTITY:
- // TODO
+ case OsmlStreamParser::State::RANGE_END:
+ stack.rangeEnd();
break;
case OsmlStreamParser::State::ANNOTATION_START:
stack.annotationStart(parser.getCommandName(),
- parser.getCommandArguments().asMap());
+ parser.getCommandArguments().asMap(),
+ parser.inRangeCommand());
break;
case OsmlStreamParser::State::ANNOTATION_END: {
Variant elementName = Variant::fromString(std::string{});
@@ -130,11 +130,9 @@ public:
case OsmlStreamParser::State::FIELD_END:
stack.fieldEnd();
break;
- case OsmlStreamParser::State::NONE:
- case OsmlStreamParser::State::ERROR:
- // Internally used in OsmlStreamParser, these states should
- // never occur. Just contiunue.
- continue;
+ case OsmlStreamParser::State::DATA:
+ stack.data(parser.getData());
+ break;
case OsmlStreamParser::State::END:
return;
}
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index f61ac7d..64a489d 100644
--- a/src/formats/osml/OsmlStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -16,179 +16,421 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <cassert>
+#include <stack>
+#include <vector>
+
#include <core/common/CharReader.hpp>
#include <core/common/Logger.hpp>
#include <core/common/Utils.hpp>
+#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
+#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
+
#include "OsmlStreamParser.hpp"
namespace ousia {
+namespace {
/**
- * Plain format default tokenizer.
+ * Osml format default tokenizer. Registers the primary tokens in its
+ * constructor. A single, static instance of this class is created as
+ * "OsmlTokens", which is copied to the Tokenizer instance of
+ * OsmlStreamParserImpl.
*/
-class PlainFormatTokens : public Tokenizer {
+class OsmlFormatTokens : public Tokenizer {
public:
+ TokenId Backslash;
+ TokenId LineComment;
+ TokenId BlockCommentStart;
+ TokenId BlockCommentEnd;
+ TokenId FieldStart;
+ TokenId FieldEnd;
+ TokenId DefaultFieldStart;
+ TokenId AnnotationStart;
+ TokenId AnnotationEnd;
+
/**
- * Id of the backslash token.
+ * Registers the plain format tokens in the internal tokenizer.
*/
- TokenId Backslash;
+ OsmlFormatTokens()
+ {
+ Backslash = registerToken("\\");
+ LineComment = registerToken("%");
+ BlockCommentStart = registerToken("%{");
+ BlockCommentEnd = registerToken("}%");
+ FieldStart = registerToken("{");
+ FieldEnd = registerToken("}");
+ DefaultFieldStart = registerToken("{!");
+ AnnotationStart = registerToken("<\\");
+ AnnotationEnd = registerToken("\\>");
+ }
+};
+
+/**
+ * Instance of OsmlFormatTokens used to initialize the internal tokenizer
+ * instance of OsmlStreamParserImpl.
+ */
+static const OsmlFormatTokens OsmlTokens;
+/**
+ * Structure representing a field.
+ */
+struct Field {
/**
- * Id of the line comment token.
+ * Specifies whether this field was marked as default field.
*/
- TokenId LineComment;
+ bool defaultField;
/**
- * Id of the block comment start token.
+ * Location at which the field was started.
*/
- TokenId BlockCommentStart;
+ SourceLocation location;
/**
- * Id of the block comment end token.
+ * Constructor of the Field structure, initializes all member variables with
+ * the given values.
+ *
+ * @param defaultField is a flag specifying whether this field is a default
+ * field.
+ * @param location specifies the location at which the field was started.
*/
- TokenId BlockCommentEnd;
+ Field(bool defaultField = false,
+ const SourceLocation &location = SourceLocation{})
+ : defaultField(defaultField), location(location)
+ {
+ }
+};
+/**
+ * Entry used for the command stack.
+ */
+class Command {
+private:
/**
- * Id of the field start token.
+ * Name and location of the current command.
*/
- TokenId FieldStart;
+ Variant name;
/**
- * Id of the field end token.
+ * Arguments that were passed to the command.
*/
- TokenId FieldEnd;
+ Variant arguments;
/**
- * Id of the default field start token.
+ * Vector used as stack for holding the number of opening/closing braces
+ * and the corresponding "isDefaultField" flag.
*/
- TokenId DefaultFieldStart;
+ std::vector<Field> fields;
/**
- * Id of the annotation start token.
+ * Set to true if this is a command with clear begin and end.
*/
- TokenId AnnotationStart;
+ bool hasRange;
+public:
/**
- * Id of the annotation end token.
+ * Default constructor, marks this command as normal, non-range command.
*/
- TokenId AnnotationEnd;
+ Command() : hasRange(false) {}
/**
- * Registers the plain format tokens in the internal tokenizer.
+ * Constructor of the Command class.
+ *
+ * @param name is a string variant with name and location of the
+ * command.
+ * @param arguments is a map variant with the arguments given to the
+ * command.
+ * @param hasRange should be set to true if this is a command with
+ * explicit range.
*/
- PlainFormatTokens()
+ Command(Variant name, Variant arguments, bool hasRange)
+ : name(std::move(name)),
+ arguments(std::move(arguments)),
+ hasRange(hasRange)
{
- Backslash = registerToken("\\");
- LineComment = registerToken("%");
- BlockCommentStart = registerToken("%{");
- BlockCommentEnd = registerToken("}%");
- FieldStart = registerToken("{");
- FieldEnd = registerToken("}");
- DefaultFieldStart = registerToken("{!");
- AnnotationStart = registerToken("<\\");
- AnnotationEnd = registerToken("\\>");
}
-};
-static const PlainFormatTokens OsmlTokens;
+ /**
+ * Returns a reference at the variant representing name and location of the
+ * command.
+ *
+ * @return a variant containing name and location of the command.
+ */
+ const Variant &getName() const { return name; }
-/**
- * Class used internally to collect data issued via "DATA" event.
- */
-class DataHandler {
-private:
/**
- * Internal character buffer.
+ * Returns a reference at the variant containing name, value and location of
+ * the arguments.
+ *
+ * @return the arguments stored for the command.
*/
- std::vector<char> buf;
+ const Variant &getArguments() const { return arguments; }
/**
- * Start location of the character data.
+ * Returns a reference at the internal field list. This list should be used
+ * for printing error messages when fields are still open although the outer
+ * range field closes.
+ *
+ * @return a const reference at the internal field vector.
*/
- SourceOffset start;
+ const std::vector<Field> &getFields() const { return fields; }
/**
- * End location of the character data.
+ * Returns true if this command is currently in a default field.
+ *
+ * @return true if the current field on the field stack was explicitly
+ * marked as default field. If the field stack is empty, true is returned
+ * if this is a range command.
*/
- SourceOffset end;
+ bool inDefaultField() const
+ {
+ return (!fields.empty() && fields.back().defaultField) ||
+ (fields.empty() && hasRange);
+ }
-public:
/**
- * Default constructor, initializes start and end with zeros.
+ * Returns true if this command currently is in any field.
+ *
+ * @return true if a field is on the stack or this is a range commands.
+ * Range commands always are in a field.
*/
- DataHandler() : start(0), end(0) {}
+ bool inField() const { return !fields.empty() || hasRange; }
/**
- * Returns true if the internal buffer is empty.
+ * Returns true if this command currently is in a range field.
*
- * @return true if no characters were added to the internal buffer, false
- * otherwise.
+ * @return true if the command has a range and no other ranges are on the
+ * stack.
*/
- bool isEmpty() { return buf.empty(); }
+ bool inRangeField() const { return fields.empty() && hasRange; }
/**
- * Appends a single character to the internal buffer.
+ * Returns true if this command currently is in a non-range field.
*
- * @param c is the character that should be added to the internal buffer.
- * @param charStart is the start position of the character.
- * @param charEnd is the end position of the character.
+ * @return true if the command is in a field, but the field is not the field
+ * constructed by the "range"
*/
- void append(char c, SourceOffset charStart, SourceOffset charEnd)
+ bool inNonRangeField() const { return !fields.empty(); }
+
+ /**
+ * Pushes another field onto the field stack of this command.
+ *
+ * @param defaultField if true, explicitly marks this field as default
+ * field.
+ * @param location is the source location at which the field was started.
+ * Used for error messages in which the user is notified about an error with
+ * too few closing fields.
+ */
+ void pushField(bool defaultField = false,
+ const SourceLocation &location = SourceLocation{})
{
- if (isEmpty()) {
- start = charStart;
- }
- buf.push_back(c);
- end = charEnd;
+ fields.emplace_back(defaultField, location);
}
/**
- * Appends a string to the internal buffer.
+ * Removes another field from the field stack of this command, returns true
+ * if the operation was successful.
*
- * @param s is the string that should be added to the internal buffer.
- * @param stringStart is the start position of the string.
- * @param stringEnd is the end position of the string.
+ * @return true if there was a field to pop on the stack, false otherwise.
*/
- void append(const std::string &s, SourceOffset stringStart,
- SourceOffset stringEnd)
+ bool popField()
{
- if (isEmpty()) {
- start = stringStart;
+ if (!fields.empty()) {
+ fields.pop_back();
+ return true;
}
- std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
- end = stringEnd;
+ return false;
}
+};
+}
+
+/* Class OsmlStreamParserImpl */
+
+/**
+ * Internal implementation of OsmlStreamParser.
+ */
+class OsmlStreamParserImpl {
+public:
+ /**
+ * State enum compatible with OsmlStreamParserState but extended by two more
+ * entries (END and NONE).
+ */
+ enum class State : uint8_t {
+ COMMAND_START = 0,
+ RANGE_END = 1,
+ FIELD_START = 2,
+ FIELD_END = 3,
+ ANNOTATION_START = 4,
+ ANNOTATION_END = 5,
+ DATA = 6,
+ END = 7,
+ RECOVERABLE_ERROR = 8,
+ IRRECOVERABLE_ERROR = 9
+ };
+
+private:
+ /**
+ * Reference to the CharReader instance from which the incomming bytes are
+ * read.
+ */
+ CharReader &reader;
/**
- * Converts the internal buffer to a variant with attached location
- * information.
+ * Reference at the logger instance to which all error messages are sent.
+ */
+ Logger &logger;
+
+ /**
+ * Tokenizer instance used to read individual tokens from the text.
+ */
+ Tokenizer tokenizer;
+
+ /**
+ * Stack containing the current commands.
+ */
+ std::stack<Command> commands;
+
+ /**
+ * Variant containing the tokenized data that was returned from the
+ * tokenizer as data.
+ */
+ TokenizedData data;
+
+ /**
+ * Variable containing the current location of the parser.
+ */
+ SourceLocation location;
+
+ /**
+ * Function used internally to parse an identifier.
*
- * @param sourceId is the source id which is needed for building the
- * location information.
- * @return a Variant with the internal buffer content as string and
- * the correct start and end location.
+ * @param start is the start byte offset of the identifier (including the
+ * backslash).
+ * @param allowNSSep should be set to true if the namespace separator is
+ * allowed in the identifier name. Issues error if the namespace separator
+ * is placed incorrectly.
*/
- Variant toVariant(SourceId sourceId)
- {
- Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
- res.setLocation({sourceId, start, end});
- return res;
- }
+ Variant parseIdentifier(size_t start, bool allowNSSep = false);
+
+ /**
+ * Function used internally to handle the special "\begin" command.
+ *
+ * @return an internal State specifying whether an error occured (return
+ * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a
+ * command was actually started (return value State::COMMAND_START).
+ */
+ State parseBeginCommand();
+
+ /**
+ * Function used internally to handle the special "\end" command.
+ *
+ * @return an internal State specifying whether an error occured (return
+ * values State::REOVERABLE_ERROR or State::IRRECOVERABLE_ERROR) or a
+ * command was actually ended (return value State::RANGE_END).
+ */
+ State parseEndCommand();
+
+ /**
+ * Parses the command arguments. Handles errors if the name of the command
+ * was given using the hash notation and as a name field.
+ *
+ * @param commandArgName is the name argument that was given using the hash
+ * notation.
+ * @return a map variant containing the arguments.
+ */
+ Variant parseCommandArguments(Variant commandArgName);
+
+ /**
+ * Function used internally to parse a command.
+ *
+ * @param start is the start byte offset of the command (including the
+ * backslash)
+ * @param isAnnotation if true, the command is not returned as command, but
+ * as annotation start.
+ * @return true if a command was actuall parsed, false otherwise.
+ */
+ State parseCommand(size_t start, bool isAnnotation);
+
+ /**
+ * Function used internally to parse a block comment.
+ */
+ void parseBlockComment();
+
+ /**
+ * Function used internally to parse a generic comment.
+ */
+ void parseLineComment();
+
+ /**
+ * Pushes the parsed command onto the command stack.
+ */
+ void pushCommand(Variant commandName, Variant commandArguments,
+ bool hasRange);
+
+ /**
+ * Checks whether there is any data pending to be issued, if yes, resets the
+ * currently peeked characters and returns true.
+ *
+ * @return true if there was any data and DATA should be returned by the
+ * parse function, false otherwise.
+ */
+ bool checkIssueData();
+
+ /**
+ * Returns a reference at the current command at the top of the command
+ * stack.
+ *
+ * @return a reference at the top command in the command stack.
+ */
+ Command &cmd() { return commands.top(); }
+
+ /**
+ * Returns a reference at the current command at the top of the command
+ * stack.
+ *
+ * @return a reference at the top command in the command stack.
+ */
+ const Command &cmd() const { return commands.top(); }
+
+public:
+ /**
+ * Constructor of the OsmlStreamParserImpl class. Attaches the new
+ * OsmlStreamParserImpl to the given CharReader and Logger instances.
+ *
+ * @param reader is the reader instance from which incomming characters
+ * should be read.
+ * @param logger is the logger instance to which errors should be written.
+ */
+ OsmlStreamParserImpl(CharReader &reader, Logger &logger);
+
+ State parse();
+
+ TokenId registerToken(const std::string &token);
+ void unregisterToken(TokenId id);
+
+ const TokenizedData &getData() const { return data; }
+ const Variant &getCommandName() const { return cmd().getName(); }
+ const Variant &getCommandArguments() const { return cmd().getArguments(); }
+ const SourceLocation &getLocation() const { return location; }
+ bool inRangeCommand() const { return cmd().inRangeField(); };
+ bool inDefaultField() const { return cmd().inDefaultField(); }
};
-OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
+/* Class OsmlStreamParserImpl */
+
+OsmlStreamParserImpl::OsmlStreamParserImpl(CharReader &reader, Logger &logger)
: reader(reader), logger(logger), tokenizer(OsmlTokens)
{
- // Place an intial command representing the complete file on the stack
- commands.push(Command{"", Variant::mapType{}, true, true, true, false});
+ commands.emplace("", Variant::mapType{}, true);
}
-Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+Variant OsmlStreamParserImpl::parseIdentifier(size_t start, bool allowNSSep)
{
bool first = true;
- bool hasCharSiceNSSep = false;
+ bool hasCharSinceNSSep = false;
std::vector<char> identifier;
size_t end = reader.getPeekOffset();
char c, c2;
@@ -197,7 +439,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
if ((first && Utils::isIdentifierStartCharacter(c)) ||
(!first && Utils::isIdentifierCharacter(c))) {
identifier.push_back(c);
- } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+ } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&
Utils::isIdentifierStartCharacter(c2)) {
identifier.push_back(c);
} else {
@@ -214,8 +456,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
// This is no longer the first character
first = false;
- // Advance the hasCharSiceNSSep flag
- hasCharSiceNSSep = allowNSSep && (c != ':');
+ // Advance the hasCharSinceNSSep flag
+ hasCharSinceNSSep = allowNSSep && (c != ':');
end = reader.getPeekOffset();
reader.consumePeek();
@@ -228,20 +470,20 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
return res;
}
-OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parseBeginCommand()
{
// Expect a '{' after the command
reader.consumeWhitespace();
if (!reader.expect('{')) {
logger.error("Expected \"{\" after \\begin", reader);
- return State::NONE;
+ return State::RECOVERABLE_ERROR;
}
// Parse the name of the command that should be opened
Variant commandName = parseIdentifier(reader.getOffset(), true);
if (commandName.asString().empty()) {
logger.error("Expected identifier", commandName);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
// Check whether the next character is a '#', indicating the start of the
@@ -257,7 +499,7 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
if (!reader.expect('}')) {
logger.error("Expected \"}\"", reader);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
// Parse the arguments
@@ -266,28 +508,15 @@ OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
// Push the command onto the command stack
pushCommand(std::move(commandName), std::move(commandArguments), true);
- return State::COMMAND;
-}
-
-static bool checkStillInField(const OsmlStreamParser::Command &cmd,
- const Variant &endName, Logger &logger)
-{
- if (cmd.inField && !cmd.inRangeField) {
- logger.error(std::string("\\end in open field of command \"") +
- cmd.name.asString() + std::string("\""),
- endName);
- logger.note(std::string("Open command started here:"), cmd.name);
- return true;
- }
- return false;
+ return State::COMMAND_START;
}
-OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parseEndCommand()
{
// Expect a '{' after the command
if (!reader.expect('{')) {
logger.error("Expected \"{\" after \\end", reader);
- return State::NONE;
+ return State::RECOVERABLE_ERROR;
}
// Fetch the name of the command that should be ended here
@@ -296,56 +525,58 @@ OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
// Make sure the given command name is not empty
if (name.asString().empty()) {
logger.error("Expected identifier", name);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
// Make sure the command name is terminated with a '}'
if (!reader.expect('}')) {
logger.error("Expected \"}\"", reader);
- return State::ERROR;
- }
-
- // Unroll the command stack up to the last range command
- while (!commands.top().hasRange) {
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
+ }
+
+ // Unroll the command stack up to the last range command, make sure we do
+ // not intersect with any open field
+ while (!cmd().inRangeField()) {
+ if (cmd().inField()) {
+ logger.error(std::string("\\end in open field of command \"") +
+ cmd().getName().asString() + std::string("\""),
+ name);
+ const std::vector<Field> &fields = cmd().getFields();
+ for (const Field &field : fields) {
+ logger.note(std::string("Still open field started here: "),
+ field.location);
+ }
+ return State::IRRECOVERABLE_ERROR;
}
commands.pop();
}
- // Make sure we're not in an open field of this command
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
- }
-
// Special error message if the top-level command is reached
if (commands.size() == 1) {
logger.error(std::string("Cannot end command \"") + name.asString() +
std::string("\" here, no command open"),
name);
- return State::ERROR;
+ return State::IRRECOVERABLE_ERROR;
}
- // Inform the about command mismatches
- const Command &cmd = commands.top();
- if (commands.top().name.asString() != name.asString()) {
- logger.error(std::string("Trying to end command \"") +
- cmd.name.asString() +
+ // Inform the user about command mismatches, copy the current command
+ // descriptor before popping it from the stack
+ if (getCommandName().asString() != name.asString()) {
+ logger.error(std::string("Trying to end command \"") + name.asString() +
std::string("\", but open command is \"") +
- name.asString() + std::string("\""),
+ getCommandName().asString() + std::string("\""),
name);
- logger.note("Last command was opened here:", cmd.name);
- return State::ERROR;
+ logger.note("Open command started here:", getCommandName());
+ return State::IRRECOVERABLE_ERROR;
}
- // Set the location to the location of the command that was ended, then end
- // the current command
+ // End the current command
location = name.getLocation();
commands.pop();
- return cmd.inRangeField ? State::FIELD_END : State::NONE;
+ return State::RANGE_END;
}
-Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
+Variant OsmlStreamParserImpl::parseCommandArguments(Variant commandArgName)
{
// Parse the arguments using the universal VariantReader
Variant commandArguments;
@@ -371,29 +602,14 @@ Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
return commandArguments;
}
-void OsmlStreamParser::pushCommand(Variant commandName,
- Variant commandArguments, bool hasRange)
-{
- // Store the location on the stack
- location = commandName.getLocation();
-
- // Place the command on the command stack, remove the last commands if we're
- // not currently inside a field of these commands
- while (!commands.top().inField) {
- commands.pop();
- }
- commands.push(Command{std::move(commandName), std::move(commandArguments),
- hasRange, false, false, false});
-}
-
-OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
- bool isAnnotation)
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parseCommand(
+ size_t start, bool isAnnotation)
{
// Parse the commandName as a first identifier
Variant commandName = parseIdentifier(start, true);
if (commandName.asString().empty()) {
logger.error("Empty command name", reader);
- return State::NONE;
+ return State::RECOVERABLE_ERROR;
}
// Handle the special "begin" and "end" commands
@@ -403,7 +619,7 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
const bool isEnd = commandNameComponents[0] == "end";
// Parse the begin or end command
- State res = State::COMMAND;
+ State res = State::COMMAND_START;
if (isBegin || isEnd) {
if (commandNameComponents.size() > 1) {
logger.error(
@@ -459,12 +675,13 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
} else {
// Make sure no arguments apart from the "name" argument are given
// to an annotation end
- Variant::mapType &map = commands.top().arguments.asMap();
+ const Variant::mapType &map = getCommandArguments().asMap();
if (!map.empty()) {
if (map.count("name") == 0 || map.size() > 1U) {
logger.error(
"An annotation end command may not have any arguments "
- "other than \"name\"");
+ "other than \"name\"",
+ reader);
return res;
}
}
@@ -478,17 +695,21 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
// If we're starting an annotation, return the command as annotation start
// instead of command
- if (isAnnotation && res == State::COMMAND) {
+ if (isAnnotation && res == State::COMMAND_START) {
return State::ANNOTATION_START;
}
return res;
}
-void OsmlStreamParser::parseBlockComment()
+void OsmlStreamParserImpl::parseBlockComment()
{
Token token;
+ TokenizedData commentData;
size_t depth = 1;
- while (tokenizer.read(reader, token)) {
+ while (tokenizer.read(reader, token, commentData)) {
+ // Throw the comment data away
+ commentData.clear();
+
if (token.id == OsmlTokens.BlockCommentEnd) {
depth--;
if (depth == 0) {
@@ -504,7 +725,7 @@ void OsmlStreamParser::parseBlockComment()
logger.error("File ended while being in a block comment", reader);
}
-void OsmlStreamParser::parseLineComment()
+void OsmlStreamParserImpl::parseLineComment()
{
char c;
while (reader.read(c)) {
@@ -514,86 +735,46 @@ void OsmlStreamParser::parseLineComment()
}
}
-bool OsmlStreamParser::checkIssueData(DataHandler &handler)
+void OsmlStreamParserImpl::pushCommand(Variant commandName,
+ Variant commandArguments, bool hasRange)
{
- if (!handler.isEmpty()) {
- data = handler.toVariant(reader.getSourceId());
- location = data.getLocation();
- reader.resetPeek();
- return true;
- }
- return false;
-}
-
-bool OsmlStreamParser::checkIssueFieldStart()
-{
- // Fetch the current command, and check whether we're currently inside a
- // field of this command
- Command &cmd = commands.top();
- if (!cmd.inField) {
- // If this is a range command, we're now implicitly inside the field of
- // this command -- we'll have to issue a field start command!
- if (cmd.hasRange) {
- cmd.inField = true;
- cmd.inRangeField = true;
- reader.resetPeek();
- return true;
- }
+ // Store the location of the command
+ location = commandName.getLocation();
- // This was not a range command, so obviously we're now inside within
- // a field of some command -- so unroll the commands stack until a
- // command with open field is reached
- while (!commands.top().inField) {
- commands.pop();
- }
+ // Place the command on the command stack, remove the last commands if we're
+ // not currently inside a field of these commands
+ while (!cmd().inField()) {
+ commands.pop();
}
- return false;
+
+ // Push the new command onto the command stack
+ commands.emplace(std::move(commandName), std::move(commandArguments),
+ hasRange);
}
-bool OsmlStreamParser::closeField()
+bool OsmlStreamParserImpl::checkIssueData()
{
- // Try to end an open field of the current command -- if the current command
- // is not inside an open field, end this command and try to close the next
- // one
- for (int i = 0; i < 2 && commands.size() > 1; i++) {
- Command &cmd = commands.top();
- if (!cmd.inRangeField) {
- if (cmd.inField) {
- cmd.inField = false;
- if (cmd.inDefaultField) {
- commands.pop();
- }
- return true;
- }
- commands.pop();
- } else {
- return false;
- }
+ if (!data.empty()) {
+ location = data.getLocation();
+ reader.resetPeek();
+ return true;
}
return false;
}
-OsmlStreamParser::State OsmlStreamParser::parse()
+OsmlStreamParserImpl::State OsmlStreamParserImpl::parse()
{
- // Handler for incomming data
- DataHandler handler;
+ // Reset the data handler
+ data.clear();
// Read tokens until the outer loop should be left
Token token;
- while (tokenizer.peek(reader, token)) {
+ while (tokenizer.peek(reader, token, data)) {
const TokenId type = token.id;
// Special handling for Backslash and Text
if (type == OsmlTokens.Backslash ||
type == OsmlTokens.AnnotationStart) {
- // Before appending anything to the output data or starting a new
- // command, check whether FIELD_START has to be issued, as the
- // current command is a command with range
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
// Check whether a command starts now, without advancing the peek
// cursor
char c;
@@ -606,7 +787,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// Try to parse a command
if (Utils::isIdentifierStartCharacter(c)) {
// Make sure to issue any data before it is to late
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -614,11 +795,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()
State res = parseCommand(token.location.getStart(),
type == OsmlTokens.AnnotationStart);
switch (res) {
- case State::ERROR:
+ case State::IRRECOVERABLE_ERROR:
throw LoggableException(
"Last error was irrecoverable, ending parsing "
"process");
- case State::NONE:
+ case State::RECOVERABLE_ERROR:
continue;
default:
return res;
@@ -632,78 +813,64 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// If this was an annotation start token, add the parsed < to the
// output
+ SourceOffset charStart = token.location.getStart();
+ SourceOffset charEnd = reader.getPeekOffset();
if (type == OsmlTokens.AnnotationStart) {
- handler.append('<', token.location.getStart(),
- token.location.getStart() + 1);
+ data.append('<', charStart, charStart + 1);
+ charStart = charStart + 1;
}
- handler.append(c, token.location.getStart(),
- reader.getPeekOffset());
+ // Append the character to the output data, mark it as protected
+ data.append(c, charStart, charEnd, true);
reader.consumePeek();
continue;
} else if (type == Tokens::Data) {
- // Check whether FIELD_START has to be issued before appending text
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
- // Append the text to the data handler
- handler.append(token.content, token.location.getStart(),
- token.location.getEnd());
-
reader.consumePeek();
continue;
+ } else if (type == OsmlTokens.LineComment) {
+ reader.consumePeek();
+ parseLineComment();
+ continue;
+ } else if (type == OsmlTokens.BlockCommentStart) {
+ reader.consumePeek();
+ parseBlockComment();
+ continue;
}
// A non-text token was reached, make sure all pending data commands
// have been issued
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
// We will handle the token now, consume the peeked characters
reader.consumePeek();
- // Update the location to the current token location
+ // Synchronize the location with the current token location
location = token.location;
- if (token.id == OsmlTokens.LineComment) {
- parseLineComment();
- } else if (token.id == OsmlTokens.BlockCommentStart) {
- parseBlockComment();
- } else if (token.id == OsmlTokens.FieldStart) {
- Command &cmd = commands.top();
- if (!cmd.inField) {
- cmd.inField = true;
- return State::FIELD_START;
- }
- logger.error(
- "Got field start token \"{\", but no command for which to "
- "start the field. Write \"\\{\" to insert this sequence as "
- "text.",
- token);
+ if (token.id == OsmlTokens.FieldStart) {
+ cmd().pushField(false, token.location);
+ return State::FIELD_START;
} else if (token.id == OsmlTokens.FieldEnd) {
- if (closeField()) {
+ // Remove all commands from the list that currently are not in any
+ // field
+ while (!cmd().inField()) {
+ commands.pop();
+ }
+
+ // If the remaining command is not in a range field, remove this
+ // command
+ if (cmd().inNonRangeField()) {
+ cmd().popField();
return State::FIELD_END;
}
logger.error(
- "Got field end token \"}\", but there is no field to end. "
- "Write \"\\}\" to insert this sequence as text.",
+ "Got field end token \"}\", but there is no field to end.",
token);
} else if (token.id == OsmlTokens.DefaultFieldStart) {
- // Try to start a default field the first time the token is reached
- Command &topCmd = commands.top();
- if (!topCmd.inField) {
- topCmd.inField = true;
- topCmd.inDefaultField = true;
- return State::FIELD_START;
- }
- logger.error(
- "Got default field start token \"{!\", but no command for "
- "which to start the field. Write \"\\{!\" to insert this "
- "sequence as text",
- token);
+ cmd().pushField(true, token.location);
+ return State::FIELD_START;
} else if (token.id == OsmlTokens.AnnotationEnd) {
// We got a single annotation end token "\>" -- simply issue the
// ANNOTATION_END event
@@ -717,38 +884,103 @@ OsmlStreamParser::State OsmlStreamParser::parse()
}
// Issue available data
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
// Make sure all open commands and fields have been ended at the end of the
// stream
- while (commands.size() > 1) {
- Command &cmd = commands.top();
- if (cmd.inField || cmd.hasRange) {
- logger.error("Reached end of stream, but command \"" +
- cmd.name.asString() + "\" has not been ended",
- cmd.name);
+ while (true) {
+ bool topLevelCommand = commands.size() == 1U;
+ if (cmd().inField()) {
+ // If the stream ended with an open range field, issue information
+ // about the range field
+ if (cmd().inRangeField() && !topLevelCommand) {
+ // Inform about the still open command itself
+ logger.error("Reached end of stream, but command \"" +
+ getCommandName().asString() +
+ "\" has not been ended",
+ getCommandName());
+ } else {
+ // Issue information about still open fields
+ const std::vector<Field> &fields = cmd().getFields();
+ if (!fields.empty()) {
+ logger.error(
+ std::string(
+ "Reached end of stream, but field is still open."),
+ fields.back().location);
+ }
+ }
+ }
+ if (!topLevelCommand) {
+ commands.pop();
+ } else {
+ break;
}
- commands.pop();
}
location = SourceLocation{reader.getSourceId(), reader.getOffset()};
return State::END;
}
+TokenId OsmlStreamParserImpl::registerToken(const std::string &token)
+{
+ return tokenizer.registerToken(token, false);
+}
+
+void OsmlStreamParserImpl::unregisterToken(TokenId id)
+{
+ assert(tokenizer.unregisterToken(id));
+}
+
+/* Class OsmlStreamParser */
+
+OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
+ : impl(new OsmlStreamParserImpl(reader, logger))
+{
+}
+
+OsmlStreamParser::~OsmlStreamParser()
+{
+ // Stub needed because OsmlStreamParserImpl is incomplete in header
+}
+
+OsmlStreamParser::State OsmlStreamParser::parse()
+{
+ return static_cast<State>(impl->parse());
+}
+
+const TokenizedData &OsmlStreamParser::getData() const
+{
+ return impl->getData();
+}
+
const Variant &OsmlStreamParser::getCommandName() const
{
- return commands.top().name;
+ return impl->getCommandName();
}
const Variant &OsmlStreamParser::getCommandArguments() const
{
- return commands.top().arguments;
+ return impl->getCommandArguments();
+}
+
+const SourceLocation &OsmlStreamParser::getLocation() const
+{
+ return impl->getLocation();
+}
+
+bool OsmlStreamParser::inDefaultField() const { return impl->inDefaultField(); }
+
+bool OsmlStreamParser::inRangeCommand() const { return impl->inRangeCommand(); }
+
+TokenId OsmlStreamParser::registerToken(const std::string &token)
+{
+ return impl->registerToken(token);
}
-bool OsmlStreamParser::inDefaultField() const
+void OsmlStreamParser::unregisterToken(TokenId id)
{
- return commands.top().inRangeField || commands.top().inDefaultField;
+ impl->unregisterToken(id);
}
}
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index dc3034c..b7e64f7 100644
--- a/src/formats/osml/OsmlStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -29,68 +29,53 @@
#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
#define _OUSIA_OSML_STREAM_PARSER_HPP_
-#include <stack>
+#include <cstdint>
+#include <memory>
-#include <core/common/Variant.hpp>
-#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/stack/Callbacks.hpp>
namespace ousia {
// Forward declarations
class CharReader;
class Logger;
-class DataHandler;
+class OsmlStreamParserImpl;
+class TokenizedData;
+class Variant;
/**
* The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
* format. The parser is constructed around a "parse" function, which reads data
* from the underlying CharReader until a new state is reached and indicates
* this state in a return value. The calling code then has to pull corresponding
- * data from the stream reader. The reader makes sure the incommind file is
+ * data from the stream reader. The reader makes sure the incomming stream is
* syntactically valid and tries to recorver from most errors. If an error is
* irrecoverable (this is the case for errors with wrong nesting of commands or
* fields, as this would lead to too many consecutive errors) a
- * LoggableException is thrown.
+ * LoggableException is thrown. In short, the OsmlStreamParser can be described
+ * as a SAX parser for OSML.
*/
-class OsmlStreamParser {
+class OsmlStreamParser: public parser_stack::ParserCallbacks {
public:
/**
* Enum used to indicate which state the OsmlStreamParser class is in
* after calling the "parse" function.
*/
- enum class State {
+ enum class State : uint8_t {
/**
- * State returned if a fully featured command has been read. A command
- * consists of the command name and its arguments (which optionally
- * includes the name).
+ * State returned if the start of a command has been read. Use the
+ * getCommandName(), getCommandArguments() and inRangeCommand()
+ * functions the retrieve more information about the command that was
+ * just started.
*/
- COMMAND,
+ COMMAND_START = 0,
/**
- * State returned if data is given. The reader must decide which field
- * or command this should be routed to. Trailing or leading whitespace
- * has been removed. Only called if the data is non-empty.
- */
- DATA,
-
- /**
- * A user-defined entity has been found. The entity sequence is stored
- * in the command name.
- */
- ENTITY,
-
- /**
- * State returned if an annotation was started. An annotation consists
- * of the command name and its arguments (which optionally include the
- * name).
- */
- ANNOTATION_START,
-
- /**
- * State returned if an annotation ends. The reader indicates which
- * annotation ends.
+ * State returned if a range command or range annotation has just ended.
+ * This state is not returned for non-range commands (as the actual end
+ * of a command is context dependent).
*/
- ANNOTATION_END,
+ RANGE_END = 1,
/**
* State returned if a new field started. The reader assures that the
@@ -98,223 +83,46 @@ public:
* is not started if data has been given outside of a field. The
* field number is set to the current field index.
*/
- FIELD_START,
+ FIELD_START = 2,
/**
* State returned if the current field ends. The reader assures that a
* field was actually open.
*/
- FIELD_END,
+ FIELD_END = 3,
/**
- * The end of the stream has been reached.
+ * State returned if an annotation was started. An annotation consists
+ * of the command name and its arguments (which optionally include the
+ * name).
*/
- END,
+ ANNOTATION_START = 4,
/**
- * Returned from internal functions if nothing should be done.
+ * State returned if an annotation ends. The reader indicates which
+ * annotation ends.
*/
- NONE,
+ ANNOTATION_END = 5,
/**
- * Returned from internal function to indicate irrecoverable errors.
+ * State returned if data is given. The reader must decide which field
+ * or command this should be routed to. Trailing or leading whitespace
+ * has been removed. Only called if the data is non-empty.
*/
- ERROR
- };
-
- /**
- * Entry used for the command stack.
- */
- struct Command {
- /**
- * Name and location of the current command.
- */
- Variant name;
-
- /**
- * Arguments that were passed to the command.
- */
- Variant arguments;
+ DATA = 6,
/**
- * Set to true if this is a command with clear begin and end.
- */
- bool hasRange : 1;
-
- /**
- * Set to true if we are currently inside a field of this command.
- */
- bool inField : 1;
-
- /**
- * Set to true if we are currently in the range field of the command
- * (implies inField being set to true).
- */
- bool inRangeField : 1;
-
- /**
- * Set to true if we are currently in a field that has been especially
- * marked as default field (using the "|") syntax.
- */
- bool inDefaultField : 1;
-
- /**
- * Default constructor.
- */
- Command()
- : hasRange(false),
- inField(false),
- inRangeField(false),
- inDefaultField()
- {
- }
-
- /**
- * Constructor of the Command class.
- *
- * @param name is a string variant with name and location of the
- * command.
- * @param arguments is a map variant with the arguments given to the
- * command.
- * @param hasRange should be set to true if this is a command with
- * explicit range.
- * @param inField is set to true if we currently are inside a field
- * of this command.
- * @param inRangeField is set to true if we currently are inside the
- * outer field of a ranged command.
- * @param inDefaultField is set to true if we currently are in a
- * specially marked default field.
- */
- Command(Variant name, Variant arguments, bool hasRange,
- bool inField, bool inRangeField, bool inDefaultField)
- : name(std::move(name)),
- arguments(std::move(arguments)),
- hasRange(hasRange),
- inField(inField),
- inRangeField(inRangeField),
- inDefaultField(inDefaultField)
- {
- }
+ * The end of the stream has been reached.
+ */
+ END = 7
};
private:
/**
- * Reference to the CharReader instance from which the incomming bytes are
- * read.
- */
- CharReader &reader;
-
- /**
- * Reference at the logger instance to which all error messages are sent.
+ * Pointer at the class containing the internal implementation (according
+ * to the PIMPL idiom).
*/
- Logger &logger;
-
- /**
- * Tokenizer instance used to read individual tokens from the text.
- */
- Tokenizer tokenizer;
-
- /**
- * Stack containing the current commands.
- */
- std::stack<Command> commands;
-
- /**
- * Variant containing the data that has been read (always is a string,
- * contains the exact location of the data in the source file).
- */
- Variant data;
-
- /**
- * Contains the location of the last token.
- */
- SourceLocation location;
-
- /**
- * Contains the field index of the current command.
- */
- size_t fieldIdx;
-
- /**
- * Function used internall to parse an identifier.
- *
- * @param start is the start byte offset of the identifier (including the
- * backslash).
- * @param allowNSSep should be set to true if the namespace separator is
- * allowed in the identifier name. Issues error if the namespace separator
- * is placed incorrectly.
- */
- Variant parseIdentifier(size_t start, bool allowNSSep = false);
-
- /**
- * Function used internally to handle the special "\begin" command.
- */
- State parseBeginCommand();
-
- /**
- * Function used internally to handle the special "\end" command.
- */
- State parseEndCommand();
-
- /**
- * Pushes the parsed command onto the command stack.
- */
- void pushCommand(Variant commandName, Variant commandArguments,
- bool hasRange);
-
- /**
- * Parses the command arguments.
- */
- Variant parseCommandArguments(Variant commandArgName);
-
- /**
- * Function used internally to parse a command.
- *
- * @param start is the start byte offset of the command (including the
- * backslash)
- * @param isAnnotation if true, the command is not returned as command, but
- * as annotation start.
- * @return true if a command was actuall parsed, false otherwise.
- */
- State parseCommand(size_t start, bool isAnnotation);
-
- /**
- * Function used internally to parse a block comment.
- */
- void parseBlockComment();
-
- /**
- * Function used internally to parse a generic comment.
- */
- void parseLineComment();
-
- /**
- * Checks whether there is any data pending to be issued, if yes, issues it.
- *
- * @param handler is the data handler that contains the data that may be
- * returned to the user.
- * @return true if there was any data and DATA should be returned by the
- * parse function, false otherwise.
- */
- bool checkIssueData(DataHandler &handler);
-
- /**
- * Called before any data is appended to the internal data handler. Checks
- * whether a new field should be started or implicitly ended.
- *
- * @return true if FIELD_START should be returned by the parse function.
- */
- bool checkIssueFieldStart();
-
- /**
- * Closes a currently open field. Note that the command will be removed from
- * the internal command stack if the field that is being closed is a
- * field marked as default field.
- *
- * @return true if the field could be closed, false if there was no field
- * to close.
- */
- bool closeField();
+ std::unique_ptr<OsmlStreamParserImpl> impl;
public:
/**
@@ -328,6 +136,12 @@ public:
OsmlStreamParser(CharReader &reader, Logger &logger);
/**
+ * Destructor of the OsmlStreamParser, needed to destroy the incomplete
+ * OsmlStreamParserImpl.
+ */
+ ~OsmlStreamParser();
+
+ /**
* Continues parsing. Returns one of the states defined in the State enum.
* Callers should stop once the State::END state is reached. Use the getter
* functions to get more information about the current state, such as the
@@ -338,17 +152,9 @@ public:
State parse();
/**
- * Returns a reference at the internally stored data. Only valid if
- * State::DATA was returned by the "parse" function.
- *
- * @return a reference at a variant containing the data parsed by the
- * "parse" function.
- */
- const Variant &getData() const { return data; }
-
- /**
* Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
+ * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END
+ * was returned by the "parse" function.
*
* @return a reference at a variant containing name and location of the
* parsed command.
@@ -357,7 +163,8 @@ public:
/**
* Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
+ * State::COMMAND_START, State::ANNOTATION_START or State::ANNOTATION_END
+ * was returned by the "parse" function.
*
* @return a reference at a variant containing arguments given to the
* command.
@@ -365,19 +172,43 @@ public:
const Variant &getCommandArguments() const;
/**
- * Returns true if the current field is the "default" field. This is true if
- * the parser either is in the outer range of a range command or inside a
- * field that has been especially marked as "default" field (using the "|"
- * syntax).
+ * Returns a reference at the internally stored data. Only valid if
+ * State::DATA was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing the data parsed by the
+ * "parse" function.
*/
- bool inDefaultField() const;
+ const TokenizedData &getData() const;
+
+ /**
+ * Returns the location of the current token.
+ */
+ const SourceLocation &getLocation() const;
/**
- * Returns a reference at the char reader.
+ * Returns true if the currently started command is a range command, only
+ * valid if State::COMMAND_START or State::ANNOTATION_START was returned by
+ * the "parse" function.
*
- * @return the last internal token location.
+ * @return true if the command is started is a range command, false
+ * otherwise.
*/
- const SourceLocation &getLocation() const { return location; }
+ bool inRangeCommand() const;
+
+ /**
+ * Returns true if the current field is the "default" field. This is true if
+ * the parser either is in the outer range of a range command or inside a
+ * field that has been especially marked as "default" field (using the "{!"
+ * syntax). Only valid if State::FIELD_START was returned by the "parse"
+ * function.
+ *
+ * @return true if the current field was marked as default field (using the
+ * "{!" syntax).
+ */
+ bool inDefaultField() const;
+
+ TokenId registerToken(const std::string &token) override;
+ void unregisterToken(TokenId token) override;
};
}
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
index c9254b0..79a8dbe 100644
--- a/src/formats/osxml/OsxmlEventParser.cpp
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -25,7 +25,7 @@
#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
#include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
#include "OsxmlAttributeLocator.hpp"
#include "OsxmlEventParser.hpp"
@@ -40,6 +40,11 @@ namespace ousia {
class OsxmlEventParserData {
public:
/**
+ * Current character data buffer.
+ */
+ TokenizedData data;
+
+ /**
* Contains the current depth of the parsing process.
*/
ssize_t depth;
@@ -52,35 +57,13 @@ public:
ssize_t annotationEndTagDepth;
/**
- * Current character data buffer.
- */
- std::vector<char> textBuf;
-
- /**
- * Current whitespace buffer (for the trimming whitspace mode)
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Flag indicating whether a whitespace character was present (for the
- * collapsing whitespace mode).
- */
- bool hasWhitespace;
-
- /**
- * Current character data start.
- */
- size_t textStart;
-
- /**
- * Current character data end.
- */
- size_t textEnd;
-
- /**
- * Default constructor.
+ * Constructor taking the sourceId of the file from which the XML is being
+ * parsed.
+ *
+ * @param sourceId is the source if of the XML file from which the data is
+ * currently being parsed.
*/
- OsxmlEventParserData();
+ OsxmlEventParserData(SourceId sourceId);
/**
* Increments the depth.
@@ -103,14 +86,6 @@ public:
* @return true if character data is available.
*/
bool hasText();
-
- /**
- * Returns a Variant containing the character data and its location.
- *
- * @return a string variant containing the text data and the character
- * location.
- */
- Variant getText(SourceId sourceId);
};
/* Class GuardedExpatXmlParser */
@@ -168,7 +143,7 @@ public:
static const std::string TOP_LEVEL_TAG{"ousia"};
/**
- * Prefix used to indicate the start of an annoation (note the trailing colon)
+ * Prefix used to indicate the start of an annoation (note the trailing colon).
*/
static const std::string ANNOTATION_START_PREFIX{"a:start:"};
@@ -215,8 +190,9 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// If there is any text data in the buffer, issue that first
if (parser->getData().hasText()) {
- parser->getEvents().data(
- parser->getData().getText(parser->getReader().getSourceId()));
+ TokenizedData &data = parser->getData().data;
+ parser->getEvents().data(data);
+ data.clear();
}
// Read the argument locations -- this is only a stupid and slow hack,
@@ -335,7 +311,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name,
// Just issue a "commandStart" event in any other case
Variant nameVar = Variant::fromString(nameStr);
nameVar.setLocation(nameLoc);
- parser->getEvents().command(nameVar, args);
+ parser->getEvents().commandStart(nameVar, args);
}
}
@@ -360,8 +336,9 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)
// If there is any text data in the buffer, issue that first
if (parser->getData().hasText()) {
- parser->getEvents().data(
- parser->getData().getText(parser->getReader().getSourceId()));
+ TokenizedData &data = parser->getData().data;
+ parser->getEvents().data(data);
+ data.clear();
}
// Abort if the special ousia tag ends here
@@ -370,8 +347,8 @@ static void xmlEndElementHandler(void *ref, const XML_Char *name)
return;
}
- // Issue the "fieldEnd" event
- parser->getEvents().fieldEnd();
+ // Issue the "rangeEnd" event
+ parser->getEvents().rangeEnd();
}
static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
@@ -393,34 +370,8 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
// Synchronize the logger position
SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
- // Fetch some variables for convenience
- const WhitespaceMode mode = parser->getWhitespaceMode();
- OsxmlEventParserData &data = parser->getData();
- std::vector<char> &textBuf = data.textBuf;
- std::vector<char> &whitespaceBuf = data.whitespaceBuf;
- bool &hasWhitespace = data.hasWhitespace;
- size_t &textStart = data.textStart;
- size_t &textEnd = data.textEnd;
-
- size_t pos = loc.getStart();
- for (size_t i = 0; i < ulen; i++, pos++) {
- switch (mode) {
- case WhitespaceMode::PRESERVE:
- PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd);
- break;
- case WhitespaceMode::TRIM:
- TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- whitespaceBuf);
- break;
- case WhitespaceMode::COLLAPSE:
- CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- hasWhitespace);
- break;
- }
- }
+ // Append the data to the buffer
+ parser->getData().data.append(std::string(s, ulen), loc.getStart());
}
/* Class OsxmlEvents */
@@ -429,12 +380,8 @@ OsxmlEvents::~OsxmlEvents() {}
/* Class OsxmlEventParser */
-OsxmlEventParserData::OsxmlEventParserData()
- : depth(0),
- annotationEndTagDepth(-1),
- hasWhitespace(false),
- textStart(0),
- textEnd(0)
+OsxmlEventParserData::OsxmlEventParserData(SourceId sourceId)
+ : data(sourceId), depth(0), annotationEndTagDepth(-1)
{
}
@@ -455,25 +402,7 @@ bool OsxmlEventParserData::inAnnotationEndTag()
return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);
}
-bool OsxmlEventParserData::hasText() { return !textBuf.empty(); }
-
-Variant OsxmlEventParserData::getText(SourceId sourceId)
-{
- // Create a variant containing the string data and the location
- Variant var =
- Variant::fromString(std::string{textBuf.data(), textBuf.size()});
- var.setLocation({sourceId, textStart, textEnd});
-
- // Reset the text buffers
- textBuf.clear();
- whitespaceBuf.clear();
- hasWhitespace = false;
- textStart = 0;
- textEnd = 0;
-
- // Return the variant
- return var;
-}
+bool OsxmlEventParserData::hasText() { return !data.empty(); }
/* Class OsxmlEventParser */
@@ -482,8 +411,7 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
: reader(reader),
events(events),
logger(logger),
- whitespaceMode(WhitespaceMode::COLLAPSE),
- data(new OsxmlEventParserData())
+ data(new OsxmlEventParserData(reader.getSourceId()))
{
}
@@ -532,16 +460,6 @@ void OsxmlEventParser::parse()
}
}
-void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
-{
- this->whitespaceMode = whitespaceMode;
-}
-
-WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
-{
- return whitespaceMode;
-}
-
CharReader &OsxmlEventParser::getReader() const { return reader; }
Logger &OsxmlEventParser::getLogger() const { return logger; }
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
index e39245f..4c5a485 100644
--- a/src/formats/osxml/OsxmlEventParser.hpp
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -32,8 +32,6 @@
#include <memory>
#include <string>
-#include <core/common/Whitespace.hpp>
-
namespace ousia {
// Forward declarations
@@ -61,7 +59,8 @@ public:
* @param args is a map containing the arguments that were given to the
* command.
*/
- virtual void command(const Variant &name, const Variant::mapType &args) = 0;
+ virtual void commandStart(const Variant &name,
+ const Variant::mapType &args) = 0;
/**
* Called whenever an annotation starts. Note that this implicitly always
@@ -90,24 +89,17 @@ public:
const Variant &elementName) = 0;
/**
- * Called whenever the default field which was implicitly started by
- * commandStart or annotationStart ends. Note that this does not end the
- * range of an annotation, but the default field of the annotation. To
- * signal the end of the annotation this, the annotationEnd method will be
- * invoked.
+ * Called whenever the command or annotation tags end.
*/
- virtual void fieldEnd() = 0;
+ virtual void rangeEnd() = 0;
/**
- * Called whenever data is found. Whitespace data is handled as specified
- * and the data has been parsed to the specified variant type. This function
- * is not called if the parsing failed, the parser prints an error message
- * instead.
+ * Called whenever string data is found.
*
- * @param data is the already parsed data that should be passed to the
- * handler.
+ * @param data is a TokenizedData instance containing the string data that
+ * was found in the XML file.
*/
- virtual void data(const Variant &data) = 0;
+ virtual void data(const TokenizedData &data) = 0;
};
/**
@@ -135,11 +127,6 @@ private:
Logger &logger;
/**
- * Current whitespace mode.
- */
- WhitespaceMode whitespaceMode;
-
- /**
* Data to be used by the internal functions.
*/
std::unique_ptr<OsxmlEventParserData> data;
@@ -171,21 +158,6 @@ public:
void parse();
/**
- * Sets the whitespace handling mode.
- *
- * @param whitespaceMode defines how whitespace in the data should be
- * handled.
- */
- void setWhitespaceMode(WhitespaceMode whitespaceMode);
-
- /**
- * Returns the current whitespace handling mode.
- *
- * @return the currently set whitespace handling mode.
- */
- WhitespaceMode getWhitespaceMode() const;
-
- /**
* Returns the internal CharReader reference.
*
* @return the CharReader reference.
@@ -207,7 +179,9 @@ public:
OsxmlEvents &getEvents() const;
/**
- * Returns a reference at the internal data.
+ * Used internally to fetch a reference at the internal data.
+ *
+ * @return a reference at the internal OsxmlEventParserData structure.
*/
OsxmlEventParserData &getData() const;
};
diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp
index c216855..10cc77a 100644
--- a/src/formats/osxml/OsxmlParser.cpp
+++ b/src/formats/osxml/OsxmlParser.cpp
@@ -16,6 +16,9 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <core/common/Variant.hpp>
+#include <core/common/CharReader.hpp>
+#include <core/parser/stack/Callbacks.hpp>
#include <core/parser/stack/GenericParserStates.hpp>
#include <core/parser/stack/Stack.hpp>
#include <core/parser/ParserContext.hpp>
@@ -30,7 +33,7 @@ using namespace parser_stack;
/**
* Class containing the actual OsxmlParser implementation.
*/
-class OsxmlParserImplementation : public OsxmlEvents {
+class OsxmlParserImplementation : public OsxmlEvents, ParserCallbacks {
private:
/**
* Actual xml parser -- converts the xml stream into a set of events.
@@ -54,7 +57,7 @@ public:
*/
OsxmlParserImplementation(CharReader &reader, ParserContext &ctx)
: parser(reader, *this, ctx.getLogger()),
- stack(ctx, GenericParserStates)
+ stack(*this, ctx, GenericParserStates)
{
}
@@ -63,17 +66,16 @@ public:
*/
void parse() { parser.parse(); }
- void command(const Variant &name, const Variant::mapType &args) override
+ void commandStart(const Variant &name,
+ const Variant::mapType &args) override
{
- stack.command(name, args);
- stack.fieldStart(true);
+ stack.commandStart(name, args, true);
}
void annotationStart(const Variant &name,
const Variant::mapType &args) override
{
- stack.annotationStart(name, args);
- stack.fieldStart(true);
+ stack.annotationStart(name, args, true);
}
void annotationEnd(const Variant &className,
@@ -82,9 +84,19 @@ public:
stack.annotationEnd(className, elementName);
}
- void fieldEnd() override { stack.fieldEnd(); }
+ void rangeEnd() override { stack.rangeEnd(); }
- void data(const Variant &data) override { stack.data(data); }
+ void data(const TokenizedData &data) override { stack.data(data); }
+
+ TokenId registerToken(const std::string &token) override
+ {
+ return Tokens::Empty;
+ }
+
+ void unregisterToken(TokenId id) override
+ {
+ // Do nothing here
+ }
};
/* Class OsxmlParser */