summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/common/Utils.cpp54
-rw-r--r--src/core/common/Utils.hpp86
-rw-r--r--src/core/common/Whitespace.hpp60
-rw-r--r--src/core/common/WhitespaceHandler.hpp284
-rw-r--r--src/core/model/Node.cpp2
-rw-r--r--src/core/parser/ParserStack.cpp216
-rw-r--r--src/core/parser/ParserStack.hpp361
-rw-r--r--src/core/parser/generic/GenericParser.cpp0
-rw-r--r--src/core/parser/stack/Callbacks.cpp23
-rw-r--r--src/core/parser/stack/Callbacks.hpp99
-rw-r--r--src/core/parser/stack/DocumentHandler.hpp15
-rw-r--r--src/core/parser/stack/DomainHandler.hpp69
-rw-r--r--src/core/parser/stack/Handler.cpp252
-rw-r--r--src/core/parser/stack/Handler.hpp414
-rw-r--r--src/core/parser/stack/ImportIncludeHandler.hpp72
-rw-r--r--src/core/parser/stack/Stack.cpp554
-rw-r--r--src/core/parser/stack/Stack.hpp341
-rw-r--r--src/core/parser/stack/State.cpp (renamed from src/core/parser/ParserState.cpp)66
-rw-r--r--src/core/parser/stack/State.hpp (renamed from src/core/parser/ParserState.hpp)152
-rw-r--r--src/core/parser/utils/TokenTrie.cpp (renamed from src/formats/osdm/TokenTrie.cpp)0
-rw-r--r--src/core/parser/utils/TokenTrie.hpp (renamed from src/formats/osdm/TokenTrie.hpp)0
-rw-r--r--src/core/parser/utils/Tokenizer.cpp (renamed from src/formats/osdm/DynamicTokenizer.cpp)283
-rw-r--r--src/core/parser/utils/Tokenizer.hpp (renamed from src/formats/osdm/DynamicTokenizer.hpp)57
-rw-r--r--src/formats/osml/OsmlParser.cpp57
-rw-r--r--src/formats/osml/OsmlParser.hpp (renamed from src/core/parser/generic/GenericParser.hpp)31
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp (renamed from src/formats/osdm/OsdmStreamParser.cpp)226
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp (renamed from src/formats/osdm/OsdmStreamParser.hpp)90
-rw-r--r--src/formats/osxml/OsxmlAttributeLocator.cpp144
-rw-r--r--src/formats/osxml/OsxmlAttributeLocator.hpp67
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp547
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp215
-rw-r--r--src/formats/osxml/OsxmlParser.cpp238
-rw-r--r--src/formats/osxml/OsxmlParser.hpp (renamed from src/plugins/xml/XmlParser.hpp)12
-rw-r--r--src/plugins/css/CodeTokenizer.cpp (renamed from src/core/CodeTokenizer.cpp)0
-rw-r--r--src/plugins/css/CodeTokenizer.hpp (renamed from src/core/CodeTokenizer.hpp)0
-rw-r--r--src/plugins/css/Tokenizer.cpp (renamed from src/core/Tokenizer.cpp)0
-rw-r--r--src/plugins/css/Tokenizer.hpp (renamed from src/core/Tokenizer.hpp)0
-rw-r--r--src/plugins/xml/XmlParser.cpp575
38 files changed, 3962 insertions, 1700 deletions
diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp
index 563fe2a..f8b53c6 100644
--- a/src/core/common/Utils.cpp
+++ b/src/core/common/Utils.cpp
@@ -18,19 +18,13 @@
#include <algorithm>
#include <cctype>
-#include <limits>
#include <string>
#include "Utils.hpp"
+#include "WhitespaceHandler.hpp"
namespace ousia {
-std::string Utils::trim(const std::string &s)
-{
- std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
- return s.substr(bounds.first, bounds.second - bounds.first);
-}
-
bool Utils::isIdentifier(const std::string &name)
{
bool first = true;
@@ -43,7 +37,27 @@ bool Utils::isIdentifier(const std::string &name)
}
first = false;
}
- return true;
+ return !first;
+}
+
+bool Utils::isIdentifierOrEmpty(const std::string &name)
+{
+ return name.empty() || isIdentifier(name);
+}
+
+bool Utils::isNamespacedIdentifier(const std::string &name)
+{
+ bool first = true;
+ for (char c : name) {
+ if (first && !isIdentifierStartCharacter(c)) {
+ return false;
+ }
+ if (!first && (!isIdentifierCharacter(c) && c != ':')) {
+ return false;
+ }
+ first = (c == ':');
+ }
+ return !first;
}
bool Utils::hasNonWhitepaceChar(const std::string &s)
@@ -94,5 +108,29 @@ std::string Utils::extractFileExtension(const std::string &filename)
}
return std::string{};
}
+
+std::string Utils::trim(const std::string &s)
+{
+ std::pair<size_t, size_t> bounds = trim(s, Utils::isWhitespace);
+ return s.substr(bounds.first, bounds.second - bounds.first);
+}
+
+std::string Utils::collapse(const std::string &s)
+{
+ CollapsingWhitespaceHandler h;
+ appendToWhitespaceHandler(h, s, 0);
+ return h.toString();
+}
+
+bool Utils::startsWith(const std::string &s, const std::string &prefix)
+{
+ return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix;
+}
+
+bool Utils::endsWith(const std::string &s, const std::string &suffix)
+{
+ return suffix.size() <= s.size() &&
+ s.substr(s.size() - suffix.size(), suffix.size()) == suffix;
+}
}
diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp
index 2c8a5b3..b5a54fc 100644
--- a/src/core/common/Utils.hpp
+++ b/src/core/common/Utils.hpp
@@ -74,16 +74,45 @@ public:
}
/**
- * Returns true if the given character is in [A-Za-z][A-Za-z0-9_-]*
+ * Returns true if the given string is in
+ * \code{.txt}
+ * [A-Za-z][A-Za-z0-9_-]*
+ * \endCode
+ *
+ * @param name is the string that should be tested.
+ * @return true if the string matches the regular expression given above,
+ * false otherwise.
*/
static bool isIdentifier(const std::string &name);
/**
+ * Returns true if the given string is an identifier or an empty string.
+ */
+ static bool isIdentifierOrEmpty(const std::string &name);
+
+ /**
+ * Returns true if the given string is in
+ * \code{.txt}
+ * ([A-Za-z][A-Za-z0-9_-]*)(:[A-Za-z][A-Za-z0-9_-]*)*
+ * \endCode
+ *
+ * @param name is the string that should be tested.
+ * @return true if the string matches the regular expression given above,
+ * false otherwise.
+ */
+ static bool isNamespacedIdentifier(const std::string &name);
+
+ /**
+ * Returns true if the given character is a linebreak character.
+ */
+ static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); }
+
+ /**
* Returns true if the given character is a whitespace character.
*/
static bool isWhitespace(const char c)
{
- return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r');
+ return (c == ' ') || (c == '\t') || isLinebreak(c);
}
/**
@@ -95,11 +124,6 @@ public:
static bool hasNonWhitepaceChar(const std::string &s);
/**
- * Returns true if the given character is a whitespace character.
- */
- static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); }
-
- /**
* Removes whitespace at the beginning and the end of the given string.
*
* @param s is the string that should be trimmed.
@@ -120,8 +144,25 @@ public:
template <class T, class Filter>
static std::pair<size_t, size_t> trim(const T &s, Filter f)
{
+ return trim(s, s.size(), f);
+ }
+
+ /**
+ * Trims the given string or vector of chars by returning the start and end
+ * index.
+ *
+ * @param s is the container that should be trimmed.
+ * @param len is the number of elements in the container.
+ * @param f is a function that returns true for values that should be
+ * removed.
+ * @return start and end index. Note that "end" points at the character
+ * beyond the end, thus "end" minus "start"
+ */
+ template <class T, class Filter>
+ static std::pair<size_t, size_t> trim(const T &s, size_t len, Filter f)
+ {
size_t start = 0;
- for (size_t i = 0; i < s.size(); i++) {
+ for (size_t i = 0; i < len; i++) {
if (!f(s[i])) {
start = i;
break;
@@ -129,7 +170,7 @@ public:
}
size_t end = 0;
- for (ssize_t i = s.size() - 1; i >= static_cast<ssize_t>(start); i--) {
+ for (ssize_t i = len - 1; i >= static_cast<ssize_t>(start); i--) {
if (!f(s[i])) {
end = i + 1;
break;
@@ -145,6 +186,15 @@ public:
}
/**
+ * Collapses the whitespaces in the given string (trims the string and
+ * replaces all whitespace characters by a single one).
+ *
+ * @param s is the string in which the whitespace should be collapsed.
+ * @return a copy of s with collapsed whitespace.
+ */
+ static std::string collapse(const std::string &s);
+
+ /**
* Turns the elements of a collection into a string separated by the
* given delimiter.
*
@@ -205,6 +255,24 @@ public:
static std::string extractFileExtension(const std::string &filename);
/**
+ * Checks whether the given string starts with the given prefix.
+ *
+ * @param s is the string.
+ * @param prefix is the string which should be checked for being a prefix of
+ * s.
+ */
+ static bool startsWith(const std::string &s, const std::string &prefix);
+
+ /**
+ * Checks whether the given string ends with the given suffix.
+ *
+ * @param s is the string.
+ * @param suffix is the string which should be checked for being a suffix of
+ * s.
+ */
+ static bool endsWith(const std::string &s, const std::string &suffix);
+
+ /**
* Hash functional to be used for enum classes.
* See http://stackoverflow.com/a/24847480/2188211
*/
diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp
new file mode 100644
index 0000000..72a2291
--- /dev/null
+++ b/src/core/common/Whitespace.hpp
@@ -0,0 +1,60 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Whitespace.hpp
+ *
+ * Contains the WhitespaceMode enum used in various places.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_WHITESPACE_HPP_
+#define _OUSIA_WHITESPACE_HPP_
+
+#include <string>
+#include <utility>
+
+namespace ousia {
+
+/**
+ * Enum specifying the whitespace handling mode of the tokenizer and the
+ * parsers.
+ */
+enum class WhitespaceMode {
+ /**
+ * Preserves all whitespaces as they are found in the source file.
+ */
+ PRESERVE,
+
+ /**
+ * Trims whitespace at the beginning and the end of the found text.
+ */
+ TRIM,
+
+ /**
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
+ COLLAPSE
+};
+
+}
+
+#endif /* _OUSIA_WHITESPACE_HPP_ */
+
diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp
new file mode 100644
index 0000000..ed52ea3
--- /dev/null
+++ b/src/core/common/WhitespaceHandler.hpp
@@ -0,0 +1,284 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file WhitespaceHandler.hpp
+ *
+ * Contains the WhitespaceHandler classes which are used in multiple places to
+ * trim, compact or preserve whitespaces while at the same time maintaining the
+ * position information associated with the input strings.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
+#define _OUSIA_WHITESPACE_HANDLER_HPP_
+
+#include <string>
+#include <vector>
+
+#include "Utils.hpp"
+
+namespace ousia {
+
+/**
+ * WhitespaceHandler is a based class that can be used to collect text on a
+ * character-by-character basis. Note that this class and its descendants are
+ * hoped to be inlined by the compiler (and used in conjunction with templates),
+ * thus they are fully defined inside this header.
+ */
+class WhitespaceHandler {
+public:
+ /**
+ * Start position of the extracted text.
+ */
+ size_t textStart;
+
+ /**
+ * End position of the extracted text.
+ */
+ size_t textEnd;
+
+ /**
+ * Buffer containing the extracted text.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Constructor of the TextHandlerBase base class. Initializes the start and
+ * end position with zeros.
+ */
+ WhitespaceHandler() : textStart(0), textEnd(0) {}
+
+ /**
+ * Returns true if this whitespace handler has found any text and a text
+ * token could be emitted.
+ *
+ * @return true if the internal data buffer is non-empty.
+ */
+ bool hasText() { return !textBuf.empty(); }
+
+ /**
+ * Returns the content of the WhitespaceHandler as string.
+ */
+ std::string toString() const
+ {
+ return std::string(textBuf.data(), textBuf.size());
+ }
+};
+
+/**
+ * The PreservingWhitespaceHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Appends the given character to the internal text buffer, does not
+ * eliminate whitespace.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ append(c, start, end, textBuf, textStart, textEnd);
+ }
+
+ /**
+ * Static version of PreservingWhitespaceHandler append
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ * @param textBuf is a reference at the text buffer that is to be used.
+ * @param textStart is a reference at the text start variable that is to be
+ * used.
+ * @param textEnd is a reference at the text end variable that is to be
+ * used.
+ */
+ static void append(char c, size_t start, size_t end,
+ std::vector<char> &textBuf, size_t &textStart,
+ size_t &textEnd)
+ {
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
+ */
+class TrimmingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Buffer used internally to temporarily store all whitespace characters.
+ * They are only added to the output buffer if another non-whitespace
+ * character is reached.
+ */
+ std::vector<char> whitespaceBuf;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * whitespace characters at the begin and end of the text.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf);
+ }
+
+ /**
+ * Static version of TrimmingWhitespaceHandler append
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ * @param textBuf is a reference at the text buffer that is to be used.
+ * @param textStart is a reference at the text start variable that is to be
+ * used.
+ * @param textEnd is a reference at the text end variable that is to be
+ * used.
+ * @param whitespaceBuf is a reference at the buffer for storing whitespace
+ * characters.
+ */
+ static void append(char c, size_t start, size_t end,
+ std::vector<char> &textBuf, size_t &textStart,
+ size_t &textEnd, std::vector<char> &whitespaceBuf)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ whitespaceBuf.push_back(c);
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (!whitespaceBuf.empty()) {
+ textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+ whitespaceBuf.end());
+ whitespaceBuf.clear();
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingWhitespaceHandler : public WhitespaceHandler {
+public:
+ /**
+ * Flag set to true if a whitespace character was reached.
+ */
+ bool hasWhitespace = false;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * redundant whitespace characters.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ append(c, start, end, textBuf, textStart, textEnd, hasWhitespace);
+ }
+
+ /**
+ * Static version of CollapsingWhitespaceHandler append
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ * @param textBuf is a reference at the text buffer that is to be used.
+ * @param textStart is a reference at the text start variable that is to be
+ * used.
+ * @param textEnd is a reference at the text end variable that is to be
+ * used.
+ * @param hasWhitespace is a reference at the "hasWhitespace" flag.
+ */
+ static void append(char c, size_t start, size_t end,
+ std::vector<char> &textBuf, size_t &textStart,
+ size_t &textEnd, bool &hasWhitespace)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ hasWhitespace = true;
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (hasWhitespace) {
+ textBuf.push_back(' ');
+ hasWhitespace = false;
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/**
+ * Function that can be used to append the given buffer (e.g. a string or a
+ * vector) to the whitespace handler.
+ *
+ * @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
+ * @tparam Buffer is an iterable type.
+ * @param handler is the handler to which the characters of the Buffer should be
+ * appended.
+ * @param buf is the buffer from which the characters should be read.
+ * @param start is the start byte offset. Each character is counted as one byte.
+ */
+template <typename WhitespaceHandler, typename Buffer>
+inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
+ size_t start)
+{
+ for (auto elem : buf) {
+ handler.append(elem, start, start + 1);
+ start++;
+ }
+}
+}
+
+#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */
+
diff --git a/src/core/model/Node.cpp b/src/core/model/Node.cpp
index 39ee2e4..ce15cad 100644
--- a/src/core/model/Node.cpp
+++ b/src/core/model/Node.cpp
@@ -448,7 +448,7 @@ bool Node::doValidate(Logger &logger) const { return true; }
bool Node::validateName(Logger &logger) const
{
- if (!Utils::isIdentifier(name)) {
+ if (!Utils::isIdentifierOrEmpty(name)) {
logger.error(type()->name + std::string(" name \"") + name +
std::string("\" is not a valid identifier"),
this);
diff --git a/src/core/parser/ParserStack.cpp b/src/core/parser/ParserStack.cpp
deleted file mode 100644
index 1265851..0000000
--- a/src/core/parser/ParserStack.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <sstream>
-
-#include <core/common/Utils.hpp>
-#include <core/common/Exceptions.hpp>
-#include <core/model/Project.hpp>
-
-#include "ParserScope.hpp"
-#include "ParserStack.hpp"
-
-namespace ousia {
-
-/* A default handler */
-
-/**
- * The DefaultHandler class is used in case no element handler is specified in
- * the ParserState descriptor.
- */
-class DefaultHandler : public Handler {
-public:
- using Handler::Handler;
-
- void start(Variant::mapType &args) override {}
-
- void end() override {}
-
- static Handler *create(const HandlerData &handlerData)
- {
- return new DefaultHandler{handlerData};
- }
-};
-
-/* Class Handler */
-
-void Handler::data(const std::string &data, int field)
-{
- if (Utils::hasNonWhitepaceChar(data)) {
- logger().error("Expected command but found character data.");
- }
-}
-
-/* Class ParserStack */
-
-/**
- * Returns an Exception that should be thrown when a currently invalid command
- * is thrown.
- */
-static LoggableException InvalidCommand(const std::string &name,
- const std::set<std::string> &expected)
-{
- if (expected.empty()) {
- return LoggableException{
- std::string{"No nested elements allowed, but got \""} + name +
- std::string{"\""}};
- } else {
- return LoggableException{
- std::string{"Expected "} +
- (expected.size() == 1 ? std::string{"\""}
- : std::string{"one of \""}) +
- Utils::join(expected, "\", \"") + std::string{"\", but got \""} +
- name + std::string{"\""}};
- }
-}
-
-ParserStack::ParserStack(
- ParserContext &ctx,
- const std::multimap<std::string, const ParserState *> &states)
- : ctx(ctx), states(states)
-{
-}
-
-bool ParserStack::deduceState()
-{
- // Assemble all states
- std::vector<const ParserState *> states;
- for (const auto &e : this->states) {
- states.push_back(e.second);
- }
-
- // Fetch the type signature of the scope and derive all possible states,
- // abort if no unique parser state was found
- std::vector<const ParserState *> possibleStates =
- ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states)
- .deduce();
- if (possibleStates.size() != 1) {
- ctx.getLogger().error(
- "Error while including file: Cannot deduce parser state.");
- return false;
- }
-
- // Switch to this state by creating a dummy handler
- const ParserState *state = possibleStates[0];
- Handler *handler =
- DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}});
- stack.emplace(handler);
- return true;
-}
-
-std::set<std::string> ParserStack::expectedCommands()
-{
- const ParserState *currentState = &(this->currentState());
- std::set<std::string> res;
- for (const auto &v : states) {
- if (v.second->parents.count(currentState)) {
- res.insert(v.first);
- }
- }
- return res;
-}
-
-const ParserState &ParserStack::currentState()
-{
- return stack.empty() ? ParserStates::None : stack.top()->state();
-}
-
-std::string ParserStack::currentCommandName()
-{
- return stack.empty() ? std::string{} : stack.top()->name();
-}
-
-const ParserState *ParserStack::findTargetState(const std::string &name)
-{
- const ParserState *currentState = &(this->currentState());
- auto range = states.equal_range(name);
- for (auto it = range.first; it != range.second; it++) {
- const ParserStateSet &parents = it->second->parents;
- if (parents.count(currentState) || parents.count(&ParserStates::All)) {
- return it->second;
- }
- }
-
- return nullptr;
-}
-
-void ParserStack::start(const std::string &name, Variant::mapType &args,
- const SourceLocation &location)
-{
- ParserState const *targetState = findTargetState(name);
-// TODO: Andreas, please improve this.
-// if (!Utils::isIdentifier(name)) {
-// throw LoggableException(std::string("Invalid identifier \"") + name +
-// std::string("\""));
-// }
-
- if (targetState == nullptr) {
- targetState = findTargetState("*");
- }
- if (targetState == nullptr) {
- throw InvalidCommand(name, expectedCommands());
- }
-
- // Fetch the associated constructor
- HandlerConstructor ctor = targetState->elementHandler
- ? targetState->elementHandler
- : DefaultHandler::create;
-
- // Canonicalize the arguments, allow additional arguments
- targetState->arguments.validateMap(args, ctx.getLogger(), true);
-
- // Instantiate the handler and call its start function
- Handler *handler = ctor({ctx, name, *targetState, currentState(), location});
- handler->start(args);
- stack.emplace(handler);
-}
-
-void ParserStack::start(std::string name, const Variant::mapType &args,
- const SourceLocation &location)
-{
- Variant::mapType argsCopy(args);
- start(name, argsCopy);
-}
-
-void ParserStack::end()
-{
- // Check whether the current command could be ended
- if (stack.empty()) {
- throw LoggableException{"No command to end."};
- }
-
- // Remove the current HandlerInstance from the stack
- std::shared_ptr<Handler> inst{stack.top()};
- stack.pop();
-
- // Call the end function of the last Handler
- inst->end();
-}
-
-void ParserStack::data(const std::string &data, int field)
-{
- // Check whether there is any command the data can be sent to
- if (stack.empty()) {
- throw LoggableException{"No command to receive data."};
- }
-
- // Pass the data to the current Handler instance
- stack.top()->data(data, field);
-}
-}
-
diff --git a/src/core/parser/ParserStack.hpp b/src/core/parser/ParserStack.hpp
deleted file mode 100644
index efc4e4a..0000000
--- a/src/core/parser/ParserStack.hpp
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file ParserStack.hpp
- *
- * Helper classes for document or description parsers. Contains the ParserStack
- * class, which is an pushdown automaton responsible for accepting commands in
- * the correct order and calling specified handlers.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_PARSER_STACK_HPP_
-#define _OUSIA_PARSER_STACK_HPP_
-
-#include <cstdint>
-
-#include <map>
-#include <memory>
-#include <set>
-#include <stack>
-#include <vector>
-
-#include <core/common/Variant.hpp>
-#include <core/common/Logger.hpp>
-#include <core/common/Argument.hpp>
-
-#include "Parser.hpp"
-#include "ParserContext.hpp"
-#include "ParserState.hpp"
-
-namespace ousia {
-
-/**
- * Struct collecting all the data that is being passed to a Handler instance.
- */
-struct HandlerData {
- /**
- * Reference to the ParserContext instance that should be used to resolve
- * references to nodes in the Graph.
- */
- ParserContext &ctx;
-
- /**
- * Contains the name of the tag that is being handled.
- */
- const std::string name;
-
- /**
- * Contains the current state of the state machine.
- */
- const ParserState &state;
-
- /**
- * Contains the state of the state machine when the parent node was handled.
- */
- const ParserState &parentState;
-
- /**
- * Current source code location.
- */
- const SourceLocation location;
-
- /**
- * Constructor of the HandlerData class.
- *
- * @param ctx is the parser context the handler should be executed in.
- * @param name is the name of the string.
- * @param state is the state this handler was called for.
- * @param parentState is the state of the parent command.
- * @param location is the location at which the handler is created.
- */
- HandlerData(ParserContext &ctx, std::string name, const ParserState &state,
- const ParserState &parentState, const SourceLocation location)
- : ctx(ctx),
- name(std::move(name)),
- state(state),
- parentState(parentState),
- location(location){};
-};
-
-/**
- * The handler class provides a context for handling an XML tag. It has to be
- * overridden and registered in the StateStack class to form handlers for
- * concrete XML tags.
- */
-class Handler {
-private:
- /**
- * Structure containing the internal handler data.
- */
- const HandlerData handlerData;
-
-public:
- /**
- * Constructor of the Handler class.
- *
- * @param data is a structure containing all data being passed to the
- * handler.
- */
- Handler(const HandlerData &handlerData) : handlerData(handlerData){};
-
- /**
- * Virtual destructor.
- */
- virtual ~Handler(){};
-
- /**
- * Returns a reference at the ParserContext.
- *
- * @return a reference at the ParserContext.
- */
- ParserContext &context() { return handlerData.ctx; }
-
- /**
- * Returns the command name for which the handler was created.
- *
- * @return a const reference at the command name.
- */
- const std::string &name() { return handlerData.name; }
-
- /**
- * Returns a reference at the ParserScope instance.
- *
- * @return a reference at the ParserScope instance.
- */
- ParserScope &scope() { return handlerData.ctx.getScope(); }
-
- /**
- * Returns a reference at the Manager instance which manages all nodes.
- *
- * @return a referance at the Manager instance.
- */
- Manager &manager() { return handlerData.ctx.getManager(); }
-
- /**
- * Returns a reference at the Logger instance used for logging error
- * messages.
- *
- * @return a reference at the Logger instance.
- */
- Logger &logger() { return handlerData.ctx.getLogger(); }
-
- /**
- * Returns a reference at the Project Node, representing the project into
- * which the file is currently being parsed.
- *
- * @return a referance at the Project Node.
- */
- Rooted<Project> project() { return handlerData.ctx.getProject(); }
-
- /**
- * Reference at the ParserState descriptor for which this Handler was
- * created.
- *
- * @return a const reference at the constructing ParserState descriptor.
- */
- const ParserState &state() { return handlerData.state; }
-
- /**
- * Reference at the ParserState descriptor of the parent state of the state
- * for which this Handler was created. Set to ParserStates::None if there
- * is no parent state.
- *
- * @return a const reference at the parent state of the constructing
- * ParserState descriptor.
- */
- const ParserState &parentState() { return handlerData.parentState; }
-
- /**
- * Returns the current location in the source file.
- *
- * @return the current location in the source file.
- */
- SourceLocation location() { return handlerData.location; }
-
- /**
- * Called when the command that was specified in the constructor is
- * instanciated.
- *
- * @param args is a map from strings to variants (argument name and value).
- */
- virtual void start(Variant::mapType &args) = 0;
-
- /**
- * Called whenever the command for which this handler is defined ends.
- */
- virtual void end() = 0;
-
- /**
- * Called whenever raw data (int the form of a string) is available for the
- * Handler instance. In the default handler an exception is raised if the
- * received data contains non-whitespace characters.
- *
- * @param data is a pointer at the character data that is available for the
- * Handler instance.
- * @param field is the field number (the interpretation of this value
- * depends on the format that is being parsed).
- */
- virtual void data(const std::string &data, int field);
-};
-
-/**
- * HandlerConstructor is a function pointer type used to create concrete
- * instances of the Handler class.
- *
- * @param handlerData is the data that should be passed to the new handler
- * instance.
- * @return a newly created handler instance.
- */
-using HandlerConstructor = Handler *(*)(const HandlerData &handlerData);
-
-/**
- * The ParserStack class is a pushdown automaton responsible for turning a
- * command stream into a tree of Node instances.
- */
-class ParserStack {
-private:
- /**
- * Reference at the parser context.
- */
- ParserContext &ctx;
-
- /**
- * Map containing all registered command names and the corresponding
- * state descriptors.
- */
- const std::multimap<std::string, const ParserState *> &states;
-
- /**
- * Internal stack used for managing the currently active Handler instances.
- */
- std::stack<std::shared_ptr<Handler>> stack;
-
- /**
- * Used internally to get all expected command names for the current state.
- * This function is used to build error messages.
- *
- * @return a set of strings containing the names of the expected commands.
- */
- std::set<std::string> expectedCommands();
-
- /**
- * Returns the targetState for a command with the given name that can be
- * reached from for the current state.
- *
- * @param name is the name of the requested command.
- * @return nullptr if no target state was found, a pointer at the target
- *state
- * otherwise.
- */
- const ParserState *findTargetState(const std::string &name);
-
-public:
- /**
- * Creates a new instance of the ParserStack class.
- *
- * @param ctx is the parser context the parser stack is working on.
- * @param states is a map containing the command names and pointers at the
- * corresponding ParserState instances.
- */
- ParserStack(ParserContext &ctx,
- const std::multimap<std::string, const ParserState *> &states);
-
- /**
- * Tries to reconstruct the parser state from the Scope instance of the
- * ParserContext given in the constructor. This functionality is needed for
- * including files,as the Parser of the included file needs to be brought to
- + an equivalent state as the one in the including file.
- *
- * @param scope is the ParserScope instance from which the ParserState
- * should be reconstructed.
- * @param logger is the logger instance to which error messages should be
- * written.
- * @return true if the operation was sucessful, false otherwise.
- */
- bool deduceState();
-
- /**
- * Returns the state the ParserStack instance currently is in.
- *
- * @return the state of the currently active Handler instance or STATE_NONE
- * if no handler is on the stack.
- */
- const ParserState &currentState();
-
- /**
- * Returns the command name that is currently being handled.
- *
- * @return the name of the command currently being handled by the active
- * Handler instance or an empty string if no handler is currently active.
- */
- std::string currentCommandName();
-
- /**
- * Function that should be called whenever a new command starts.
- *
- * @param name is the name of the command.
- * @param args is a map from strings to variants (argument name and value).
- * Note that the passed map will be modified.
- * @param location is the location in the source file at which the command
- * starts.
- */
- void start(const std::string &name, Variant::mapType &args,
- const SourceLocation &location = SourceLocation{});
-
- /**
- * Function that should be called whenever a new command starts.
- *
- * @param name is the name of the command.
- * @param args is a map from strings to variants (argument name and value).
- * @param location is the location in the source file at which the command
- * starts.
- */
- void start(std::string name,
- const Variant::mapType &args = Variant::mapType{},
- const SourceLocation &location = SourceLocation{});
-
- /**
- * Function called whenever a command ends.
- */
- void end();
-
- /**
- * Function that should be called whenever data is available for the
- * command.
- *
- * @param data is the data that should be passed to the handler.
- * @param field is the field number (the interpretation of this value
- * depends on the format that is being parsed).
- */
- void data(const std::string &data, int field = 0);
-
- /**
- * Returns a reference to the parser context the parser stack is currently
- * working on.
- *
- * @return a reference to the parser context.
- */
- ParserContext &getContext() { return ctx; }
-};
-}
-
-#endif /* _OUSIA_PARSER_STACK_HPP_ */
-
diff --git a/src/core/parser/generic/GenericParser.cpp b/src/core/parser/generic/GenericParser.cpp
deleted file mode 100644
index e69de29..0000000
--- a/src/core/parser/generic/GenericParser.cpp
+++ /dev/null
diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp
new file mode 100644
index 0000000..6ebc549
--- /dev/null
+++ b/src/core/parser/stack/Callbacks.cpp
@@ -0,0 +1,23 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Callbacks.hpp"
+
+namespace ousia {
+}
+
diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp
new file mode 100644
index 0000000..9c61000
--- /dev/null
+++ b/src/core/parser/stack/Callbacks.hpp
@@ -0,0 +1,99 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Callbacks.hpp
+ *
+ * Contains an interface defining the callbacks that can be directed from a
+ * StateHandler to the StateStack, and from the StateStack to
+ * the actual parser.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_PARSER_STACK_CALLBACKS_HPP_
+#define _OUSIA_PARSER_STACK_CALLBACKS_HPP_
+
+#include <string>
+
+#include <core/common/Whitespace.hpp>
+
+namespace ousia {
+namespace parser_stack {
+
+/**
+ * Interface defining a set of callback functions that act as a basis for the
+ * StateStackCallbacks and the ParserCallbacks.
+ */
+class Callbacks {
+public:
+ /**
+ * Virtual descructor.
+ */
+ virtual ~Callbacks() {};
+
+ /**
+ * Sets the whitespace mode that specifies how string data should be
+ * processed.
+ *
+ * @param whitespaceMode specifies one of the three WhitespaceMode constants
+ * PRESERVE, TRIM or COLLAPSE.
+ */
+ virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0;
+
+ /**
+ * Registers the given token as token that should be reported to the handler
+ * using the "token" function.
+ *
+ * @param token is the token string that should be reported.
+ */
+ virtual void registerToken(const std::string &token) = 0;
+
+ /**
+ * Unregisters the given token, it will no longer be reported to the handler
+ * using the "token" function.
+ *
+ * @param token is the token string that should be unregistered.
+ */
+ virtual void unregisterToken(const std::string &token) = 0;
+};
+
+/**
+ * Interface defining the callback functions that can be passed from a
+ * StateStack to the underlying parser.
+ */
+class ParserCallbacks : public Callbacks {
+ /**
+ * Checks whether the given token is supported by the parser. The parser
+ * returns true, if the token is supported, false if this token cannot be
+ * registered. Note that parsers that do not support the registration of
+ * tokens at all should always return "true".
+ *
+ * @param token is the token that should be checked for support.
+ * @return true if the token is generally supported (or the parser does not
+ * support registering tokens at all), false if the token is not supported,
+ * because e.g. it is a reserved token or it interferes with other tokens.
+ */
+ virtual bool supportsToken(const std::string &token) = 0;
+};
+
+}
+}
+
+#endif /* _OUSIA_PARSER_STACK_CALLBACKS_HPP_ */
+
diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp
index 697f9e7..475fe69 100644
--- a/src/core/parser/stack/DocumentHandler.hpp
+++ b/src/core/parser/stack/DocumentHandler.hpp
@@ -26,7 +26,8 @@
#define _OUSIA_DOCUMENT_HANDLER_HPP_
#include <core/common/Variant.hpp>
-#include <core/parser/ParserStack.hpp>
+
+#include "Handler.hpp"
namespace ousia {
@@ -35,11 +36,11 @@ class Rtti;
class DocumentEntity;
class FieldDescriptor;
-class DocumentHandler : public Handler {
+class DocumentHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -54,7 +55,7 @@ public:
using Node::Node;
};
-class DocumentChildHandler : public Handler {
+class DocumentChildHandler : public StaticHandler {
private:
void preamble(Handle<Node> parentNode, std::string &fieldName,
DocumentEntity *&parent, bool &inField);
@@ -68,11 +69,11 @@ private:
public:
using Handler::Handler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
- void data(const std::string &data, int fieldIdx) override;
+ bool data(const Variant &data) override;
static Handler *create(const HandlerData &handlerData)
{
diff --git a/src/core/parser/stack/DomainHandler.hpp b/src/core/parser/stack/DomainHandler.hpp
index 7398812..5e8ea60 100644
--- a/src/core/parser/stack/DomainHandler.hpp
+++ b/src/core/parser/stack/DomainHandler.hpp
@@ -26,18 +26,19 @@
#define _OUSIA_DOMAIN_HANDLER_HPP_
#include <core/common/Variant.hpp>
-#include <core/parser/ParserStack.hpp>
+
+#include "Handler.hpp"
namespace ousia {
// Forward declarations
class Rtti;
-class DomainHandler : public Handler {
+class DomainHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -47,11 +48,11 @@ public:
}
};
-class DomainStructHandler : public Handler {
+class DomainStructHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -61,11 +62,11 @@ public:
}
};
-class DomainAnnotationHandler : public Handler {
+class DomainAnnotationHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -75,11 +76,11 @@ public:
}
};
-class DomainAttributesHandler : public Handler {
+class DomainAttributesHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -89,11 +90,11 @@ public:
}
};
-class DomainFieldHandler : public Handler {
+class DomainFieldHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -103,11 +104,11 @@ public:
}
};
-class DomainFieldRefHandler : public Handler {
+class DomainFieldRefHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -117,11 +118,11 @@ public:
}
};
-class DomainPrimitiveHandler : public Handler {
+class DomainPrimitiveHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -131,11 +132,11 @@ public:
}
};
-class DomainChildHandler : public Handler {
+class DomainChildHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -154,11 +155,11 @@ namespace RttiTypes {
extern const Rtti DomainParent;
}
-class DomainParentHandler : public Handler {
+class DomainParentHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -168,11 +169,11 @@ public:
}
};
-class DomainParentFieldHandler : public Handler {
+class DomainParentFieldHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
@@ -182,11 +183,11 @@ public:
}
};
-class DomainParentFieldRefHandler : public Handler {
+class DomainParentFieldRefHandler : public StaticHandler {
public:
- using Handler::Handler;
+ using StaticHandler::StaticHandler;
- void start(Variant::mapType &args) override;
+ bool start(Variant::mapType &args) override;
void end() override;
diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp
new file mode 100644
index 0000000..a608f7f
--- /dev/null
+++ b/src/core/parser/stack/Handler.cpp
@@ -0,0 +1,252 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/Exceptions.hpp>
+#include <core/common/Logger.hpp>
+#include <core/parser/ParserContext.hpp>
+
+#include "Callbacks.hpp"
+#include "Handler.hpp"
+#include "State.hpp"
+
+namespace ousia {
+namespace parser_stack {
+
+/* Class HandlerData */
+
+HandlerData::HandlerData(ParserContext &ctx, /*Callbacks &callbacks,*/
+ const std::string &name, const State &state,
+ const SourceLocation &location)
+ : ctx(ctx),
+ /*callbacks(callbacks),*/
+ name(name),
+ state(state),
+ location(location)
+{
+}
+
+/* Class Handler */
+
+Handler::Handler(const HandlerData &handlerData)
+ : handlerData(handlerData), internalLogger(nullptr)
+{
+}
+
+Handler::~Handler() {}
+
+ParserContext &Handler::context() { return handlerData.ctx; }
+
+ParserScope &Handler::scope() { return handlerData.ctx.getScope(); }
+
+Manager &Handler::manager() { return handlerData.ctx.getManager(); }
+
+Logger &Handler::logger()
+{
+ if (internalLogger != nullptr) {
+ return *internalLogger;
+ }
+ return handlerData.ctx.getLogger();
+}
+
+const SourceLocation &Handler::location() const { return handlerData.location; }
+
+void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode)
+{
+ /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/
+}
+
+void Handler::registerToken(const std::string &token)
+{
+ /*handlerData.callbacks.registerToken(token);*/
+}
+
+void Handler::unregisterToken(const std::string &token)
+{
+ /*handlerData.callbacks.unregisterToken(token);*/
+}
+
+const std::string &Handler::getName() const { return handlerData.name; }
+
+const State &Handler::getState() const { return handlerData.state; }
+
+void Handler::setLogger(Logger &logger) { internalLogger = &logger; }
+
+void Handler::resetLogger() { internalLogger = nullptr; }
+
+const SourceLocation &Handler::getLocation() const { return location(); }
+
+/* Class EmptyHandler */
+
+bool EmptyHandler::start(const Variant::mapType &args)
+{
+ // Just accept anything
+ return true;
+}
+
+void EmptyHandler::end()
+{
+ // Do nothing if a command ends
+}
+
+bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex)
+{
+ // Accept any field
+ return true;
+}
+
+void EmptyHandler::fieldEnd()
+{
+ // Do not handle fields
+}
+
+bool EmptyHandler::annotationStart(const Variant &className,
+ const Variant::mapType &args)
+{
+ // Accept any data
+ return true;
+}
+
+bool EmptyHandler::annotationEnd(const Variant &className,
+ const Variant &elementName)
+{
+ // Accept any annotation
+ return true;
+}
+
+bool EmptyHandler::data(const Variant &data)
+{
+ // Support any data
+ return true;
+}
+
+Handler *EmptyHandler::create(const HandlerData &handlerData)
+{
+ return new EmptyHandler(handlerData);
+}
+
+/* Class StaticHandler */
+
+bool StaticHandler::start(const Variant::mapType &args)
+{
+ // Do nothing in the default implementation, accept anything
+ return true;
+}
+
+void StaticHandler::end()
+{
+ // Do nothing here
+}
+
+bool StaticHandler::fieldStart(bool &isDefault, size_t fieldIdx)
+{
+ // Return true if either the default field is requested or the field index
+ // is zero. This simulates that there is exactly one field (a default field)
+ if (fieldIdx == 0) {
+ isDefault = true;
+ return true;
+ }
+ return false;
+}
+
+void StaticHandler::fieldEnd()
+{
+ // Do nothing here
+}
+
+bool StaticHandler::annotationStart(const Variant &className,
+ const Variant::mapType &args)
+{
+ // No annotations supported
+ return false;
+}
+
+bool StaticHandler::annotationEnd(const Variant &className,
+ const Variant &elementName)
+{
+ // No annotations supported
+ return false;
+}
+
+bool StaticHandler::data(const Variant &data)
+{
+ logger().error("Did not expect any data here", data);
+ return false;
+}
+
+/* Class StaticFieldHandler */
+
+StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData,
+ const std::string &argName)
+ : StaticHandler(handlerData), argName(argName), handled(false)
+{
+}
+
+bool StaticFieldHandler::start(const Variant::mapType &args)
+{
+ if (!argName.empty()) {
+ auto it = args.find(argName);
+ if (it != args.end()) {
+ handled = true;
+ doHandle(it->second, args);
+ return true;
+ }
+ }
+
+ this->args = args;
+ return true;
+}
+
+void StaticFieldHandler::end()
+{
+ if (!handled) {
+ if (!argName.empty()) {
+ logger().error(std::string("Required argument \"") + argName +
+ std::string("\" is missing."),
+ location());
+ } else {
+ logger().error("Command requires data, but no data given",
+ location());
+ }
+ }
+}
+
+bool StaticFieldHandler::data(const Variant &data)
+{
+ // Call the doHandle function if this has not been done before
+ if (!handled) {
+ handled = true;
+ doHandle(data, args);
+ return true;
+ }
+
+ // The doHandle function was already called, print an error message
+ logger().error(
+ std::string("Found data, but the corresponding argument \"") + argName +
+ std::string("\" was already specified"),
+ data);
+
+ // Print the location at which the attribute was originally specified
+ auto it = args.find(argName);
+ if (it != args.end()) {
+ logger().note(std::string("Attribute was specified here:"), it->second);
+ }
+ return false;
+}
+}
+}
+
diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp
new file mode 100644
index 0000000..eeaf555
--- /dev/null
+++ b/src/core/parser/stack/Handler.hpp
@@ -0,0 +1,414 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _OUSIA_PARSER_STACK_HANDLER_HPP_
+#define _OUSIA_PARSER_STACK_HANDLER_HPP_
+
+#include <string>
+
+#include <core/common/Location.hpp>
+#include <core/common/Variant.hpp>
+#include <core/common/Whitespace.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class ParserScope;
+class ParserContext;
+class Logger;
+
+namespace parser_stack {
+
+// More forward declarations
+class Callbacks;
+class State;
+
+/**
+ * Class collecting all the data that is being passed to a Handler
+ * instance.
+ */
+class HandlerData {
+public:
+ /**
+ * Reference to the ParserContext instance that should be used to resolve
+ * references to nodes in the Graph.
+ */
+ ParserContext &ctx;
+
+ /**
+ * Reference at an instance of the Callbacks class, used for
+ * modifying the behaviour of the parser (like registering tokens, setting
+ * the data type or changing the whitespace handling mode).
+ */
+ // Callbacks &callbacks;
+
+ /**
+ * Contains the name of the command that is being handled.
+ */
+ std::string name;
+
+ /**
+ * Contains the current state of the state machine.
+ */
+ const State &state;
+
+ /**
+ * Current source code location.
+ */
+ SourceLocation location;
+
+ /**
+ * Constructor of the HandlerData class.
+ *
+ * @param ctx is the parser context the handler should be executed in.
+ * @param callbacks is an instance of Callbacks used to notify
+ * the parser about certain state changes.
+ * @param name is the name of the string.
+ * @param state is the state this handler was called for.
+ * @param location is the location at which the handler is created.
+ */
+ HandlerData(ParserContext &ctx,
+ /*Callbacks &callbacks,*/ const std::string &name,
+ const State &state, const SourceLocation &location);
+};
+
+/**
+ * The Handler class provides a context for handling a generic stack element.
+ * It has to beoverridden and registered in the StateStack class to form
+ * handlers for concrete XML tags.
+ */
+class Handler {
+private:
+ /**
+ * Structure containing the internal handler data.
+ */
+ const HandlerData handlerData;
+
+ /**
+ * Reference at the current logger. If not nullptr, this will override the
+ * logger from the ParserContext specified in the handlerData.
+ */
+ Logger *internalLogger;
+
+protected:
+ /**
+ * Constructor of the Handler class.
+ *
+ * @param data is a structure containing all data being passed to the
+ * handler.
+ */
+ Handler(const HandlerData &handlerData);
+
+ /**
+ * Returns a reference at the ParserContext.
+ *
+ * @return a reference at the ParserContext.
+ */
+ ParserContext &context();
+
+ /**
+ * Returns a reference at the ParserScope instance.
+ *
+ * @return a reference at the ParserScope instance.
+ */
+ ParserScope &scope();
+
+ /**
+ * Returns a reference at the Manager instance which manages all nodes.
+ *
+ * @return a referance at the Manager instance.
+ */
+ Manager &manager();
+
+ /**
+ * Returns a reference at the Logger instance used for logging error
+ * messages.
+ *
+ * @return a reference at the Logger instance.
+ */
+ Logger &logger();
+
+ /**
+ * Returns the location of the element in the source file, for which this
+ * Handler was created.
+ *
+ * @return the location of the Handler in the source file.
+ */
+ const SourceLocation &location() const;
+
+public:
+ /**
+ * Virtual destructor.
+ */
+ virtual ~Handler();
+
+ /**
+ * Calls the corresponding function in the Callbacks instance. Sets the
+ * whitespace mode that specifies how string data should be processed. The
+ * calls to this function are placed on a stack by the underlying Stack
+ * class.
+ *
+ * @param whitespaceMode specifies one of the three WhitespaceMode constants
+ * PRESERVE, TRIM or COLLAPSE.
+ */
+ void setWhitespaceMode(WhitespaceMode whitespaceMode);
+
+ /**
+ * Calls the corresponding function in the Callbacks instance.
+ * Registers the given token as token that should be reported to the handler
+ * using the "token" function.
+ *
+ * @param token is the token string that should be reported.
+ */
+ void registerToken(const std::string &token);
+
+ /**
+ * Calls the corresponding function in the Callbacks instance.
+ * Unregisters the given token, it will no longer be reported to the handler
+ * using the "token" function.
+ *
+ * @param token is the token string that should be unregistered.
+ */
+ void unregisterToken(const std::string &token);
+
+ /**
+ * Returns the command name for which the handler was created.
+ *
+ * @return a const reference at the command name.
+ */
+ const std::string &getName() const;
+
+ /**
+ * Reference at the State descriptor for which this Handler was created.
+ *
+ * @return a const reference at the constructing State descriptor.
+ */
+ const State &getState() const;
+
+ /**
+ * Sets the internal logger to the given logger instance.
+ *
+ * @param logger is the Logger instance to which the logger should be set.
+ */
+ void setLogger(Logger &logger);
+
+ /**
+ * Resets the logger instance to the logger instance provided in the
+ * ParserContext.
+ */
+ void resetLogger();
+
+ /**
+ * Returns the location of the element in the source file, for which this
+ * Handler was created.
+ *
+ * @return the location of the Handler in the source file.
+ */
+ const SourceLocation &getLocation() const;
+
+ /**
+ * Called when the command that was specified in the constructor is
+ * instanciated.
+ *
+ * @param args is a map from strings to variants (argument name and value).
+ * @return true if the handler was successful in starting the element it
+ * represents, false otherwise.
+ */
+ virtual bool start(const Variant::mapType &args) = 0;
+
+ /**
+ * Called before the command for which this handler is defined ends (is
+ * forever removed from the stack).
+ */
+ virtual void end() = 0;
+
+ /**
+ * Called when a new field starts, while the handler is active. This
+ * function should return true if the field is supported, false otherwise.
+ * No error should be logged if the field cannot be started, the caller will
+ * take care of that (since it is always valid to start a default field,
+ * even though the corresponding structure does not have a field, as long as
+ * no data is fed into the field).
+ *
+ * @param isDefault is set to true if the field that is being started is the
+ * default/tree field. The handler should set the value of this variable to
+ * true if the referenced field is indeed the default field.
+ * @param fieldIdx is the numerical index of the field.
+ */
+ virtual bool fieldStart(bool &isDefault, size_t fieldIdx) = 0;
+
+ /**
+ * Called when a previously opened field ends, while the handler is active.
+ * Note that a "fieldStart" and "fieldEnd" are always called alternately.
+ */
+ virtual void fieldEnd() = 0;
+
+ /**
+ * Called whenever an annotation starts while this handler is active. The
+ * function should return true if starting the annotation was successful,
+ * false otherwise.
+ *
+ * @param className is a string variant containing the name of the
+ * annotation class and the location of the name in the source code.
+ * @param args is a map from strings to variants (argument name and value).
+ * @return true if the mentioned annotation could be started here, false
+ * if an error occurred.
+ */
+ virtual bool annotationStart(const Variant &className,
+ const Variant::mapType &args) = 0;
+
+ /**
+ * Called whenever an annotation ends while this handler is active. The
+ * function should return true if ending the annotation was successful,
+ * false otherwise.
+ *
+ * @param className is a string variant containing the name of the
+ * annotation class and the location of the class name in the source code.
+ * @param elementName is a string variant containing the name of the
+ * annotation class and the location of the element name in the source code.
+ * @return true if the mentioned annotation could be started here, false if
+ * an error occurred.
+ */
+ virtual bool annotationEnd(const Variant &className,
+ const Variant &elementName) = 0;
+
+ /**
+ * Called whenever raw data (int the form of a string) is available for the
+ * Handler instance. Should return true if the data could be handled, false
+ * otherwise.
+ *
+ * @param data is a string variant containing the character data and its
+ * location.
+ * @return true if the data could be handled, false otherwise.
+ */
+ virtual bool data(const Variant &data) = 0;
+};
+
+/**
+ * HandlerConstructor is a function pointer type used to create concrete
+ * instances of the Handler class.
+ *
+ * @param handlerData is the data that should be passed to the new handler
+ * instance.
+ * @return a newly created handler instance.
+ */
+using HandlerConstructor = Handler *(*)(const HandlerData &handlerData);
+
+/**
+ * The EmptyHandler class is used in case no element handler is specified in
+ * the State descriptor. It just accepts all data and does nothing.
+ */
+class EmptyHandler : public Handler {
+protected:
+ using Handler::Handler;
+
+public:
+ bool start(const Variant::mapType &args) override;
+ void end() override;
+ bool fieldStart(bool &isDefault, size_t fieldIdx) override;
+ void fieldEnd() override;
+ bool annotationStart(const Variant &className,
+ const Variant::mapType &args) override;
+ bool annotationEnd(const Variant &className,
+ const Variant &elementName) override;
+ bool data(const Variant &data) override;
+
+ /**
+ * Creates an instance of the EmptyHandler class.
+ */
+ static Handler *create(const HandlerData &handlerData);
+};
+
+/**
+ * The StaticHandler class is used to handle predifined commands which do
+ * neither support annotations, nor multiple fields. Child classes can decide
+ * whether a single data field should be used.
+ */
+class StaticHandler : public Handler {
+protected:
+ using Handler::Handler;
+
+public:
+ bool start(const Variant::mapType &args) override;
+ void end() override;
+ bool fieldStart(bool &isDefault, size_t fieldIdx) override;
+ void fieldEnd() override;
+ bool annotationStart(const Variant &className,
+ const Variant::mapType &args) override;
+ bool annotationEnd(const Variant &className,
+ const Variant &elementName) override;
+ bool data(const Variant &data) override;
+};
+
+/**
+ * The StaticFieldHandler class is used to handle predifined commands which do
+ * neither support annotations, nor multiple fields. Additionally, it captures a
+ * data entry from a single default field.
+ */
+class StaticFieldHandler : public StaticHandler {
+private:
+ /**
+ * Set to the name of the data argument that should be used instead of the
+ * data field, if no data field is given.
+ */
+ std::string argName;
+
+ /**
+ * Set to true, once the "doHandle" function has been called.
+ */
+ bool handled;
+
+ /**
+ * Map containing the arguments given in the start function.
+ */
+ Variant::mapType args;
+
+protected:
+ /**
+ * Constructor of the StaticFieldHandler class.
+ *
+ * @param handlerData is a structure containing the internal data that
+ * should be stored inside the handler.
+ * @param name of the data argument that -- if present -- should be used
+ * instead of the data field. If empty, data is not captured from the
+ * arguments. If both, data in the data field and the argument, are given,
+ * this results in an error.
+ */
+ StaticFieldHandler(const HandlerData &handlerData,
+ const std::string &argName);
+
+ /**
+ * Function that should be overriden in order to handle the field data and
+ * the other arguments. This function is not called if no data was given.
+ *
+ * @param fieldData is the captured field data.
+ * @param args are the arguments that were given in the "start" function.
+ */
+ virtual void doHandle(const Variant &fieldData,
+ const Variant::mapType &args) = 0;
+
+public:
+ bool start(const Variant::mapType &args) override;
+ void end() override;
+ bool data(const Variant &data) override;
+};
+}
+}
+
+#endif /* _OUSIA_PARSER_STACK_HANDLER_HPP_ */
+
diff --git a/src/core/parser/stack/ImportIncludeHandler.hpp b/src/core/parser/stack/ImportIncludeHandler.hpp
index b0767be..f9abe55 100644
--- a/src/core/parser/stack/ImportIncludeHandler.hpp
+++ b/src/core/parser/stack/ImportIncludeHandler.hpp
@@ -19,6 +19,9 @@
/**
* @file ImportIncludeHandler.hpp
*
+ * Contains the conceptually similar handlers for the "include" and "import"
+ * commands.
+ *
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
@@ -30,43 +33,54 @@
namespace ousia {
-class ImportIncludeHandler : public Handler {
-protected:
- bool srcInArgs = false;
- std::string rel;
- std::string type;
- std::string src;
-
-public:
- using Handler::Handler;
-
- void start(Variant::mapType &args) override;
-
- void data(const std::string &data, int field) override;
-};
-
-class ImportHandler : public ImportIncludeHandler {
+/**
+ * The ImportHandler is responsible for handling the "import" command. An import
+ * creates a reference to a specified file. The specified file is parsed (if
+ * this has not already been done) outside of the context of the current file.
+ * If the specified resource has already been parsed, a reference to the already
+ * parsed file is inserted. Imports are only possible before no other content
+ * has been parsed.
+ */
+class ImportHandler : public StaticFieldHandler {
public:
- using ImportIncludeHandler::ImportIncludeHandler;
-
- void start(Variant::mapType &args) override;
-
- void end() override;
-
+ using StaticFieldHandler::StaticFieldHandler;
+
+ void doHandle(const Variant &fieldData,
+ const Variant::mapType &args) override;
+
+ /**
+ * Creates a new instance of the ImportHandler.
+ *
+ * @param handlerData is the data that is passed to the constructor of the
+ * Handler base class and used there to e.g. access the ParserContext and
+ * the Callbacks instance.
+ */
static Handler *create(const HandlerData &handlerData)
{
return new ImportHandler{handlerData};
}
};
-class IncludeHandler : public ImportIncludeHandler {
+/**
+ * The IncludeHandler is responsible for handling the "include" command. The
+ * included file is parsed in the context of the current file and will change
+ * the content that is currently being parsed. Includes are possible at (almost)
+ * any position in the source file.
+ */
+class IncludeHandler : public StaticFieldHandler {
public:
- using ImportIncludeHandler::ImportIncludeHandler;
-
- void start(Variant::mapType &args) override;
-
- void end() override;
-
+ using StaticFieldHandler::StaticFieldHandler;
+
+ void doHandle(const Variant &fieldData,
+ const Variant::mapType &args) override;
+
+ /**
+ * Creates a new instance of the IncludeHandler.
+ *
+ * @param handlerData is the data that is passed to the constructor of the
+ * Handler base class and used there to e.g. access the ParserContext and
+ * the Callbacks instance.
+ */
static Handler *create(const HandlerData &handlerData)
{
return new IncludeHandler{handlerData};
diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp
new file mode 100644
index 0000000..d84a19c
--- /dev/null
+++ b/src/core/parser/stack/Stack.cpp
@@ -0,0 +1,554 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <sstream>
+
+#include <core/common/Logger.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/Exceptions.hpp>
+#include <core/parser/ParserScope.hpp>
+#include <core/parser/ParserContext.hpp>
+
+#include "Handler.hpp"
+#include "Stack.hpp"
+#include "State.hpp"
+
+namespace ousia {
+namespace parser_stack {
+
+/* Class HandlerInfo */
+
+HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {}
+
+HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler)
+ : handler(handler),
+ fieldIdx(0),
+ valid(true),
+ implicit(false),
+ inField(false),
+ inDefaultField(false),
+ inImplicitDefaultField(false),
+ inValidField(false),
+ hadDefaultField(false)
+{
+}
+
+HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField,
+ bool inDefaultField, bool inImplicitDefaultField,
+ bool inValidField)
+ : handler(nullptr),
+ fieldIdx(0),
+ valid(valid),
+ implicit(implicit),
+ inField(inField),
+ inDefaultField(inDefaultField),
+ inImplicitDefaultField(inImplicitDefaultField),
+ inValidField(inValidField),
+ hadDefaultField(false)
+{
+}
+
+HandlerInfo::~HandlerInfo()
+{
+ // Do nothing
+}
+
+void HandlerInfo::fieldStart(bool isDefault, bool isImplicit, bool isValid)
+{
+ inField = true;
+ inDefaultField = isDefault || isImplicit;
+ inImplicitDefaultField = isImplicit;
+ inValidField = isValid;
+ hadDefaultField = hadDefaultField || inDefaultField;
+ fieldIdx++;
+}
+
+void HandlerInfo::fieldEnd()
+{
+ inField = false;
+ inDefaultField = false;
+ inImplicitDefaultField = false;
+ inValidField = false;
+}
+
+/**
+ * Stub instance of HandlerInfo containing no handler information.
+ */
+static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true};
+
+/* Helper functions */
+
+/**
+ * Returns an Exception that should be thrown when a currently invalid command
+ * is thrown.
+ *
+ * @param name is the name of the command for which no state transition is
+ * found.
+ * @param expected is a set containing the names of the expected commands.
+ */
+static LoggableException buildInvalidCommandException(
+ const std::string &name, const std::set<std::string> &expected)
+{
+ if (expected.empty()) {
+ return LoggableException{
+ std::string{"No nested elements allowed, but got \""} + name +
+ std::string{"\""}};
+ } else {
+ return LoggableException{
+ std::string{"Expected "} +
+ (expected.size() == 1 ? std::string{"\""}
+ : std::string{"one of \""}) +
+ Utils::join(expected, "\", \"") + std::string{"\", but got \""} +
+ name + std::string{"\""}};
+ }
+}
+
+/* Class Stack */
+
+Stack::Stack(ParserContext &ctx,
+ const std::multimap<std::string, const State *> &states)
+ : ctx(ctx), states(states)
+{
+ // If the scope instance is not empty we need to deduce the current parser
+ // state
+ if (!ctx.getScope().isEmpty()) {
+ deduceState();
+ }
+}
+
+Stack::~Stack()
+{
+ while (!stack.empty()) {
+ // Fetch the topmost stack element
+ HandlerInfo &info = currentInfo();
+
+ // It is an error if we're still in a field of an element while the
+ // Stack instance is destroyed. Log that
+ if (handlersValid()) {
+ if (info.inField && !info.implicit &&
+ !info.inImplicitDefaultField) {
+ logger().error(
+ std::string("Reached end of stream, but command \"") +
+ info.handler->getName() +
+ "\" has not ended yet. Command was started here:",
+ info.handler->getLocation());
+ }
+ }
+
+ // Remove the command from the stack
+ endCurrentHandler();
+ }
+}
+
+void Stack::deduceState()
+{
+ // Assemble all states
+ std::vector<const State *> states;
+ for (const auto &e : this->states) {
+ states.push_back(e.second);
+ }
+
+ // Fetch the type signature of the scope and derive all possible states,
+ // abort if no unique parser state was found
+ std::vector<const State *> possibleStates =
+ StateDeductor(ctx.getScope().getStackTypeSignature(), states).deduce();
+ if (possibleStates.size() != 1U) {
+ throw LoggableException(
+ "Error while including file: Cannot deduce parser state.");
+ }
+
+ // Switch to this state by creating a handler, but do not call its start
+ // function
+ const State &state = *possibleStates[0];
+ HandlerConstructor ctor =
+ state.elementHandler ? state.elementHandler : EmptyHandler::create;
+
+ std::shared_ptr<Handler> handler =
+ std::shared_ptr<Handler>{ctor({ctx, "", state, SourceLocation{}})};
+ stack.emplace_back(handler);
+
+ // Set the correct flags for this implicit handler
+ HandlerInfo &info = currentInfo();
+ info.implicit = true;
+ info.fieldStart(true, false, true);
+}
+
+std::set<std::string> Stack::expectedCommands()
+{
+ const State *currentState = &(this->currentState());
+ std::set<std::string> res;
+ for (const auto &v : states) {
+ if (v.second->parents.count(currentState)) {
+ res.insert(v.first);
+ }
+ }
+ return res;
+}
+
+const State &Stack::currentState()
+{
+ return stack.empty() ? States::None : stack.back().handler->getState();
+}
+
+std::string Stack::currentCommandName()
+{
+ return stack.empty() ? std::string{} : stack.back().handler->getName();
+}
+
+const State *Stack::findTargetState(const std::string &name)
+{
+ const State *currentState = &(this->currentState());
+ auto range = states.equal_range(name);
+ for (auto it = range.first; it != range.second; it++) {
+ const StateSet &parents = it->second->parents;
+ if (parents.count(currentState) || parents.count(&States::All)) {
+ return it->second;
+ }
+ }
+
+ return nullptr;
+}
+
+const State *Stack::findTargetStateOrWildcard(const std::string &name)
+{
+ // Try to find the target state with the given name, if none is found, try
+ // find a matching "*" state.
+ State const *targetState = findTargetState(name);
+ if (targetState == nullptr) {
+ return findTargetState("*");
+ }
+ return targetState;
+}
+
+HandlerInfo &Stack::currentInfo()
+{
+ return stack.empty() ? EmptyHandlerInfo : stack.back();
+}
+HandlerInfo &Stack::lastInfo()
+{
+ return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2];
+}
+
+void Stack::endCurrentHandler()
+{
+ if (!stack.empty()) {
+ // Fetch the handler info for the current top-level element
+ HandlerInfo &info = stack.back();
+
+ // Do not call any callback functions while the stack is marked as
+ // invalid or this is an elment marked as "implicit"
+ if (!info.implicit && handlersValid()) {
+ // Make sure the fieldEnd handler is called if the element still
+ // is in a field
+ if (info.inField) {
+ info.handler->fieldEnd();
+ info.fieldEnd();
+ }
+
+ // Call the "end" function of the corresponding Handler instance
+ info.handler->end();
+ }
+
+ // Remove the element from the stack
+ stack.pop_back();
+ }
+}
+
+bool Stack::ensureHandlerIsInField()
+{
+ // If the current handler is not in a field (and actually has a handler)
+ // try to start a default field
+ HandlerInfo &info = currentInfo();
+ if (!info.inField && info.handler != nullptr) {
+ // Abort if the element already had a default field
+ if (info.hadDefaultField) {
+ return false;
+ }
+
+ // Try to start a new default field, abort if this did not work
+ bool isDefault = true;
+ if (!info.handler->fieldStart(isDefault, info.fieldIdx)) {
+ info.handler->fieldEnd();
+ endCurrentHandler();
+ return false;
+ }
+
+ // Mark the field as started
+ info.fieldStart(true, true, true);
+ }
+ return true;
+}
+
+bool Stack::handlersValid()
+{
+ for (auto it = stack.crbegin(); it != stack.crend(); it++) {
+ if (!it->valid) {
+ return false;
+ }
+ }
+ return true;
+}
+
+Logger &Stack::logger() { return ctx.getLogger(); }
+
+void Stack::command(const Variant &name, const Variant::mapType &args)
+{
+ // Make sure the given identifier is valid (preventing "*" from being
+ // malicously passed to this function)
+ if (!Utils::isNamespacedIdentifier(name.asString())) {
+ throw LoggableException(std::string("Invalid identifier \"") +
+ name.asString() + std::string("\""),
+ name);
+ }
+
+ State const *lastTargetState = nullptr;
+ Variant::mapType canonicalArgs;
+ while (true) {
+ // Try to find a target state for the given command, if none can be
+ // found and the current command does not have an open field, then try
+ // to create an empty default field, otherwise this is an exception
+ const State *targetState = findTargetStateOrWildcard(name.asString());
+ if (targetState == nullptr) {
+ if (!currentInfo().inField) {
+ endCurrentHandler();
+ continue;
+ } else {
+ throw buildInvalidCommandException(name.asString(),
+ expectedCommands());
+ }
+ }
+
+ // Make sure we're currently inside a field
+ if (!ensureHandlerIsInField()) {
+ endCurrentHandler();
+ continue;
+ }
+
+ // Fork the logger. We do not want any validation errors to skip
+ LoggerFork loggerFork = logger().fork();
+
+ // Canonicalize the arguments (if this has not already been done), allow
+ // additional arguments
+ if (lastTargetState != targetState) {
+ canonicalArgs = args;
+ targetState->arguments.validateMap(canonicalArgs, loggerFork, true);
+ lastTargetState = targetState;
+ }
+
+ // Instantiate the handler and push it onto the stack
+ HandlerConstructor ctor = targetState->elementHandler
+ ? targetState->elementHandler
+ : EmptyHandler::create;
+ std::shared_ptr<Handler> handler{
+ ctor({ctx, name.asString(), *targetState, name.getLocation()})};
+ stack.emplace_back(handler);
+
+ // Fetch the HandlerInfo for the parent element and the current element
+ HandlerInfo &parentInfo = lastInfo();
+ HandlerInfo &info = currentInfo();
+
+ // Call the "start" method of the handler, store the result of the start
+ // method as the validity of the handler -- do not call the start method
+ // if the stack is currently invalid (as this may cause further,
+ // unwanted errors)
+ bool validStack = handlersValid();
+ info.valid = false;
+ if (validStack) {
+ handler->setLogger(loggerFork);
+ try {
+ info.valid = handler->start(canonicalArgs);
+ }
+ catch (LoggableException ex) {
+ loggerFork.log(ex);
+ }
+ handler->resetLogger();
+ }
+
+ // We started the command within an implicit default field and it is not
+ // valid -- remove both the new handler and the parent field from the
+ // stack
+ if (!info.valid && parentInfo.inImplicitDefaultField) {
+ endCurrentHandler();
+ endCurrentHandler();
+ continue;
+ }
+
+ // If we ended up here, starting the command may or may not have worked,
+ // but after all, we cannot unroll the stack any further. Update the
+ // "valid" flag, commit any potential error messages and return.
+ info.valid = parentInfo.valid && info.valid;
+ loggerFork.commit();
+ return;
+ }
+}
+
+void Stack::data(const Variant &data)
+{
+ while (true) {
+ // Check whether there is any command the data can be sent to
+ if (stack.empty()) {
+ throw LoggableException("No command here to receive data.");
+ }
+
+ // Fetch the current command handler information
+ HandlerInfo &info = currentInfo();
+
+ // Make sure the current handler has an open field
+ if (!ensureHandlerIsInField()) {
+ endCurrentHandler();
+ continue;
+ }
+
+ // If this field should not get any data, log an error and do not call
+ // the "data" handler
+ if (!info.inValidField) {
+ logger().error("Did not expect any data here", data);
+ }
+
+ if (handlersValid() && info.inValidField) {
+ // Fork the logger and set it as temporary logger for the "start"
+ // method. We only want to keep error messages if this was not a try
+ // to implicitly open a default field.
+ LoggerFork loggerFork = logger().fork();
+ info.handler->setLogger(loggerFork);
+
+ // Pass the data to the current Handler instance
+ bool valid = false;
+ try {
+ valid = info.handler->data(data);
+ }
+ catch (LoggableException ex) {
+ loggerFork.log(ex);
+ }
+
+ // Reset the logger instance as soon as possible
+ info.handler->resetLogger();
+
+ // If placing the data here failed and we're currently in an
+ // implicitly opened field, just unroll the stack to the next field
+ // and try again
+ if (!valid && info.inImplicitDefaultField) {
+ endCurrentHandler();
+ continue;
+ }
+
+ // Commit the content of the logger fork. Do not change the valid
+ // flag.
+ loggerFork.commit();
+ }
+
+ // There was no reason to unroll the stack any further, so continue
+ return;
+ }
+}
+
+void Stack::fieldStart(bool isDefault)
+{
+ // Make sure the current handler stack is not empty
+ if (stack.empty()) {
+ throw LoggableException(
+ "No command for which a field could be started");
+ }
+
+ // Fetch the information attached to the current handler
+ HandlerInfo &info = currentInfo();
+ if (info.inField) {
+ logger().error(
+ "Got field start, but there is no command for which to start the "
+ "field.");
+ return;
+ }
+
+ // Copy the isDefault flag to a local variable, the fieldStart method will
+ // write into this variable
+ bool defaultField = isDefault;
+
+ // Do not call the "fieldStart" function if we're in an invalid subtree
+ bool valid = false;
+ if (handlersValid()) {
+ try {
+ valid = info.handler->fieldStart(defaultField, info.fieldIdx);
+ }
+ catch (LoggableException ex) {
+ logger().log(ex);
+ }
+ if (!valid && !defaultField) {
+ logger().error(
+ std::string("Cannot start a new field here (index ") +
+ std::to_string(info.fieldIdx + 1) +
+ std::string("), field does not exist"));
+ }
+ }
+
+ // Mark the field as started
+ info.fieldStart(defaultField, false, valid);
+}
+
+void Stack::fieldEnd()
+{
+ // Make sure the current handler stack is not empty
+ if (stack.empty()) {
+ throw LoggableException("No command for which a field could be ended");
+ }
+
+ // Fetch the information attached to the current handler
+ HandlerInfo &info = currentInfo();
+ if (!info.inField) {
+ logger().error(
+ "Got field end, but there is no command for which to end the "
+ "field.");
+ return;
+ }
+
+ // Only continue if the current handler stack is in a valid state, do not
+ // call the fieldEnd function if something went wrong before
+ if (handlersValid()) {
+ try {
+ info.handler->fieldEnd();
+ }
+ catch (LoggableException ex) {
+ logger().log(ex);
+ }
+ }
+
+ // This command no longer is in a field
+ info.fieldEnd();
+
+ // As soon as this command had a default field, remove it from the stack
+ if (info.hadDefaultField) {
+ endCurrentHandler();
+ }
+}
+
+void Stack::annotationStart(const Variant &className, const Variant &args)
+{
+ // TODO
+}
+
+void Stack::annotationEnd(const Variant &className, const Variant &elementName)
+{
+ // TODO
+}
+
+void Stack::token(Variant token)
+{
+ // TODO
+}
+}
+}
+
diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp
new file mode 100644
index 0000000..76eefd9
--- /dev/null
+++ b/src/core/parser/stack/Stack.hpp
@@ -0,0 +1,341 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file Stack.hpp
+ *
+ * Helper classes for document or description parsers. Contains the
+ * Stack class, which is an pushdown automaton responsible for
+ * accepting commands in the correct order and calling specified handlers.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_PARSER_STACK_STACK_HPP_
+#define _OUSIA_PARSER_STACK_STACK_HPP_
+
+#include <cstdint>
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include <core/common/Variant.hpp>
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class ParserContext;
+class Logger;
+
+namespace parser_stack {
+
+// Forward declarations
+class Handler;
+class State;
+
+/**
+ * The HandlerInfo class is used internally by the stack to associate additional
+ * (mutable) data with a handler instance.
+ */
+class HandlerInfo {
+public:
+ /**
+ * Pointer pointing at the actual handler instance.
+ */
+ std::shared_ptr<Handler> handler;
+
+ /**
+ * Next field index to be passed to the "fieldStart" function of the Handler
+ * class.
+ */
+ size_t fieldIdx;
+
+ /**
+ * Set to true if the handler is valid (which is the case if the "start"
+ * method has returned true). If the handler is invalid, no more calls are
+ * directed at it until it can be removed from the stack.
+ */
+ bool valid : 1;
+
+ /**
+ * Set to true if this is an implicit handler, that was created when the
+ * current stack state was deduced.
+ */
+ bool implicit : 1;
+
+ /**
+ * Set to true if the handler currently is in a field.
+ */
+ bool inField : 1;
+
+ /**
+ * Set to true if the handler currently is in the default field.
+ */
+ bool inDefaultField : 1;
+
+ /**
+ * Set to true if the handler currently is in an implicitly started default
+ * field.
+ */
+ bool inImplicitDefaultField : 1;
+
+ /**
+ * Set to false if this field is only opened pro-forma and does not accept
+ * any data. Otherwise set to true.
+ */
+ bool inValidField : 1;
+
+ /**
+ * Set to true, if the default field was already started.
+ */
+ bool hadDefaultField : 1;
+
+ /**
+ * Default constructor of the HandlerInfo class.
+ */
+ HandlerInfo();
+ /**
+ * Constructor of the HandlerInfo class, allows to set all flags manually.
+ */
+ HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField,
+ bool inImplicitDefaultField, bool inValidField);
+
+ /**
+ * Constructor of the HandlerInfo class, taking a shared_ptr to the handler
+ * to which additional information should be attached.
+ */
+ HandlerInfo(std::shared_ptr<Handler> handler);
+
+ /**
+ * Destructor of the HandlerInfo class (to allow Handler to be forward
+ * declared).
+ */
+ ~HandlerInfo();
+
+ /**
+ * Updates the "field" flags according to a "fieldStart" event.
+ */
+ void fieldStart(bool isDefault, bool isImplicit, bool isValid);
+
+ /**
+ * Updates the "fields" flags according to a "fieldEnd" event.
+ */
+ void fieldEnd();
+};
+
+/**
+ * The Stack class is a pushdown automaton responsible for turning a command
+ * stream into a tree of Node instances. It does so by following a state
+ * transition graph and creating a set of Handler instances, which are placed
+ * on the stack.
+ */
+class Stack {
+private:
+ /**
+ * Reference at the parser context.
+ */
+ ParserContext &ctx;
+
+ /**
+ * Map containing all registered command names and the corresponding
+ * state descriptors.
+ */
+ const std::multimap<std::string, const State *> &states;
+
+ /**
+ * Internal stack used for managing the currently active Handler instances.
+ */
+ std::vector<HandlerInfo> stack;
+
+ /**
+ * Return the reference in the Logger instance stored within the context.
+ */
+ Logger &logger();
+
+ /**
+ * Used internally to get all expected command names for the current state.
+ * This function is used to build error messages.
+ *
+ * @return a set of strings containing the names of the expected commands.
+ */
+ std::set<std::string> expectedCommands();
+
+ /**
+ * Returns the targetState for a command with the given name that can be
+ * reached from the current state.
+ *
+ * @param name is the name of the requested command.
+ * @return nullptr if no target state was found, a pointer at the target
+ * state otherwise.
+ */
+ const State *findTargetState(const std::string &name);
+
+ /**
+ * Returns the targetState for a command with the given name that can be
+ * reached from the current state, also including the wildcard "*" state.
+ * Throws an exception if the given target state is not a valid identifier.
+ *
+ * @param name is the name of the requested command.
+ * @return nullptr if no target state was found, a pointer at the target
+ * state otherwise.
+ */
+ const State *findTargetStateOrWildcard(const std::string &name);
+
+ /**
+ * Tries to reconstruct the parser state from the Scope instance of the
+ * ParserContext given in the constructor. This functionality is needed for
+ * including files,as the Parser of the included file needs to be brought to
+ * an equivalent state as the one in the including file.
+ */
+ void deduceState();
+
+ /**
+ * Returns a reference at the current HandlerInfo instance (or a stub
+ * HandlerInfo instance if the stack is empty).
+ */
+ HandlerInfo &currentInfo();
+
+ /**
+ * Returns a reference at the last HandlerInfo instance (or a stub
+ * HandlerInfo instance if the stack has only one element).
+ */
+ HandlerInfo &lastInfo();
+
+ /**
+ * Ends the current handler and removes the corresponding element from the
+ * stack.
+ */
+ void endCurrentHandler();
+
+ /**
+ * Tries to start a default field for the current handler, if currently the
+ * handler is not inside a field and did not have a default field yet.
+ *
+ * @return true if the handler is inside a field, false if no field could
+ * be started.
+ */
+ bool ensureHandlerIsInField();
+
+ /**
+ * Returns true if all handlers on the stack are currently valid, or false
+ * if at least one handler is invalid.
+ *
+ * @return true if all handlers on the stack are valid.
+ */
+ bool handlersValid();
+
+public:
+ /**
+ * Creates a new instance of the Stack class.
+ *
+ * @param ctx is the parser context the parser stack is working on.
+ * @param states is a map containing the command names and pointers at the
+ * corresponding State instances.
+ */
+ Stack(ParserContext &ctx,
+ const std::multimap<std::string, const State *> &states);
+
+ /**
+ * Destructor of the Stack class.
+ */
+ ~Stack();
+
+ /**
+ * Returns the state the Stack instance currently is in.
+ *
+ * @return the state of the currently active Handler instance or STATE_NONE
+ * if no handler is on the stack.
+ */
+ const State &currentState();
+
+ /**
+ * Returns the command name that is currently being handled.
+ *
+ * @return the name of the command currently being handled by the active
+ * Handler instance or an empty string if no handler is currently active.
+ */
+ std::string currentCommandName();
+
+ /**
+ * Function that should be called whenever a new command is reached.
+ *
+ * @param name is the name of the command (including the namespace
+ * separator ':') and its corresponding location. Must be a string variant.
+ * @param args is a map containing the arguments that were passed to the
+ * command.
+ */
+ void command(const Variant &name, const Variant::mapType &args);
+
+ /**
+ * Function that shuold be called whenever character data is found in the
+ * input stream. May only be called if the currently is a command on the
+ * stack.
+ *
+ * @param data is a string variant containing the data that has been found.
+ */
+ void data(const Variant &data);
+
+ /**
+ * Function that should be called whenever a new field starts. Fields of the
+ * same command may not be separated by calls to data or annotations. Doing
+ * so will result in a LoggableException.
+ *
+ * @param isDefault should be set to true if the started field explicitly
+ * is the default field.
+ */
+ void fieldStart(bool isDefault);
+
+ /**
+ * Function that should be called whenever a field ends. Calling this
+ * function if there is no field to end will result in a LoggableException.
+ */
+ void fieldEnd();
+
+ /**
+ * Function that should be called whenever an annotation starts.
+ *
+ * @param name is the name of the annotation class.
+ * @param args is a map variant containing the arguments that were passed
+ * to the annotation.
+ */
+ void annotationStart(const Variant &className, const Variant &args);
+
+ /**
+ * Function that should be called whenever an annotation ends.
+ *
+ * @param name is the name of the annotation class that was ended.
+ * @param annotationName is the name of the annotation that was ended.
+ */
+ void annotationEnd(const Variant &className, const Variant &elementName);
+
+ /**
+ * Function that should be called whenever a previously registered token
+ * is found in the input stream.
+ *
+ * @param token is string variant containing the token that was encountered.
+ */
+ void token(Variant token);
+};
+}
+}
+
+#endif /* _OUSIA_STACK_HPP_ */
+
diff --git a/src/core/parser/ParserState.cpp b/src/core/parser/stack/State.cpp
index f635d86..d72f533 100644
--- a/src/core/parser/ParserState.cpp
+++ b/src/core/parser/stack/State.cpp
@@ -16,88 +16,97 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include "ParserState.hpp"
+#include "State.hpp"
namespace ousia {
+namespace parser_stack {
-/* Class ParserState */
+/* Class State */
-ParserState::ParserState() : elementHandler(nullptr) {}
+State::State() : elementHandler(nullptr) {}
-ParserState::ParserState(ParserStateSet parents, Arguments arguments,
+State::State(StateSet parents, Arguments arguments,
RttiSet createdNodeTypes,
- HandlerConstructor elementHandler)
+ HandlerConstructor elementHandler,
+ bool supportsAnnotations)
: parents(parents),
arguments(arguments),
createdNodeTypes(createdNodeTypes),
- elementHandler(elementHandler)
+ elementHandler(elementHandler),
+ supportsAnnotations(supportsAnnotations)
{
}
-ParserState::ParserState(const ParserStateBuilder &builder)
- : ParserState(builder.build())
+State::State(const StateBuilder &builder)
+ : State(builder.build())
{
}
-/* Class ParserStateBuilder */
+/* Class StateBuilder */
-ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state)
+StateBuilder &StateBuilder::copy(const State &state)
{
this->state = state;
return *this;
}
-ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent)
+StateBuilder &StateBuilder::parent(const State *parent)
{
- state.parents = ParserStateSet{parent};
+ state.parents = StateSet{parent};
return *this;
}
-ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents)
+StateBuilder &StateBuilder::parents(const StateSet &parents)
{
state.parents = parents;
return *this;
}
-ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments)
+StateBuilder &StateBuilder::arguments(const Arguments &arguments)
{
state.arguments = arguments;
return *this;
}
-ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type)
+StateBuilder &StateBuilder::createdNodeType(const Rtti *type)
{
state.createdNodeTypes = RttiSet{type};
return *this;
}
-ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types)
+StateBuilder &StateBuilder::createdNodeTypes(const RttiSet &types)
{
state.createdNodeTypes = types;
return *this;
}
-ParserStateBuilder &ParserStateBuilder::elementHandler(
+StateBuilder &StateBuilder::elementHandler(
HandlerConstructor elementHandler)
{
state.elementHandler = elementHandler;
return *this;
}
-const ParserState &ParserStateBuilder::build() const { return state; }
+StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations)
+{
+ state.supportsAnnotations = supportsAnnotations;
+ return *this;
+}
-/* Class ParserStateDeductor */
+const State &StateBuilder::build() const { return state; }
-ParserStateDeductor::ParserStateDeductor(
+/* Class StateDeductor */
+
+StateDeductor::StateDeductor(
std::vector<const Rtti *> signature,
- std::vector<const ParserState *> states)
+ std::vector<const State *> states)
: tbl(signature.size()),
signature(std::move(signature)),
states(std::move(states))
{
}
-bool ParserStateDeductor::isActive(size_t d, const ParserState *s)
+bool StateDeductor::isActive(size_t d, const State *s)
{
// Lookup the "active" state of (d, s), if it was not already set
// (e.second is true) we'll have to calculate it
@@ -123,7 +132,7 @@ bool ParserStateDeductor::isActive(size_t d, const ParserState *s)
// Check whether any of the parent nodes were active -- either for
// the previous element (if this one is generative) or for the
// current element (assuming this node was not generative)
- for (const ParserState *parent : s->parents) {
+ for (const State *parent : s->parents) {
if ((isGenerative && isActive(d - 1, parent)) ||
isActive(d, parent)) {
res = true;
@@ -136,9 +145,9 @@ bool ParserStateDeductor::isActive(size_t d, const ParserState *s)
return res;
}
-std::vector<const ParserState *> ParserStateDeductor::deduce()
+std::vector<const State *> StateDeductor::deduce()
{
- std::vector<const ParserState *> res;
+ std::vector<const State *> res;
if (!signature.empty()) {
const size_t D = signature.size();
for (auto s : states) {
@@ -153,9 +162,10 @@ std::vector<const ParserState *> ParserStateDeductor::deduce()
/* Constant initializations */
-namespace ParserStates {
-const ParserState All;
-const ParserState None;
+namespace States {
+const State All;
+const State None;
+}
}
}
diff --git a/src/core/parser/ParserState.hpp b/src/core/parser/stack/State.hpp
index 6487fdd..4766235 100644
--- a/src/core/parser/ParserState.hpp
+++ b/src/core/parser/stack/State.hpp
@@ -17,10 +17,10 @@
*/
/**
- * @file ParserState.hpp
+ * @file State.hpp
*
- * Defines the ParserState class used within the ParserStack pushdown
- * automaton and the ParserStateBuilder class for convenient construction of
+ * Defines the State class used within the ParserStack pushdown
+ * automaton and the StateBuilder class for convenient construction of
* such classes.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
@@ -33,12 +33,14 @@
#include <core/common/Rtti.hpp>
#include <core/common/Argument.hpp>
+#include <core/common/Whitespace.hpp>
namespace ousia {
+namespace parser_stack {
// Forward declarations
-class ParserStateBuilder;
-class ParserState;
+class StateBuilder;
+class State;
class HandlerData;
class Handler;
using HandlerConstructor = Handler *(*)(const HandlerData &handlerData);
@@ -47,17 +49,17 @@ using HandlerConstructor = Handler *(*)(const HandlerData &handlerData);
* Set of pointers of parser states -- used for specifying a set of parent
* states.
*/
-using ParserStateSet = std::unordered_set<const ParserState *>;
+using StateSet = std::unordered_set<const State *>;
/**
- * Class used for the complete specification of a ParserState. Stores possible
+ * Class used for the complete specification of a State. Stores possible
* parent states, state handlers and arguments to be passed to that state.
*/
-struct ParserState {
+struct State {
/**
* Vector containing all possible parent states.
*/
- ParserStateSet parents;
+ StateSet parents;
/**
* Descriptor of the arguments that should be passed to the handler.
@@ -66,8 +68,8 @@ struct ParserState {
/**
* Set containing the types of the nodes that may be created in this
- * ParserState. This information is needed for Parsers to reconstruct the
- * current ParserState from a given ParserScope when a file is included.
+ * State. This information is needed for Parsers to reconstruct the
+ * current State from a given ParserScope when a file is included.
*/
RttiSet createdNodeTypes;
@@ -79,109 +81,119 @@ struct ParserState {
HandlerConstructor elementHandler;
/**
+ * Set to true if this handler does support annotations. This is almost
+ * always false (e.g. all description handlers), except for document
+ * element handlers.
+ */
+ bool supportsAnnotations;
+
+ /**
* Default constructor, initializes the handlers with nullptr.
*/
- ParserState();
+ State();
/**
- * Constructor taking values for all fields. Use the ParserStateBuilder
- * class for a more convenient construction of ParserState instances.
+ * Constructor taking values for all fields. Use the StateBuilder
+ * class for a more convenient construction of State instances.
*
* @param parents is a vector containing all possible parent states.
* @param arguments is a descriptor of arguments that should be passed to
* the handler.
* @param createdNodeTypes is a set containing the types of the nodes tha
- * may be created in this ParserState. This information is needed for
- * Parsers to reconstruct the current ParserState from a given ParserScope
+ * may be created in this State. This information is needed for
+ * Parsers to reconstruct the current State from a given ParserScope
* when a file is included.
* @param elementHandler is a pointer at a function which creates a new
* concrete Handler instance for the elements described by this state. May
* be nullptr in which case no handler instance is created.
+ * @param supportsAnnotations specifies whether annotations are supported
+ * here at all.
*/
- ParserState(ParserStateSet parents, Arguments arguments = Arguments{},
+ State(StateSet parents, Arguments arguments = Arguments{},
RttiSet createdNodeTypes = RttiSet{},
- HandlerConstructor elementHandler = nullptr);
+ HandlerConstructor elementHandler = nullptr,
+ bool supportsAnnotations = false);
/**
- * Creates this ParserState from the given ParserStateBuilder instance.
+ * Creates this State from the given StateBuilder instance.
*/
- ParserState(const ParserStateBuilder &builder);
+ State(const StateBuilder &builder);
};
/**
- * The ParserStateBuilder class is a class used for conveniently building new
- * ParserState instances.
+ * The StateBuilder class is a class used for conveniently building new
+ * State instances.
*/
-class ParserStateBuilder {
+class StateBuilder {
private:
/**
- * ParserState instance that is currently being built by the
- * ParserStateBuilder.
+ * State instance that is currently being built by the
+ * StateBuilder.
*/
- ParserState state;
+ State state;
public:
/**
- * Copies the ParserState instance and uses it as internal state. Overrides
- * all changes made by the ParserStateBuilder.
+ * Copies the State instance and uses it as internal state. Overrides
+ * all changes made by the StateBuilder.
*
* @param state is the state that should be copied.
- * @return a reference at this ParserStateBuilder instance for method
+ * @return a reference at this StateBuilder instance for method
* chaining.
*/
- ParserStateBuilder &copy(const ParserState &state);
+ StateBuilder &copy(const State &state);
/**
* Sets the possible parent states to the single given parent element.
*
- * @param parent is a pointer at the parent ParserState instance that should
+ * @param parent is a pointer at the parent State instance that should
* be the possible parent state.
- * @return a reference at this ParserStateBuilder instance for method
+ * @return a reference at this StateBuilder instance for method
* chaining.
*/
- ParserStateBuilder &parent(const ParserState *parent);
+ StateBuilder &parent(const State *parent);
/**
- * Sets the ParserState instances in the given ParserStateSet as the list of
+ * Sets the State instances in the given StateSet as the list of
* supported parent states.
*
- * @param parents is a set of pointers at ParserState instances that should
+ * @param parents is a set of pointers at State instances that should
* be the possible parent states.
- * @return a reference at this ParserStateBuilder instance for method
+ * @return a reference at this StateBuilder instance for method
* chaining.
*/
- ParserStateBuilder &parents(const ParserStateSet &parents);
+ StateBuilder &parents(const StateSet &parents);
/**
* Sets the arguments that should be passed to the parser state handler to
* those given as argument.
*
* @param arguments is the Arguments instance describing the Arguments that
- * should be parsed to a Handler for this ParserState.
- * @return a reference at this ParserStateBuilder instance for method
+ * should be parsed to a Handler for this State.
+ * @return a reference at this StateBuilder instance for method
* chaining.
*/
- ParserStateBuilder &arguments(const Arguments &arguments);
+ StateBuilder &arguments(const Arguments &arguments);
/**
* Sets the Node types this state may produce to the given Rtti descriptor.
*
* @param type is the Rtti descriptor of the Type that may be produced by
* this state.
- * @return a reference at this ParserStateBuilder instance for method
+ * @return a reference at this StateBuilder instance for method
* chaining.
*/
- ParserStateBuilder &createdNodeType(const Rtti *type);
+ StateBuilder &createdNodeType(const Rtti *type);
/**
* Sets the Node types this state may produce to the given Rtti descriptors.
*
* @param types is a set of Rtti descriptors of the Types that may be
* produced by this state.
- * @return a reference at this ParserStateBuilder instance for method
+ * @return a reference at this StateBuilder instance for method
* chaining.
*/
- ParserStateBuilder &createdNodeTypes(const RttiSet &types);
+ StateBuilder &createdNodeTypes(const RttiSet &types);
/**
* Sets the constructor for the element handler. The constructor creates a
@@ -191,31 +203,42 @@ public:
*
* @param elementHandler is the HandlerConstructor that should create a
* new Handler instance.
- * @return a reference at this ParserStateBuilder instance for method
+ * @return a reference at this StateBuilder instance for method
* chaining.
*/
- ParserStateBuilder &elementHandler(HandlerConstructor elementHandler);
+ StateBuilder &elementHandler(HandlerConstructor elementHandler);
/**
- * Returns a reference at the internal ParserState instance that was built
- * using the ParserStateBuilder.
+ * Sets the state of the "supportsAnnotations" flags (default value is
+ * false)
*
- * @return the built ParserState.
+ * @param supportsAnnotations should be set to true, if annotations are
+ * supported for the handlers associated with this document.
+ * @return a reference at this StateBuilder instance for method
+ * chaining.
*/
- const ParserState &build() const;
+ StateBuilder &supportsAnnotations(bool supportsAnnotations);
+
+ /**
+ * Returns a reference at the internal State instance that was built
+ * using the StateBuilder.
+ *
+ * @return the built State.
+ */
+ const State &build() const;
};
/**
- * Class used to deduce the ParserState a Parser is currently in based on the
+ * Class used to deduce the State a Parser is currently in based on the
* types of the Nodes that currently are on the ParserStack. Uses dynamic
* programming in order to solve this problem.
*/
-class ParserStateDeductor {
+class StateDeductor {
public:
/**
* Type containing the dynamic programming table.
*/
- using Table = std::vector<std::unordered_map<const ParserState *, bool>>;
+ using Table = std::vector<std::unordered_map<const State *, bool>>;
private:
/**
@@ -231,7 +254,7 @@ private:
/**
* List of states that should be checked for being active.
*/
- const std::vector<const ParserState *> states;
+ const std::vector<const State *> states;
/**
* Used internally to check whether the given parser stack s may have been
@@ -239,20 +262,20 @@ private:
*
* @param d is the signature element.
* @param s is the parser state.
- * @return true if the the given ParserState may have been active.
+ * @return true if the the given State may have been active.
*/
- bool isActive(size_t d, const ParserState *s);
+ bool isActive(size_t d, const State *s);
public:
/**
- * Constructor of the ParserStateDeductor class.
+ * Constructor of the StateDeductor class.
*
* @param signature a Node type signature describing the types of the nodes
* which currently reside on e.g. the ParserScope stack.
* @param states is a list of states that should be checked.
*/
- ParserStateDeductor(std::vector<const Rtti *> signature,
- std::vector<const ParserState *> states);
+ StateDeductor(std::vector<const Rtti *> signature,
+ std::vector<const State *> states);
/**
* Selects all active states from the given states. Only considers those
@@ -260,23 +283,24 @@ public:
*
* @return a list of states that may actually have been active.
*/
- std::vector<const ParserState *> deduce();
+ std::vector<const State *> deduce();
};
/**
- * The ParserStates namespace contains all the global state constants used
+ * The States namespace contains all the global state constants used
* in the ParserStack class.
*/
-namespace ParserStates {
+namespace States {
/**
* State representing all states.
*/
-extern const ParserState All;
+extern const State All;
/**
* State representing the initial state.
*/
-extern const ParserState None;
+extern const State None;
+}
}
}
diff --git a/src/formats/osdm/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp
index 4a0430b..4a0430b 100644
--- a/src/formats/osdm/TokenTrie.cpp
+++ b/src/core/parser/utils/TokenTrie.cpp
diff --git a/src/formats/osdm/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp
index 36c2ffa..36c2ffa 100644
--- a/src/formats/osdm/TokenTrie.hpp
+++ b/src/core/parser/utils/TokenTrie.hpp
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp
index f2cfcd1..3c8177d 100644
--- a/src/formats/osdm/DynamicTokenizer.cpp
+++ b/src/core/parser/utils/Tokenizer.cpp
@@ -22,8 +22,9 @@
#include <core/common/CharReader.hpp>
#include <core/common/Exceptions.hpp>
#include <core/common/Utils.hpp>
+#include <core/common/WhitespaceHandler.hpp>
-#include "DynamicTokenizer.hpp"
+#include "Tokenizer.hpp"
namespace ousia {
@@ -38,7 +39,7 @@ struct TokenMatch {
/**
* Token that was matched.
*/
- DynamicToken token;
+ Token token;
/**
* Current length of the data within the text handler. The text buffer needs
@@ -102,8 +103,8 @@ public:
* @param textLength is the text buffer length of the previous text token.
* @param textEnd is the current end location of the previous text token.
*/
- TokenLookup(const TokenTrie::Node *node, size_t start,
- size_t textLength, size_t textEnd)
+ TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength,
+ size_t textEnd)
: node(node), start(start), textLength(textLength), textEnd(textEnd)
{
}
@@ -116,10 +117,10 @@ public:
* @param c is the character that should be appended to the current prefix.
* @param lookups is a list to which new TokeLookup instances are added --
* which could potentially be expanded in the next iteration.
- * @param match is the DynamicToken instance to which the matching token
+ * @param match is the Token instance to which the matching token
* should be written.
* @param tokens is a reference at the internal token list of the
- * DynamicTokenizer.
+ * Tokenizer.
* @param end is the end byte offset of the current character.
* @param sourceId is the source if of this file.
*/
@@ -142,7 +143,7 @@ public:
size_t len = str.size();
if (len > match.token.content.size()) {
match.token =
- DynamicToken{node->type, str, {sourceId, start, end}};
+ Token{node->type, str, {sourceId, start, end}};
match.textLength = textLength;
match.textEnd = textEnd;
}
@@ -155,203 +156,40 @@ public:
}
};
-/* Internal class TextHandlerBase */
-
-/**
- * Base class used for those classes that may be used as TextHandler in the
- * DynamicTokenizer::next function.
- */
-class TextHandlerBase {
-public:
- /**
- * Start position of the extracted text.
- */
- size_t textStart;
-
- /**
- * End position of the extracted text.
- */
- size_t textEnd;
-
- /**
- * Buffer containing the extracted text.
- */
- std::vector<char> textBuf;
-
- /**
- * Constructor of the TextHandlerBase base class. Initializes the start and
- * end position with zeros.
- */
- TextHandlerBase() : textStart(0), textEnd(0) {}
-
- /**
- * Transforms the given token into a text token containing the extracted
- * text.
- *
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
- void buildTextToken(TokenMatch &match, SourceId sourceId)
- {
- if (match.hasMatch()) {
- match.token.content =
- std::string{textBuf.data(), match.textLength};
- match.token.location =
- SourceLocation{sourceId, textStart, match.textEnd};
- } else {
- match.token.content = std::string{textBuf.data(), textBuf.size()};
- match.token.location = SourceLocation{sourceId, textStart, textEnd};
- }
- match.token.type = TextToken;
- }
-
- /**
- * Returns true if this whitespace handler has found any text and a text
- * token could be emitted.
- *
- * @return true if the internal data buffer is non-empty.
- */
- bool hasText() { return !textBuf.empty(); }
-};
-
-/* Internal class PreservingTextHandler */
-
-/**
- * The PreservingTextHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Appends the given character to the internal text buffer, does not
- * eliminate whitespace.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
- textBuf.push_back(c);
- }
-};
-
-/* Internal class TrimmingTextHandler */
-
/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
+ * Transforms the given token into a text token containing the extracted
+ * text.
+ *
+ * @param handler is the WhitespaceHandler containing the collected data.
+ * @param token is the output token to which the text should be written.
+ * @param sourceId is the source id of the underlying file.
*/
-class TrimmingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Buffer used internally to temporarily store all whitespace characters.
- * They are only added to the output buffer if another non-whitespace
- * character is reached.
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * whitespace characters at the begin and end of the text.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- whitespaceBuf.push_back(c);
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (!whitespaceBuf.empty()) {
- textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
- whitespaceBuf.end());
- whitespaceBuf.clear();
- }
- textBuf.push_back(c);
- }
-};
-
-/* Internal class CollapsingTextHandler */
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Flag set to true if a whitespace character was reached.
- */
- bool hasWhitespace = false;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * redundant whitespace characters.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- hasWhitespace = true;
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (hasWhitespace) {
- textBuf.push_back(' ');
- hasWhitespace = false;
- }
- textBuf.push_back(c);
+static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match,
+ SourceId sourceId)
+{
+ if (match.hasMatch()) {
+ match.token.content =
+ std::string{handler.textBuf.data(), match.textLength};
+ match.token.location =
+ SourceLocation{sourceId, handler.textStart, match.textEnd};
+ } else {
+ match.token.content = handler.toString();
+ match.token.location =
+ SourceLocation{sourceId, handler.textStart, handler.textEnd};
}
-};
+ match.token.type = TextToken;
+}
}
-/* Class DynamicTokenizer */
+/* Class Tokenizer */
-DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
+Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
: whitespaceMode(whitespaceMode), nextTokenTypeId(0)
{
}
template <typename TextHandler, bool read>
-bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
+bool Tokenizer::next(CharReader &reader, Token &token)
{
// If we're in the read mode, reset the char reader peek position to the
// current read position
@@ -409,9 +247,8 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
}
// If we found text, emit that text
- if (textHandler.hasText() &&
- (!match.hasMatch() || match.textLength > 0)) {
- textHandler.buildTextToken(match, sourceId);
+ if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
+ buildTextToken(textHandler, match, sourceId);
}
// Move the read/peek cursor to the end of the token, abort if an error
@@ -431,38 +268,38 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
}
token = match.token;
} else {
- token = DynamicToken{};
+ token = Token{};
}
return match.hasMatch();
}
-bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
+bool Tokenizer::read(CharReader &reader, Token &token)
{
switch (whitespaceMode) {
case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, true>(reader, token);
+ return next<PreservingWhitespaceHandler, true>(reader, token);
case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, true>(reader, token);
+ return next<TrimmingWhitespaceHandler, true>(reader, token);
case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, true>(reader, token);
+ return next<CollapsingWhitespaceHandler, true>(reader, token);
}
return false;
}
-bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
+bool Tokenizer::peek(CharReader &reader, Token &token)
{
switch (whitespaceMode) {
case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, false>(reader, token);
+ return next<PreservingWhitespaceHandler, false>(reader, token);
case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, false>(reader, token);
+ return next<TrimmingWhitespaceHandler, false>(reader, token);
case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, false>(reader, token);
+ return next<CollapsingWhitespaceHandler, false>(reader, token);
}
return false;
}
-TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
+TokenTypeId Tokenizer::registerToken(const std::string &token)
{
// Abort if an empty token should be registered
if (token.empty()) {
@@ -493,14 +330,14 @@ TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
// Try to register the token in the trie -- if this fails, remove it
// from the tokens list
if (!trie.registerToken(token, type)) {
- tokens[type] = std::string();
+ tokens[type] = std::string{};
nextTokenTypeId = type;
return EmptyToken;
}
return type;
}
-bool DynamicTokenizer::unregisterToken(TokenTypeId type)
+bool Tokenizer::unregisterToken(TokenTypeId type)
{
// Unregister the token from the trie, abort if an invalid type is given
if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
@@ -511,7 +348,7 @@ bool DynamicTokenizer::unregisterToken(TokenTypeId type)
return false;
}
-std::string DynamicTokenizer::getTokenString(TokenTypeId type)
+std::string Tokenizer::getTokenString(TokenTypeId type)
{
if (type < tokens.size()) {
return tokens[type];
@@ -519,26 +356,26 @@ std::string DynamicTokenizer::getTokenString(TokenTypeId type)
return std::string{};
}
-void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
+void Tokenizer::setWhitespaceMode(WhitespaceMode mode)
{
whitespaceMode = mode;
}
-WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
+WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; }
/* Explicitly instantiate all possible instantiations of the "next" member
function */
-template bool DynamicTokenizer::next<PreservingTextHandler, false>(
- CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
- CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<PreservingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
+template bool Tokenizer::next<PreservingWhitespaceHandler, false>(
+ CharReader &reader, Token &token);
+template bool Tokenizer::next<TrimmingWhitespaceHandler, false>(
+ CharReader &reader, Token &token);
+template bool Tokenizer::next<CollapsingWhitespaceHandler, false>(
+ CharReader &reader, Token &token);
+template bool Tokenizer::next<PreservingWhitespaceHandler, true>(
+ CharReader &reader, Token &token);
+template bool Tokenizer::next<TrimmingWhitespaceHandler, true>(
+ CharReader &reader, Token &token);
+template bool Tokenizer::next<CollapsingWhitespaceHandler, true>(
+ CharReader &reader, Token &token);
}
diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp
index 0cac2e8..6b4e116 100644
--- a/src/formats/osdm/DynamicTokenizer.hpp
+++ b/src/core/parser/utils/Tokenizer.hpp
@@ -17,7 +17,7 @@
*/
/**
- * @file DynamicTokenizer.hpp
+ * @file Tokenizer.hpp
*
* Tokenizer that can be reconfigured at runtime used for parsing the plain
* text format.
@@ -33,6 +33,7 @@
#include <vector>
#include <core/common/Location.hpp>
+#include <core/common/Whitespace.hpp>
#include "TokenTrie.hpp"
@@ -42,9 +43,9 @@ namespace ousia {
class CharReader;
/**
- * The DynamicToken structure describes a token discovered by the Tokenizer.
+ * The Token structure describes a token discovered by the Tokenizer.
*/
-struct DynamicToken {
+struct Token {
/**
* Id of the type of this token.
*/
@@ -63,28 +64,28 @@ struct DynamicToken {
/**
* Default constructor.
*/
- DynamicToken() : type(EmptyToken) {}
+ Token() : type(EmptyToken) {}
/**
- * Constructor of the DynamicToken struct.
+ * Constructor of the Token struct.
*
* @param id represents the token type.
* @param content is the string content that has been extracted.
* @param location is the location of the extracted string content in the
* source file.
*/
- DynamicToken(TokenTypeId type, const std::string &content,
+ Token(TokenTypeId type, const std::string &content,
SourceLocation location)
: type(type), content(content), location(location)
{
}
/**
- * Constructor of the DynamicToken struct, only initializes the token type
+ * Constructor of the Token struct, only initializes the token type
*
* @param type is the id corresponding to the type of the token.
*/
- DynamicToken(TokenTypeId type) : type(type) {}
+ Token(TokenTypeId type) : type(type) {}
/**
* The getLocation function allows the tokens to be directly passed as
@@ -96,35 +97,13 @@ struct DynamicToken {
};
/**
- * Enum specifying the whitespace handling of the DynamicTokenizer class when
- * reading non-token text.
- */
-enum class WhitespaceMode {
- /**
- * Preserves all whitespaces as they are found in the source file.
- */
- PRESERVE,
-
- /**
- * Trims whitespace at the beginning and the end of the found text.
- */
- TRIM,
-
- /**
- * Whitespaces are trimmed and collapsed, multiple whitespace characters
- * are replaced by a single space character.
- */
- COLLAPSE
-};
-
-/**
- * The DynamicTokenizer is used to extract tokens and chunks of text from a
+ * The Tokenizer is used to extract tokens and chunks of text from a
* CharReader. It allows to register and unregister tokens while parsing and
* to modify the handling of whitespace characters. Note that the
- * DynamicTokenizer always tries to extract the longest possible token from the
+ * Tokenizer always tries to extract the longest possible token from the
* tokenizer.
*/
-class DynamicTokenizer {
+class Tokenizer {
private:
/**
* Internally used token trie. This object holds all registered tokens.
@@ -161,15 +140,15 @@ private:
* @return false if the end of the stream has been reached, true otherwise.
*/
template <typename TextHandler, bool read>
- bool next(CharReader &reader, DynamicToken &token);
+ bool next(CharReader &reader, Token &token);
public:
/**
- * Constructor of the DynamicTokenizer class.
+ * Constructor of the Tokenizer class.
*
* @param whitespaceMode specifies how whitespace should be handled.
*/
- DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+ Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
/**
* Registers the given string as a token. Returns a const pointer at a
@@ -222,7 +201,7 @@ public:
/**
* Reads a new token from the CharReader and stores it in the given
- * DynamicToken instance.
+ * Token instance.
*
* @param reader is the CharReader instance from which the data should be
* read.
@@ -231,7 +210,7 @@ public:
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool read(CharReader &reader, DynamicToken &token);
+ bool read(CharReader &reader, Token &token);
/**
* The peek method does not advance the read position of the char reader,
@@ -244,7 +223,7 @@ public:
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool peek(CharReader &reader, DynamicToken &token);
+ bool peek(CharReader &reader, Token &token);
};
}
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp
new file mode 100644
index 0000000..4973639
--- /dev/null
+++ b/src/formats/osml/OsmlParser.cpp
@@ -0,0 +1,57 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/parser/generic/ParserStateCallbacks.hpp>
+#include <core/parser/generic/ParserStateStack.hpp>
+
+#include "OsdmParser.hpp"
+#include "OsdmStreamParser.hpp"
+
+namespace ousia {
+
+namespace {
+
+/**
+ * The OsdmParserImplementation class contains the actual implementation of the
+ * parsing process and is created in the "doParse" function of the OsdmParser.
+
+ */
+class OsdmParserImplementation : public ParserStateCallbacks {
+private:
+ /**
+ * OsdmStreamParser instance.
+ */
+ OsdmStreamParser parser;
+
+ /**
+ * Instance of the ParserStateStack.
+ */
+ ParserStateStack stack;
+
+public:
+ OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap)
+};
+}
+
+void OsdmParser::doParse(CharReader &reader, ParserContext &ctx)
+{
+ OsdmParserImplementation parser(reader, ctx);
+ parser.parse();
+}
+
+}
diff --git a/src/core/parser/generic/GenericParser.hpp b/src/formats/osml/OsmlParser.hpp
index 4f29f94..37505b4 100644
--- a/src/core/parser/generic/GenericParser.hpp
+++ b/src/formats/osml/OsmlParser.hpp
@@ -17,33 +17,32 @@
*/
/**
- * @file GenericParser.hpp
+ * @file OsdmParser.hpp
*
- * The GenericParser class builds an abstraction layer that separates the
- * underlying document format (e.g. osdm or osdmx) from the actual process of
- * building the document model. It provides a set of genric functions that
- * should be called by the inheriting concrete parser class, e.g. indicating a
- * command with parameters, the start/end of a field or the start/end of an
- * annotation. The GenericParser maintains an internal stack of
- * ParserStateHandlers and relays the commands to the elements of this stack.
+ * Contains the parser of the osdm format, the standard plain-text format used
+ * by Ousía for documents.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
-#ifndef _OUSIA_GENERIC_PARSER_HPP_
-#define _OUSIA_GENERIC_PARSER_HPP_
+#ifndef _OUSIA_OSDM_PARSER_HPP_
+#define _OUSIA_OSDM_PARSER_HPP_
-#include <core/parser/Parseer.hpp>
+#include <core/parser/Parser.hpp>
namespace ousia {
-class GenericParser : public Parser {
-
-
-
+/**
+ * OsdmParser is a small wrapper implementing the Parser interface. The actual
+ * parsing is performed with the OsdmStreamParser in conjunction with the
+ * ParserStateStack.
+ */
+class OsdmParser : public Parser {
+protected:
+ void doParse(CharReader &reader, ParserContext &ctx) override;
};
}
-#endif _OUSIA_GENERIC_PARSER_HPP_
+#endif /* _OUSIA_OSDM_PARSER_HPP_ */
diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index 8cb8caf..0174fa4 100644
--- a/src/formats/osdm/OsdmStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -21,14 +21,14 @@
#include <core/common/Utils.hpp>
#include <core/common/VariantReader.hpp>
-#include "OsdmStreamParser.hpp"
+#include "OsmlStreamParser.hpp"
namespace ousia {
/**
* Plain format default tokenizer.
*/
-class PlainFormatTokens : public DynamicTokenizer {
+class PlainFormatTokens : public Tokenizer {
public:
/**
* Id of the backslash token.
@@ -61,6 +61,21 @@ public:
TokenTypeId FieldEnd;
/**
+ * Id of the default field start token.
+ */
+ TokenTypeId DefaultFieldStart;
+
+ /**
+ * Id of the annotation start token.
+ */
+ TokenTypeId AnnotationStart;
+
+ /**
+ * Id of the annotation end token.
+ */
+ TokenTypeId AnnotationEnd;
+
+ /**
* Registers the plain format tokens in the internal tokenizer.
*/
PlainFormatTokens()
@@ -71,6 +86,9 @@ public:
BlockCommentEnd = registerToken("}%");
FieldStart = registerToken("{");
FieldEnd = registerToken("}");
+ DefaultFieldStart = registerToken("{!");
+ AnnotationStart = registerToken("<\\");
+ AnnotationEnd = registerToken("\\>");
}
};
@@ -160,14 +178,14 @@ public:
}
};
-OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger)
+OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
: reader(reader), logger(logger), tokenizer(Tokens)
{
// Place an intial command representing the complete file on the stack
- commands.push(Command{"", Variant::mapType{}, true, true, true});
+ commands.push(Command{"", Variant::mapType{}, true, true, true, false});
}
-Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
{
bool first = true;
bool hasCharSiceNSSep = false;
@@ -210,7 +228,7 @@ Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
return res;
}
-OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
+OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
{
// Expect a '{' after the command
reader.consumeWhitespace();
@@ -251,7 +269,7 @@ OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
return State::COMMAND;
}
-static bool checkStillInField(const OsdmStreamParser::Command &cmd,
+static bool checkStillInField(const OsmlStreamParser::Command &cmd,
const Variant &endName, Logger &logger)
{
if (cmd.inField && !cmd.inRangeField) {
@@ -264,7 +282,7 @@ static bool checkStillInField(const OsdmStreamParser::Command &cmd,
return false;
}
-OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
+OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
{
// Expect a '{' after the command
if (!reader.expect('{')) {
@@ -327,7 +345,7 @@ OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
return cmd.inRangeField ? State::FIELD_END : State::NONE;
}
-Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
+Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
{
// Parse the arguments using the universal VariantReader
Variant commandArguments;
@@ -353,7 +371,7 @@ Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
return commandArguments;
}
-void OsdmStreamParser::pushCommand(Variant commandName,
+void OsmlStreamParser::pushCommand(Variant commandName,
Variant commandArguments, bool hasRange)
{
// Store the location on the stack
@@ -365,10 +383,11 @@ void OsdmStreamParser::pushCommand(Variant commandName,
commands.pop();
}
commands.push(Command{std::move(commandName), std::move(commandArguments),
- hasRange, false, false});
+ hasRange, false, false, false});
}
-OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
+OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
+ bool isAnnotation)
{
// Parse the commandName as a first identifier
Variant commandName = parseIdentifier(start, true);
@@ -382,6 +401,9 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
Utils::split(commandName.asString(), ':');
const bool isBegin = commandNameComponents[0] == "begin";
const bool isEnd = commandNameComponents[0] == "end";
+
+ // Parse the begin or end command
+ State res = State::COMMAND;
if (isBegin || isEnd) {
if (commandNameComponents.size() > 1) {
logger.error(
@@ -390,35 +412,81 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
commandName);
}
if (isBegin) {
- return parseBeginCommand();
+ res = parseBeginCommand();
} else if (isEnd) {
- return parseEndCommand();
+ res = parseEndCommand();
}
+ } else {
+ // Check whether the next character is a '#', indicating the start of
+ // the command name
+ Variant commandArgName;
+ start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ // Parse the arugments
+ Variant commandArguments =
+ parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), false);
}
- // Check whether the next character is a '#', indicating the start of the
- // command name
- Variant commandArgName;
- start = reader.getOffset();
- if (reader.expect('#')) {
- commandArgName = parseIdentifier(start);
- if (commandArgName.asString().empty()) {
- logger.error("Expected identifier after \"#\"", commandArgName);
+ // Check whether a ">" character is the next character that is to be read.
+ // In that case the current command could be an annotation end command!
+ char c;
+ if (reader.fetch(c) && c == '>') {
+ // Ignore the character after a begin or end command
+ if (isBegin || isEnd) {
+ logger.warning(
+ "Ignoring annotation end character \">\" after special "
+ "commands \"begin\" or \"end\". Write \"\\>\" to end a "
+ "\"begin\"/\"end\" enclosed annotation.",
+ reader);
+ return res;
}
- }
- // Parse the arugments
- Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+ // If this should be an annoation, ignore the character
+ if (isAnnotation) {
+ logger.warning(
+ "Ignoring annotation end character \">\" after annotation "
+ "start command. Write \"\\>\" to end the annotation.",
+ reader);
+ } else {
+ // Make sure no arguments apart from the "name" argument are given
+ // to an annotation end
+ Variant::mapType &map = commands.top().arguments.asMap();
+ if (!map.empty()) {
+ if (map.count("name") == 0 || map.size() > 1U) {
+ logger.error(
+ "An annotation end command may not have any arguments "
+ "other than \"name\"");
+ return res;
+ }
+ }
- // Push the command onto the command stack
- pushCommand(std::move(commandName), std::move(commandArguments), false);
+ // If we got here, this is a valid ANNOTATION_END command, issue it
+ reader.peek(c);
+ reader.consumePeek();
+ return State::ANNOTATION_END;
+ }
+ }
- return State::COMMAND;
+ // If we're starting an annotation, return the command as annotation start
+ // instead of command
+ if (isAnnotation && res == State::COMMAND) {
+ return State::ANNOTATION_START;
+ }
+ return res;
}
-void OsdmStreamParser::parseBlockComment()
+void OsmlStreamParser::parseBlockComment()
{
- DynamicToken token;
+ Token token;
size_t depth = 1;
while (tokenizer.read(reader, token)) {
if (token.type == Tokens.BlockCommentEnd) {
@@ -436,7 +504,7 @@ void OsdmStreamParser::parseBlockComment()
logger.error("File ended while being in a block comment", reader);
}
-void OsdmStreamParser::parseLineComment()
+void OsmlStreamParser::parseLineComment()
{
char c;
while (reader.read(c)) {
@@ -446,7 +514,7 @@ void OsdmStreamParser::parseLineComment()
}
}
-bool OsdmStreamParser::checkIssueData(DataHandler &handler)
+bool OsmlStreamParser::checkIssueData(DataHandler &handler)
{
if (!handler.isEmpty()) {
data = handler.toVariant(reader.getSourceId());
@@ -457,7 +525,7 @@ bool OsdmStreamParser::checkIssueData(DataHandler &handler)
return false;
}
-bool OsdmStreamParser::checkIssueFieldStart()
+bool OsmlStreamParser::checkIssueFieldStart()
{
// Fetch the current command, and check whether we're currently inside a
// field of this command
@@ -482,18 +550,41 @@ bool OsdmStreamParser::checkIssueFieldStart()
return false;
}
-OsdmStreamParser::State OsdmStreamParser::parse()
+bool OsmlStreamParser::closeField()
+{
+ // Try to end an open field of the current command -- if the current command
+ // is not inside an open field, end this command and try to close the next
+ // one
+ for (int i = 0; i < 2 && commands.size() > 1; i++) {
+ Command &cmd = commands.top();
+ if (!cmd.inRangeField) {
+ if (cmd.inField) {
+ cmd.inField = false;
+ if (cmd.inDefaultField) {
+ commands.pop();
+ }
+ return true;
+ }
+ commands.pop();
+ } else {
+ return false;
+ }
+ }
+ return false;
+}
+
+OsmlStreamParser::State OsmlStreamParser::parse()
{
// Handler for incomming data
DataHandler handler;
// Read tokens until the outer loop should be left
- DynamicToken token;
+ Token token;
while (tokenizer.peek(reader, token)) {
const TokenTypeId type = token.type;
// Special handling for Backslash and Text
- if (type == Tokens.Backslash) {
+ if (type == Tokens.Backslash || type == Tokens.AnnotationStart) {
// Before appending anything to the output data or starting a new
// command, check whether FIELD_START has to be issued, as the
// current command is a command with range
@@ -519,7 +610,8 @@ OsdmStreamParser::State OsdmStreamParser::parse()
}
// Parse the actual command
- State res = parseCommand(token.location.getStart());
+ State res = parseCommand(token.location.getStart(),
+ type == Tokens.AnnotationStart);
switch (res) {
case State::ERROR:
throw LoggableException(
@@ -536,6 +628,14 @@ OsdmStreamParser::State OsdmStreamParser::parse()
// to the data buffer, use the escape character start as start
// location and the peek offset as end location
reader.peek(c); // Peek the previously fetched character
+
+ // If this was an annotation start token, add the parsed < to the
+ // output
+ if (type == Tokens.AnnotationStart) {
+ handler.append('<', token.location.getStart(),
+ token.location.getStart() + 1);
+ }
+
handler.append(c, token.location.getStart(),
reader.getPeekOffset());
reader.consumePeek();
@@ -579,28 +679,37 @@ OsdmStreamParser::State OsdmStreamParser::parse()
}
logger.error(
"Got field start token \"{\", but no command for which to "
- "start the field. Did you mean \"\\{\"?",
+ "start the field. Write \"\\{\" to insert this sequence as "
+ "text.",
token);
} else if (token.type == Tokens.FieldEnd) {
- // Try to end an open field of the current command -- if the current
- // command is not inside an open field, end this command and try to
- // close the next one
- for (int i = 0; i < 2 && commands.size() > 1; i++) {
- Command &cmd = commands.top();
- if (!cmd.inRangeField) {
- if (cmd.inField) {
- cmd.inField = false;
- return State::FIELD_END;
- }
- commands.pop();
- } else {
- break;
- }
+ if (closeField()) {
+ return State::FIELD_END;
}
logger.error(
- "Got field end token \"}\", but there is no field to end. Did "
- "you mean \"\\}\"?",
+ "Got field end token \"}\", but there is no field to end. "
+ "Write \"\\}\" to insert this sequence as text.",
token);
+ } else if (token.type == Tokens.DefaultFieldStart) {
+ // Try to start a default field the first time the token is reached
+ Command &topCmd = commands.top();
+ if (!topCmd.inField) {
+ topCmd.inField = true;
+ topCmd.inDefaultField = true;
+ return State::FIELD_START;
+ }
+ logger.error(
+ "Got default field start token \"{!\", but no command for "
+ "which to start the field. Write \"\\{!\" to insert this "
+ "sequence as text",
+ token);
+ } else if (token.type == Tokens.AnnotationEnd) {
+ // We got a single annotation end token "\>" -- simply issue the
+ // ANNOTATION_END event
+ Variant annotationName = Variant::fromString("");
+ annotationName.setLocation(token.location);
+ pushCommand(annotationName, Variant::mapType{}, false);
+ return State::ANNOTATION_END;
} else {
logger.error("Unexpected token \"" + token.content + "\"", token);
}
@@ -627,14 +736,19 @@ OsdmStreamParser::State OsdmStreamParser::parse()
return State::END;
}
-const Variant &OsdmStreamParser::getCommandName()
+const Variant &OsmlStreamParser::getCommandName() const
{
return commands.top().name;
}
-const Variant &OsdmStreamParser::getCommandArguments()
+const Variant &OsmlStreamParser::getCommandArguments() const
{
return commands.top().arguments;
}
+
+bool OsmlStreamParser::inDefaultField() const
+{
+ return commands.top().inRangeField || commands.top().inDefaultField;
+}
}
diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index 48d8fb7..dc3034c 100644
--- a/src/formats/osdm/OsdmStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -17,23 +17,22 @@
*/
/**
- * @file OsdmStreamParser.hpp
+ * @file OsmlStreamParser.hpp
*
- * Provides classes for low-level classes for reading the TeX-esque osdm
+ * Provides classes for low-level classes for reading the TeX-esque osml
* format. The class provided here does not build any model objects and does not
* implement the Parser interface.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
-#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_
-#define _OUSIA_OSDM_STREAM_PARSER_HPP_
+#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
+#define _OUSIA_OSML_STREAM_PARSER_HPP_
#include <stack>
#include <core/common/Variant.hpp>
-
-#include "DynamicTokenizer.hpp"
+#include <core/parser/utils/Tokenizer.hpp>
namespace ousia {
@@ -43,7 +42,7 @@ class Logger;
class DataHandler;
/**
- * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm
+ * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
* format. The parser is constructed around a "parse" function, which reads data
* from the underlying CharReader until a new state is reached and indicates
* this state in a return value. The calling code then has to pull corresponding
@@ -53,10 +52,10 @@ class DataHandler;
* fields, as this would lead to too many consecutive errors) a
* LoggableException is thrown.
*/
-class OsdmStreamParser {
+class OsmlStreamParser {
public:
/**
- * Enum used to indicate which state the OsdmStreamParser class is in
+ * Enum used to indicate which state the OsmlStreamParser class is in
* after calling the "parse" function.
*/
enum class State {
@@ -140,23 +139,35 @@ public:
/**
* Set to true if this is a command with clear begin and end.
*/
- bool hasRange;
+ bool hasRange : 1;
/**
* Set to true if we are currently inside a field of this command.
*/
- bool inField;
+ bool inField : 1;
/**
* Set to true if we are currently in the range field of the command
* (implies inField being set to true).
*/
- bool inRangeField;
+ bool inRangeField : 1;
+
+ /**
+ * Set to true if we are currently in a field that has been especially
+ * marked as default field (using the "|") syntax.
+ */
+ bool inDefaultField : 1;
/**
* Default constructor.
*/
- Command() : hasRange(false), inField(false), inRangeField(false) {}
+ Command()
+ : hasRange(false),
+ inField(false),
+ inRangeField(false),
+ inDefaultField()
+ {
+ }
/**
* Constructor of the Command class.
@@ -169,16 +180,19 @@ public:
* explicit range.
* @param inField is set to true if we currently are inside a field
* of this command.
- * @param inRangeField is set to true if we currently inside the outer
- * field of the command.
+ * @param inRangeField is set to true if we currently are inside the
+ * outer field of a ranged command.
+ * @param inDefaultField is set to true if we currently are in a
+ * specially marked default field.
*/
- Command(Variant name, Variant arguments, bool hasRange, bool inField,
- bool inRangeField)
+ Command(Variant name, Variant arguments, bool hasRange,
+ bool inField, bool inRangeField, bool inDefaultField)
: name(std::move(name)),
arguments(std::move(arguments)),
hasRange(hasRange),
inField(inField),
- inRangeField(inRangeField)
+ inRangeField(inRangeField),
+ inDefaultField(inDefaultField)
{
}
};
@@ -198,7 +212,7 @@ private:
/**
* Tokenizer instance used to read individual tokens from the text.
*/
- DynamicTokenizer tokenizer;
+ Tokenizer tokenizer;
/**
* Stack containing the current commands.
@@ -258,9 +272,11 @@ private:
*
* @param start is the start byte offset of the command (including the
* backslash)
+ * @param isAnnotation if true, the command is not returned as command, but
+ * as annotation start.
* @return true if a command was actuall parsed, false otherwise.
*/
- State parseCommand(size_t start);
+ State parseCommand(size_t start, bool isAnnotation);
/**
* Function used internally to parse a block comment.
@@ -290,16 +306,26 @@ private:
*/
bool checkIssueFieldStart();
+ /**
+ * Closes a currently open field. Note that the command will be removed from
+ * the internal command stack if the field that is being closed is a
+ * field marked as default field.
+ *
+ * @return true if the field could be closed, false if there was no field
+ * to close.
+ */
+ bool closeField();
+
public:
/**
- * Constructor of the OsdmStreamParser class. Attaches the new
- * OsdmStreamParser to the given CharReader and Logger instances.
+ * Constructor of the OsmlStreamParser class. Attaches the new
+ * OsmlStreamParser to the given CharReader and Logger instances.
*
* @param reader is the reader instance from which incomming characters
* should be read.
* @param logger is the logger instance to which errors should be written.
*/
- OsdmStreamParser(CharReader &reader, Logger &logger);
+ OsmlStreamParser(CharReader &reader, Logger &logger);
/**
* Continues parsing. Returns one of the states defined in the State enum.
@@ -318,7 +344,7 @@ public:
* @return a reference at a variant containing the data parsed by the
* "parse" function.
*/
- const Variant &getData() { return data; }
+ const Variant &getData() const { return data; }
/**
* Returns a reference at the internally stored command name. Only valid if
@@ -327,7 +353,7 @@ public:
* @return a reference at a variant containing name and location of the
* parsed command.
*/
- const Variant &getCommandName();
+ const Variant &getCommandName() const;
/**
* Returns a reference at the internally stored command name. Only valid if
@@ -336,16 +362,24 @@ public:
* @return a reference at a variant containing arguments given to the
* command.
*/
- const Variant &getCommandArguments();
+ const Variant &getCommandArguments() const;
+
+ /**
+ * Returns true if the current field is the "default" field. This is true if
+ * the parser either is in the outer range of a range command or inside a
+ * field that has been especially marked as "default" field (using the "|"
+ * syntax).
+ */
+ bool inDefaultField() const;
/**
* Returns a reference at the char reader.
*
* @return the last internal token location.
*/
- SourceLocation &getLocation() { return location; }
+ const SourceLocation &getLocation() const { return location; }
};
}
-#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */
+#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */
diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp
new file mode 100644
index 0000000..e37446a
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.cpp
@@ -0,0 +1,144 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/Location.hpp>
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+
+#include "OsxmlAttributeLocator.hpp"
+
+namespace ousia {
+
+/**
+ * Enum used internally in the statemachine of the xml argument parser.
+ */
+enum class XmlAttributeState {
+ IN_TAG_NAME,
+ SEARCH_ATTR,
+ IN_ATTR_NAME,
+ HAS_ATTR_NAME,
+ HAS_ATTR_EQUALS,
+ IN_ATTR_DATA
+};
+
+std::map<std::string, SourceLocation> OsxmlAttributeLocator::locate(
+ CharReader &reader, size_t offs)
+{
+ std::map<std::string, SourceLocation> res;
+
+ // Fork the reader, we don't want to mess up the XML parsing process, do we?
+ CharReaderFork readerFork = reader.fork();
+
+ // Move the read cursor to the start location, abort if this does not work
+ if (offs != readerFork.seek(offs)) {
+ return res;
+ }
+
+ // Now all we need to do is to implement one half of an XML parser. As this
+ // is inherently complicated we'll totaly fail at it. Don't care. All we
+ // want to get is those darn offsets for pretty error messages... (and we
+ // can assume the XML is valid as it was already read by expat)
+ XmlAttributeState state = XmlAttributeState::IN_TAG_NAME;
+ char c;
+ std::stringstream attrName;
+ while (readerFork.read(c)) {
+ // Abort at the end of the tag
+ if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) {
+ return res;
+ }
+
+ // One state machine to rule them all, one state machine to find them,
+ // One state machine to bring them all and in the darkness bind them
+ // (the byte offsets)
+ switch (state) {
+ case XmlAttributeState::IN_TAG_NAME:
+ if (Utils::isWhitespace(c)) {
+ res.emplace("$tag",
+ SourceLocation{reader.getSourceId(), offs + 1,
+ readerFork.getOffset() - 1});
+ state = XmlAttributeState::SEARCH_ATTR;
+ }
+ break;
+ case XmlAttributeState::SEARCH_ATTR:
+ if (!Utils::isWhitespace(c)) {
+ state = XmlAttributeState::IN_ATTR_NAME;
+ attrName << c;
+ }
+ break;
+ case XmlAttributeState::IN_ATTR_NAME:
+ if (Utils::isWhitespace(c)) {
+ state = XmlAttributeState::HAS_ATTR_NAME;
+ } else if (c == '=') {
+ state = XmlAttributeState::HAS_ATTR_EQUALS;
+ } else {
+ attrName << c;
+ }
+ break;
+ case XmlAttributeState::HAS_ATTR_NAME:
+ if (!Utils::isWhitespace(c)) {
+ if (c == '=') {
+ state = XmlAttributeState::HAS_ATTR_EQUALS;
+ break;
+ }
+ // Well, this is a strange XML file... We expected to
+ // see a '=' here! Try to continue with the
+ // "HAS_ATTR_EQUALS" state as this state will hopefully
+ // inlcude some error recovery
+ } else {
+ // Skip whitespace here
+ break;
+ }
+ // Fallthrough
+ case XmlAttributeState::HAS_ATTR_EQUALS:
+ if (!Utils::isWhitespace(c)) {
+ if (c == '"') {
+ // Here we are! We have found the beginning of an
+ // attribute. Let's quickly lock the current offset away
+ // in the result map
+ res.emplace(attrName.str(),
+ SourceLocation{reader.getSourceId(),
+ readerFork.getOffset()});
+ state = XmlAttributeState::IN_ATTR_DATA;
+ } else {
+ // No, this XML file is not well formed. Assume we're in
+ // an attribute name once again
+ attrName.str(std::string{&c, 1});
+ state = XmlAttributeState::IN_ATTR_NAME;
+ }
+ }
+ break;
+ case XmlAttributeState::IN_ATTR_DATA:
+ if (c == '"') {
+ // We're at the end of the attribute data, set the end
+ // location
+ auto it = res.find(attrName.str());
+ if (it != res.end()) {
+ it->second.setEnd(readerFork.getOffset() - 1);
+ }
+
+ // Reset the attribute name and restart the search
+ attrName.str(std::string{});
+ state = XmlAttributeState::SEARCH_ATTR;
+ }
+ break;
+ }
+ }
+ return res;
+}
+}
+
diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp
new file mode 100644
index 0000000..f9a3437
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.hpp
@@ -0,0 +1,67 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlAttributeLocator.hpp
+ *
+ * Contains a class used for locating the byte offsets of the attributes given
+ * in a XML tag.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+
+#include <map>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class SourceLocation;
+
+/**
+ * Class containing one static function for locating the byte offsets of the
+ * attributes in a XML tag. This are not retrieved by our xml parser, so we have
+ * to do this manually.
+ */
+class OsxmlAttributeLocator {
+public:
+ /**
+ * Function used to reconstruct the location of the attributes of a XML tag
+ * in the source code. This is necessary, as the xml parser only returns an
+ * offset to the begining of a tag and not to the position of the individual
+ * arguments.
+ *
+ * @param reader is the char reader from which the character data should be
+ * read.
+ * @param offs is a byte offset in the xml file pointing at the "<"
+ * character of the tag.
+ * @return a map from attribute keys to the corresponding location
+ * (including range) of the atribute. Also contains the location of the
+ * tagname in the form of the virtual attribute "$tag".
+ */
+ static std::map<std::string, SourceLocation> locate(CharReader &reader,
+ size_t offs);
+};
+
+}
+
+#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */
+
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
new file mode 100644
index 0000000..b4aff77
--- /dev/null
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -0,0 +1,547 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <expat.h>
+
+#include <vector>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Logger.hpp>
+#include <core/common/Variant.hpp>
+#include <core/common/VariantReader.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/WhitespaceHandler.hpp>
+
+#include "OsxmlAttributeLocator.hpp"
+#include "OsxmlEventParser.hpp"
+
+namespace ousia {
+
+/* Class OsxmlEventParser */
+
+/**
+ * Class containing data used by the internal functions.
+ */
+class OsxmlEventParserData {
+public:
+ /**
+ * Contains the current depth of the parsing process.
+ */
+ ssize_t depth;
+
+ /**
+ * Set to a value larger or equal to zero if the parser is currently inside
+ * an annotation end tag -- the value represents the depth in which the
+ * tag was opened.
+ */
+ ssize_t annotationEndTagDepth;
+
+ /**
+ * Current character data buffer.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Current whitespace buffer (for the trimming whitspace mode)
+ */
+ std::vector<char> whitespaceBuf;
+
+ /**
+ * Flag indicating whether a whitespace character was present (for the
+ * collapsing whitespace mode).
+ */
+ bool hasWhitespace;
+
+ /**
+ * Current character data start.
+ */
+ size_t textStart;
+
+ /**
+ * Current character data end.
+ */
+ size_t textEnd;
+
+ /**
+ * Default constructor.
+ */
+ OsxmlEventParserData();
+
+ /**
+ * Increments the depth.
+ */
+ void incrDepth();
+
+ /**
+ * Decrement the depth and reset the annotationEndTagDepth flag.
+ */
+ void decrDepth();
+
+ /**
+ * Returns true if we're currently inside an end tag.
+ */
+ bool inAnnotationEndTag();
+
+ /**
+ * Returns true if character data is available.
+ *
+ * @return true if character data is available.
+ */
+ bool hasText();
+
+ /**
+ * Returns a Variant containing the character data and its location.
+ *
+ * @return a string variant containing the text data and the character
+ * location.
+ */
+ Variant getText(SourceId sourceId);
+};
+
+/* Class GuardedExpatXmlParser */
+
+/**
+ * Wrapper class around the XML_Parser pointer which safely frees it whenever
+ * the scope is left (e.g. because an exception was thrown).
+ */
+class GuardedExpatXmlParser {
+private:
+ /**
+ * Internal pointer to the XML_Parser instance.
+ */
+ XML_Parser parser;
+
+public:
+ /**
+ * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS
+ * from the expat library. Throws a parser exception if the XML parser
+ * cannot be initialized.
+ *
+ * @param encoding is the protocol-defined encoding passed to expat (or
+ * nullptr if expat should determine the encoding by itself).
+ */
+ GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr)
+ {
+ parser = XML_ParserCreate(encoding);
+ if (!parser) {
+ throw LoggableException{
+ "Internal error: Could not create expat XML parser!"};
+ }
+ }
+
+ /**
+ * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance.
+ */
+ ~GuardedExpatXmlParser()
+ {
+ if (parser) {
+ XML_ParserFree(parser);
+ parser = nullptr;
+ }
+ }
+
+ /**
+ * Returns the XML_Parser pointer.
+ */
+ XML_Parser operator&() { return parser; }
+};
+
+/**
+ * Name of the special outer tag used for allowing multiple top-level elements
+ * in an xml file.
+ */
+static const std::string TOP_LEVEL_TAG{"ousia"};
+
+/**
+ * Prefix used to indicate the start of an annoation (note the trailing colon)
+ */
+static const std::string ANNOTATION_START_PREFIX{"a:start:"};
+
+/**
+ * Prefix used to indicate the end of an annotation.
+ */
+static const std::string ANNOTATION_END_PREFIX{"a:end"};
+
+/**
+ * Synchronizes the position of the xml parser with the default location of the
+ * logger instance.
+ *
+ * @param p is a pointer at the xml parser instance.
+ * @param len is the length of the string that should be refered to.
+ * @return the SourceLocation that has been set in the logger.
+ */
+static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0)
+{
+ // Fetch the OsxmlEventParser instance
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // Fetch the current location in the XML file and set the default location
+ // in the logger
+ size_t offs = XML_GetCurrentByteIndex(p);
+ SourceLocation loc =
+ SourceLocation{parser->getReader().getSourceId(), offs, offs + len};
+ parser->getLogger().setDefaultLocation(loc);
+
+ // Return the fetched location
+ return loc;
+}
+
+/**
+ * Callback called by eXpat whenever a start handler is reached.
+ */
+static void xmlStartElementHandler(void *ref, const XML_Char *name,
+ const XML_Char **attrs)
+{
+ // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+ XML_Parser p = static_cast<XML_Parser>(ref);
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // If there is any text data in the buffer, issue that first
+ if (parser->getData().hasText()) {
+ parser->getEvents().data(
+ parser->getData().getText(parser->getReader().getSourceId()));
+ }
+
+ // Read the argument locations -- this is only a stupid and slow hack,
+ // but it is necessary, as expat doesn't give use the byte offset of the
+ // arguments.
+ std::map<std::string, SourceLocation> attributeOffsets =
+ OsxmlAttributeLocator::locate(parser->getReader(),
+ XML_GetCurrentByteIndex(p));
+
+ // Update the logger position
+ SourceLocation loc = xmlSyncLoggerPosition(p);
+
+ // Fetch the location of the name
+ SourceLocation nameLoc = loc;
+ auto it = attributeOffsets.find("$tag");
+ if (it != attributeOffsets.end()) {
+ nameLoc = it->second;
+ }
+ // Increment the current depth
+ parser->getData().incrDepth();
+
+ // Make sure we're currently not inside an annotation end tag -- this would
+ // be highly illegal!
+ if (parser->getData().inAnnotationEndTag()) {
+ parser->getLogger().error(
+ "No tags allowed inside an annotation end tag", nameLoc);
+ return;
+ }
+
+ // Assemble the arguments
+ Variant::mapType args;
+ const XML_Char **attr = attrs;
+ while (*attr) {
+ // Convert the C string to a std::string
+ const std::string key{*(attr++)};
+
+ // Search the location of the key
+ SourceLocation keyLoc;
+ auto it = attributeOffsets.find(key);
+ if (it != attributeOffsets.end()) {
+ keyLoc = it->second;
+ }
+
+ // Parse the string, pass the location of the key
+ std::pair<bool, Variant> value = VariantReader::parseGenericString(
+ *(attr++), parser->getLogger(), keyLoc.getSourceId(),
+ keyLoc.getStart());
+
+ // Set the overall location of the parsed element to the attribute
+ // location
+ value.second.setLocation(keyLoc);
+
+ // Store the keys in the map
+ args.emplace(key, value.second).second;
+ }
+
+ // Fetch the name of the tag, check for special tags
+ std::string nameStr(name);
+ if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) {
+ // We're in the top-level and the magic tag is reached -- just
+ // ignore it and issue a warning for each argument that has been given
+ for (const auto &arg : args) {
+ parser->getLogger().warning(std::string("Ignoring attribute \"") +
+ arg.first +
+ std::string("\" for magic tag \"") +
+ TOP_LEVEL_TAG + std::string("\""),
+ arg.second);
+ }
+ } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) {
+ // Assemble a name variant containing the name minus the prefix
+ Variant nameVar =
+ Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size()));
+ nameVar.setLocation(nameLoc);
+
+ // Issue the "annotationStart" event
+ parser->getEvents().annotationStart(nameVar, args);
+ } else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) {
+ // Assemble a name variant containing the name minus the prefix
+ nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size());
+
+ // Discard a potentially leading colon
+ if (!nameStr.empty() && nameStr[0] == ':') {
+ nameStr = nameStr.substr(1);
+ }
+
+ // Assemble the variant containing the name and its location
+ Variant nameVar = Variant::fromString(nameStr);
+ nameVar.setLocation(nameLoc);
+
+ // Check whether a "name" attribute was given
+ Variant elementName;
+ for (const auto &arg : args) {
+ if (arg.first == "name") {
+ elementName = arg.second;
+ } else {
+ parser->getLogger().warning(
+ std::string("Ignoring attribute \"") + arg.first +
+ "\" in annotation end tag",
+ arg.second);
+ }
+ }
+
+ // Set the annotationEndTagDepth to disallow any further tags to be
+ // opened inside the annotation end tag.
+ parser->getData().annotationEndTagDepth = parser->getData().depth;
+
+ // Issue the "annotationEnd" event
+ parser->getEvents().annotationEnd(nameVar, args);
+ } else {
+ // Just issue a "commandStart" event in any other case
+ Variant nameVar = Variant::fromString(nameStr);
+ nameVar.setLocation(nameLoc);
+ parser->getEvents().commandStart(nameVar, args);
+ }
+}
+
+static void xmlEndElementHandler(void *ref, const XML_Char *name)
+{
+ // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+ XML_Parser p = static_cast<XML_Parser>(ref);
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // Synchronize the position of the logger with teh position
+ xmlSyncLoggerPosition(p);
+
+ // Abort as long as we're in an annotation end tag
+ if (parser->getData().inAnnotationEndTag()) {
+ parser->getData().decrDepth();
+ return;
+ }
+
+ // Decrement the current depth
+ parser->getData().decrDepth();
+
+ // If there is any text data in the buffer, issue that first
+ if (parser->getData().hasText()) {
+ parser->getEvents().data(
+ parser->getData().getText(parser->getReader().getSourceId()));
+ }
+
+ // Abort if the special ousia tag ends here
+ std::string nameStr{name};
+ if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) {
+ return;
+ }
+
+ // Issue the "fieldEnd" event
+ parser->getEvents().fieldEnd();
+}
+
+static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
+{
+ // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+ XML_Parser p = static_cast<XML_Parser>(ref);
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // Abort as long as we're in an annotation end tag
+ if (parser->getData().inAnnotationEndTag()) {
+ return;
+ }
+
+ // Convert the signed (smell the 90's C library here?) length to an usigned
+ // value
+ size_t ulen = len > 0 ? static_cast<size_t>(len) : 0;
+
+ // Synchronize the logger position
+ SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
+
+ // Fetch some variables for convenience
+ const WhitespaceMode mode = parser->getWhitespaceMode();
+ OsxmlEventParserData &data = parser->getData();
+ std::vector<char> &textBuf = data.textBuf;
+ std::vector<char> &whitespaceBuf = data.whitespaceBuf;
+ bool &hasWhitespace = data.hasWhitespace;
+ size_t &textStart = data.textStart;
+ size_t &textEnd = data.textEnd;
+
+ size_t pos = loc.getStart();
+ for (size_t i = 0; i < ulen; i++, pos++) {
+ switch (mode) {
+ case WhitespaceMode::PRESERVE:
+ PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd);
+ break;
+ case WhitespaceMode::TRIM:
+ TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd,
+ whitespaceBuf);
+ break;
+ case WhitespaceMode::COLLAPSE:
+ CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd,
+ hasWhitespace);
+ break;
+ }
+ }
+}
+
+/* Class OsxmlEvents */
+
+OsxmlEvents::~OsxmlEvents() {}
+
+/* Class OsxmlEventParser */
+
+OsxmlEventParserData::OsxmlEventParserData()
+ : depth(0),
+ annotationEndTagDepth(-1),
+ hasWhitespace(false),
+ textStart(0),
+ textEnd(0)
+{
+}
+
+void OsxmlEventParserData::incrDepth() { depth++; }
+
+void OsxmlEventParserData::decrDepth()
+{
+ if (depth > 0) {
+ depth--;
+ }
+ if (depth < annotationEndTagDepth) {
+ annotationEndTagDepth = -1;
+ }
+}
+
+bool OsxmlEventParserData::inAnnotationEndTag()
+{
+ return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);
+}
+
+bool OsxmlEventParserData::hasText() { return !textBuf.empty(); }
+
+Variant OsxmlEventParserData::getText(SourceId sourceId)
+{
+ // Create a variant containing the string data and the location
+ Variant var =
+ Variant::fromString(std::string{textBuf.data(), textBuf.size()});
+ var.setLocation({sourceId, textStart, textEnd});
+
+ // Reset the text buffers
+ textBuf.clear();
+ whitespaceBuf.clear();
+ hasWhitespace = false;
+ textStart = 0;
+ textEnd = 0;
+
+ // Return the variant
+ return var;
+}
+
+/* Class OsxmlEventParser */
+
+OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
+ Logger &logger)
+ : reader(reader),
+ events(events),
+ logger(logger),
+ whitespaceMode(WhitespaceMode::TRIM),
+ data(new OsxmlEventParserData())
+{
+}
+
+OsxmlEventParser::~OsxmlEventParser() {}
+
+void OsxmlEventParser::parse()
+{
+ // Create the parser object
+ GuardedExpatXmlParser p{"UTF-8"};
+
+ // Reset the depth
+ data->depth = 0;
+
+ // Pass the reference to this parser instance to the XML handler
+ XML_SetUserData(&p, this);
+ XML_UseParserAsHandlerArg(&p);
+
+ // Set the callback functions
+ XML_SetStartElementHandler(&p, xmlStartElementHandler);
+ XML_SetEndElementHandler(&p, xmlEndElementHandler);
+ XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler);
+
+ // Feed data into expat while there is data to process
+ constexpr size_t BUFFER_SIZE = 64 * 1024;
+ while (true) {
+ // Fetch a buffer from expat for the input data
+ char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE));
+ if (!buf) {
+ throw OusiaException{"Internal error: XML parser out of memory!"};
+ }
+
+ // Read into the buffer
+ size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE);
+
+ // Parse the data and handle any XML error as exception
+ if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) {
+ throw LoggableException{
+ "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))},
+ xmlSyncLoggerPosition(&p)};
+ }
+
+ // Abort once there are no more bytes in the stream
+ if (bytesRead == 0) {
+ break;
+ }
+ }
+}
+
+void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
+{
+ this->whitespaceMode = whitespaceMode;
+}
+
+WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
+{
+ return whitespaceMode;
+}
+
+CharReader &OsxmlEventParser::getReader() const { return reader; }
+
+Logger &OsxmlEventParser::getLogger() const { return logger; }
+
+OsxmlEvents &OsxmlEventParser::getEvents() const { return events; }
+
+OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; }
+}
+
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
new file mode 100644
index 0000000..aa20ea9
--- /dev/null
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -0,0 +1,215 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlEventParser.hpp
+ *
+ * The OsxmlEventParser class is responsible for parsing an XML file and calling
+ * the corresponding event handler functions if an XML item is found. Event
+ * handling is performed using a listener interface.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OSXML_EVENT_PARSER_HPP_
+#define _OSXML_EVENT_PARSER_HPP_
+
+#include <memory>
+#include <string>
+
+#include <core/common/Whitespace.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class Logger;
+class Variant;
+class OsxmlEventParserData;
+
+/**
+ * Interface which defines the callback functions which are called by the
+ * OsxmlEventParser whenever an event occurs.
+ */
+class OsxmlEvents {
+public:
+ /**
+ * Virtual destructor.
+ */
+ virtual ~OsxmlEvents();
+
+ /**
+ * Called whenever a command starts. Note that this implicitly always starts
+ * the default field of the command.
+ *
+ * @param name is a string variant containing name and location of the
+ * command.
+ * @param args is a map variant containing the arguments that were given
+ * to the command.
+ */
+ virtual void commandStart(Variant name, Variant args) = 0;
+
+ /**
+ * Called whenever an annotation starts. Note that this implicitly always
+ * starts the default field of the annotation.
+ *
+ * @param name is a string variant containing the name of the annotation
+ * class and the location of the annotation definition.
+ * @param args is a map variant containing the arguments that were given
+ * to the annotation definition.
+ */
+ virtual void annotationStart(Variant name, Variant args) = 0;
+
+ /**
+ * Called whenever the range of an annotation ends. The callee must
+ * disambiguate the actual annotation that is finished here.
+ *
+ * @param name is a string variant containing the name of the annotation
+ * class that should end here. May be empty (or nullptr), if no elementName
+ * has been specified at the end of the annotation.
+ * @param elementName is the name of the annotation element that should be
+ * ended here. May be empty (or nullptr), if no elementName has been
+ * specified at the end of the annotation.
+ */
+ virtual void annotationEnd(Variant name, Variant elementName) = 0;
+
+ /**
+ * Called whenever the default field which was implicitly started by
+ * commandStart or annotationStart ends. Note that this does not end the
+ * range of an annotation, but the default field of the annotation. To
+ * signal the end of the annotation this, the annotationEnd method will be
+ * invoked.
+ */
+ virtual void fieldEnd() = 0;
+
+ /**
+ * Called whenever data is found. Whitespace data is handled as specified
+ * and the data has been parsed to the specified variant type. This function
+ * is not called if the parsing failed, the parser prints an error message
+ * instead.
+ *
+ * @param data is the already parsed data that should be passed to the
+ * handler.
+ */
+ virtual void data(Variant data) = 0;
+};
+
+/**
+ * The OsxmlEventParser class is a wrapper around eXpat which implements the
+ * specialities of the osxml formats class (like annotation ranges). It notifies
+ * a specified event handler whenever a command, annotation or data has been
+ * reached.
+ */
+class OsxmlEventParser {
+private:
+ /**
+ * Reference at the internal CharReader instance.
+ */
+ CharReader &reader;
+
+ /**
+ * Set of callback functions to be called whenever an event is triggered.
+ */
+ OsxmlEvents &events;
+
+ /**
+ * Reference at the Logger object to which error messages or warnings should
+ * be logged.
+ */
+ Logger &logger;
+
+ /**
+ * Current whitespace mode.
+ */
+ WhitespaceMode whitespaceMode;
+
+ /**
+ * Data to be used by the internal functions.
+ */
+ std::unique_ptr<OsxmlEventParserData> data;
+
+public:
+ /**
+ * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents
+ * of which the callback functions are called.
+ *
+ * @param reader is a reference to the CharReader instance from which the
+ * XML should be read.
+ * @param events is a refence at an instance of the OsxmlEvents class. All
+ * events are forwarded to this class.
+ * @param logger is the Logger instance to which log messages should be
+ * written.
+ */
+ OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger);
+
+ /**
+ * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type)
+ */
+ ~OsxmlEventParser();
+
+ /**
+ * Performs the actual parsing. Reads the XML using eXpat and calles the
+ * callbacks in the event listener instance whenever something interesting
+ * happens.
+ */
+ void parse();
+
+ /**
+ * Sets the whitespace handling mode.
+ *
+ * @param whitespaceMode defines how whitespace in the data should be
+ * handled.
+ */
+ void setWhitespaceMode(WhitespaceMode whitespaceMode);
+
+ /**
+ * Returns the current whitespace handling mode.
+ *
+ * @return the currently set whitespace handling mode.
+ */
+ WhitespaceMode getWhitespaceMode() const;
+
+ /**
+ * Returns the internal CharReader reference.
+ *
+ * @return the CharReader reference.
+ */
+ CharReader &getReader() const;
+
+ /**
+ * Returns the internal Logger reference.
+ *
+ * @return the internal Logger reference.
+ */
+ Logger &getLogger() const;
+
+ /**
+ * Returns the internal OsxmlEvents reference.
+ *
+ * @return the internal OsxmlEvents reference.
+ */
+ OsxmlEvents &getEvents() const;
+
+ /**
+ * Returns a reference at the internal data.
+ */
+ OsxmlEventParserData &getData() const;
+};
+}
+
+#endif /* _OSXML_EVENT_PARSER_HPP_ */
+
diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp
new file mode 100644
index 0000000..869c76a
--- /dev/null
+++ b/src/formats/osxml/OsxmlParser.cpp
@@ -0,0 +1,238 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+#include <expat.h>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/VariantReader.hpp>
+#include <core/parser/ParserScope.hpp>
+#include <core/parser/ParserStack.hpp>
+#include <core/parser/stack/DocumentHandler.hpp>
+#include <core/parser/stack/DomainHandler.hpp>
+#include <core/parser/stack/ImportIncludeHandler.hpp>
+#include <core/parser/stack/TypesystemHandler.hpp>
+#include <core/model/Document.hpp>
+#include <core/model/Domain.hpp>
+#include <core/model/Typesystem.hpp>
+
+#include "XmlParser.hpp"
+
+namespace ousia {
+
+namespace ParserStates {
+/* Document states */
+static const ParserState Document =
+ ParserStateBuilder()
+ .parent(&None)
+ .createdNodeType(&RttiTypes::Document)
+ .elementHandler(DocumentHandler::create)
+ .arguments({Argument::String("name", "")});
+
+static const ParserState DocumentChild =
+ ParserStateBuilder()
+ .parents({&Document, &DocumentChild})
+ .createdNodeTypes({&RttiTypes::StructureNode,
+ &RttiTypes::AnnotationEntity,
+ &RttiTypes::DocumentField})
+ .elementHandler(DocumentChildHandler::create);
+
+/* Domain states */
+static const ParserState Domain = ParserStateBuilder()
+ .parents({&None, &Document})
+ .createdNodeType(&RttiTypes::Domain)
+ .elementHandler(DomainHandler::create)
+ .arguments({Argument::String("name")});
+
+static const ParserState DomainStruct =
+ ParserStateBuilder()
+ .parent(&Domain)
+ .createdNodeType(&RttiTypes::StructuredClass)
+ .elementHandler(DomainStructHandler::create)
+ .arguments({Argument::String("name"),
+ Argument::Cardinality("cardinality", Cardinality::any()),
+ Argument::Bool("isRoot", false),
+ Argument::Bool("transparent", false),
+ Argument::String("isa", "")});
+
+static const ParserState DomainAnnotation =
+ ParserStateBuilder()
+ .parent(&Domain)
+ .createdNodeType(&RttiTypes::AnnotationClass)
+ .elementHandler(DomainAnnotationHandler::create)
+ .arguments({Argument::String("name")});
+
+static const ParserState DomainAttributes =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::StructType)
+ .elementHandler(DomainAttributesHandler::create)
+ .arguments({});
+
+static const ParserState DomainAttribute =
+ ParserStateBuilder()
+ .parent(&DomainAttributes)
+ .elementHandler(TypesystemStructFieldHandler::create)
+ .arguments({Argument::String("name"), Argument::String("type"),
+ Argument::Any("default", Variant::fromObject(nullptr))});
+
+static const ParserState DomainField =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainFieldHandler::create)
+ .arguments({Argument::String("name", ""),
+ Argument::Bool("isSubtree", false),
+ Argument::Bool("optional", false)});
+
+static const ParserState DomainFieldRef =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainFieldRefHandler::create)
+ .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)});
+
+static const ParserState DomainStructPrimitive =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainPrimitiveHandler::create)
+ .arguments(
+ {Argument::String("name", ""), Argument::Bool("isSubtree", false),
+ Argument::Bool("optional", false), Argument::String("type")});
+
+static const ParserState DomainStructChild =
+ ParserStateBuilder()
+ .parent(&DomainField)
+ .elementHandler(DomainChildHandler::create)
+ .arguments({Argument::String("ref")});
+
+static const ParserState DomainStructParent =
+ ParserStateBuilder()
+ .parent(&DomainStruct)
+ .createdNodeType(&RttiTypes::DomainParent)
+ .elementHandler(DomainParentHandler::create)
+ .arguments({Argument::String("ref")});
+
+static const ParserState DomainStructParentField =
+ ParserStateBuilder()
+ .parent(&DomainStructParent)
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainParentFieldHandler::create)
+ .arguments({Argument::String("name", ""),
+ Argument::Bool("isSubtree", false),
+ Argument::Bool("optional", false)});
+
+static const ParserState DomainStructParentFieldRef =
+ ParserStateBuilder()
+ .parent(&DomainStructParent)
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainParentFieldRefHandler::create)
+ .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)});
+
+/* Typesystem states */
+static const ParserState Typesystem =
+ ParserStateBuilder()
+ .parents({&None, &Domain})
+ .createdNodeType(&RttiTypes::Typesystem)
+ .elementHandler(TypesystemHandler::create)
+ .arguments({Argument::String("name", "")});
+
+static const ParserState TypesystemEnum =
+ ParserStateBuilder()
+ .parent(&Typesystem)
+ .createdNodeType(&RttiTypes::EnumType)
+ .elementHandler(TypesystemEnumHandler::create)
+ .arguments({Argument::String("name")});
+
+static const ParserState TypesystemEnumEntry =
+ ParserStateBuilder()
+ .parent(&TypesystemEnum)
+ .elementHandler(TypesystemEnumEntryHandler::create)
+ .arguments({});
+
+static const ParserState TypesystemStruct =
+ ParserStateBuilder()
+ .parent(&Typesystem)
+ .createdNodeType(&RttiTypes::StructType)
+ .elementHandler(TypesystemStructHandler::create)
+ .arguments({Argument::String("name"), Argument::String("parent", "")});
+
+static const ParserState TypesystemStructField =
+ ParserStateBuilder()
+ .parent(&TypesystemStruct)
+ .elementHandler(TypesystemStructFieldHandler::create)
+ .arguments({Argument::String("name"), Argument::String("type"),
+ Argument::Any("default", Variant::fromObject(nullptr))});
+
+static const ParserState TypesystemConstant =
+ ParserStateBuilder()
+ .parent(&Typesystem)
+ .createdNodeType(&RttiTypes::Constant)
+ .elementHandler(TypesystemConstantHandler::create)
+ .arguments({Argument::String("name"), Argument::String("type"),
+ Argument::Any("value")});
+
+/* Special states for import and include */
+static const ParserState Import =
+ ParserStateBuilder()
+ .parents({&Document, &Typesystem, &Domain})
+ .elementHandler(ImportHandler::create)
+ .arguments({Argument::String("rel", ""), Argument::String("type", ""),
+ Argument::String("src", "")});
+
+static const ParserState Include =
+ ParserStateBuilder()
+ .parent(&All)
+ .elementHandler(IncludeHandler::create)
+ .arguments({Argument::String("rel", ""), Argument::String("type", ""),
+ Argument::String("src", "")});
+
+static const std::multimap<std::string, const ParserState *> XmlStates{
+ {"document", &Document},
+ {"*", &DocumentChild},
+ {"domain", &Domain},
+ {"struct", &DomainStruct},
+ {"annotation", &DomainAnnotation},
+ {"attributes", &DomainAttributes},
+ {"attribute", &DomainAttribute},
+ {"field", &DomainField},
+ {"fieldRef", &DomainFieldRef},
+ {"primitive", &DomainStructPrimitive},
+ {"childRef", &DomainStructChild},
+ {"parentRef", &DomainStructParent},
+ {"field", &DomainStructParentField},
+ {"fieldRef", &DomainStructParentFieldRef},
+ {"typesystem", &Typesystem},
+ {"enum", &TypesystemEnum},
+ {"entry", &TypesystemEnumEntry},
+ {"struct", &TypesystemStruct},
+ {"field", &TypesystemStructField},
+ {"constant", &TypesystemConstant},
+ {"import", &Import},
+ {"include", &Include}};
+}
+
+
+}
+
diff --git a/src/plugins/xml/XmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp
index c8b6302..281a49c 100644
--- a/src/plugins/xml/XmlParser.hpp
+++ b/src/formats/osxml/OsxmlParser.hpp
@@ -25,18 +25,18 @@
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
-#ifndef _OUSIA_XML_PARSER_HPP_
-#define _OUSIA_XML_PARSER_HPP_
+#ifndef _OUSIA_OSXML_PARSER_HPP_
+#define _OUSIA_OSXML_PARSER_HPP_
#include <core/parser/Parser.hpp>
namespace ousia {
/**
- * The XmlParser class implements parsing the various types of Ousía XML
- * documents using the expat stream XML parser.
+ * The OsxmlParser class implements parsing the various types of Ousía XML
+ * documents using the OsxmlEventParser and Stack classes.
*/
-class XmlParser : public Parser {
+class OsxmlParser : public Parser {
protected:
/**
* Parses the given input stream as XML file and returns the parsed
@@ -51,5 +51,5 @@ protected:
}
-#endif /* _OUSIA_XML_PARSER_HPP_ */
+#endif /* _OUSIA_OSXML_PARSER_HPP_ */
diff --git a/src/core/CodeTokenizer.cpp b/src/plugins/css/CodeTokenizer.cpp
index d65c514..d65c514 100644
--- a/src/core/CodeTokenizer.cpp
+++ b/src/plugins/css/CodeTokenizer.cpp
diff --git a/src/core/CodeTokenizer.hpp b/src/plugins/css/CodeTokenizer.hpp
index 154f949..154f949 100644
--- a/src/core/CodeTokenizer.hpp
+++ b/src/plugins/css/CodeTokenizer.hpp
diff --git a/src/core/Tokenizer.cpp b/src/plugins/css/Tokenizer.cpp
index ab4735a..ab4735a 100644
--- a/src/core/Tokenizer.cpp
+++ b/src/plugins/css/Tokenizer.cpp
diff --git a/src/core/Tokenizer.hpp b/src/plugins/css/Tokenizer.hpp
index 50e458c..50e458c 100644
--- a/src/core/Tokenizer.hpp
+++ b/src/plugins/css/Tokenizer.hpp
diff --git a/src/plugins/xml/XmlParser.cpp b/src/plugins/xml/XmlParser.cpp
deleted file mode 100644
index 6dfad49..0000000
--- a/src/plugins/xml/XmlParser.cpp
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <iostream>
-#include <map>
-#include <sstream>
-#include <vector>
-
-#include <expat.h>
-
-#include <core/common/CharReader.hpp>
-#include <core/common/Utils.hpp>
-#include <core/common/VariantReader.hpp>
-#include <core/parser/ParserScope.hpp>
-#include <core/parser/ParserStack.hpp>
-#include <core/parser/stack/DocumentHandler.hpp>
-#include <core/parser/stack/DomainHandler.hpp>
-#include <core/parser/stack/ImportIncludeHandler.hpp>
-#include <core/parser/stack/TypesystemHandler.hpp>
-#include <core/model/Document.hpp>
-#include <core/model/Domain.hpp>
-#include <core/model/Typesystem.hpp>
-
-#include "XmlParser.hpp"
-
-namespace ousia {
-
-namespace ParserStates {
-/* Document states */
-static const ParserState Document =
- ParserStateBuilder()
- .parent(&None)
- .createdNodeType(&RttiTypes::Document)
- .elementHandler(DocumentHandler::create)
- .arguments({Argument::String("name", "")});
-
-static const ParserState DocumentChild =
- ParserStateBuilder()
- .parents({&Document, &DocumentChild})
- .createdNodeTypes({&RttiTypes::StructureNode,
- &RttiTypes::AnnotationEntity,
- &RttiTypes::DocumentField})
- .elementHandler(DocumentChildHandler::create);
-
-/* Domain states */
-static const ParserState Domain = ParserStateBuilder()
- .parents({&None, &Document})
- .createdNodeType(&RttiTypes::Domain)
- .elementHandler(DomainHandler::create)
- .arguments({Argument::String("name")});
-
-static const ParserState DomainStruct =
- ParserStateBuilder()
- .parent(&Domain)
- .createdNodeType(&RttiTypes::StructuredClass)
- .elementHandler(DomainStructHandler::create)
- .arguments({Argument::String("name"),
- Argument::Cardinality("cardinality", Cardinality::any()),
- Argument::Bool("isRoot", false),
- Argument::Bool("transparent", false),
- Argument::String("isa", "")});
-
-static const ParserState DomainAnnotation =
- ParserStateBuilder()
- .parent(&Domain)
- .createdNodeType(&RttiTypes::AnnotationClass)
- .elementHandler(DomainAnnotationHandler::create)
- .arguments({Argument::String("name")});
-
-static const ParserState DomainAttributes =
- ParserStateBuilder()
- .parents({&DomainStruct, &DomainAnnotation})
- .createdNodeType(&RttiTypes::StructType)
- .elementHandler(DomainAttributesHandler::create)
- .arguments({});
-
-static const ParserState DomainAttribute =
- ParserStateBuilder()
- .parent(&DomainAttributes)
- .elementHandler(TypesystemStructFieldHandler::create)
- .arguments({Argument::String("name"), Argument::String("type"),
- Argument::Any("default", Variant::fromObject(nullptr))});
-
-static const ParserState DomainField =
- ParserStateBuilder()
- .parents({&DomainStruct, &DomainAnnotation})
- .createdNodeType(&RttiTypes::FieldDescriptor)
- .elementHandler(DomainFieldHandler::create)
- .arguments({Argument::String("name", ""),
- Argument::Bool("isSubtree", false),
- Argument::Bool("optional", false)});
-
-static const ParserState DomainFieldRef =
- ParserStateBuilder()
- .parents({&DomainStruct, &DomainAnnotation})
- .createdNodeType(&RttiTypes::FieldDescriptor)
- .elementHandler(DomainFieldRefHandler::create)
- .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)});
-
-static const ParserState DomainStructPrimitive =
- ParserStateBuilder()
- .parents({&DomainStruct, &DomainAnnotation})
- .createdNodeType(&RttiTypes::FieldDescriptor)
- .elementHandler(DomainPrimitiveHandler::create)
- .arguments(
- {Argument::String("name", ""), Argument::Bool("isSubtree", false),
- Argument::Bool("optional", false), Argument::String("type")});
-
-static const ParserState DomainStructChild =
- ParserStateBuilder()
- .parent(&DomainField)
- .elementHandler(DomainChildHandler::create)
- .arguments({Argument::String("ref")});
-
-static const ParserState DomainStructParent =
- ParserStateBuilder()
- .parent(&DomainStruct)
- .createdNodeType(&RttiTypes::DomainParent)
- .elementHandler(DomainParentHandler::create)
- .arguments({Argument::String("ref")});
-
-static const ParserState DomainStructParentField =
- ParserStateBuilder()
- .parent(&DomainStructParent)
- .createdNodeType(&RttiTypes::FieldDescriptor)
- .elementHandler(DomainParentFieldHandler::create)
- .arguments({Argument::String("name", ""),
- Argument::Bool("isSubtree", false),
- Argument::Bool("optional", false)});
-
-static const ParserState DomainStructParentFieldRef =
- ParserStateBuilder()
- .parent(&DomainStructParent)
- .createdNodeType(&RttiTypes::FieldDescriptor)
- .elementHandler(DomainParentFieldRefHandler::create)
- .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)});
-
-/* Typesystem states */
-static const ParserState Typesystem =
- ParserStateBuilder()
- .parents({&None, &Domain})
- .createdNodeType(&RttiTypes::Typesystem)
- .elementHandler(TypesystemHandler::create)
- .arguments({Argument::String("name", "")});
-
-static const ParserState TypesystemEnum =
- ParserStateBuilder()
- .parent(&Typesystem)
- .createdNodeType(&RttiTypes::EnumType)
- .elementHandler(TypesystemEnumHandler::create)
- .arguments({Argument::String("name")});
-
-static const ParserState TypesystemEnumEntry =
- ParserStateBuilder()
- .parent(&TypesystemEnum)
- .elementHandler(TypesystemEnumEntryHandler::create)
- .arguments({});
-
-static const ParserState TypesystemStruct =
- ParserStateBuilder()
- .parent(&Typesystem)
- .createdNodeType(&RttiTypes::StructType)
- .elementHandler(TypesystemStructHandler::create)
- .arguments({Argument::String("name"), Argument::String("parent", "")});
-
-static const ParserState TypesystemStructField =
- ParserStateBuilder()
- .parent(&TypesystemStruct)
- .elementHandler(TypesystemStructFieldHandler::create)
- .arguments({Argument::String("name"), Argument::String("type"),
- Argument::Any("default", Variant::fromObject(nullptr))});
-
-static const ParserState TypesystemConstant =
- ParserStateBuilder()
- .parent(&Typesystem)
- .createdNodeType(&RttiTypes::Constant)
- .elementHandler(TypesystemConstantHandler::create)
- .arguments({Argument::String("name"), Argument::String("type"),
- Argument::Any("value")});
-
-/* Special states for import and include */
-static const ParserState Import =
- ParserStateBuilder()
- .parents({&Document, &Typesystem, &Domain})
- .elementHandler(ImportHandler::create)
- .arguments({Argument::String("rel", ""), Argument::String("type", ""),
- Argument::String("src", "")});
-
-static const ParserState Include =
- ParserStateBuilder()
- .parent(&All)
- .elementHandler(IncludeHandler::create)
- .arguments({Argument::String("rel", ""), Argument::String("type", ""),
- Argument::String("src", "")});
-
-static const std::multimap<std::string, const ParserState *> XmlStates{
- {"document", &Document},
- {"*", &DocumentChild},
- {"domain", &Domain},
- {"struct", &DomainStruct},
- {"annotation", &DomainAnnotation},
- {"attributes", &DomainAttributes},
- {"attribute", &DomainAttribute},
- {"field", &DomainField},
- {"fieldRef", &DomainFieldRef},
- {"primitive", &DomainStructPrimitive},
- {"childRef", &DomainStructChild},
- {"parentRef", &DomainStructParent},
- {"field", &DomainStructParentField},
- {"fieldRef", &DomainStructParentFieldRef},
- {"typesystem", &Typesystem},
- {"enum", &TypesystemEnum},
- {"entry", &TypesystemEnumEntry},
- {"struct", &TypesystemStruct},
- {"field", &TypesystemStructField},
- {"constant", &TypesystemConstant},
- {"import", &Import},
- {"include", &Include}};
-}
-
-/**
- * Structue containing the private data that is being passed to the
- * XML-Handlers.
- */
-struct XMLUserData {
- /**
- * Containing the depth of the current XML file
- */
- size_t depth;
-
- /**
- * Reference at the ParserStack instance.
- */
- ParserStack *stack;
-
- /**
- * Reference at the CharReader instance.
- */
- CharReader *reader;
-
- /**
- * Constructor of the XMLUserData struct.
- *
- * @param stack is a pointer at the ParserStack instance.
- * @param reader is a pointer at the CharReader instance.
- */
- XMLUserData(ParserStack *stack, CharReader *reader)
- : depth(0), stack(stack), reader(reader)
- {
- }
-};
-
-/**
- * Wrapper class around the XML_Parser pointer which safely frees it whenever
- * the scope is left (e.g. because an exception was thrown).
- */
-class ScopedExpatXmlParser {
-private:
- /**
- * Internal pointer to the XML_Parser instance.
- */
- XML_Parser parser;
-
-public:
- /**
- * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS
- * from the expat library. Throws a parser exception if the XML parser
- * cannot be initialized.
- *
- * @param encoding is the protocol-defined encoding passed to expat (or
- * nullptr if expat should determine the encoding by itself).
- */
- ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr)
- {
- parser = XML_ParserCreate(encoding);
- if (!parser) {
- throw LoggableException{
- "Internal error: Could not create expat XML parser!"};
- }
- }
-
- /**
- * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance.
- */
- ~ScopedExpatXmlParser()
- {
- if (parser) {
- XML_ParserFree(parser);
- parser = nullptr;
- }
- }
-
- /**
- * Returns the XML_Parser pointer.
- */
- XML_Parser operator&() { return parser; }
-};
-
-/* Adapter Expat -> ParserStack */
-
-static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0)
-{
- // Fetch the parser stack and the associated user data
- XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p));
- ParserStack *stack = userData->stack;
-
- // Fetch the current location in the XML file
- size_t offs = XML_GetCurrentByteIndex(p);
-
- // Build the source location and update the default location of the
- // current
- // logger instance
- SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len};
- stack->getContext().getLogger().setDefaultLocation(loc);
- return loc;
-}
-
-enum class XMLAttributeState {
- IN_TAG_NAME,
- SEARCH_ATTR,
- IN_ATTR_NAME,
- HAS_ATTR_NAME,
- HAS_ATTR_EQUALS,
- IN_ATTR_DATA
-};
-
-static std::map<std::string, SourceLocation> reconstructXMLAttributeOffsets(
- CharReader &reader, SourceLocation location)
-{
- std::map<std::string, SourceLocation> res;
-
- // Fork the reader, we don't want to mess up the XML parsing process, do we?
- CharReaderFork readerFork = reader.fork();
-
- // Move the read cursor to the start location, abort if this does not work
- size_t offs = location.getStart();
- if (!location.isValid() || offs != readerFork.seek(offs)) {
- return res;
- }
-
- // Now all we need to do is to implement one half of an XML parser. As this
- // is inherently complicated we'll totaly fail at it. Don't care. All we
- // want to get is those darn offsets for pretty error messages... (and we
- // can assume the XML is valid as it was already read by expat)
- XMLAttributeState state = XMLAttributeState::IN_TAG_NAME;
- char c;
- std::stringstream attrName;
- while (readerFork.read(c)) {
- // Abort at the end of the tag
- if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) {
- return res;
- }
-
- // One state machine to rule them all, one state machine to find them,
- // One state machine to bring them all and in the darkness bind them
- // (the byte offsets)
- switch (state) {
- case XMLAttributeState::IN_TAG_NAME:
- if (Utils::isWhitespace(c)) {
- state = XMLAttributeState::SEARCH_ATTR;
- }
- break;
- case XMLAttributeState::SEARCH_ATTR:
- if (!Utils::isWhitespace(c)) {
- state = XMLAttributeState::IN_ATTR_NAME;
- attrName << c;
- }
- break;
- case XMLAttributeState::IN_ATTR_NAME:
- if (Utils::isWhitespace(c)) {
- state = XMLAttributeState::HAS_ATTR_NAME;
- } else if (c == '=') {
- state = XMLAttributeState::HAS_ATTR_EQUALS;
- } else {
- attrName << c;
- }
- break;
- case XMLAttributeState::HAS_ATTR_NAME:
- if (!Utils::isWhitespace(c)) {
- if (c == '=') {
- state = XMLAttributeState::HAS_ATTR_EQUALS;
- break;
- }
- // Well, this is a strange XML file... We expected to
- // see a '=' here! Try to continue with the
- // "HAS_ATTR_EQUALS" state as this state will hopefully
- // inlcude some error recovery
- } else {
- // Skip whitespace here
- break;
- }
- // Fallthrough
- case XMLAttributeState::HAS_ATTR_EQUALS:
- if (!Utils::isWhitespace(c)) {
- if (c == '"') {
- // Here we are! We have found the beginning of an
- // attribute. Let's quickly lock the current offset away
- // in the result map
- res.emplace(attrName.str(),
- SourceLocation{reader.getSourceId(),
- readerFork.getOffset()});
- attrName.str(std::string{});
- state = XMLAttributeState::IN_ATTR_DATA;
- } else {
- // No, this XML file is not well formed. Assume we're in
- // an attribute name once again
- attrName.str(std::string{&c, 1});
- state = XMLAttributeState::IN_ATTR_NAME;
- }
- }
- break;
- case XMLAttributeState::IN_ATTR_DATA:
- if (c == '"') {
- // We're at the end of the attribute data, start anew
- state = XMLAttributeState::SEARCH_ATTR;
- }
- break;
- }
- }
- return res;
-}
-
-static void xmlStartElementHandler(void *p, const XML_Char *name,
- const XML_Char **attrs)
-{
- XML_Parser parser = static_cast<XML_Parser>(p);
- XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p));
- ParserStack *stack = userData->stack;
-
- SourceLocation loc = syncLoggerPosition(parser);
-
- // Read the argument locations -- this is only a stupid and slow hack,
- // but it is necessary, as expat doesn't give use the byte offset of the
- // arguments.
- std::map<std::string, SourceLocation> offs =
- reconstructXMLAttributeOffsets(*userData->reader, loc);
-
- // Assemble the arguments
- Variant::mapType args;
-
- const XML_Char **attr = attrs;
- while (*attr) {
- // Convert the C string to a std::string
- const std::string key{*(attr++)};
-
- // Search the location of the key
- SourceLocation keyLoc;
- auto it = offs.find(key);
- if (it != offs.end()) {
- keyLoc = it->second;
- }
-
- // Parse the string, pass the location of the key
- std::pair<bool, Variant> value = VariantReader::parseGenericString(
- *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(),
- keyLoc.getStart());
- args.emplace(key, value.second);
- }
-
- // Call the start function
- std::string nameStr(name);
- if (nameStr != "ousia" || userData->depth > 0) {
- stack->start(std::string(name), args, loc);
- }
-
- // Increment the current depth
- userData->depth++;
-}
-
-static void xmlEndElementHandler(void *p, const XML_Char *name)
-{
- XML_Parser parser = static_cast<XML_Parser>(p);
- XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p));
- ParserStack *stack = userData->stack;
-
- syncLoggerPosition(parser);
-
- // Decrement the current depth
- userData->depth--;
-
- // Call the end function
- std::string nameStr(name);
- if (nameStr != "ousia" || userData->depth > 0) {
- stack->end();
- }
-}
-
-static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len)
-{
- XML_Parser parser = static_cast<XML_Parser>(p);
- XMLUserData *userData = static_cast<XMLUserData *>(XML_GetUserData(p));
- ParserStack *stack = userData->stack;
-
- size_t ulen = len > 0 ? static_cast<size_t>(len) : 0;
- syncLoggerPosition(parser, ulen);
- const std::string data = Utils::trim(std::string{s, ulen});
- if (!data.empty()) {
- stack->data(data);
- }
-}
-
-/* Class XmlParser */
-
-void XmlParser::doParse(CharReader &reader, ParserContext &ctx)
-{
- // Create the parser object
- ScopedExpatXmlParser p{"UTF-8"};
-
- // Create the parser stack instance, if we're starting on a non-empty scope,
- // try to deduce the parser state
- ParserStack stack(ctx, ParserStates::XmlStates);
- if (!ctx.getScope().isEmpty()) {
- if (!stack.deduceState()) {
- return;
- }
- }
-
- // Pass the reference to the ParserStack to the XML handler
- XMLUserData data(&stack, &reader);
- XML_SetUserData(&p, &data);
- XML_UseParserAsHandlerArg(&p);
-
- // Set the callback functions
- XML_SetStartElementHandler(&p, xmlStartElementHandler);
- XML_SetEndElementHandler(&p, xmlEndElementHandler);
- XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler);
-
- // Feed data into expat while there is data to process
- constexpr size_t BUFFER_SIZE = 64 * 1024;
- while (true) {
- // Fetch a buffer from expat for the input data
- char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE));
- if (!buf) {
- throw LoggableException{
- "Internal error: XML parser out of memory!"};
- }
-
- // Read into the buffer
- size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE);
-
- // Parse the data and handle any XML error
- if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) {
- // Fetch the xml parser byte offset
- size_t offs = XML_GetCurrentByteIndex(&p);
-
- // Throw a corresponding exception
- XML_Error code = XML_GetErrorCode(&p);
- std::string msg = std::string{XML_ErrorString(code)};
- throw LoggableException{"XML: " + msg,
- SourceLocation{ctx.getSourceId(), offs}};
- }
-
- // Abort once there are no more bytes in the stream
- if (bytesRead == 0) {
- break;
- }
- }
-}
-}
-