summaryrefslogtreecommitdiff
path: root/src/formats
diff options
context:
space:
mode:
Diffstat (limited to 'src/formats')
-rw-r--r--src/formats/osdm/DynamicTokenizer.cpp544
-rw-r--r--src/formats/osdm/DynamicTokenizer.hpp252
-rw-r--r--src/formats/osdm/TokenTrie.cpp119
-rw-r--r--src/formats/osdm/TokenTrie.hpp150
-rw-r--r--src/formats/osml/OsmlParser.cpp57
-rw-r--r--src/formats/osml/OsmlParser.hpp48
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp (renamed from src/formats/osdm/OsdmStreamParser.cpp)226
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp (renamed from src/formats/osdm/OsdmStreamParser.hpp)90
-rw-r--r--src/formats/osxml/OsxmlAttributeLocator.cpp144
-rw-r--r--src/formats/osxml/OsxmlAttributeLocator.hpp67
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp547
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp215
-rw-r--r--src/formats/osxml/OsxmlParser.cpp238
-rw-r--r--src/formats/osxml/OsxmlParser.hpp55
14 files changed, 1603 insertions, 1149 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp
deleted file mode 100644
index f2cfcd1..0000000
--- a/src/formats/osdm/DynamicTokenizer.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <memory>
-#include <vector>
-
-#include <core/common/CharReader.hpp>
-#include <core/common/Exceptions.hpp>
-#include <core/common/Utils.hpp>
-
-#include "DynamicTokenizer.hpp"
-
-namespace ousia {
-
-namespace {
-
-/* Internal class TokenMatch */
-
-/**
- * Contains information about a matching token.
- */
-struct TokenMatch {
- /**
- * Token that was matched.
- */
- DynamicToken token;
-
- /**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
- */
- size_t textLength;
-
- /**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
- */
- size_t textEnd;
-
- /**
- * Constructor of the TokenMatch class.
- */
- TokenMatch() : textLength(0), textEnd(0) {}
-
- /**
- * Returns true if this TokenMatch instance actually represents a match.
- */
- bool hasMatch() { return token.type != EmptyToken; }
-};
-
-/* Internal class TokenLookup */
-
-/**
- * The TokenLookup class is used to represent a thread in a running token
- * lookup.
- */
-class TokenLookup {
-private:
- /**
- * Current node within the token trie.
- */
- TokenTrie::Node const *node;
-
- /**
- * Start offset within the source file.
- */
- size_t start;
-
- /**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
- */
- size_t textLength;
-
- /**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
- */
- size_t textEnd;
-
-public:
- /**
- * Constructor of the TokenLookup class.
- *
- * @param node is the current node.
- * @param start is the start position.
- * @param textLength is the text buffer length of the previous text token.
- * @param textEnd is the current end location of the previous text token.
- */
- TokenLookup(const TokenTrie::Node *node, size_t start,
- size_t textLength, size_t textEnd)
- : node(node), start(start), textLength(textLength), textEnd(textEnd)
- {
- }
-
- /**
- * Tries to extend the current path in the token trie with the given
- * character. If a complete token is matched, stores this match in the
- * tokens list (in case it is longer than any previous token).
- *
- * @param c is the character that should be appended to the current prefix.
- * @param lookups is a list to which new TokeLookup instances are added --
- * which could potentially be expanded in the next iteration.
- * @param match is the DynamicToken instance to which the matching token
- * should be written.
- * @param tokens is a reference at the internal token list of the
- * DynamicTokenizer.
- * @param end is the end byte offset of the current character.
- * @param sourceId is the source if of this file.
- */
- void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
- const std::vector<std::string> &tokens, SourceOffset end,
- SourceId sourceId)
- {
- // Check whether we can continue the current token path with the given
- // character without visiting an already visited node
- auto it = node->children.find(c);
- if (it == node->children.end()) {
- return;
- }
-
- // Check whether the new node represents a complete token a whether it
- // is longer than the current token. If yes, replace the current token.
- node = it->second.get();
- if (node->type != EmptyToken) {
- const std::string &str = tokens[node->type];
- size_t len = str.size();
- if (len > match.token.content.size()) {
- match.token =
- DynamicToken{node->type, str, {sourceId, start, end}};
- match.textLength = textLength;
- match.textEnd = textEnd;
- }
- }
-
- // If this state can possibly be advanced, store it in the states list.
- if (!node->children.empty()) {
- lookups.emplace_back(*this);
- }
- }
-};
-
-/* Internal class TextHandlerBase */
-
-/**
- * Base class used for those classes that may be used as TextHandler in the
- * DynamicTokenizer::next function.
- */
-class TextHandlerBase {
-public:
- /**
- * Start position of the extracted text.
- */
- size_t textStart;
-
- /**
- * End position of the extracted text.
- */
- size_t textEnd;
-
- /**
- * Buffer containing the extracted text.
- */
- std::vector<char> textBuf;
-
- /**
- * Constructor of the TextHandlerBase base class. Initializes the start and
- * end position with zeros.
- */
- TextHandlerBase() : textStart(0), textEnd(0) {}
-
- /**
- * Transforms the given token into a text token containing the extracted
- * text.
- *
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
- void buildTextToken(TokenMatch &match, SourceId sourceId)
- {
- if (match.hasMatch()) {
- match.token.content =
- std::string{textBuf.data(), match.textLength};
- match.token.location =
- SourceLocation{sourceId, textStart, match.textEnd};
- } else {
- match.token.content = std::string{textBuf.data(), textBuf.size()};
- match.token.location = SourceLocation{sourceId, textStart, textEnd};
- }
- match.token.type = TextToken;
- }
-
- /**
- * Returns true if this whitespace handler has found any text and a text
- * token could be emitted.
- *
- * @return true if the internal data buffer is non-empty.
- */
- bool hasText() { return !textBuf.empty(); }
-};
-
-/* Internal class PreservingTextHandler */
-
-/**
- * The PreservingTextHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Appends the given character to the internal text buffer, does not
- * eliminate whitespace.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
- textBuf.push_back(c);
- }
-};
-
-/* Internal class TrimmingTextHandler */
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Buffer used internally to temporarily store all whitespace characters.
- * They are only added to the output buffer if another non-whitespace
- * character is reached.
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * whitespace characters at the begin and end of the text.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- whitespaceBuf.push_back(c);
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (!whitespaceBuf.empty()) {
- textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
- whitespaceBuf.end());
- whitespaceBuf.clear();
- }
- textBuf.push_back(c);
- }
-};
-
-/* Internal class CollapsingTextHandler */
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Flag set to true if a whitespace character was reached.
- */
- bool hasWhitespace = false;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * redundant whitespace characters.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- hasWhitespace = true;
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (hasWhitespace) {
- textBuf.push_back(' ');
- hasWhitespace = false;
- }
- textBuf.push_back(c);
- }
-};
-}
-
-/* Class DynamicTokenizer */
-
-DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
- : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
-{
-}
-
-template <typename TextHandler, bool read>
-bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
-{
- // If we're in the read mode, reset the char reader peek position to the
- // current read position
- if (read) {
- reader.resetPeek();
- }
-
- // Prepare the lookups in the token trie
- const TokenTrie::Node *root = trie.getRoot();
- TokenMatch match;
- std::vector<TokenLookup> lookups;
- std::vector<TokenLookup> nextLookups;
-
- // Instantiate the text handler
- TextHandler textHandler;
-
- // Peek characters from the reader and try to advance the current token tree
- // cursor
- char c;
- size_t charStart = reader.getPeekOffset();
- const SourceId sourceId = reader.getSourceId();
- while (reader.peek(c)) {
- const size_t charEnd = reader.getPeekOffset();
- const size_t textLength = textHandler.textBuf.size();
- const size_t textEnd = textHandler.textEnd;
-
- // If we do not have a match yet, start a new lookup from the root
- if (!match.hasMatch()) {
- TokenLookup{root, charStart, textLength, textEnd}.advance(
- c, nextLookups, match, tokens, charEnd, sourceId);
- }
-
- // Try to advance all other lookups with the new character
- for (TokenLookup &lookup : lookups) {
- lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
- }
-
- // We have found a token and there are no more states to advance or the
- // text handler has found something -- abort to return the new token
- if (match.hasMatch()) {
- if ((nextLookups.empty() || textHandler.hasText())) {
- break;
- }
- } else {
- // Record all incomming characters
- textHandler.append(c, charStart, charEnd);
- }
-
- // Swap the lookups and the nextLookups list
- lookups = std::move(nextLookups);
- nextLookups.clear();
-
- // Advance the offset
- charStart = charEnd;
- }
-
- // If we found text, emit that text
- if (textHandler.hasText() &&
- (!match.hasMatch() || match.textLength > 0)) {
- textHandler.buildTextToken(match, sourceId);
- }
-
- // Move the read/peek cursor to the end of the token, abort if an error
- // happens while doing so
- if (match.hasMatch()) {
- // Make sure we have a valid location
- if (match.token.location.getEnd() == InvalidSourceOffset) {
- throw OusiaException{"Token end position offset out of range"};
- }
-
- // Seek to the end of the current token
- const size_t end = match.token.location.getEnd();
- if (read) {
- reader.seek(end);
- } else {
- reader.seekPeekCursor(end);
- }
- token = match.token;
- } else {
- token = DynamicToken{};
- }
- return match.hasMatch();
-}
-
-bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
-{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, true>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, true>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, true>(reader, token);
- }
- return false;
-}
-
-bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
-{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, false>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, false>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, false>(reader, token);
- }
- return false;
-}
-
-TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
-{
- // Abort if an empty token should be registered
- if (token.empty()) {
- return EmptyToken;
- }
-
- // Search for a new slot in the tokens list
- TokenTypeId type = EmptyToken;
- for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
- if (tokens[i].empty()) {
- tokens[i] = token;
- type = i;
- break;
- }
- }
-
- // No existing slot was found, add a new one -- make sure we do not
- // override the special token type handles
- if (type == EmptyToken) {
- type = tokens.size();
- if (type == TextToken || type == EmptyToken) {
- throw OusiaException{"Token type ids depleted!"};
- }
- tokens.emplace_back(token);
- }
- nextTokenTypeId = type + 1;
-
- // Try to register the token in the trie -- if this fails, remove it
- // from the tokens list
- if (!trie.registerToken(token, type)) {
- tokens[type] = std::string();
- nextTokenTypeId = type;
- return EmptyToken;
- }
- return type;
-}
-
-bool DynamicTokenizer::unregisterToken(TokenTypeId type)
-{
- // Unregister the token from the trie, abort if an invalid type is given
- if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
- tokens[type] = std::string{};
- nextTokenTypeId = type;
- return true;
- }
- return false;
-}
-
-std::string DynamicTokenizer::getTokenString(TokenTypeId type)
-{
- if (type < tokens.size()) {
- return tokens[type];
- }
- return std::string{};
-}
-
-void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
-{
- whitespaceMode = mode;
-}
-
-WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
-
-/* Explicitly instantiate all possible instantiations of the "next" member
- function */
-template bool DynamicTokenizer::next<PreservingTextHandler, false>(
- CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
- CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<PreservingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-}
-
diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp
deleted file mode 100644
index 0cac2e8..0000000
--- a/src/formats/osdm/DynamicTokenizer.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file DynamicTokenizer.hpp
- *
- * Tokenizer that can be reconfigured at runtime used for parsing the plain
- * text format.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
-#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
-
-#include <set>
-#include <string>
-#include <vector>
-
-#include <core/common/Location.hpp>
-
-#include "TokenTrie.hpp"
-
-namespace ousia {
-
-// Forward declarations
-class CharReader;
-
-/**
- * The DynamicToken structure describes a token discovered by the Tokenizer.
- */
-struct DynamicToken {
- /**
- * Id of the type of this token.
- */
- TokenTypeId type;
-
- /**
- * String that was matched.
- */
- std::string content;
-
- /**
- * Location from which the string was extracted.
- */
- SourceLocation location;
-
- /**
- * Default constructor.
- */
- DynamicToken() : type(EmptyToken) {}
-
- /**
- * Constructor of the DynamicToken struct.
- *
- * @param id represents the token type.
- * @param content is the string content that has been extracted.
- * @param location is the location of the extracted string content in the
- * source file.
- */
- DynamicToken(TokenTypeId type, const std::string &content,
- SourceLocation location)
- : type(type), content(content), location(location)
- {
- }
-
- /**
- * Constructor of the DynamicToken struct, only initializes the token type
- *
- * @param type is the id corresponding to the type of the token.
- */
- DynamicToken(TokenTypeId type) : type(type) {}
-
- /**
- * The getLocation function allows the tokens to be directly passed as
- * parameter to Logger or LoggableException instances.
- *
- * @return a reference at the location field
- */
- const SourceLocation &getLocation() const { return location; }
-};
-
-/**
- * Enum specifying the whitespace handling of the DynamicTokenizer class when
- * reading non-token text.
- */
-enum class WhitespaceMode {
- /**
- * Preserves all whitespaces as they are found in the source file.
- */
- PRESERVE,
-
- /**
- * Trims whitespace at the beginning and the end of the found text.
- */
- TRIM,
-
- /**
- * Whitespaces are trimmed and collapsed, multiple whitespace characters
- * are replaced by a single space character.
- */
- COLLAPSE
-};
-
-/**
- * The DynamicTokenizer is used to extract tokens and chunks of text from a
- * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters. Note that the
- * DynamicTokenizer always tries to extract the longest possible token from the
- * tokenizer.
- */
-class DynamicTokenizer {
-private:
- /**
- * Internally used token trie. This object holds all registered tokens.
- */
- TokenTrie trie;
-
- /**
- * Flag defining whether whitespaces should be preserved or not.
- */
- WhitespaceMode whitespaceMode;
-
- /**
- * Vector containing all registered token types.
- */
- std::vector<std::string> tokens;
-
- /**
- * Next index in the tokens list where to search for a new token id.
- */
- size_t nextTokenTypeId;
-
- /**
- * Templated function used internally to read the current token. The
- * function is templated in order to force code generation for all six
- * combiations of whitespace modes and reading/peeking.
- *
- * @tparam TextHandler is the type to be used for the textHandler instance.
- * @tparam read specifies whether the function should start from and advance
- * the read pointer of the char reader.
- * @param reader is the CharReader instance from which the data should be
- * read.
- * @param token is the token structure into which the token information
- * should be written.
- * @return false if the end of the stream has been reached, true otherwise.
- */
- template <typename TextHandler, bool read>
- bool next(CharReader &reader, DynamicToken &token);
-
-public:
- /**
- * Constructor of the DynamicTokenizer class.
- *
- * @param whitespaceMode specifies how whitespace should be handled.
- */
- DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
-
- /**
- * Registers the given string as a token. Returns a const pointer at a
- * TokenDescriptor that will be used to reference the newly created token.
- *
- * @param token is the token string that should be registered.
- * @return a unique identifier for the registered token or EmptyToken if
- * an error occured.
- */
- TokenTypeId registerToken(const std::string &token);
-
- /**
- * Unregisters the token belonging to the given TokenTypeId.
- *
- * @param type is the token type that should be unregistered. The
- *TokenTypeId
- * must have been returned by registerToken.
- * @return true if the operation was successful, false otherwise (e.g.
- * because the given TokenDescriptor was already unregistered).
- */
- bool unregisterToken(TokenTypeId type);
-
- /**
- * Returns the token that was registered under the given TokenTypeId id or
- *an
- * empty string if an invalid TokenTypeId id is given.
- *
- * @param type is the TokenTypeId id for which the corresponding token
- *string
- * should be returned.
- * @return the registered token string or an empty string if the given type
- * was invalid.
- */
- std::string getTokenString(TokenTypeId type);
-
- /**
- * Sets the whitespace mode.
- *
- * @param whitespaceMode defines how whitespace should be treated in text
- * tokens.
- */
- void setWhitespaceMode(WhitespaceMode mode);
-
- /**
- * Returns the current value of the whitespace mode.
- *
- * @return the whitespace mode.
- */
- WhitespaceMode getWhitespaceMode();
-
- /**
- * Reads a new token from the CharReader and stores it in the given
- * DynamicToken instance.
- *
- * @param reader is the CharReader instance from which the data should be
- * read.
- * @param token is a reference at the token instance into which the Token
- * information should be written.
- * @return true if a token could be read, false if the end of the stream
- * has been reached.
- */
- bool read(CharReader &reader, DynamicToken &token);
-
- /**
- * The peek method does not advance the read position of the char reader,
- * but reads the next token from the current char reader peek position.
- *
- * @param reader is the CharReader instance from which the data should be
- * read.
- * @param token is a reference at the token instance into which the Token
- * information should be written.
- * @return true if a token could be read, false if the end of the stream
- * has been reached.
- */
- bool peek(CharReader &reader, DynamicToken &token);
-};
-}
-
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
-
diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp
deleted file mode 100644
index 4a0430b..0000000
--- a/src/formats/osdm/TokenTrie.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "TokenTrie.hpp"
-
-namespace ousia {
-
-/* Class DynamicTokenTree::Node */
-
-TokenTrie::Node::Node() : type(EmptyToken) {}
-
-/* Class DynamicTokenTree */
-
-bool TokenTrie::registerToken(const std::string &token,
- TokenTypeId type) noexcept
-{
- // Abort if the token is empty -- this would taint the root node
- if (token.empty()) {
- return false;
- }
-
- // Iterate over each character in the given string and insert them as
- // (new) nodes
- Node *node = &root;
- for (size_t i = 0; i < token.size(); i++) {
- // Insert a new node if this one does not exist
- const char c = token[i];
- auto it = node->children.find(c);
- if (it == node->children.end()) {
- it = node->children.emplace(c, std::make_shared<Node>()).first;
- }
- node = it->second.get();
- }
-
- // If the resulting node already has a type set, we're screwed.
- if (node->type != EmptyToken) {
- return false;
- }
-
- // Otherwise just set the type to the given type.
- node->type = type;
- return true;
-}
-
-bool TokenTrie::unregisterToken(const std::string &token) noexcept
-{
- // We cannot remove empty tokens as we need to access the fist character
- // upfront
- if (token.empty()) {
- return false;
- }
-
- // First pass -- search the node in the path that can be deleted
- Node *subtreeRoot = &root;
- char subtreeKey = token[0];
- Node *node = &root;
- for (size_t i = 0; i < token.size(); i++) {
- // Go to the next node, abort if the tree ends unexpectedly
- auto it = node->children.find(token[i]);
- if (it == node->children.end()) {
- return false;
- }
-
- // Reset the subtree handler if this node has another type
- node = it->second.get();
- if ((node->type != EmptyToken || node->children.size() > 1) &&
- (i + 1 != token.size())) {
- subtreeRoot = node;
- subtreeKey = token[i + 1];
- }
- }
-
- // If the node type is already EmptyToken, we cannot do anything here
- if (node->type == EmptyToken) {
- return false;
- }
-
- // If the target node has children, we cannot delete the subtree. Set the
- // type to EmptyToken instead
- if (!node->children.empty()) {
- node->type = EmptyToken;
- return true;
- }
-
- // If we end up here, we can safely delete the complete subtree
- subtreeRoot->children.erase(subtreeKey);
- return true;
-}
-
-TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
-{
- Node const *node = &root;
- for (size_t i = 0; i < token.size(); i++) {
- const char c = token[i];
- auto it = node->children.find(c);
- if (it == node->children.end()) {
- return EmptyToken;
- }
- node = it->second.get();
- }
- return node->type;
-}
-}
-
diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp
deleted file mode 100644
index 36c2ffa..0000000
--- a/src/formats/osdm/TokenTrie.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file TokenTrie.hpp
- *
- * Class representing a token trie that can be updated dynamically.
- *
- * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_TOKEN_TRIE_HPP_
-#define _OUSIA_TOKEN_TRIE_HPP_
-
-#include <cstdint>
-#include <memory>
-#include <limits>
-#include <unordered_map>
-
-namespace ousia {
-
-/**
- * The TokenTypeId is used to give each token type a unique id.
- */
-using TokenTypeId = uint32_t;
-
-/**
- * Token which is not a token.
- */
-constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
-
-/**
- * Token which represents a text token.
- */
-constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
-
-/**
- * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
- * the longest consecutive token in the text. This is equivalent to a prefix
- * trie.
- *
- * A token trie is a construct that structures all special tokens a Tokenizer
- * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
- * three. Then the token tree would look like this:
- *
- * \code{*.txt}
- * ~ (0)
- * / \
- * a (2) b (0)
- * | |
- * a (0) a (0)
- * | |
- * b (1) c (0)
- * \endcode
- *
- * Where the number indicates the corresponding token descriptor identifier.
- */
-class TokenTrie {
-public:
- /**
- * Structure used to build the node tree.
- */
- struct Node {
- /**
- * Type used for the child map.
- */
- using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
-
- /**
- * Map from single characters at the corresponding child nodes.
- */
- ChildMap children;
-
- /**
- * Reference at the corresponding token descriptor. Set to nullptr if
- * no token is attached to this node.
- */
- TokenTypeId type;
-
- /**
- * Default constructor, initializes the descriptor with nullptr.
- */
- Node();
- };
-
-private:
- /**
- * Root node of the internal token tree.
- */
- Node root;
-
-public:
- /**
- * Registers a token containing the given string. Returns false if the
- * token already exists, true otherwise.
- *
- * @param token is the character sequence that should be registered as
- * token.
- * @param type is the descriptor that should be set for this token.
- * @return true if the operation is successful, false otherwise.
- */
- bool registerToken(const std::string &token, TokenTypeId type) noexcept;
-
- /**
- * Unregisters the token from the token tree. Returns true if the token was
- * unregistered successfully, false otherwise.
- *
- * @param token is the character sequence that should be unregistered.
- * @return true if the operation was successful, false otherwise.
- */
- bool unregisterToken(const std::string &token) noexcept;
-
- /**
- * Returns true, if the given token exists within the TokenTree. This
- * function is mostly thought for debugging and unit testing.
- *
- * @param token is the character sequence that should be searched.
- * @return the attached token descriptor or nullptr if the given token is
- * not found.
- */
- TokenTypeId hasToken(const std::string &token) const noexcept;
-
- /**
- * Returns a reference at the root node to be used for traversing the token
- * tree.
- *
- * @return a reference at the root node.
- */
- const Node *getRoot() const noexcept { return &root; }
-};
-}
-
-#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
-
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp
new file mode 100644
index 0000000..4973639
--- /dev/null
+++ b/src/formats/osml/OsmlParser.cpp
@@ -0,0 +1,57 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/parser/generic/ParserStateCallbacks.hpp>
+#include <core/parser/generic/ParserStateStack.hpp>
+
+#include "OsdmParser.hpp"
+#include "OsdmStreamParser.hpp"
+
+namespace ousia {
+
+namespace {
+
+/**
+ * The OsdmParserImplementation class contains the actual implementation of the
+ * parsing process and is created in the "doParse" function of the OsdmParser.
+
+ */
+class OsdmParserImplementation : public ParserStateCallbacks {
+private:
+ /**
+ * OsdmStreamParser instance.
+ */
+ OsdmStreamParser parser;
+
+ /**
+ * Instance of the ParserStateStack.
+ */
+ ParserStateStack stack;
+
+public:
+ OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap)
+};
+}
+
+void OsdmParser::doParse(CharReader &reader, ParserContext &ctx)
+{
+ OsdmParserImplementation parser(reader, ctx);
+ parser.parse();
+}
+
+}
diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp
new file mode 100644
index 0000000..37505b4
--- /dev/null
+++ b/src/formats/osml/OsmlParser.hpp
@@ -0,0 +1,48 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsdmParser.hpp
+ *
+ * Contains the parser of the osdm format, the standard plain-text format used
+ * by Ousía for documents.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSDM_PARSER_HPP_
+#define _OUSIA_OSDM_PARSER_HPP_
+
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+
+/**
+ * OsdmParser is a small wrapper implementing the Parser interface. The actual
+ * parsing is performed with the OsdmStreamParser in conjunction with the
+ * ParserStateStack.
+ */
+class OsdmParser : public Parser {
+protected:
+ void doParse(CharReader &reader, ParserContext &ctx) override;
+};
+
+}
+
+#endif /* _OUSIA_OSDM_PARSER_HPP_ */
+
diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index 8cb8caf..0174fa4 100644
--- a/src/formats/osdm/OsdmStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -21,14 +21,14 @@
#include <core/common/Utils.hpp>
#include <core/common/VariantReader.hpp>
-#include "OsdmStreamParser.hpp"
+#include "OsmlStreamParser.hpp"
namespace ousia {
/**
* Plain format default tokenizer.
*/
-class PlainFormatTokens : public DynamicTokenizer {
+class PlainFormatTokens : public Tokenizer {
public:
/**
* Id of the backslash token.
@@ -61,6 +61,21 @@ public:
TokenTypeId FieldEnd;
/**
+ * Id of the default field start token.
+ */
+ TokenTypeId DefaultFieldStart;
+
+ /**
+ * Id of the annotation start token.
+ */
+ TokenTypeId AnnotationStart;
+
+ /**
+ * Id of the annotation end token.
+ */
+ TokenTypeId AnnotationEnd;
+
+ /**
* Registers the plain format tokens in the internal tokenizer.
*/
PlainFormatTokens()
@@ -71,6 +86,9 @@ public:
BlockCommentEnd = registerToken("}%");
FieldStart = registerToken("{");
FieldEnd = registerToken("}");
+ DefaultFieldStart = registerToken("{!");
+ AnnotationStart = registerToken("<\\");
+ AnnotationEnd = registerToken("\\>");
}
};
@@ -160,14 +178,14 @@ public:
}
};
-OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger)
+OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
: reader(reader), logger(logger), tokenizer(Tokens)
{
// Place an intial command representing the complete file on the stack
- commands.push(Command{"", Variant::mapType{}, true, true, true});
+ commands.push(Command{"", Variant::mapType{}, true, true, true, false});
}
-Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
{
bool first = true;
bool hasCharSiceNSSep = false;
@@ -210,7 +228,7 @@ Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
return res;
}
-OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
+OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
{
// Expect a '{' after the command
reader.consumeWhitespace();
@@ -251,7 +269,7 @@ OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
return State::COMMAND;
}
-static bool checkStillInField(const OsdmStreamParser::Command &cmd,
+static bool checkStillInField(const OsmlStreamParser::Command &cmd,
const Variant &endName, Logger &logger)
{
if (cmd.inField && !cmd.inRangeField) {
@@ -264,7 +282,7 @@ static bool checkStillInField(const OsdmStreamParser::Command &cmd,
return false;
}
-OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
+OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
{
// Expect a '{' after the command
if (!reader.expect('{')) {
@@ -327,7 +345,7 @@ OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
return cmd.inRangeField ? State::FIELD_END : State::NONE;
}
-Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
+Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
{
// Parse the arguments using the universal VariantReader
Variant commandArguments;
@@ -353,7 +371,7 @@ Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
return commandArguments;
}
-void OsdmStreamParser::pushCommand(Variant commandName,
+void OsmlStreamParser::pushCommand(Variant commandName,
Variant commandArguments, bool hasRange)
{
// Store the location on the stack
@@ -365,10 +383,11 @@ void OsdmStreamParser::pushCommand(Variant commandName,
commands.pop();
}
commands.push(Command{std::move(commandName), std::move(commandArguments),
- hasRange, false, false});
+ hasRange, false, false, false});
}
-OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
+OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
+ bool isAnnotation)
{
// Parse the commandName as a first identifier
Variant commandName = parseIdentifier(start, true);
@@ -382,6 +401,9 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
Utils::split(commandName.asString(), ':');
const bool isBegin = commandNameComponents[0] == "begin";
const bool isEnd = commandNameComponents[0] == "end";
+
+ // Parse the begin or end command
+ State res = State::COMMAND;
if (isBegin || isEnd) {
if (commandNameComponents.size() > 1) {
logger.error(
@@ -390,35 +412,81 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
commandName);
}
if (isBegin) {
- return parseBeginCommand();
+ res = parseBeginCommand();
} else if (isEnd) {
- return parseEndCommand();
+ res = parseEndCommand();
}
+ } else {
+ // Check whether the next character is a '#', indicating the start of
+ // the command name
+ Variant commandArgName;
+ start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ // Parse the arugments
+ Variant commandArguments =
+ parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), false);
}
- // Check whether the next character is a '#', indicating the start of the
- // command name
- Variant commandArgName;
- start = reader.getOffset();
- if (reader.expect('#')) {
- commandArgName = parseIdentifier(start);
- if (commandArgName.asString().empty()) {
- logger.error("Expected identifier after \"#\"", commandArgName);
+ // Check whether a ">" character is the next character that is to be read.
+ // In that case the current command could be an annotation end command!
+ char c;
+ if (reader.fetch(c) && c == '>') {
+ // Ignore the character after a begin or end command
+ if (isBegin || isEnd) {
+ logger.warning(
+ "Ignoring annotation end character \">\" after special "
+ "commands \"begin\" or \"end\". Write \"\\>\" to end a "
+ "\"begin\"/\"end\" enclosed annotation.",
+ reader);
+ return res;
}
- }
- // Parse the arugments
- Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+ // If this should be an annoation, ignore the character
+ if (isAnnotation) {
+ logger.warning(
+ "Ignoring annotation end character \">\" after annotation "
+ "start command. Write \"\\>\" to end the annotation.",
+ reader);
+ } else {
+ // Make sure no arguments apart from the "name" argument are given
+ // to an annotation end
+ Variant::mapType &map = commands.top().arguments.asMap();
+ if (!map.empty()) {
+ if (map.count("name") == 0 || map.size() > 1U) {
+ logger.error(
+ "An annotation end command may not have any arguments "
+ "other than \"name\"");
+ return res;
+ }
+ }
- // Push the command onto the command stack
- pushCommand(std::move(commandName), std::move(commandArguments), false);
+ // If we got here, this is a valid ANNOTATION_END command, issue it
+ reader.peek(c);
+ reader.consumePeek();
+ return State::ANNOTATION_END;
+ }
+ }
- return State::COMMAND;
+ // If we're starting an annotation, return the command as annotation start
+ // instead of command
+ if (isAnnotation && res == State::COMMAND) {
+ return State::ANNOTATION_START;
+ }
+ return res;
}
-void OsdmStreamParser::parseBlockComment()
+void OsmlStreamParser::parseBlockComment()
{
- DynamicToken token;
+ Token token;
size_t depth = 1;
while (tokenizer.read(reader, token)) {
if (token.type == Tokens.BlockCommentEnd) {
@@ -436,7 +504,7 @@ void OsdmStreamParser::parseBlockComment()
logger.error("File ended while being in a block comment", reader);
}
-void OsdmStreamParser::parseLineComment()
+void OsmlStreamParser::parseLineComment()
{
char c;
while (reader.read(c)) {
@@ -446,7 +514,7 @@ void OsdmStreamParser::parseLineComment()
}
}
-bool OsdmStreamParser::checkIssueData(DataHandler &handler)
+bool OsmlStreamParser::checkIssueData(DataHandler &handler)
{
if (!handler.isEmpty()) {
data = handler.toVariant(reader.getSourceId());
@@ -457,7 +525,7 @@ bool OsdmStreamParser::checkIssueData(DataHandler &handler)
return false;
}
-bool OsdmStreamParser::checkIssueFieldStart()
+bool OsmlStreamParser::checkIssueFieldStart()
{
// Fetch the current command, and check whether we're currently inside a
// field of this command
@@ -482,18 +550,41 @@ bool OsdmStreamParser::checkIssueFieldStart()
return false;
}
-OsdmStreamParser::State OsdmStreamParser::parse()
+bool OsmlStreamParser::closeField()
+{
+ // Try to end an open field of the current command -- if the current command
+ // is not inside an open field, end this command and try to close the next
+ // one
+ for (int i = 0; i < 2 && commands.size() > 1; i++) {
+ Command &cmd = commands.top();
+ if (!cmd.inRangeField) {
+ if (cmd.inField) {
+ cmd.inField = false;
+ if (cmd.inDefaultField) {
+ commands.pop();
+ }
+ return true;
+ }
+ commands.pop();
+ } else {
+ return false;
+ }
+ }
+ return false;
+}
+
+OsmlStreamParser::State OsmlStreamParser::parse()
{
// Handler for incomming data
DataHandler handler;
// Read tokens until the outer loop should be left
- DynamicToken token;
+ Token token;
while (tokenizer.peek(reader, token)) {
const TokenTypeId type = token.type;
// Special handling for Backslash and Text
- if (type == Tokens.Backslash) {
+ if (type == Tokens.Backslash || type == Tokens.AnnotationStart) {
// Before appending anything to the output data or starting a new
// command, check whether FIELD_START has to be issued, as the
// current command is a command with range
@@ -519,7 +610,8 @@ OsdmStreamParser::State OsdmStreamParser::parse()
}
// Parse the actual command
- State res = parseCommand(token.location.getStart());
+ State res = parseCommand(token.location.getStart(),
+ type == Tokens.AnnotationStart);
switch (res) {
case State::ERROR:
throw LoggableException(
@@ -536,6 +628,14 @@ OsdmStreamParser::State OsdmStreamParser::parse()
// to the data buffer, use the escape character start as start
// location and the peek offset as end location
reader.peek(c); // Peek the previously fetched character
+
+ // If this was an annotation start token, add the parsed < to the
+ // output
+ if (type == Tokens.AnnotationStart) {
+ handler.append('<', token.location.getStart(),
+ token.location.getStart() + 1);
+ }
+
handler.append(c, token.location.getStart(),
reader.getPeekOffset());
reader.consumePeek();
@@ -579,28 +679,37 @@ OsdmStreamParser::State OsdmStreamParser::parse()
}
logger.error(
"Got field start token \"{\", but no command for which to "
- "start the field. Did you mean \"\\{\"?",
+ "start the field. Write \"\\{\" to insert this sequence as "
+ "text.",
token);
} else if (token.type == Tokens.FieldEnd) {
- // Try to end an open field of the current command -- if the current
- // command is not inside an open field, end this command and try to
- // close the next one
- for (int i = 0; i < 2 && commands.size() > 1; i++) {
- Command &cmd = commands.top();
- if (!cmd.inRangeField) {
- if (cmd.inField) {
- cmd.inField = false;
- return State::FIELD_END;
- }
- commands.pop();
- } else {
- break;
- }
+ if (closeField()) {
+ return State::FIELD_END;
}
logger.error(
- "Got field end token \"}\", but there is no field to end. Did "
- "you mean \"\\}\"?",
+ "Got field end token \"}\", but there is no field to end. "
+ "Write \"\\}\" to insert this sequence as text.",
token);
+ } else if (token.type == Tokens.DefaultFieldStart) {
+ // Try to start a default field the first time the token is reached
+ Command &topCmd = commands.top();
+ if (!topCmd.inField) {
+ topCmd.inField = true;
+ topCmd.inDefaultField = true;
+ return State::FIELD_START;
+ }
+ logger.error(
+ "Got default field start token \"{!\", but no command for "
+ "which to start the field. Write \"\\{!\" to insert this "
+ "sequence as text",
+ token);
+ } else if (token.type == Tokens.AnnotationEnd) {
+ // We got a single annotation end token "\>" -- simply issue the
+ // ANNOTATION_END event
+ Variant annotationName = Variant::fromString("");
+ annotationName.setLocation(token.location);
+ pushCommand(annotationName, Variant::mapType{}, false);
+ return State::ANNOTATION_END;
} else {
logger.error("Unexpected token \"" + token.content + "\"", token);
}
@@ -627,14 +736,19 @@ OsdmStreamParser::State OsdmStreamParser::parse()
return State::END;
}
-const Variant &OsdmStreamParser::getCommandName()
+const Variant &OsmlStreamParser::getCommandName() const
{
return commands.top().name;
}
-const Variant &OsdmStreamParser::getCommandArguments()
+const Variant &OsmlStreamParser::getCommandArguments() const
{
return commands.top().arguments;
}
+
+bool OsmlStreamParser::inDefaultField() const
+{
+ return commands.top().inRangeField || commands.top().inDefaultField;
+}
}
diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index 48d8fb7..dc3034c 100644
--- a/src/formats/osdm/OsdmStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -17,23 +17,22 @@
*/
/**
- * @file OsdmStreamParser.hpp
+ * @file OsmlStreamParser.hpp
*
- * Provides classes for low-level classes for reading the TeX-esque osdm
+ * Provides classes for low-level classes for reading the TeX-esque osml
* format. The class provided here does not build any model objects and does not
* implement the Parser interface.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
-#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_
-#define _OUSIA_OSDM_STREAM_PARSER_HPP_
+#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
+#define _OUSIA_OSML_STREAM_PARSER_HPP_
#include <stack>
#include <core/common/Variant.hpp>
-
-#include "DynamicTokenizer.hpp"
+#include <core/parser/utils/Tokenizer.hpp>
namespace ousia {
@@ -43,7 +42,7 @@ class Logger;
class DataHandler;
/**
- * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm
+ * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
* format. The parser is constructed around a "parse" function, which reads data
* from the underlying CharReader until a new state is reached and indicates
* this state in a return value. The calling code then has to pull corresponding
@@ -53,10 +52,10 @@ class DataHandler;
* fields, as this would lead to too many consecutive errors) a
* LoggableException is thrown.
*/
-class OsdmStreamParser {
+class OsmlStreamParser {
public:
/**
- * Enum used to indicate which state the OsdmStreamParser class is in
+ * Enum used to indicate which state the OsmlStreamParser class is in
* after calling the "parse" function.
*/
enum class State {
@@ -140,23 +139,35 @@ public:
/**
* Set to true if this is a command with clear begin and end.
*/
- bool hasRange;
+ bool hasRange : 1;
/**
* Set to true if we are currently inside a field of this command.
*/
- bool inField;
+ bool inField : 1;
/**
* Set to true if we are currently in the range field of the command
* (implies inField being set to true).
*/
- bool inRangeField;
+ bool inRangeField : 1;
+
+ /**
+ * Set to true if we are currently in a field that has been especially
+ * marked as default field (using the "|") syntax.
+ */
+ bool inDefaultField : 1;
/**
* Default constructor.
*/
- Command() : hasRange(false), inField(false), inRangeField(false) {}
+ Command()
+ : hasRange(false),
+ inField(false),
+ inRangeField(false),
+ inDefaultField()
+ {
+ }
/**
* Constructor of the Command class.
@@ -169,16 +180,19 @@ public:
* explicit range.
* @param inField is set to true if we currently are inside a field
* of this command.
- * @param inRangeField is set to true if we currently inside the outer
- * field of the command.
+ * @param inRangeField is set to true if we currently are inside the
+ * outer field of a ranged command.
+ * @param inDefaultField is set to true if we currently are in a
+ * specially marked default field.
*/
- Command(Variant name, Variant arguments, bool hasRange, bool inField,
- bool inRangeField)
+ Command(Variant name, Variant arguments, bool hasRange,
+ bool inField, bool inRangeField, bool inDefaultField)
: name(std::move(name)),
arguments(std::move(arguments)),
hasRange(hasRange),
inField(inField),
- inRangeField(inRangeField)
+ inRangeField(inRangeField),
+ inDefaultField(inDefaultField)
{
}
};
@@ -198,7 +212,7 @@ private:
/**
* Tokenizer instance used to read individual tokens from the text.
*/
- DynamicTokenizer tokenizer;
+ Tokenizer tokenizer;
/**
* Stack containing the current commands.
@@ -258,9 +272,11 @@ private:
*
* @param start is the start byte offset of the command (including the
* backslash)
+ * @param isAnnotation if true, the command is not returned as command, but
+ * as annotation start.
* @return true if a command was actuall parsed, false otherwise.
*/
- State parseCommand(size_t start);
+ State parseCommand(size_t start, bool isAnnotation);
/**
* Function used internally to parse a block comment.
@@ -290,16 +306,26 @@ private:
*/
bool checkIssueFieldStart();
+ /**
+ * Closes a currently open field. Note that the command will be removed from
+ * the internal command stack if the field that is being closed is a
+ * field marked as default field.
+ *
+ * @return true if the field could be closed, false if there was no field
+ * to close.
+ */
+ bool closeField();
+
public:
/**
- * Constructor of the OsdmStreamParser class. Attaches the new
- * OsdmStreamParser to the given CharReader and Logger instances.
+ * Constructor of the OsmlStreamParser class. Attaches the new
+ * OsmlStreamParser to the given CharReader and Logger instances.
*
* @param reader is the reader instance from which incomming characters
* should be read.
* @param logger is the logger instance to which errors should be written.
*/
- OsdmStreamParser(CharReader &reader, Logger &logger);
+ OsmlStreamParser(CharReader &reader, Logger &logger);
/**
* Continues parsing. Returns one of the states defined in the State enum.
@@ -318,7 +344,7 @@ public:
* @return a reference at a variant containing the data parsed by the
* "parse" function.
*/
- const Variant &getData() { return data; }
+ const Variant &getData() const { return data; }
/**
* Returns a reference at the internally stored command name. Only valid if
@@ -327,7 +353,7 @@ public:
* @return a reference at a variant containing name and location of the
* parsed command.
*/
- const Variant &getCommandName();
+ const Variant &getCommandName() const;
/**
* Returns a reference at the internally stored command name. Only valid if
@@ -336,16 +362,24 @@ public:
* @return a reference at a variant containing arguments given to the
* command.
*/
- const Variant &getCommandArguments();
+ const Variant &getCommandArguments() const;
+
+ /**
+ * Returns true if the current field is the "default" field. This is true if
+ * the parser either is in the outer range of a range command or inside a
+ * field that has been especially marked as "default" field (using the "|"
+ * syntax).
+ */
+ bool inDefaultField() const;
/**
* Returns a reference at the char reader.
*
* @return the last internal token location.
*/
- SourceLocation &getLocation() { return location; }
+ const SourceLocation &getLocation() const { return location; }
};
}
-#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */
+#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */
diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp
new file mode 100644
index 0000000..e37446a
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.cpp
@@ -0,0 +1,144 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/Location.hpp>
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+
+#include "OsxmlAttributeLocator.hpp"
+
+namespace ousia {
+
+/**
+ * Enum used internally in the statemachine of the xml argument parser.
+ */
+enum class XmlAttributeState {
+ IN_TAG_NAME,
+ SEARCH_ATTR,
+ IN_ATTR_NAME,
+ HAS_ATTR_NAME,
+ HAS_ATTR_EQUALS,
+ IN_ATTR_DATA
+};
+
+std::map<std::string, SourceLocation> OsxmlAttributeLocator::locate(
+ CharReader &reader, size_t offs)
+{
+ std::map<std::string, SourceLocation> res;
+
+ // Fork the reader, we don't want to mess up the XML parsing process, do we?
+ CharReaderFork readerFork = reader.fork();
+
+ // Move the read cursor to the start location, abort if this does not work
+ if (offs != readerFork.seek(offs)) {
+ return res;
+ }
+
+ // Now all we need to do is to implement one half of an XML parser. As this
+ // is inherently complicated we'll totaly fail at it. Don't care. All we
+ // want to get is those darn offsets for pretty error messages... (and we
+ // can assume the XML is valid as it was already read by expat)
+ XmlAttributeState state = XmlAttributeState::IN_TAG_NAME;
+ char c;
+ std::stringstream attrName;
+ while (readerFork.read(c)) {
+ // Abort at the end of the tag
+ if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) {
+ return res;
+ }
+
+ // One state machine to rule them all, one state machine to find them,
+ // One state machine to bring them all and in the darkness bind them
+ // (the byte offsets)
+ switch (state) {
+ case XmlAttributeState::IN_TAG_NAME:
+ if (Utils::isWhitespace(c)) {
+ res.emplace("$tag",
+ SourceLocation{reader.getSourceId(), offs + 1,
+ readerFork.getOffset() - 1});
+ state = XmlAttributeState::SEARCH_ATTR;
+ }
+ break;
+ case XmlAttributeState::SEARCH_ATTR:
+ if (!Utils::isWhitespace(c)) {
+ state = XmlAttributeState::IN_ATTR_NAME;
+ attrName << c;
+ }
+ break;
+ case XmlAttributeState::IN_ATTR_NAME:
+ if (Utils::isWhitespace(c)) {
+ state = XmlAttributeState::HAS_ATTR_NAME;
+ } else if (c == '=') {
+ state = XmlAttributeState::HAS_ATTR_EQUALS;
+ } else {
+ attrName << c;
+ }
+ break;
+ case XmlAttributeState::HAS_ATTR_NAME:
+ if (!Utils::isWhitespace(c)) {
+ if (c == '=') {
+ state = XmlAttributeState::HAS_ATTR_EQUALS;
+ break;
+ }
+ // Well, this is a strange XML file... We expected to
+ // see a '=' here! Try to continue with the
+ // "HAS_ATTR_EQUALS" state as this state will hopefully
+ // inlcude some error recovery
+ } else {
+ // Skip whitespace here
+ break;
+ }
+ // Fallthrough
+ case XmlAttributeState::HAS_ATTR_EQUALS:
+ if (!Utils::isWhitespace(c)) {
+ if (c == '"') {
+ // Here we are! We have found the beginning of an
+ // attribute. Let's quickly lock the current offset away
+ // in the result map
+ res.emplace(attrName.str(),
+ SourceLocation{reader.getSourceId(),
+ readerFork.getOffset()});
+ state = XmlAttributeState::IN_ATTR_DATA;
+ } else {
+ // No, this XML file is not well formed. Assume we're in
+ // an attribute name once again
+ attrName.str(std::string{&c, 1});
+ state = XmlAttributeState::IN_ATTR_NAME;
+ }
+ }
+ break;
+ case XmlAttributeState::IN_ATTR_DATA:
+ if (c == '"') {
+ // We're at the end of the attribute data, set the end
+ // location
+ auto it = res.find(attrName.str());
+ if (it != res.end()) {
+ it->second.setEnd(readerFork.getOffset() - 1);
+ }
+
+ // Reset the attribute name and restart the search
+ attrName.str(std::string{});
+ state = XmlAttributeState::SEARCH_ATTR;
+ }
+ break;
+ }
+ }
+ return res;
+}
+}
+
diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp
new file mode 100644
index 0000000..f9a3437
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.hpp
@@ -0,0 +1,67 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlAttributeLocator.hpp
+ *
+ * Contains a class used for locating the byte offsets of the attributes given
+ * in a XML tag.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+
+#include <map>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class SourceLocation;
+
+/**
+ * Class containing one static function for locating the byte offsets of the
+ * attributes in a XML tag. This are not retrieved by our xml parser, so we have
+ * to do this manually.
+ */
+class OsxmlAttributeLocator {
+public:
+ /**
+ * Function used to reconstruct the location of the attributes of a XML tag
+ * in the source code. This is necessary, as the xml parser only returns an
+ * offset to the begining of a tag and not to the position of the individual
+ * arguments.
+ *
+ * @param reader is the char reader from which the character data should be
+ * read.
+ * @param offs is a byte offset in the xml file pointing at the "<"
+ * character of the tag.
+ * @return a map from attribute keys to the corresponding location
+ * (including range) of the atribute. Also contains the location of the
+ * tagname in the form of the virtual attribute "$tag".
+ */
+ static std::map<std::string, SourceLocation> locate(CharReader &reader,
+ size_t offs);
+};
+
+}
+
+#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */
+
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
new file mode 100644
index 0000000..b4aff77
--- /dev/null
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -0,0 +1,547 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <expat.h>
+
+#include <vector>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Logger.hpp>
+#include <core/common/Variant.hpp>
+#include <core/common/VariantReader.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/WhitespaceHandler.hpp>
+
+#include "OsxmlAttributeLocator.hpp"
+#include "OsxmlEventParser.hpp"
+
+namespace ousia {
+
+/* Class OsxmlEventParser */
+
+/**
+ * Class containing data used by the internal functions.
+ */
+class OsxmlEventParserData {
+public:
+ /**
+ * Contains the current depth of the parsing process.
+ */
+ ssize_t depth;
+
+ /**
+ * Set to a value larger or equal to zero if the parser is currently inside
+ * an annotation end tag -- the value represents the depth in which the
+ * tag was opened.
+ */
+ ssize_t annotationEndTagDepth;
+
+ /**
+ * Current character data buffer.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Current whitespace buffer (for the trimming whitspace mode)
+ */
+ std::vector<char> whitespaceBuf;
+
+ /**
+ * Flag indicating whether a whitespace character was present (for the
+ * collapsing whitespace mode).
+ */
+ bool hasWhitespace;
+
+ /**
+ * Current character data start.
+ */
+ size_t textStart;
+
+ /**
+ * Current character data end.
+ */
+ size_t textEnd;
+
+ /**
+ * Default constructor.
+ */
+ OsxmlEventParserData();
+
+ /**
+ * Increments the depth.
+ */
+ void incrDepth();
+
+ /**
+ * Decrement the depth and reset the annotationEndTagDepth flag.
+ */
+ void decrDepth();
+
+ /**
+ * Returns true if we're currently inside an end tag.
+ */
+ bool inAnnotationEndTag();
+
+ /**
+ * Returns true if character data is available.
+ *
+ * @return true if character data is available.
+ */
+ bool hasText();
+
+ /**
+ * Returns a Variant containing the character data and its location.
+ *
+ * @return a string variant containing the text data and the character
+ * location.
+ */
+ Variant getText(SourceId sourceId);
+};
+
+/* Class GuardedExpatXmlParser */
+
+/**
+ * Wrapper class around the XML_Parser pointer which safely frees it whenever
+ * the scope is left (e.g. because an exception was thrown).
+ */
+class GuardedExpatXmlParser {
+private:
+ /**
+ * Internal pointer to the XML_Parser instance.
+ */
+ XML_Parser parser;
+
+public:
+ /**
+ * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS
+ * from the expat library. Throws a parser exception if the XML parser
+ * cannot be initialized.
+ *
+ * @param encoding is the protocol-defined encoding passed to expat (or
+ * nullptr if expat should determine the encoding by itself).
+ */
+ GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr)
+ {
+ parser = XML_ParserCreate(encoding);
+ if (!parser) {
+ throw LoggableException{
+ "Internal error: Could not create expat XML parser!"};
+ }
+ }
+
+ /**
+ * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance.
+ */
+ ~GuardedExpatXmlParser()
+ {
+ if (parser) {
+ XML_ParserFree(parser);
+ parser = nullptr;
+ }
+ }
+
+ /**
+ * Returns the XML_Parser pointer.
+ */
+ XML_Parser operator&() { return parser; }
+};
+
+/**
+ * Name of the special outer tag used for allowing multiple top-level elements
+ * in an xml file.
+ */
+static const std::string TOP_LEVEL_TAG{"ousia"};
+
+/**
+ * Prefix used to indicate the start of an annoation (note the trailing colon)
+ */
+static const std::string ANNOTATION_START_PREFIX{"a:start:"};
+
+/**
+ * Prefix used to indicate the end of an annotation.
+ */
+static const std::string ANNOTATION_END_PREFIX{"a:end"};
+
+/**
+ * Synchronizes the position of the xml parser with the default location of the
+ * logger instance.
+ *
+ * @param p is a pointer at the xml parser instance.
+ * @param len is the length of the string that should be refered to.
+ * @return the SourceLocation that has been set in the logger.
+ */
+static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0)
+{
+ // Fetch the OsxmlEventParser instance
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // Fetch the current location in the XML file and set the default location
+ // in the logger
+ size_t offs = XML_GetCurrentByteIndex(p);
+ SourceLocation loc =
+ SourceLocation{parser->getReader().getSourceId(), offs, offs + len};
+ parser->getLogger().setDefaultLocation(loc);
+
+ // Return the fetched location
+ return loc;
+}
+
+/**
+ * Callback called by eXpat whenever a start handler is reached.
+ */
+static void xmlStartElementHandler(void *ref, const XML_Char *name,
+ const XML_Char **attrs)
+{
+ // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+ XML_Parser p = static_cast<XML_Parser>(ref);
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // If there is any text data in the buffer, issue that first
+ if (parser->getData().hasText()) {
+ parser->getEvents().data(
+ parser->getData().getText(parser->getReader().getSourceId()));
+ }
+
+ // Read the argument locations -- this is only a stupid and slow hack,
+ // but it is necessary, as expat doesn't give use the byte offset of the
+ // arguments.
+ std::map<std::string, SourceLocation> attributeOffsets =
+ OsxmlAttributeLocator::locate(parser->getReader(),
+ XML_GetCurrentByteIndex(p));
+
+ // Update the logger position
+ SourceLocation loc = xmlSyncLoggerPosition(p);
+
+ // Fetch the location of the name
+ SourceLocation nameLoc = loc;
+ auto it = attributeOffsets.find("$tag");
+ if (it != attributeOffsets.end()) {
+ nameLoc = it->second;
+ }
+ // Increment the current depth
+ parser->getData().incrDepth();
+
+ // Make sure we're currently not inside an annotation end tag -- this would
+ // be highly illegal!
+ if (parser->getData().inAnnotationEndTag()) {
+ parser->getLogger().error(
+ "No tags allowed inside an annotation end tag", nameLoc);
+ return;
+ }
+
+ // Assemble the arguments
+ Variant::mapType args;
+ const XML_Char **attr = attrs;
+ while (*attr) {
+ // Convert the C string to a std::string
+ const std::string key{*(attr++)};
+
+ // Search the location of the key
+ SourceLocation keyLoc;
+ auto it = attributeOffsets.find(key);
+ if (it != attributeOffsets.end()) {
+ keyLoc = it->second;
+ }
+
+ // Parse the string, pass the location of the key
+ std::pair<bool, Variant> value = VariantReader::parseGenericString(
+ *(attr++), parser->getLogger(), keyLoc.getSourceId(),
+ keyLoc.getStart());
+
+ // Set the overall location of the parsed element to the attribute
+ // location
+ value.second.setLocation(keyLoc);
+
+ // Store the keys in the map
+ args.emplace(key, value.second).second;
+ }
+
+ // Fetch the name of the tag, check for special tags
+ std::string nameStr(name);
+ if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) {
+ // We're in the top-level and the magic tag is reached -- just
+ // ignore it and issue a warning for each argument that has been given
+ for (const auto &arg : args) {
+ parser->getLogger().warning(std::string("Ignoring attribute \"") +
+ arg.first +
+ std::string("\" for magic tag \"") +
+ TOP_LEVEL_TAG + std::string("\""),
+ arg.second);
+ }
+ } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) {
+ // Assemble a name variant containing the name minus the prefix
+ Variant nameVar =
+ Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size()));
+ nameVar.setLocation(nameLoc);
+
+ // Issue the "annotationStart" event
+ parser->getEvents().annotationStart(nameVar, args);
+ } else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) {
+ // Assemble a name variant containing the name minus the prefix
+ nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size());
+
+ // Discard a potentially leading colon
+ if (!nameStr.empty() && nameStr[0] == ':') {
+ nameStr = nameStr.substr(1);
+ }
+
+ // Assemble the variant containing the name and its location
+ Variant nameVar = Variant::fromString(nameStr);
+ nameVar.setLocation(nameLoc);
+
+ // Check whether a "name" attribute was given
+ Variant elementName;
+ for (const auto &arg : args) {
+ if (arg.first == "name") {
+ elementName = arg.second;
+ } else {
+ parser->getLogger().warning(
+ std::string("Ignoring attribute \"") + arg.first +
+ "\" in annotation end tag",
+ arg.second);
+ }
+ }
+
+ // Set the annotationEndTagDepth to disallow any further tags to be
+ // opened inside the annotation end tag.
+ parser->getData().annotationEndTagDepth = parser->getData().depth;
+
+ // Issue the "annotationEnd" event
+ parser->getEvents().annotationEnd(nameVar, args);
+ } else {
+ // Just issue a "commandStart" event in any other case
+ Variant nameVar = Variant::fromString(nameStr);
+ nameVar.setLocation(nameLoc);
+ parser->getEvents().commandStart(nameVar, args);
+ }
+}
+
+static void xmlEndElementHandler(void *ref, const XML_Char *name)
+{
+ // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+ XML_Parser p = static_cast<XML_Parser>(ref);
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // Synchronize the position of the logger with teh position
+ xmlSyncLoggerPosition(p);
+
+ // Abort as long as we're in an annotation end tag
+ if (parser->getData().inAnnotationEndTag()) {
+ parser->getData().decrDepth();
+ return;
+ }
+
+ // Decrement the current depth
+ parser->getData().decrDepth();
+
+ // If there is any text data in the buffer, issue that first
+ if (parser->getData().hasText()) {
+ parser->getEvents().data(
+ parser->getData().getText(parser->getReader().getSourceId()));
+ }
+
+ // Abort if the special ousia tag ends here
+ std::string nameStr{name};
+ if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) {
+ return;
+ }
+
+ // Issue the "fieldEnd" event
+ parser->getEvents().fieldEnd();
+}
+
+static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
+{
+ // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+ XML_Parser p = static_cast<XML_Parser>(ref);
+ OsxmlEventParser *parser =
+ static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+ // Abort as long as we're in an annotation end tag
+ if (parser->getData().inAnnotationEndTag()) {
+ return;
+ }
+
+ // Convert the signed (smell the 90's C library here?) length to an usigned
+ // value
+ size_t ulen = len > 0 ? static_cast<size_t>(len) : 0;
+
+ // Synchronize the logger position
+ SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
+
+ // Fetch some variables for convenience
+ const WhitespaceMode mode = parser->getWhitespaceMode();
+ OsxmlEventParserData &data = parser->getData();
+ std::vector<char> &textBuf = data.textBuf;
+ std::vector<char> &whitespaceBuf = data.whitespaceBuf;
+ bool &hasWhitespace = data.hasWhitespace;
+ size_t &textStart = data.textStart;
+ size_t &textEnd = data.textEnd;
+
+ size_t pos = loc.getStart();
+ for (size_t i = 0; i < ulen; i++, pos++) {
+ switch (mode) {
+ case WhitespaceMode::PRESERVE:
+ PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd);
+ break;
+ case WhitespaceMode::TRIM:
+ TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd,
+ whitespaceBuf);
+ break;
+ case WhitespaceMode::COLLAPSE:
+ CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+ textStart, textEnd,
+ hasWhitespace);
+ break;
+ }
+ }
+}
+
+/* Class OsxmlEvents */
+
+OsxmlEvents::~OsxmlEvents() {}
+
+/* Class OsxmlEventParser */
+
+OsxmlEventParserData::OsxmlEventParserData()
+ : depth(0),
+ annotationEndTagDepth(-1),
+ hasWhitespace(false),
+ textStart(0),
+ textEnd(0)
+{
+}
+
+void OsxmlEventParserData::incrDepth() { depth++; }
+
+void OsxmlEventParserData::decrDepth()
+{
+ if (depth > 0) {
+ depth--;
+ }
+ if (depth < annotationEndTagDepth) {
+ annotationEndTagDepth = -1;
+ }
+}
+
+bool OsxmlEventParserData::inAnnotationEndTag()
+{
+ return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);
+}
+
+bool OsxmlEventParserData::hasText() { return !textBuf.empty(); }
+
+Variant OsxmlEventParserData::getText(SourceId sourceId)
+{
+ // Create a variant containing the string data and the location
+ Variant var =
+ Variant::fromString(std::string{textBuf.data(), textBuf.size()});
+ var.setLocation({sourceId, textStart, textEnd});
+
+ // Reset the text buffers
+ textBuf.clear();
+ whitespaceBuf.clear();
+ hasWhitespace = false;
+ textStart = 0;
+ textEnd = 0;
+
+ // Return the variant
+ return var;
+}
+
+/* Class OsxmlEventParser */
+
+OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
+ Logger &logger)
+ : reader(reader),
+ events(events),
+ logger(logger),
+ whitespaceMode(WhitespaceMode::TRIM),
+ data(new OsxmlEventParserData())
+{
+}
+
+OsxmlEventParser::~OsxmlEventParser() {}
+
+void OsxmlEventParser::parse()
+{
+ // Create the parser object
+ GuardedExpatXmlParser p{"UTF-8"};
+
+ // Reset the depth
+ data->depth = 0;
+
+ // Pass the reference to this parser instance to the XML handler
+ XML_SetUserData(&p, this);
+ XML_UseParserAsHandlerArg(&p);
+
+ // Set the callback functions
+ XML_SetStartElementHandler(&p, xmlStartElementHandler);
+ XML_SetEndElementHandler(&p, xmlEndElementHandler);
+ XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler);
+
+ // Feed data into expat while there is data to process
+ constexpr size_t BUFFER_SIZE = 64 * 1024;
+ while (true) {
+ // Fetch a buffer from expat for the input data
+ char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE));
+ if (!buf) {
+ throw OusiaException{"Internal error: XML parser out of memory!"};
+ }
+
+ // Read into the buffer
+ size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE);
+
+ // Parse the data and handle any XML error as exception
+ if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) {
+ throw LoggableException{
+ "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))},
+ xmlSyncLoggerPosition(&p)};
+ }
+
+ // Abort once there are no more bytes in the stream
+ if (bytesRead == 0) {
+ break;
+ }
+ }
+}
+
+void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
+{
+ this->whitespaceMode = whitespaceMode;
+}
+
+WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
+{
+ return whitespaceMode;
+}
+
+CharReader &OsxmlEventParser::getReader() const { return reader; }
+
+Logger &OsxmlEventParser::getLogger() const { return logger; }
+
+OsxmlEvents &OsxmlEventParser::getEvents() const { return events; }
+
+OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; }
+}
+
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
new file mode 100644
index 0000000..aa20ea9
--- /dev/null
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -0,0 +1,215 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlEventParser.hpp
+ *
+ * The OsxmlEventParser class is responsible for parsing an XML file and calling
+ * the corresponding event handler functions if an XML item is found. Event
+ * handling is performed using a listener interface.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OSXML_EVENT_PARSER_HPP_
+#define _OSXML_EVENT_PARSER_HPP_
+
+#include <memory>
+#include <string>
+
+#include <core/common/Whitespace.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class Logger;
+class Variant;
+class OsxmlEventParserData;
+
+/**
+ * Interface which defines the callback functions which are called by the
+ * OsxmlEventParser whenever an event occurs.
+ */
+class OsxmlEvents {
+public:
+ /**
+ * Virtual destructor.
+ */
+ virtual ~OsxmlEvents();
+
+ /**
+ * Called whenever a command starts. Note that this implicitly always starts
+ * the default field of the command.
+ *
+ * @param name is a string variant containing name and location of the
+ * command.
+ * @param args is a map variant containing the arguments that were given
+ * to the command.
+ */
+ virtual void commandStart(Variant name, Variant args) = 0;
+
+ /**
+ * Called whenever an annotation starts. Note that this implicitly always
+ * starts the default field of the annotation.
+ *
+ * @param name is a string variant containing the name of the annotation
+ * class and the location of the annotation definition.
+ * @param args is a map variant containing the arguments that were given
+ * to the annotation definition.
+ */
+ virtual void annotationStart(Variant name, Variant args) = 0;
+
+ /**
+ * Called whenever the range of an annotation ends. The callee must
+ * disambiguate the actual annotation that is finished here.
+ *
+ * @param name is a string variant containing the name of the annotation
+ * class that should end here. May be empty (or nullptr), if no elementName
+ * has been specified at the end of the annotation.
+ * @param elementName is the name of the annotation element that should be
+ * ended here. May be empty (or nullptr), if no elementName has been
+ * specified at the end of the annotation.
+ */
+ virtual void annotationEnd(Variant name, Variant elementName) = 0;
+
+ /**
+ * Called whenever the default field which was implicitly started by
+ * commandStart or annotationStart ends. Note that this does not end the
+ * range of an annotation, but the default field of the annotation. To
+ * signal the end of the annotation this, the annotationEnd method will be
+ * invoked.
+ */
+ virtual void fieldEnd() = 0;
+
+ /**
+ * Called whenever data is found. Whitespace data is handled as specified
+ * and the data has been parsed to the specified variant type. This function
+ * is not called if the parsing failed, the parser prints an error message
+ * instead.
+ *
+ * @param data is the already parsed data that should be passed to the
+ * handler.
+ */
+ virtual void data(Variant data) = 0;
+};
+
+/**
+ * The OsxmlEventParser class is a wrapper around eXpat which implements the
+ * specialities of the osxml formats class (like annotation ranges). It notifies
+ * a specified event handler whenever a command, annotation or data has been
+ * reached.
+ */
+class OsxmlEventParser {
+private:
+ /**
+ * Reference at the internal CharReader instance.
+ */
+ CharReader &reader;
+
+ /**
+ * Set of callback functions to be called whenever an event is triggered.
+ */
+ OsxmlEvents &events;
+
+ /**
+ * Reference at the Logger object to which error messages or warnings should
+ * be logged.
+ */
+ Logger &logger;
+
+ /**
+ * Current whitespace mode.
+ */
+ WhitespaceMode whitespaceMode;
+
+ /**
+ * Data to be used by the internal functions.
+ */
+ std::unique_ptr<OsxmlEventParserData> data;
+
+public:
+ /**
+ * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents
+ * of which the callback functions are called.
+ *
+ * @param reader is a reference to the CharReader instance from which the
+ * XML should be read.
+ * @param events is a refence at an instance of the OsxmlEvents class. All
+ * events are forwarded to this class.
+ * @param logger is the Logger instance to which log messages should be
+ * written.
+ */
+ OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger);
+
+ /**
+ * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type)
+ */
+ ~OsxmlEventParser();
+
+ /**
+ * Performs the actual parsing. Reads the XML using eXpat and calles the
+ * callbacks in the event listener instance whenever something interesting
+ * happens.
+ */
+ void parse();
+
+ /**
+ * Sets the whitespace handling mode.
+ *
+ * @param whitespaceMode defines how whitespace in the data should be
+ * handled.
+ */
+ void setWhitespaceMode(WhitespaceMode whitespaceMode);
+
+ /**
+ * Returns the current whitespace handling mode.
+ *
+ * @return the currently set whitespace handling mode.
+ */
+ WhitespaceMode getWhitespaceMode() const;
+
+ /**
+ * Returns the internal CharReader reference.
+ *
+ * @return the CharReader reference.
+ */
+ CharReader &getReader() const;
+
+ /**
+ * Returns the internal Logger reference.
+ *
+ * @return the internal Logger reference.
+ */
+ Logger &getLogger() const;
+
+ /**
+ * Returns the internal OsxmlEvents reference.
+ *
+ * @return the internal OsxmlEvents reference.
+ */
+ OsxmlEvents &getEvents() const;
+
+ /**
+ * Returns a reference at the internal data.
+ */
+ OsxmlEventParserData &getData() const;
+};
+}
+
+#endif /* _OSXML_EVENT_PARSER_HPP_ */
+
diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp
new file mode 100644
index 0000000..869c76a
--- /dev/null
+++ b/src/formats/osxml/OsxmlParser.cpp
@@ -0,0 +1,238 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+#include <expat.h>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/VariantReader.hpp>
+#include <core/parser/ParserScope.hpp>
+#include <core/parser/ParserStack.hpp>
+#include <core/parser/stack/DocumentHandler.hpp>
+#include <core/parser/stack/DomainHandler.hpp>
+#include <core/parser/stack/ImportIncludeHandler.hpp>
+#include <core/parser/stack/TypesystemHandler.hpp>
+#include <core/model/Document.hpp>
+#include <core/model/Domain.hpp>
+#include <core/model/Typesystem.hpp>
+
+#include "XmlParser.hpp"
+
+namespace ousia {
+
+namespace ParserStates {
+/* Document states */
+static const ParserState Document =
+ ParserStateBuilder()
+ .parent(&None)
+ .createdNodeType(&RttiTypes::Document)
+ .elementHandler(DocumentHandler::create)
+ .arguments({Argument::String("name", "")});
+
+static const ParserState DocumentChild =
+ ParserStateBuilder()
+ .parents({&Document, &DocumentChild})
+ .createdNodeTypes({&RttiTypes::StructureNode,
+ &RttiTypes::AnnotationEntity,
+ &RttiTypes::DocumentField})
+ .elementHandler(DocumentChildHandler::create);
+
+/* Domain states */
+static const ParserState Domain = ParserStateBuilder()
+ .parents({&None, &Document})
+ .createdNodeType(&RttiTypes::Domain)
+ .elementHandler(DomainHandler::create)
+ .arguments({Argument::String("name")});
+
+static const ParserState DomainStruct =
+ ParserStateBuilder()
+ .parent(&Domain)
+ .createdNodeType(&RttiTypes::StructuredClass)
+ .elementHandler(DomainStructHandler::create)
+ .arguments({Argument::String("name"),
+ Argument::Cardinality("cardinality", Cardinality::any()),
+ Argument::Bool("isRoot", false),
+ Argument::Bool("transparent", false),
+ Argument::String("isa", "")});
+
+static const ParserState DomainAnnotation =
+ ParserStateBuilder()
+ .parent(&Domain)
+ .createdNodeType(&RttiTypes::AnnotationClass)
+ .elementHandler(DomainAnnotationHandler::create)
+ .arguments({Argument::String("name")});
+
+static const ParserState DomainAttributes =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::StructType)
+ .elementHandler(DomainAttributesHandler::create)
+ .arguments({});
+
+static const ParserState DomainAttribute =
+ ParserStateBuilder()
+ .parent(&DomainAttributes)
+ .elementHandler(TypesystemStructFieldHandler::create)
+ .arguments({Argument::String("name"), Argument::String("type"),
+ Argument::Any("default", Variant::fromObject(nullptr))});
+
+static const ParserState DomainField =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainFieldHandler::create)
+ .arguments({Argument::String("name", ""),
+ Argument::Bool("isSubtree", false),
+ Argument::Bool("optional", false)});
+
+static const ParserState DomainFieldRef =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainFieldRefHandler::create)
+ .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)});
+
+static const ParserState DomainStructPrimitive =
+ ParserStateBuilder()
+ .parents({&DomainStruct, &DomainAnnotation})
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainPrimitiveHandler::create)
+ .arguments(
+ {Argument::String("name", ""), Argument::Bool("isSubtree", false),
+ Argument::Bool("optional", false), Argument::String("type")});
+
+static const ParserState DomainStructChild =
+ ParserStateBuilder()
+ .parent(&DomainField)
+ .elementHandler(DomainChildHandler::create)
+ .arguments({Argument::String("ref")});
+
+static const ParserState DomainStructParent =
+ ParserStateBuilder()
+ .parent(&DomainStruct)
+ .createdNodeType(&RttiTypes::DomainParent)
+ .elementHandler(DomainParentHandler::create)
+ .arguments({Argument::String("ref")});
+
+static const ParserState DomainStructParentField =
+ ParserStateBuilder()
+ .parent(&DomainStructParent)
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainParentFieldHandler::create)
+ .arguments({Argument::String("name", ""),
+ Argument::Bool("isSubtree", false),
+ Argument::Bool("optional", false)});
+
+static const ParserState DomainStructParentFieldRef =
+ ParserStateBuilder()
+ .parent(&DomainStructParent)
+ .createdNodeType(&RttiTypes::FieldDescriptor)
+ .elementHandler(DomainParentFieldRefHandler::create)
+ .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)});
+
+/* Typesystem states */
+static const ParserState Typesystem =
+ ParserStateBuilder()
+ .parents({&None, &Domain})
+ .createdNodeType(&RttiTypes::Typesystem)
+ .elementHandler(TypesystemHandler::create)
+ .arguments({Argument::String("name", "")});
+
+static const ParserState TypesystemEnum =
+ ParserStateBuilder()
+ .parent(&Typesystem)
+ .createdNodeType(&RttiTypes::EnumType)
+ .elementHandler(TypesystemEnumHandler::create)
+ .arguments({Argument::String("name")});
+
+static const ParserState TypesystemEnumEntry =
+ ParserStateBuilder()
+ .parent(&TypesystemEnum)
+ .elementHandler(TypesystemEnumEntryHandler::create)
+ .arguments({});
+
+static const ParserState TypesystemStruct =
+ ParserStateBuilder()
+ .parent(&Typesystem)
+ .createdNodeType(&RttiTypes::StructType)
+ .elementHandler(TypesystemStructHandler::create)
+ .arguments({Argument::String("name"), Argument::String("parent", "")});
+
+static const ParserState TypesystemStructField =
+ ParserStateBuilder()
+ .parent(&TypesystemStruct)
+ .elementHandler(TypesystemStructFieldHandler::create)
+ .arguments({Argument::String("name"), Argument::String("type"),
+ Argument::Any("default", Variant::fromObject(nullptr))});
+
+static const ParserState TypesystemConstant =
+ ParserStateBuilder()
+ .parent(&Typesystem)
+ .createdNodeType(&RttiTypes::Constant)
+ .elementHandler(TypesystemConstantHandler::create)
+ .arguments({Argument::String("name"), Argument::String("type"),
+ Argument::Any("value")});
+
+/* Special states for import and include */
+static const ParserState Import =
+ ParserStateBuilder()
+ .parents({&Document, &Typesystem, &Domain})
+ .elementHandler(ImportHandler::create)
+ .arguments({Argument::String("rel", ""), Argument::String("type", ""),
+ Argument::String("src", "")});
+
+static const ParserState Include =
+ ParserStateBuilder()
+ .parent(&All)
+ .elementHandler(IncludeHandler::create)
+ .arguments({Argument::String("rel", ""), Argument::String("type", ""),
+ Argument::String("src", "")});
+
+static const std::multimap<std::string, const ParserState *> XmlStates{
+ {"document", &Document},
+ {"*", &DocumentChild},
+ {"domain", &Domain},
+ {"struct", &DomainStruct},
+ {"annotation", &DomainAnnotation},
+ {"attributes", &DomainAttributes},
+ {"attribute", &DomainAttribute},
+ {"field", &DomainField},
+ {"fieldRef", &DomainFieldRef},
+ {"primitive", &DomainStructPrimitive},
+ {"childRef", &DomainStructChild},
+ {"parentRef", &DomainStructParent},
+ {"field", &DomainStructParentField},
+ {"fieldRef", &DomainStructParentFieldRef},
+ {"typesystem", &Typesystem},
+ {"enum", &TypesystemEnum},
+ {"entry", &TypesystemEnumEntry},
+ {"struct", &TypesystemStruct},
+ {"field", &TypesystemStructField},
+ {"constant", &TypesystemConstant},
+ {"import", &Import},
+ {"include", &Include}};
+}
+
+
+}
+
diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp
new file mode 100644
index 0000000..281a49c
--- /dev/null
+++ b/src/formats/osxml/OsxmlParser.hpp
@@ -0,0 +1,55 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file XmlParser.hpp
+ *
+ * Contains the parser responsible for reading Ousía XML Documents (extension
+ * oxd) and Ousía XML Modules (extension oxm).
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSXML_PARSER_HPP_
+#define _OUSIA_OSXML_PARSER_HPP_
+
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+
+/**
+ * The OsxmlParser class implements parsing the various types of Ousía XML
+ * documents using the OsxmlEventParser and Stack classes.
+ */
+class OsxmlParser : public Parser {
+protected:
+ /**
+ * Parses the given input stream as XML file and returns the parsed
+ * top-level node.
+ *
+ * @param reader is the CharReader from which the input should be read.
+ * @param ctx is a reference to the ParserContext instance that should be
+ * used.
+ */
+ void doParse(CharReader &reader, ParserContext &ctx) override;
+};
+
+}
+
+#endif /* _OUSIA_OSXML_PARSER_HPP_ */
+