diff options
author | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2015-02-15 21:56:04 +0100 |
---|---|---|
committer | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2015-02-15 21:56:04 +0100 |
commit | d2f99e4b43ed93ef0fa8e138e0c3afc79775b77c (patch) | |
tree | 8e7cdb894b7036b3ca01499ee9432d2e62930477 /src/formats | |
parent | 40f7df390f00f85c17bd0e6527ec4ba19cbce4fc (diff) | |
parent | 4f2872d9968aec93bebff90d1238347c8a364949 (diff) |
Merge branch 'master' of somweyr.de:ousia
Diffstat (limited to 'src/formats')
-rw-r--r-- | src/formats/osdm/DynamicTokenizer.cpp | 544 | ||||
-rw-r--r-- | src/formats/osdm/DynamicTokenizer.hpp | 252 | ||||
-rw-r--r-- | src/formats/osdm/TokenTrie.cpp | 119 | ||||
-rw-r--r-- | src/formats/osdm/TokenTrie.hpp | 150 | ||||
-rw-r--r-- | src/formats/osml/OsmlParser.cpp | 57 | ||||
-rw-r--r-- | src/formats/osml/OsmlParser.hpp | 48 | ||||
-rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp (renamed from src/formats/osdm/OsdmStreamParser.cpp) | 226 | ||||
-rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp (renamed from src/formats/osdm/OsdmStreamParser.hpp) | 90 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlAttributeLocator.cpp | 144 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlAttributeLocator.hpp | 67 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 547 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 217 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlParser.cpp | 98 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlParser.hpp | 55 |
14 files changed, 1465 insertions, 1149 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp deleted file mode 100644 index f2cfcd1..0000000 --- a/src/formats/osdm/DynamicTokenizer.cpp +++ /dev/null @@ -1,544 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <memory> -#include <vector> - -#include <core/common/CharReader.hpp> -#include <core/common/Exceptions.hpp> -#include <core/common/Utils.hpp> - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -namespace { - -/* Internal class TokenMatch */ - -/** - * Contains information about a matching token. - */ -struct TokenMatch { - /** - * Token that was matched. - */ - DynamicToken token; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - - /** - * Constructor of the TokenMatch class. - */ - TokenMatch() : textLength(0), textEnd(0) {} - - /** - * Returns true if this TokenMatch instance actually represents a match. - */ - bool hasMatch() { return token.type != EmptyToken; } -}; - -/* Internal class TokenLookup */ - -/** - * The TokenLookup class is used to represent a thread in a running token - * lookup. - */ -class TokenLookup { -private: - /** - * Current node within the token trie. - */ - TokenTrie::Node const *node; - - /** - * Start offset within the source file. - */ - size_t start; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - -public: - /** - * Constructor of the TokenLookup class. - * - * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. - */ - TokenLookup(const TokenTrie::Node *node, size_t start, - size_t textLength, size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) - { - } - - /** - * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). - * - * @param c is the character that should be appended to the current prefix. - * @param lookups is a list to which new TokeLookup instances are added -- - * which could potentially be expanded in the next iteration. - * @param match is the DynamicToken instance to which the matching token - * should be written. - * @param tokens is a reference at the internal token list of the - * DynamicTokenizer. - * @param end is the end byte offset of the current character. - * @param sourceId is the source if of this file. - */ - void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, - const std::vector<std::string> &tokens, SourceOffset end, - SourceId sourceId) - { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node - auto it = node->children.find(c); - if (it == node->children.end()) { - return; - } - - // Check whether the new node represents a complete token a whether it - // is longer than the current token. If yes, replace the current token. - node = it->second.get(); - if (node->type != EmptyToken) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - DynamicToken{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } - } - - // If this state can possibly be advanced, store it in the states list. - if (!node->children.empty()) { - lookups.emplace_back(*this); - } - } -}; - -/* Internal class TextHandlerBase */ - -/** - * Base class used for those classes that may be used as TextHandler in the - * DynamicTokenizer::next function. - */ -class TextHandlerBase { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector<char> textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - TextHandlerBase() : textStart(0), textEnd(0) {} - - /** - * Transforms the given token into a text token containing the extracted - * text. - * - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ - void buildTextToken(TokenMatch &match, SourceId sourceId) - { - if (match.hasMatch()) { - match.token.content = - std::string{textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, textStart, match.textEnd}; - } else { - match.token.content = std::string{textBuf.data(), textBuf.size()}; - match.token.location = SourceLocation{sourceId, textStart, textEnd}; - } - match.token.type = TextToken; - } - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } -}; - -/* Internal class PreservingTextHandler */ - -/** - * The PreservingTextHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/* Internal class TrimmingTextHandler */ - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector<char> whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/* Internal class CollapsingTextHandler */ - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; -} - -/* Class DynamicTokenizer */ - -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenTypeId(0) -{ -} - -template <typename TextHandler, bool read> -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) -{ - // If we're in the read mode, reset the char reader peek position to the - // current read position - if (read) { - reader.resetPeek(); - } - - // Prepare the lookups in the token trie - const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; - std::vector<TokenLookup> lookups; - std::vector<TokenLookup> nextLookups; - - // Instantiate the text handler - TextHandler textHandler; - - // Peek characters from the reader and try to advance the current token tree - // cursor - char c; - size_t charStart = reader.getPeekOffset(); - const SourceId sourceId = reader.getSourceId(); - while (reader.peek(c)) { - const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; - - // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); - } - - // Try to advance all other lookups with the new character - for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); - } - - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { - break; - } - } else { - // Record all incomming characters - textHandler.append(c, charStart, charEnd); - } - - // Swap the lookups and the nextLookups list - lookups = std::move(nextLookups); - nextLookups.clear(); - - // Advance the offset - charStart = charEnd; - } - - // If we found text, emit that text - if (textHandler.hasText() && - (!match.hasMatch() || match.textLength > 0)) { - textHandler.buildTextToken(match, sourceId); - } - - // Move the read/peek cursor to the end of the token, abort if an error - // happens while doing so - if (match.hasMatch()) { - // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { - throw OusiaException{"Token end position offset out of range"}; - } - - // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); - if (read) { - reader.seek(end); - } else { - reader.seekPeekCursor(end); - } - token = match.token; - } else { - token = DynamicToken{}; - } - return match.hasMatch(); -} - -bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingTextHandler, true>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingTextHandler, true>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingTextHandler, true>(reader, token); - } - return false; -} - -bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingTextHandler, false>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingTextHandler, false>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingTextHandler, false>(reader, token); - } - return false; -} - -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) -{ - // Abort if an empty token should be registered - if (token.empty()) { - return EmptyToken; - } - - // Search for a new slot in the tokens list - TokenTypeId type = EmptyToken; - for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; - type = i; - break; - } - } - - // No existing slot was found, add a new one -- make sure we do not - // override the special token type handles - if (type == EmptyToken) { - type = tokens.size(); - if (type == TextToken || type == EmptyToken) { - throw OusiaException{"Token type ids depleted!"}; - } - tokens.emplace_back(token); - } - nextTokenTypeId = type + 1; - - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list - if (!trie.registerToken(token, type)) { - tokens[type] = std::string(); - nextTokenTypeId = type; - return EmptyToken; - } - return type; -} - -bool DynamicTokenizer::unregisterToken(TokenTypeId type) -{ - // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return true; - } - return false; -} - -std::string DynamicTokenizer::getTokenString(TokenTypeId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} - -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) -{ - whitespaceMode = mode; -} - -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } - -/* Explicitly instantiate all possible instantiations of the "next" member - function */ -template bool DynamicTokenizer::next<PreservingTextHandler, false>( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, false>( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, false>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<PreservingTextHandler, true>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, true>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, true>( - CharReader &reader,DynamicToken &token); -} - diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp deleted file mode 100644 index 0cac2e8..0000000 --- a/src/formats/osdm/DynamicTokenizer.hpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file DynamicTokenizer.hpp - * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ - -#include <set> -#include <string> -#include <vector> - -#include <core/common/Location.hpp> - -#include "TokenTrie.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; - -/** - * The DynamicToken structure describes a token discovered by the Tokenizer. - */ -struct DynamicToken { - /** - * Id of the type of this token. - */ - TokenTypeId type; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - DynamicToken() : type(EmptyToken) {} - - /** - * Constructor of the DynamicToken struct. - * - * @param id represents the token type. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - DynamicToken(TokenTypeId type, const std::string &content, - SourceLocation location) - : type(type), content(content), location(location) - { - } - - /** - * Constructor of the DynamicToken struct, only initializes the token type - * - * @param type is the id corresponding to the type of the token. - */ - DynamicToken(TokenTypeId type) : type(type) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; - -/** - * Enum specifying the whitespace handling of the DynamicTokenizer class when - * reading non-token text. - */ -enum class WhitespaceMode { - /** - * Preserves all whitespaces as they are found in the source file. - */ - PRESERVE, - - /** - * Trims whitespace at the beginning and the end of the found text. - */ - TRIM, - - /** - * Whitespaces are trimmed and collapsed, multiple whitespace characters - * are replaced by a single space character. - */ - COLLAPSE -}; - -/** - * The DynamicTokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the - * tokenizer. - */ -class DynamicTokenizer { -private: - /** - * Internally used token trie. This object holds all registered tokens. - */ - TokenTrie trie; - - /** - * Flag defining whether whitespaces should be preserved or not. - */ - WhitespaceMode whitespaceMode; - - /** - * Vector containing all registered token types. - */ - std::vector<std::string> tokens; - - /** - * Next index in the tokens list where to search for a new token id. - */ - size_t nextTokenTypeId; - - /** - * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. - * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is the token structure into which the token information - * should be written. - * @return false if the end of the stream has been reached, true otherwise. - */ - template <typename TextHandler, bool read> - bool next(CharReader &reader, DynamicToken &token); - -public: - /** - * Constructor of the DynamicTokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. - */ - DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); - - /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. - * - * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if - * an error occured. - */ - TokenTypeId registerToken(const std::string &token); - - /** - * Unregisters the token belonging to the given TokenTypeId. - * - * @param type is the token type that should be unregistered. The - *TokenTypeId - * must have been returned by registerToken. - * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). - */ - bool unregisterToken(TokenTypeId type); - - /** - * Returns the token that was registered under the given TokenTypeId id or - *an - * empty string if an invalid TokenTypeId id is given. - * - * @param type is the TokenTypeId id for which the corresponding token - *string - * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenTypeId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. - */ - WhitespaceMode getWhitespaceMode(); - - /** - * Reads a new token from the CharReader and stores it in the given - * DynamicToken instance. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool read(CharReader &reader, DynamicToken &token); - - /** - * The peek method does not advance the read position of the char reader, - * but reads the next token from the current char reader peek position. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool peek(CharReader &reader, DynamicToken &token); -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ - diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp deleted file mode 100644 index 4a0430b..0000000 --- a/src/formats/osdm/TokenTrie.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "TokenTrie.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -TokenTrie::Node::Node() : type(EmptyToken) {} - -/* Class DynamicTokenTree */ - -bool TokenTrie::registerToken(const std::string &token, - TokenTypeId type) noexcept -{ - // Abort if the token is empty -- this would taint the root node - if (token.empty()) { - return false; - } - - // Iterate over each character in the given string and insert them as - // (new) nodes - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Insert a new node if this one does not exist - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - it = node->children.emplace(c, std::make_shared<Node>()).first; - } - node = it->second.get(); - } - - // If the resulting node already has a type set, we're screwed. - if (node->type != EmptyToken) { - return false; - } - - // Otherwise just set the type to the given type. - node->type = type; - return true; -} - -bool TokenTrie::unregisterToken(const std::string &token) noexcept -{ - // We cannot remove empty tokens as we need to access the fist character - // upfront - if (token.empty()) { - return false; - } - - // First pass -- search the node in the path that can be deleted - Node *subtreeRoot = &root; - char subtreeKey = token[0]; - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Go to the next node, abort if the tree ends unexpectedly - auto it = node->children.find(token[i]); - if (it == node->children.end()) { - return false; - } - - // Reset the subtree handler if this node has another type - node = it->second.get(); - if ((node->type != EmptyToken || node->children.size() > 1) && - (i + 1 != token.size())) { - subtreeRoot = node; - subtreeKey = token[i + 1]; - } - } - - // If the node type is already EmptyToken, we cannot do anything here - if (node->type == EmptyToken) { - return false; - } - - // If the target node has children, we cannot delete the subtree. Set the - // type to EmptyToken instead - if (!node->children.empty()) { - node->type = EmptyToken; - return true; - } - - // If we end up here, we can safely delete the complete subtree - subtreeRoot->children.erase(subtreeKey); - return true; -} - -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept -{ - Node const *node = &root; - for (size_t i = 0; i < token.size(); i++) { - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - return EmptyToken; - } - node = it->second.get(); - } - return node->type; -} -} - diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp deleted file mode 100644 index 36c2ffa..0000000 --- a/src/formats/osdm/TokenTrie.hpp +++ /dev/null @@ -1,150 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file TokenTrie.hpp - * - * Class representing a token trie that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_TRIE_HPP_ -#define _OUSIA_TOKEN_TRIE_HPP_ - -#include <cstdint> -#include <memory> -#include <limits> -#include <unordered_map> - -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); - -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; - -/** - * The Tokenizer internally uses a TokenTrie to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * A token trie is a construct that structures all special tokens a Tokenizer - * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and - * three. Then the token tree would look like this: - * - * \code{*.txt} - * ~ (0) - * / \ - * a (2) b (0) - * | | - * a (0) a (0) - * | | - * b (1) c (0) - * \endcode - * - * Where the number indicates the corresponding token descriptor identifier. - */ -class TokenTrie { -public: - /** - * Structure used to build the node tree. - */ - struct Node { - /** - * Type used for the child map. - */ - using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; - - /** - * Map from single characters at the corresponding child nodes. - */ - ChildMap children; - - /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. - */ - TokenTypeId type; - - /** - * Default constructor, initializes the descriptor with nullptr. - */ - Node(); - }; - -private: - /** - * Root node of the internal token tree. - */ - Node root; - -public: - /** - * Registers a token containing the given string. Returns false if the - * token already exists, true otherwise. - * - * @param token is the character sequence that should be registered as - * token. - * @param type is the descriptor that should be set for this token. - * @return true if the operation is successful, false otherwise. - */ - bool registerToken(const std::string &token, TokenTypeId type) noexcept; - - /** - * Unregisters the token from the token tree. Returns true if the token was - * unregistered successfully, false otherwise. - * - * @param token is the character sequence that should be unregistered. - * @return true if the operation was successful, false otherwise. - */ - bool unregisterToken(const std::string &token) noexcept; - - /** - * Returns true, if the given token exists within the TokenTree. This - * function is mostly thought for debugging and unit testing. - * - * @param token is the character sequence that should be searched. - * @return the attached token descriptor or nullptr if the given token is - * not found. - */ - TokenTypeId hasToken(const std::string &token) const noexcept; - - /** - * Returns a reference at the root node to be used for traversing the token - * tree. - * - * @return a reference at the root node. - */ - const Node *getRoot() const noexcept { return &root; } -}; -} - -#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ - diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp new file mode 100644 index 0000000..4973639 --- /dev/null +++ b/src/formats/osml/OsmlParser.cpp @@ -0,0 +1,57 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/parser/generic/ParserStateCallbacks.hpp> +#include <core/parser/generic/ParserStateStack.hpp> + +#include "OsdmParser.hpp" +#include "OsdmStreamParser.hpp" + +namespace ousia { + +namespace { + +/** + * The OsdmParserImplementation class contains the actual implementation of the + * parsing process and is created in the "doParse" function of the OsdmParser. + + */ +class OsdmParserImplementation : public ParserStateCallbacks { +private: + /** + * OsdmStreamParser instance. + */ + OsdmStreamParser parser; + + /** + * Instance of the ParserStateStack. + */ + ParserStateStack stack; + +public: + OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap) +}; +} + +void OsdmParser::doParse(CharReader &reader, ParserContext &ctx) +{ + OsdmParserImplementation parser(reader, ctx); + parser.parse(); +} + +} diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp new file mode 100644 index 0000000..37505b4 --- /dev/null +++ b/src/formats/osml/OsmlParser.hpp @@ -0,0 +1,48 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsdmParser.hpp + * + * Contains the parser of the osdm format, the standard plain-text format used + * by Ousía for documents. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_PARSER_HPP_ +#define _OUSIA_OSDM_PARSER_HPP_ + +#include <core/parser/Parser.hpp> + +namespace ousia { + +/** + * OsdmParser is a small wrapper implementing the Parser interface. The actual + * parsing is performed with the OsdmStreamParser in conjunction with the + * ParserStateStack. + */ +class OsdmParser : public Parser { +protected: + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSDM_PARSER_HPP_ */ + diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 8cb8caf..0174fa4 100644 --- a/src/formats/osdm/OsdmStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -21,14 +21,14 @@ #include <core/common/Utils.hpp> #include <core/common/VariantReader.hpp> -#include "OsdmStreamParser.hpp" +#include "OsmlStreamParser.hpp" namespace ousia { /** * Plain format default tokenizer. */ -class PlainFormatTokens : public DynamicTokenizer { +class PlainFormatTokens : public Tokenizer { public: /** * Id of the backslash token. @@ -61,6 +61,21 @@ public: TokenTypeId FieldEnd; /** + * Id of the default field start token. + */ + TokenTypeId DefaultFieldStart; + + /** + * Id of the annotation start token. + */ + TokenTypeId AnnotationStart; + + /** + * Id of the annotation end token. + */ + TokenTypeId AnnotationEnd; + + /** * Registers the plain format tokens in the internal tokenizer. */ PlainFormatTokens() @@ -71,6 +86,9 @@ public: BlockCommentEnd = registerToken("}%"); FieldStart = registerToken("{"); FieldEnd = registerToken("}"); + DefaultFieldStart = registerToken("{!"); + AnnotationStart = registerToken("<\\"); + AnnotationEnd = registerToken("\\>"); } }; @@ -160,14 +178,14 @@ public: } }; -OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) : reader(reader), logger(logger), tokenizer(Tokens) { // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true}); + commands.push(Command{"", Variant::mapType{}, true, true, true, false}); } -Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; bool hasCharSiceNSSep = false; @@ -210,7 +228,7 @@ Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) return res; } -OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() { // Expect a '{' after the command reader.consumeWhitespace(); @@ -251,7 +269,7 @@ OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() return State::COMMAND; } -static bool checkStillInField(const OsdmStreamParser::Command &cmd, +static bool checkStillInField(const OsmlStreamParser::Command &cmd, const Variant &endName, Logger &logger) { if (cmd.inField && !cmd.inRangeField) { @@ -264,7 +282,7 @@ static bool checkStillInField(const OsdmStreamParser::Command &cmd, return false; } -OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +OsmlStreamParser::State OsmlStreamParser::parseEndCommand() { // Expect a '{' after the command if (!reader.expect('{')) { @@ -327,7 +345,7 @@ OsdmStreamParser::State OsdmStreamParser::parseEndCommand() return cmd.inRangeField ? State::FIELD_END : State::NONE; } -Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) { // Parse the arguments using the universal VariantReader Variant commandArguments; @@ -353,7 +371,7 @@ Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) return commandArguments; } -void OsdmStreamParser::pushCommand(Variant commandName, +void OsmlStreamParser::pushCommand(Variant commandName, Variant commandArguments, bool hasRange) { // Store the location on the stack @@ -365,10 +383,11 @@ void OsdmStreamParser::pushCommand(Variant commandName, commands.pop(); } commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false}); + hasRange, false, false, false}); } -OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, + bool isAnnotation) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); @@ -382,6 +401,9 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) Utils::split(commandName.asString(), ':'); const bool isBegin = commandNameComponents[0] == "begin"; const bool isEnd = commandNameComponents[0] == "end"; + + // Parse the begin or end command + State res = State::COMMAND; if (isBegin || isEnd) { if (commandNameComponents.size() > 1) { logger.error( @@ -390,35 +412,81 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) commandName); } if (isBegin) { - return parseBeginCommand(); + res = parseBeginCommand(); } else if (isEnd) { - return parseEndCommand(); + res = parseEndCommand(); } + } else { + // Check whether the next character is a '#', indicating the start of + // the command name + Variant commandArgName; + start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + // Parse the arugments + Variant commandArguments = + parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), false); } - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); + // Check whether a ">" character is the next character that is to be read. + // In that case the current command could be an annotation end command! + char c; + if (reader.fetch(c) && c == '>') { + // Ignore the character after a begin or end command + if (isBegin || isEnd) { + logger.warning( + "Ignoring annotation end character \">\" after special " + "commands \"begin\" or \"end\". Write \"\\>\" to end a " + "\"begin\"/\"end\" enclosed annotation.", + reader); + return res; } - } - // Parse the arugments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + // If this should be an annoation, ignore the character + if (isAnnotation) { + logger.warning( + "Ignoring annotation end character \">\" after annotation " + "start command. Write \"\\>\" to end the annotation.", + reader); + } else { + // Make sure no arguments apart from the "name" argument are given + // to an annotation end + Variant::mapType &map = commands.top().arguments.asMap(); + if (!map.empty()) { + if (map.count("name") == 0 || map.size() > 1U) { + logger.error( + "An annotation end command may not have any arguments " + "other than \"name\""); + return res; + } + } - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), false); + // If we got here, this is a valid ANNOTATION_END command, issue it + reader.peek(c); + reader.consumePeek(); + return State::ANNOTATION_END; + } + } - return State::COMMAND; + // If we're starting an annotation, return the command as annotation start + // instead of command + if (isAnnotation && res == State::COMMAND) { + return State::ANNOTATION_START; + } + return res; } -void OsdmStreamParser::parseBlockComment() +void OsmlStreamParser::parseBlockComment() { - DynamicToken token; + Token token; size_t depth = 1; while (tokenizer.read(reader, token)) { if (token.type == Tokens.BlockCommentEnd) { @@ -436,7 +504,7 @@ void OsdmStreamParser::parseBlockComment() logger.error("File ended while being in a block comment", reader); } -void OsdmStreamParser::parseLineComment() +void OsmlStreamParser::parseLineComment() { char c; while (reader.read(c)) { @@ -446,7 +514,7 @@ void OsdmStreamParser::parseLineComment() } } -bool OsdmStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData(DataHandler &handler) { if (!handler.isEmpty()) { data = handler.toVariant(reader.getSourceId()); @@ -457,7 +525,7 @@ bool OsdmStreamParser::checkIssueData(DataHandler &handler) return false; } -bool OsdmStreamParser::checkIssueFieldStart() +bool OsmlStreamParser::checkIssueFieldStart() { // Fetch the current command, and check whether we're currently inside a // field of this command @@ -482,18 +550,41 @@ bool OsdmStreamParser::checkIssueFieldStart() return false; } -OsdmStreamParser::State OsdmStreamParser::parse() +bool OsmlStreamParser::closeField() +{ + // Try to end an open field of the current command -- if the current command + // is not inside an open field, end this command and try to close the next + // one + for (int i = 0; i < 2 && commands.size() > 1; i++) { + Command &cmd = commands.top(); + if (!cmd.inRangeField) { + if (cmd.inField) { + cmd.inField = false; + if (cmd.inDefaultField) { + commands.pop(); + } + return true; + } + commands.pop(); + } else { + return false; + } + } + return false; +} + +OsmlStreamParser::State OsmlStreamParser::parse() { // Handler for incomming data DataHandler handler; // Read tokens until the outer loop should be left - DynamicToken token; + Token token; while (tokenizer.peek(reader, token)) { const TokenTypeId type = token.type; // Special handling for Backslash and Text - if (type == Tokens.Backslash) { + if (type == Tokens.Backslash || type == Tokens.AnnotationStart) { // Before appending anything to the output data or starting a new // command, check whether FIELD_START has to be issued, as the // current command is a command with range @@ -519,7 +610,8 @@ OsdmStreamParser::State OsdmStreamParser::parse() } // Parse the actual command - State res = parseCommand(token.location.getStart()); + State res = parseCommand(token.location.getStart(), + type == Tokens.AnnotationStart); switch (res) { case State::ERROR: throw LoggableException( @@ -536,6 +628,14 @@ OsdmStreamParser::State OsdmStreamParser::parse() // to the data buffer, use the escape character start as start // location and the peek offset as end location reader.peek(c); // Peek the previously fetched character + + // If this was an annotation start token, add the parsed < to the + // output + if (type == Tokens.AnnotationStart) { + handler.append('<', token.location.getStart(), + token.location.getStart() + 1); + } + handler.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); @@ -579,28 +679,37 @@ OsdmStreamParser::State OsdmStreamParser::parse() } logger.error( "Got field start token \"{\", but no command for which to " - "start the field. Did you mean \"\\{\"?", + "start the field. Write \"\\{\" to insert this sequence as " + "text.", token); } else if (token.type == Tokens.FieldEnd) { - // Try to end an open field of the current command -- if the current - // command is not inside an open field, end this command and try to - // close the next one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - return State::FIELD_END; - } - commands.pop(); - } else { - break; - } + if (closeField()) { + return State::FIELD_END; } logger.error( - "Got field end token \"}\", but there is no field to end. Did " - "you mean \"\\}\"?", + "Got field end token \"}\", but there is no field to end. " + "Write \"\\}\" to insert this sequence as text.", token); + } else if (token.type == Tokens.DefaultFieldStart) { + // Try to start a default field the first time the token is reached + Command &topCmd = commands.top(); + if (!topCmd.inField) { + topCmd.inField = true; + topCmd.inDefaultField = true; + return State::FIELD_START; + } + logger.error( + "Got default field start token \"{!\", but no command for " + "which to start the field. Write \"\\{!\" to insert this " + "sequence as text", + token); + } else if (token.type == Tokens.AnnotationEnd) { + // We got a single annotation end token "\>" -- simply issue the + // ANNOTATION_END event + Variant annotationName = Variant::fromString(""); + annotationName.setLocation(token.location); + pushCommand(annotationName, Variant::mapType{}, false); + return State::ANNOTATION_END; } else { logger.error("Unexpected token \"" + token.content + "\"", token); } @@ -627,14 +736,19 @@ OsdmStreamParser::State OsdmStreamParser::parse() return State::END; } -const Variant &OsdmStreamParser::getCommandName() +const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; } -const Variant &OsdmStreamParser::getCommandArguments() +const Variant &OsmlStreamParser::getCommandArguments() const { return commands.top().arguments; } + +bool OsmlStreamParser::inDefaultField() const +{ + return commands.top().inRangeField || commands.top().inDefaultField; +} } diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 48d8fb7..dc3034c 100644 --- a/src/formats/osdm/OsdmStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -17,23 +17,22 @@ */ /** - * @file OsdmStreamParser.hpp + * @file OsmlStreamParser.hpp * - * Provides classes for low-level classes for reading the TeX-esque osdm + * Provides classes for low-level classes for reading the TeX-esque osml * format. The class provided here does not build any model objects and does not * implement the Parser interface. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ -#define _OUSIA_OSDM_STREAM_PARSER_HPP_ +#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ +#define _OUSIA_OSML_STREAM_PARSER_HPP_ #include <stack> #include <core/common/Variant.hpp> - -#include "DynamicTokenizer.hpp" +#include <core/parser/utils/Tokenizer.hpp> namespace ousia { @@ -43,7 +42,7 @@ class Logger; class DataHandler; /** - * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml * format. The parser is constructed around a "parse" function, which reads data * from the underlying CharReader until a new state is reached and indicates * this state in a return value. The calling code then has to pull corresponding @@ -53,10 +52,10 @@ class DataHandler; * fields, as this would lead to too many consecutive errors) a * LoggableException is thrown. */ -class OsdmStreamParser { +class OsmlStreamParser { public: /** - * Enum used to indicate which state the OsdmStreamParser class is in + * Enum used to indicate which state the OsmlStreamParser class is in * after calling the "parse" function. */ enum class State { @@ -140,23 +139,35 @@ public: /** * Set to true if this is a command with clear begin and end. */ - bool hasRange; + bool hasRange : 1; /** * Set to true if we are currently inside a field of this command. */ - bool inField; + bool inField : 1; /** * Set to true if we are currently in the range field of the command * (implies inField being set to true). */ - bool inRangeField; + bool inRangeField : 1; + + /** + * Set to true if we are currently in a field that has been especially + * marked as default field (using the "|") syntax. + */ + bool inDefaultField : 1; /** * Default constructor. */ - Command() : hasRange(false), inField(false), inRangeField(false) {} + Command() + : hasRange(false), + inField(false), + inRangeField(false), + inDefaultField() + { + } /** * Constructor of the Command class. @@ -169,16 +180,19 @@ public: * explicit range. * @param inField is set to true if we currently are inside a field * of this command. - * @param inRangeField is set to true if we currently inside the outer - * field of the command. + * @param inRangeField is set to true if we currently are inside the + * outer field of a ranged command. + * @param inDefaultField is set to true if we currently are in a + * specially marked default field. */ - Command(Variant name, Variant arguments, bool hasRange, bool inField, - bool inRangeField) + Command(Variant name, Variant arguments, bool hasRange, + bool inField, bool inRangeField, bool inDefaultField) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), inField(inField), - inRangeField(inRangeField) + inRangeField(inRangeField), + inDefaultField(inDefaultField) { } }; @@ -198,7 +212,7 @@ private: /** * Tokenizer instance used to read individual tokens from the text. */ - DynamicTokenizer tokenizer; + Tokenizer tokenizer; /** * Stack containing the current commands. @@ -258,9 +272,11 @@ private: * * @param start is the start byte offset of the command (including the * backslash) + * @param isAnnotation if true, the command is not returned as command, but + * as annotation start. * @return true if a command was actuall parsed, false otherwise. */ - State parseCommand(size_t start); + State parseCommand(size_t start, bool isAnnotation); /** * Function used internally to parse a block comment. @@ -290,16 +306,26 @@ private: */ bool checkIssueFieldStart(); + /** + * Closes a currently open field. Note that the command will be removed from + * the internal command stack if the field that is being closed is a + * field marked as default field. + * + * @return true if the field could be closed, false if there was no field + * to close. + */ + bool closeField(); + public: /** - * Constructor of the OsdmStreamParser class. Attaches the new - * OsdmStreamParser to the given CharReader and Logger instances. + * Constructor of the OsmlStreamParser class. Attaches the new + * OsmlStreamParser to the given CharReader and Logger instances. * * @param reader is the reader instance from which incomming characters * should be read. * @param logger is the logger instance to which errors should be written. */ - OsdmStreamParser(CharReader &reader, Logger &logger); + OsmlStreamParser(CharReader &reader, Logger &logger); /** * Continues parsing. Returns one of the states defined in the State enum. @@ -318,7 +344,7 @@ public: * @return a reference at a variant containing the data parsed by the * "parse" function. */ - const Variant &getData() { return data; } + const Variant &getData() const { return data; } /** * Returns a reference at the internally stored command name. Only valid if @@ -327,7 +353,7 @@ public: * @return a reference at a variant containing name and location of the * parsed command. */ - const Variant &getCommandName(); + const Variant &getCommandName() const; /** * Returns a reference at the internally stored command name. Only valid if @@ -336,16 +362,24 @@ public: * @return a reference at a variant containing arguments given to the * command. */ - const Variant &getCommandArguments(); + const Variant &getCommandArguments() const; + + /** + * Returns true if the current field is the "default" field. This is true if + * the parser either is in the outer range of a range command or inside a + * field that has been especially marked as "default" field (using the "|" + * syntax). + */ + bool inDefaultField() const; /** * Returns a reference at the char reader. * * @return the last internal token location. */ - SourceLocation &getLocation() { return location; } + const SourceLocation &getLocation() const { return location; } }; } -#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ +#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */ diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp new file mode 100644 index 0000000..e37446a --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.cpp @@ -0,0 +1,144 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/Location.hpp> +#include <core/common/CharReader.hpp> +#include <core/common/Utils.hpp> + +#include "OsxmlAttributeLocator.hpp" + +namespace ousia { + +/** + * Enum used internally in the statemachine of the xml argument parser. + */ +enum class XmlAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +std::map<std::string, SourceLocation> OsxmlAttributeLocator::locate( + CharReader &reader, size_t offs) +{ + std::map<std::string, SourceLocation> res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + if (offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XmlAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + res.emplace("$tag", + SourceLocation{reader.getSourceId(), offs + 1, + readerFork.getOffset() - 1}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + case XmlAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XmlAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XmlAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XmlAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XmlAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XmlAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + state = XmlAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XmlAttributeState::IN_ATTR_NAME; + } + } + break; + case XmlAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, set the end + // location + auto it = res.find(attrName.str()); + if (it != res.end()) { + it->second.setEnd(readerFork.getOffset() - 1); + } + + // Reset the attribute name and restart the search + attrName.str(std::string{}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} +} + diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp new file mode 100644 index 0000000..f9a3437 --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.hpp @@ -0,0 +1,67 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlAttributeLocator.hpp + * + * Contains a class used for locating the byte offsets of the attributes given + * in a XML tag. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ +#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ + +#include <map> + +namespace ousia { + +// Forward declarations +class CharReader; +class SourceLocation; + +/** + * Class containing one static function for locating the byte offsets of the + * attributes in a XML tag. This are not retrieved by our xml parser, so we have + * to do this manually. + */ +class OsxmlAttributeLocator { +public: + /** + * Function used to reconstruct the location of the attributes of a XML tag + * in the source code. This is necessary, as the xml parser only returns an + * offset to the begining of a tag and not to the position of the individual + * arguments. + * + * @param reader is the char reader from which the character data should be + * read. + * @param offs is a byte offset in the xml file pointing at the "<" + * character of the tag. + * @return a map from attribute keys to the corresponding location + * (including range) of the atribute. Also contains the location of the + * tagname in the form of the virtual attribute "$tag". + */ + static std::map<std::string, SourceLocation> locate(CharReader &reader, + size_t offs); +}; + +} + +#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */ + diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp new file mode 100644 index 0000000..7404960 --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -0,0 +1,547 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <expat.h> + +#include <vector> + +#include <core/common/CharReader.hpp> +#include <core/common/Logger.hpp> +#include <core/common/Variant.hpp> +#include <core/common/VariantReader.hpp> +#include <core/common/Utils.hpp> +#include <core/common/WhitespaceHandler.hpp> + +#include "OsxmlAttributeLocator.hpp" +#include "OsxmlEventParser.hpp" + +namespace ousia { + +/* Class OsxmlEventParser */ + +/** + * Class containing data used by the internal functions. + */ +class OsxmlEventParserData { +public: + /** + * Contains the current depth of the parsing process. + */ + ssize_t depth; + + /** + * Set to a value larger or equal to zero if the parser is currently inside + * an annotation end tag -- the value represents the depth in which the + * tag was opened. + */ + ssize_t annotationEndTagDepth; + + /** + * Current character data buffer. + */ + std::vector<char> textBuf; + + /** + * Current whitespace buffer (for the trimming whitspace mode) + */ + std::vector<char> whitespaceBuf; + + /** + * Flag indicating whether a whitespace character was present (for the + * collapsing whitespace mode). + */ + bool hasWhitespace; + + /** + * Current character data start. + */ + size_t textStart; + + /** + * Current character data end. + */ + size_t textEnd; + + /** + * Default constructor. + */ + OsxmlEventParserData(); + + /** + * Increments the depth. + */ + void incrDepth(); + + /** + * Decrement the depth and reset the annotationEndTagDepth flag. + */ + void decrDepth(); + + /** + * Returns true if we're currently inside an end tag. + */ + bool inAnnotationEndTag(); + + /** + * Returns true if character data is available. + * + * @return true if character data is available. + */ + bool hasText(); + + /** + * Returns a Variant containing the character data and its location. + * + * @return a string variant containing the text data and the character + * location. + */ + Variant getText(SourceId sourceId); +}; + +/* Class GuardedExpatXmlParser */ + +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class GuardedExpatXmlParser { +private: + /** + * Internal pointer to the XML_Parser instance. + */ + XML_Parser parser; + +public: + /** + * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS + * from the expat library. Throws a parser exception if the XML parser + * cannot be initialized. + * + * @param encoding is the protocol-defined encoding passed to expat (or + * nullptr if expat should determine the encoding by itself). + */ + GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + { + parser = XML_ParserCreate(encoding); + if (!parser) { + throw LoggableException{ + "Internal error: Could not create expat XML parser!"}; + } + } + + /** + * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance. + */ + ~GuardedExpatXmlParser() + { + if (parser) { + XML_ParserFree(parser); + parser = nullptr; + } + } + + /** + * Returns the XML_Parser pointer. + */ + XML_Parser operator&() { return parser; } +}; + +/** + * Name of the special outer tag used for allowing multiple top-level elements + * in an xml file. + */ +static const std::string TOP_LEVEL_TAG{"ousia"}; + +/** + * Prefix used to indicate the start of an annoation (note the trailing colon) + */ +static const std::string ANNOTATION_START_PREFIX{"a:start:"}; + +/** + * Prefix used to indicate the end of an annotation. + */ +static const std::string ANNOTATION_END_PREFIX{"a:end"}; + +/** + * Synchronizes the position of the xml parser with the default location of the + * logger instance. + * + * @param p is a pointer at the xml parser instance. + * @param len is the length of the string that should be refered to. + * @return the SourceLocation that has been set in the logger. + */ +static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) +{ + // Fetch the OsxmlEventParser instance + OsxmlEventParser *parser = + static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + + // Fetch the current location in the XML file and set the default location + // in the logger + size_t offs = XML_GetCurrentByteIndex(p); + SourceLocation loc = + SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; + parser->getLogger().setDefaultLocation(loc); + + // Return the fetched location + return loc; +} + +/** + * Callback called by eXpat whenever a start handler is reached. + */ +static void xmlStartElementHandler(void *ref, const XML_Char *name, + const XML_Char **attrs) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast<XML_Parser>(ref); + OsxmlEventParser *parser = + static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + + // If there is any text data in the buffer, issue that first + if (parser->getData().hasText()) { + parser->getEvents().data( + parser->getData().getText(parser->getReader().getSourceId())); + } + + // Read the argument locations -- this is only a stupid and slow hack, + // but it is necessary, as expat doesn't give use the byte offset of the + // arguments. + std::map<std::string, SourceLocation> attributeOffsets = + OsxmlAttributeLocator::locate(parser->getReader(), + XML_GetCurrentByteIndex(p)); + + // Update the logger position + SourceLocation loc = xmlSyncLoggerPosition(p); + + // Fetch the location of the name + SourceLocation nameLoc = loc; + auto it = attributeOffsets.find("$tag"); + if (it != attributeOffsets.end()) { + nameLoc = it->second; + } + // Increment the current depth + parser->getData().incrDepth(); + + // Make sure we're currently not inside an annotation end tag -- this would + // be highly illegal! + if (parser->getData().inAnnotationEndTag()) { + parser->getLogger().error( + "No tags allowed inside an annotation end tag", nameLoc); + return; + } + + // Assemble the arguments + Variant::mapType args; + const XML_Char **attr = attrs; + while (*attr) { + // Convert the C string to a std::string + const std::string key{*(attr++)}; + + // Search the location of the key + SourceLocation keyLoc; + auto it = attributeOffsets.find(key); + if (it != attributeOffsets.end()) { + keyLoc = it->second; + } + + // Parse the string, pass the location of the key + std::pair<bool, Variant> value = VariantReader::parseGenericString( + *(attr++), parser->getLogger(), keyLoc.getSourceId(), + keyLoc.getStart()); + + // Set the overall location of the parsed element to the attribute + // location + value.second.setLocation(keyLoc); + + // Store the keys in the map + args.emplace(key, value.second).second; + } + + // Fetch the name of the tag, check for special tags + std::string nameStr(name); + if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) { + // We're in the top-level and the magic tag is reached -- just + // ignore it and issue a warning for each argument that has been given + for (const auto &arg : args) { + parser->getLogger().warning(std::string("Ignoring attribute \"") + + arg.first + + std::string("\" for magic tag \"") + + TOP_LEVEL_TAG + std::string("\""), + arg.second); + } + } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { + // Assemble a name variant containing the name minus the prefix + Variant nameVar = + Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size())); + nameVar.setLocation(nameLoc); + + // Issue the "annotationStart" event + parser->getEvents().annotationStart(nameVar, args); + } else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) { + // Assemble a name variant containing the name minus the prefix + nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size()); + + // Discard a potentially leading colon + if (!nameStr.empty() && nameStr[0] == ':') { + nameStr = nameStr.substr(1); + } + + // Assemble the variant containing the name and its location + Variant nameVar = Variant::fromString(nameStr); + nameVar.setLocation(nameLoc); + + // Check whether a "name" attribute was given + Variant elementName; + for (const auto &arg : args) { + if (arg.first == "name") { + elementName = arg.second; + } else { + parser->getLogger().warning( + std::string("Ignoring attribute \"") + arg.first + + "\" in annotation end tag", + arg.second); + } + } + + // Set the annotationEndTagDepth to disallow any further tags to be + // opened inside the annotation end tag. + parser->getData().annotationEndTagDepth = parser->getData().depth; + + // Issue the "annotationEnd" event + parser->getEvents().annotationEnd(nameVar, args); + } else { + // Just issue a "commandStart" event in any other case + Variant nameVar = Variant::fromString(nameStr); + nameVar.setLocation(nameLoc); + parser->getEvents().command(nameVar, args); + } +} + +static void xmlEndElementHandler(void *ref, const XML_Char *name) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast<XML_Parser>(ref); + OsxmlEventParser *parser = + static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + + // Synchronize the position of the logger with teh position + xmlSyncLoggerPosition(p); + + // Abort as long as we're in an annotation end tag + if (parser->getData().inAnnotationEndTag()) { + parser->getData().decrDepth(); + return; + } + + // Decrement the current depth + parser->getData().decrDepth(); + + // If there is any text data in the buffer, issue that first + if (parser->getData().hasText()) { + parser->getEvents().data( + parser->getData().getText(parser->getReader().getSourceId())); + } + + // Abort if the special ousia tag ends here + std::string nameStr{name}; + if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) { + return; + } + + // Issue the "fieldEnd" event + parser->getEvents().fieldEnd(); +} + +static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast<XML_Parser>(ref); + OsxmlEventParser *parser = + static_cast<OsxmlEventParser *>(XML_GetUserData(p)); + + // Abort as long as we're in an annotation end tag + if (parser->getData().inAnnotationEndTag()) { + return; + } + + // Convert the signed (smell the 90's C library here?) length to an usigned + // value + size_t ulen = len > 0 ? static_cast<size_t>(len) : 0; + + // Synchronize the logger position + SourceLocation loc = xmlSyncLoggerPosition(p, ulen); + + // Fetch some variables for convenience + const WhitespaceMode mode = parser->getWhitespaceMode(); + OsxmlEventParserData &data = parser->getData(); + std::vector<char> &textBuf = data.textBuf; + std::vector<char> &whitespaceBuf = data.whitespaceBuf; + bool &hasWhitespace = data.hasWhitespace; + size_t &textStart = data.textStart; + size_t &textEnd = data.textEnd; + + size_t pos = loc.getStart(); + for (size_t i = 0; i < ulen; i++, pos++) { + switch (mode) { + case WhitespaceMode::PRESERVE: + PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd); + break; + case WhitespaceMode::TRIM: + TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd, + whitespaceBuf); + break; + case WhitespaceMode::COLLAPSE: + CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd, + hasWhitespace); + break; + } + } +} + +/* Class OsxmlEvents */ + +OsxmlEvents::~OsxmlEvents() {} + +/* Class OsxmlEventParser */ + +OsxmlEventParserData::OsxmlEventParserData() + : depth(0), + annotationEndTagDepth(-1), + hasWhitespace(false), + textStart(0), + textEnd(0) +{ +} + +void OsxmlEventParserData::incrDepth() { depth++; } + +void OsxmlEventParserData::decrDepth() +{ + if (depth > 0) { + depth--; + } + if (depth < annotationEndTagDepth) { + annotationEndTagDepth = -1; + } +} + +bool OsxmlEventParserData::inAnnotationEndTag() +{ + return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); +} + +bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } + +Variant OsxmlEventParserData::getText(SourceId sourceId) +{ + // Create a variant containing the string data and the location + Variant var = + Variant::fromString(std::string{textBuf.data(), textBuf.size()}); + var.setLocation({sourceId, textStart, textEnd}); + + // Reset the text buffers + textBuf.clear(); + whitespaceBuf.clear(); + hasWhitespace = false; + textStart = 0; + textEnd = 0; + + // Return the variant + return var; +} + +/* Class OsxmlEventParser */ + +OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, + Logger &logger) + : reader(reader), + events(events), + logger(logger), + whitespaceMode(WhitespaceMode::TRIM), + data(new OsxmlEventParserData()) +{ +} + +OsxmlEventParser::~OsxmlEventParser() {} + +void OsxmlEventParser::parse() +{ + // Create the parser object + GuardedExpatXmlParser p{"UTF-8"}; + + // Reset the depth + data->depth = 0; + + // Pass the reference to this parser instance to the XML handler + XML_SetUserData(&p, this); + XML_UseParserAsHandlerArg(&p); + + // Set the callback functions + XML_SetStartElementHandler(&p, xmlStartElementHandler); + XML_SetEndElementHandler(&p, xmlEndElementHandler); + XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + + // Feed data into expat while there is data to process + constexpr size_t BUFFER_SIZE = 64 * 1024; + while (true) { + // Fetch a buffer from expat for the input data + char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE)); + if (!buf) { + throw OusiaException{"Internal error: XML parser out of memory!"}; + } + + // Read into the buffer + size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + + // Parse the data and handle any XML error as exception + if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { + throw LoggableException{ + "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, + xmlSyncLoggerPosition(&p)}; + } + + // Abort once there are no more bytes in the stream + if (bytesRead == 0) { + break; + } + } +} + +void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ + this->whitespaceMode = whitespaceMode; +} + +WhitespaceMode OsxmlEventParser::getWhitespaceMode() const +{ + return whitespaceMode; +} + +CharReader &OsxmlEventParser::getReader() const { return reader; } + +Logger &OsxmlEventParser::getLogger() const { return logger; } + +OsxmlEvents &OsxmlEventParser::getEvents() const { return events; } + +OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; } +} + diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp new file mode 100644 index 0000000..e39245f --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -0,0 +1,217 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlEventParser.hpp + * + * The OsxmlEventParser class is responsible for parsing an XML file and calling + * the corresponding event handler functions if an XML item is found. Event + * handling is performed using a listener interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OSXML_EVENT_PARSER_HPP_ +#define _OSXML_EVENT_PARSER_HPP_ + +#include <memory> +#include <string> + +#include <core/common/Whitespace.hpp> + +namespace ousia { + +// Forward declarations +class Logger; +class Variant; +class OsxmlEventParserData; + +/** + * Interface which defines the callback functions which are called by the + * OsxmlEventParser whenever an event occurs. + */ +class OsxmlEvents { +public: + /** + * Virtual destructor. + */ + virtual ~OsxmlEvents(); + + /** + * Called whenever a command starts. Note that this implicitly always starts + * the default field of the command. + * + * @param name is a string variant containing name and location of the + * command. + * @param args is a map containing the arguments that were given to the + * command. + */ + virtual void command(const Variant &name, const Variant::mapType &args) = 0; + + /** + * Called whenever an annotation starts. Note that this implicitly always + * starts the default field of the annotation. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the annotation definition. + * @param args is a map variant containing the arguments that were given + * to the annotation definition. + */ + virtual void annotationStart(const Variant &className, + const Variant::mapType &args) = 0; + + /** + * Called whenever the range of an annotation ends. The callee must + * disambiguate the actual annotation that is finished here. + * + * @param className is a string variant containing the name of the + * annotation class that should end here. May be empty (or nullptr), if no + * elementName has been specified at the end of the annotation. + * @param elementName is the name of the annotation element that should be + * ended here. May be empty (or nullptr), if no elementName has been + * specified at the end of the annotation. + */ + virtual void annotationEnd(const Variant &className, + const Variant &elementName) = 0; + + /** + * Called whenever the default field which was implicitly started by + * commandStart or annotationStart ends. Note that this does not end the + * range of an annotation, but the default field of the annotation. To + * signal the end of the annotation this, the annotationEnd method will be + * invoked. + */ + virtual void fieldEnd() = 0; + + /** + * Called whenever data is found. Whitespace data is handled as specified + * and the data has been parsed to the specified variant type. This function + * is not called if the parsing failed, the parser prints an error message + * instead. + * + * @param data is the already parsed data that should be passed to the + * handler. + */ + virtual void data(const Variant &data) = 0; +}; + +/** + * The OsxmlEventParser class is a wrapper around eXpat which implements the + * specialities of the osxml formats class (like annotation ranges). It notifies + * a specified event handler whenever a command, annotation or data has been + * reached. + */ +class OsxmlEventParser { +private: + /** + * Reference at the internal CharReader instance. + */ + CharReader &reader; + + /** + * Set of callback functions to be called whenever an event is triggered. + */ + OsxmlEvents &events; + + /** + * Reference at the Logger object to which error messages or warnings should + * be logged. + */ + Logger &logger; + + /** + * Current whitespace mode. + */ + WhitespaceMode whitespaceMode; + + /** + * Data to be used by the internal functions. + */ + std::unique_ptr<OsxmlEventParserData> data; + +public: + /** + * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents + * of which the callback functions are called. + * + * @param reader is a reference to the CharReader instance from which the + * XML should be read. + * @param events is a refence at an instance of the OsxmlEvents class. All + * events are forwarded to this class. + * @param logger is the Logger instance to which log messages should be + * written. + */ + OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger); + + /** + * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type) + */ + ~OsxmlEventParser(); + + /** + * Performs the actual parsing. Reads the XML using eXpat and calles the + * callbacks in the event listener instance whenever something interesting + * happens. + */ + void parse(); + + /** + * Sets the whitespace handling mode. + * + * @param whitespaceMode defines how whitespace in the data should be + * handled. + */ + void setWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Returns the current whitespace handling mode. + * + * @return the currently set whitespace handling mode. + */ + WhitespaceMode getWhitespaceMode() const; + + /** + * Returns the internal CharReader reference. + * + * @return the CharReader reference. + */ + CharReader &getReader() const; + + /** + * Returns the internal Logger reference. + * + * @return the internal Logger reference. + */ + Logger &getLogger() const; + + /** + * Returns the internal OsxmlEvents reference. + * + * @return the internal OsxmlEvents reference. + */ + OsxmlEvents &getEvents() const; + + /** + * Returns a reference at the internal data. + */ + OsxmlEventParserData &getData() const; +}; +} + +#endif /* _OSXML_EVENT_PARSER_HPP_ */ + diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp new file mode 100644 index 0000000..c216855 --- /dev/null +++ b/src/formats/osxml/OsxmlParser.cpp @@ -0,0 +1,98 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/parser/stack/GenericParserStates.hpp> +#include <core/parser/stack/Stack.hpp> +#include <core/parser/ParserContext.hpp> + +#include "OsxmlEventParser.hpp" +#include "OsxmlParser.hpp" + +namespace ousia { + +using namespace parser_stack; + +/** + * Class containing the actual OsxmlParser implementation. + */ +class OsxmlParserImplementation : public OsxmlEvents { +private: + /** + * Actual xml parser -- converts the xml stream into a set of events. + */ + OsxmlEventParser parser; + + /** + * Pushdown automaton responsible for converting the xml events into an + * actual Node tree. + */ + Stack stack; + +public: + /** + * Constructor of the OsxmlParserImplementation class. + * + * @param reader is a reference to the CharReader instance from which the + * XML should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) + : parser(reader, *this, ctx.getLogger()), + stack(ctx, GenericParserStates) + { + } + + /** + * Starts the actual parsing process. + */ + void parse() { parser.parse(); } + + void command(const Variant &name, const Variant::mapType &args) override + { + stack.command(name, args); + stack.fieldStart(true); + } + + void annotationStart(const Variant &name, + const Variant::mapType &args) override + { + stack.annotationStart(name, args); + stack.fieldStart(true); + } + + void annotationEnd(const Variant &className, + const Variant &elementName) override + { + stack.annotationEnd(className, elementName); + } + + void fieldEnd() override { stack.fieldEnd(); } + + void data(const Variant &data) override { stack.data(data); } +}; + +/* Class OsxmlParser */ + +void OsxmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ + OsxmlParserImplementation impl(reader, ctx); + impl.parse(); +} +} + diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp new file mode 100644 index 0000000..0fbf83c --- /dev/null +++ b/src/formats/osxml/OsxmlParser.hpp @@ -0,0 +1,55 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsxmlParser.hpp + * + * Contains the parser responsible for reading Ousía XML Documents (extension + * oxd) and Ousía XML Modules (extension oxm). + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_PARSER_HPP_ +#define _OUSIA_OSXML_PARSER_HPP_ + +#include <core/parser/Parser.hpp> + +namespace ousia { + +/** + * The OsxmlParser class implements parsing the various types of Ousía XML + * documents using the OsxmlEventParser and Stack classes. + */ +class OsxmlParser : public Parser { +protected: + /** + * Parses the given input stream as XML file and returns the parsed + * top-level node. + * + * @param reader is the CharReader from which the input should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSXML_PARSER_HPP_ */ + |