Merge branch 'master' of somweyr.de:ousia

author: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2015-02-15 21:56:04 +0100
committer: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2015-02-15 21:56:04 +0100
commit: d2f99e4b43ed93ef0fa8e138e0c3afc79775b77c (patch)
tree: 8e7cdb894b7036b3ca01499ee9432d2e62930477 /src/formats
parent: 40f7df390f00f85c17bd0e6527ec4ba19cbce4fc (diff)
parent: 4f2872d9968aec93bebff90d1238347c8a364949 (diff)
14 files changed, 1465 insertions, 1149 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp
deleted file mode 100644
index f2cfcd1..0000000
--- a/src/formats/osdm/DynamicTokenizer.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <memory>
-#include <vector>
-
-#include <core/common/CharReader.hpp>
-#include <core/common/Exceptions.hpp>
-#include <core/common/Utils.hpp>
-
-#include "DynamicTokenizer.hpp"
-
-namespace ousia {
-
-namespace {
-
-/* Internal class TokenMatch */
-
-/**
- * Contains information about a matching token.
- */
-struct TokenMatch {
-	/**
-	 * Token that was matched.
-	 */
-	DynamicToken token;
-
-	/**
-	 * Current length of the data within the text handler. The text buffer needs
-	 * to be trimmed to this length if this token matches.
-	 */
-	size_t textLength;
-
-	/**
-	 * End location of the current text handler. This location needs to be used
-	 * for the text token that is emitted before the actual token.
-	 */
-	size_t textEnd;
-
-	/**
-	 * Constructor of the TokenMatch class.
-	 */
-	TokenMatch() : textLength(0), textEnd(0) {}
-
-	/**
-	 * Returns true if this TokenMatch instance actually represents a match.
-	 */
-	bool hasMatch() { return token.type != EmptyToken; }
-};
-
-/* Internal class TokenLookup */
-
-/**
- * The TokenLookup class is used to represent a thread in a running token
- * lookup.
- */
-class TokenLookup {
-private:
-	/**
-	 * Current node within the token trie.
-	 */
-	TokenTrie::Node const *node;
-
-	/**
-	 * Start offset within the source file.
-	 */
-	size_t start;
-
-	/**
-	 * Current length of the data within the text handler. The text buffer needs
-	 * to be trimmed to this length if this token matches.
-	 */
-	size_t textLength;
-
-	/**
-	 * End location of the current text handler. This location needs to be used
-	 * for the text token that is emitted before the actual token.
-	 */
-	size_t textEnd;
-
-public:
-	/**
-	 * Constructor of the TokenLookup class.
-	 *
-	 * @param node is the current node.
-	 * @param start is the start position.
-	 * @param textLength is the text buffer length of the previous text token.
-	 * @param textEnd is the current end location of the previous text token.
-	 */
-	TokenLookup(const TokenTrie::Node *node, size_t start,
-	            size_t textLength, size_t textEnd)
-	    : node(node), start(start), textLength(textLength), textEnd(textEnd)
-	{
-	}
-
-	/**
-	 * Tries to extend the current path in the token trie with the given
-	 * character. If a complete token is matched, stores this match in the
-	 * tokens list (in case it is longer than any previous token).
-	 *
-	 * @param c is the character that should be appended to the current prefix.
-	 * @param lookups is a list to which new TokeLookup instances are added --
-	 * which could potentially be expanded in the next iteration.
-	 * @param match is the DynamicToken instance to which the matching token
-	 * should be written.
-	 * @param tokens is a reference at the internal token list of the
-	 * DynamicTokenizer.
-	 * @param end is the end byte offset of the current character.
-	 * @param sourceId is the source if of this file.
-	 */
-	void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
-	             const std::vector<std::string> &tokens, SourceOffset end,
-	             SourceId sourceId)
-	{
-		// Check whether we can continue the current token path with the given
-		// character without visiting an already visited node
-		auto it = node->children.find(c);
-		if (it == node->children.end()) {
-			return;
-		}
-
-		// Check whether the new node represents a complete token a whether it
-		// is longer than the current token. If yes, replace the current token.
-		node = it->second.get();
-		if (node->type != EmptyToken) {
-			const std::string &str = tokens[node->type];
-			size_t len = str.size();
-			if (len > match.token.content.size()) {
-				match.token =
-				    DynamicToken{node->type, str, {sourceId, start, end}};
-				match.textLength = textLength;
-				match.textEnd = textEnd;
-			}
-		}
-
-		// If this state can possibly be advanced, store it in the states list.
-		if (!node->children.empty()) {
-			lookups.emplace_back(*this);
-		}
-	}
-};
-
-/* Internal class TextHandlerBase */
-
-/**
- * Base class used for those classes that may be used as TextHandler in the
- * DynamicTokenizer::next function.
- */
-class TextHandlerBase {
-public:
-	/**
-	 * Start position of the extracted text.
-	 */
-	size_t textStart;
-
-	/**
-	 * End position of the extracted text.
-	 */
-	size_t textEnd;
-
-	/**
-	 * Buffer containing the extracted text.
-	 */
-	std::vector<char> textBuf;
-
-	/**
-	 * Constructor of the TextHandlerBase base class. Initializes the start and
-	 * end position with zeros.
-	 */
-	TextHandlerBase() : textStart(0), textEnd(0) {}
-
-	/**
-	 * Transforms the given token into a text token containing the extracted
-	 * text.
-	 *
-	 * @param token is the output token to which the text should be written.
-	 * @param sourceId is the source id of the underlying file.
-	 */
-	void buildTextToken(TokenMatch &match, SourceId sourceId)
-	{
-		if (match.hasMatch()) {
-			match.token.content =
-			    std::string{textBuf.data(), match.textLength};
-			match.token.location =
-			    SourceLocation{sourceId, textStart, match.textEnd};
-		} else {
-			match.token.content = std::string{textBuf.data(), textBuf.size()};
-			match.token.location = SourceLocation{sourceId, textStart, textEnd};
-		}
-		match.token.type = TextToken;
-	}
-
-	/**
-	 * Returns true if this whitespace handler has found any text and a text
-	 * token could be emitted.
-	 *
-	 * @return true if the internal data buffer is non-empty.
-	 */
-	bool hasText() { return !textBuf.empty(); }
-};
-
-/* Internal class PreservingTextHandler */
-
-/**
- * The PreservingTextHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingTextHandler : public TextHandlerBase {
-public:
-	using TextHandlerBase::TextHandlerBase;
-
-	/**
-	 * Appends the given character to the internal text buffer, does not
-	 * eliminate whitespace.
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 */
-	void append(char c, size_t start, size_t end)
-	{
-		if (textBuf.empty()) {
-			textStart = start;
-		}
-		textEnd = end;
-		textBuf.push_back(c);
-	}
-};
-
-/* Internal class TrimmingTextHandler */
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingTextHandler : public TextHandlerBase {
-public:
-	using TextHandlerBase::TextHandlerBase;
-
-	/**
-	 * Buffer used internally to temporarily store all whitespace characters.
-	 * They are only added to the output buffer if another non-whitespace
-	 * character is reached.
-	 */
-	std::vector<char> whitespaceBuf;
-
-	/**
-	 * Appends the given character to the internal text buffer, eliminates
-	 * whitespace characters at the begin and end of the text.
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 */
-	void append(char c, size_t start, size_t end)
-	{
-		// Handle whitespace characters
-		if (Utils::isWhitespace(c)) {
-			if (!textBuf.empty()) {
-				whitespaceBuf.push_back(c);
-			}
-			return;
-		}
-
-		// Set the start and end offset correctly
-		if (textBuf.empty()) {
-			textStart = start;
-		}
-		textEnd = end;
-
-		// Store the character
-		if (!whitespaceBuf.empty()) {
-			textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
-			               whitespaceBuf.end());
-			whitespaceBuf.clear();
-		}
-		textBuf.push_back(c);
-	}
-};
-
-/* Internal class CollapsingTextHandler */
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingTextHandler : public TextHandlerBase {
-public:
-	using TextHandlerBase::TextHandlerBase;
-
-	/**
-	 * Flag set to true if a whitespace character was reached.
-	 */
-	bool hasWhitespace = false;
-
-	/**
-	 * Appends the given character to the internal text buffer, eliminates
-	 * redundant whitespace characters.
-	 *
-	 * @param c is the character that should be appended to the internal buffer.
-	 * @param start is the start byte offset of the given character.
-	 * @param end is the end byte offset of the given character.
-	 */
-	void append(char c, size_t start, size_t end)
-	{
-		// Handle whitespace characters
-		if (Utils::isWhitespace(c)) {
-			if (!textBuf.empty()) {
-				hasWhitespace = true;
-			}
-			return;
-		}
-
-		// Set the start and end offset correctly
-		if (textBuf.empty()) {
-			textStart = start;
-		}
-		textEnd = end;
-
-		// Store the character
-		if (hasWhitespace) {
-			textBuf.push_back(' ');
-			hasWhitespace = false;
-		}
-		textBuf.push_back(c);
-	}
-};
-}
-
-/* Class DynamicTokenizer */
-
-DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
-    : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
-{
-}
-
-template <typename TextHandler, bool read>
-bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
-{
-	// If we're in the read mode, reset the char reader peek position to the
-	// current read position
-	if (read) {
-		reader.resetPeek();
-	}
-
-	// Prepare the lookups in the token trie
-	const TokenTrie::Node *root = trie.getRoot();
-	TokenMatch match;
-	std::vector<TokenLookup> lookups;
-	std::vector<TokenLookup> nextLookups;
-
-	// Instantiate the text handler
-	TextHandler textHandler;
-
-	// Peek characters from the reader and try to advance the current token tree
-	// cursor
-	char c;
-	size_t charStart = reader.getPeekOffset();
-	const SourceId sourceId = reader.getSourceId();
-	while (reader.peek(c)) {
-		const size_t charEnd = reader.getPeekOffset();
-		const size_t textLength = textHandler.textBuf.size();
-		const size_t textEnd = textHandler.textEnd;
-
-		// If we do not have a match yet, start a new lookup from the root
-		if (!match.hasMatch()) {
-			TokenLookup{root, charStart, textLength, textEnd}.advance(
-			    c, nextLookups, match, tokens, charEnd, sourceId);
-		}
-
-		// Try to advance all other lookups with the new character
-		for (TokenLookup &lookup : lookups) {
-			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
-		}
-
-		// We have found a token and there are no more states to advance or the
-		// text handler has found something -- abort to return the new token
-		if (match.hasMatch()) {
-			if ((nextLookups.empty() || textHandler.hasText())) {
-				break;
-			}
-		} else {
-			// Record all incomming characters
-			textHandler.append(c, charStart, charEnd);
-		}
-
-		// Swap the lookups and the nextLookups list
-		lookups = std::move(nextLookups);
-		nextLookups.clear();
-
-		// Advance the offset
-		charStart = charEnd;
-	}
-
-	// If we found text, emit that text
-	if (textHandler.hasText() &&
-	    (!match.hasMatch() || match.textLength > 0)) {
-		textHandler.buildTextToken(match, sourceId);
-	}
-
-	// Move the read/peek cursor to the end of the token, abort if an error
-	// happens while doing so
-	if (match.hasMatch()) {
-		// Make sure we have a valid location
-		if (match.token.location.getEnd() == InvalidSourceOffset) {
-			throw OusiaException{"Token end position offset out of range"};
-		}
-
-		// Seek to the end of the current token
-		const size_t end = match.token.location.getEnd();
-		if (read) {
-			reader.seek(end);
-		} else {
-			reader.seekPeekCursor(end);
-		}
-		token = match.token;
-	} else {
-		token = DynamicToken{};
-	}
-	return match.hasMatch();
-}
-
-bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
-{
-	switch (whitespaceMode) {
-		case WhitespaceMode::PRESERVE:
-			return next<PreservingTextHandler, true>(reader, token);
-		case WhitespaceMode::TRIM:
-			return next<TrimmingTextHandler, true>(reader, token);
-		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingTextHandler, true>(reader, token);
-	}
-	return false;
-}
-
-bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
-{
-	switch (whitespaceMode) {
-		case WhitespaceMode::PRESERVE:
-			return next<PreservingTextHandler, false>(reader, token);
-		case WhitespaceMode::TRIM:
-			return next<TrimmingTextHandler, false>(reader, token);
-		case WhitespaceMode::COLLAPSE:
-			return next<CollapsingTextHandler, false>(reader, token);
-	}
-	return false;
-}
-
-TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
-{
-	// Abort if an empty token should be registered
-	if (token.empty()) {
-		return EmptyToken;
-	}
-
-	// Search for a new slot in the tokens list
-	TokenTypeId type = EmptyToken;
-	for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
-		if (tokens[i].empty()) {
-			tokens[i] = token;
-			type = i;
-			break;
-		}
-	}
-
-	// No existing slot was found, add a new one -- make sure we do not
-	// override the special token type handles
-	if (type == EmptyToken) {
-		type = tokens.size();
-		if (type == TextToken || type == EmptyToken) {
-			throw OusiaException{"Token type ids depleted!"};
-		}
-		tokens.emplace_back(token);
-	}
-	nextTokenTypeId = type + 1;
-
-	// Try to register the token in the trie -- if this fails, remove it
-	// from the tokens list
-	if (!trie.registerToken(token, type)) {
-		tokens[type] = std::string();
-		nextTokenTypeId = type;
-		return EmptyToken;
-	}
-	return type;
-}
-
-bool DynamicTokenizer::unregisterToken(TokenTypeId type)
-{
-	// Unregister the token from the trie, abort if an invalid type is given
-	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
-		tokens[type] = std::string{};
-		nextTokenTypeId = type;
-		return true;
-	}
-	return false;
-}
-
-std::string DynamicTokenizer::getTokenString(TokenTypeId type)
-{
-	if (type < tokens.size()) {
-		return tokens[type];
-	}
-	return std::string{};
-}
-
-void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
-{
-	whitespaceMode = mode;
-}
-
-WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
-
-/* Explicitly instantiate all possible instantiations of the "next" member
-   function */
-template bool DynamicTokenizer::next<PreservingTextHandler, false>(
-    CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
-    CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
-    CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<PreservingTextHandler, true>(
-    CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
-    CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
-    CharReader &reader,DynamicToken &token);
-}
-
diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp
deleted file mode 100644
index 0cac2e8..0000000
--- a/src/formats/osdm/DynamicTokenizer.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file DynamicTokenizer.hpp
- *
- * Tokenizer that can be reconfigured at runtime used for parsing the plain
- * text format.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
-#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
-
-#include <set>
-#include <string>
-#include <vector>
-
-#include <core/common/Location.hpp>
-
-#include "TokenTrie.hpp"
-
-namespace ousia {
-
-// Forward declarations
-class CharReader;
-
-/**
- * The DynamicToken structure describes a token discovered by the Tokenizer.
- */
-struct DynamicToken {
-	/**
-	 * Id of the type of this token.
-	 */
-	TokenTypeId type;
-
-	/**
-	 * String that was matched.
-	 */
-	std::string content;
-
-	/**
-	 * Location from which the string was extracted.
-	 */
-	SourceLocation location;
-
-	/**
-	 * Default constructor.
-	 */
-	DynamicToken() : type(EmptyToken) {}
-
-	/**
-	 * Constructor of the DynamicToken struct.
-	 *
-	 * @param id represents the token type.
-	 * @param content is the string content that has been extracted.
-	 * @param location is the location of the extracted string content in the
-	 * source file.
-	 */
-	DynamicToken(TokenTypeId type, const std::string &content,
-	             SourceLocation location)
-	    : type(type), content(content), location(location)
-	{
-	}
-
-	/**
-	 * Constructor of the DynamicToken struct, only initializes the token type
-	 *
-	 * @param type is the id corresponding to the type of the token.
-	 */
-	DynamicToken(TokenTypeId type) : type(type) {}
-
-	/**
-	 * The getLocation function allows the tokens to be directly passed as
-	 * parameter to Logger or LoggableException instances.
-	 *
-	 * @return a reference at the location field
-	 */
-	const SourceLocation &getLocation() const { return location; }
-};
-
-/**
- * Enum specifying the whitespace handling of the DynamicTokenizer class when
- * reading non-token text.
- */
-enum class WhitespaceMode {
-	/**
-     * Preserves all whitespaces as they are found in the source file.
-     */
-	PRESERVE,
-
-	/**
-     * Trims whitespace at the beginning and the end of the found text.
-     */
-	TRIM,
-
-	/**
-     * Whitespaces are trimmed and collapsed, multiple whitespace characters
-     * are replaced by a single space character.
-     */
-	COLLAPSE
-};
-
-/**
- * The DynamicTokenizer is used to extract tokens and chunks of text from a
- * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters. Note that the
- * DynamicTokenizer always tries to extract the longest possible token from the
- * tokenizer.
- */
-class DynamicTokenizer {
-private:
-	/**
-	 * Internally used token trie. This object holds all registered tokens.
-	 */
-	TokenTrie trie;
-
-	/**
-	 * Flag defining whether whitespaces should be preserved or not.
-	 */
-	WhitespaceMode whitespaceMode;
-
-	/**
-	 * Vector containing all registered token types.
-	 */
-	std::vector<std::string> tokens;
-
-	/**
-	 * Next index in the tokens list where to search for a new token id.
-	 */
-	size_t nextTokenTypeId;
-
-	/**
-	 * Templated function used internally to read the current token. The
-	 * function is templated in order to force code generation for all six
-	 * combiations of whitespace modes and reading/peeking.
-	 *
-	 * @tparam TextHandler is the type to be used for the textHandler instance.
-	 * @tparam read specifies whether the function should start from and advance
-	 * the read pointer of the char reader.
-	 * @param reader is the CharReader instance from which the data should be
-	 * read.
-	 * @param token is the token structure into which the token information
-	 * should be written.
-	 * @return false if the end of the stream has been reached, true otherwise.
-	 */
-	template <typename TextHandler, bool read>
-	bool next(CharReader &reader, DynamicToken &token);
-
-public:
-	/**
-	 * Constructor of the DynamicTokenizer class.
-	 *
-	 * @param whitespaceMode specifies how whitespace should be handled.
-	 */
-	DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
-
-	/**
-	 * Registers the given string as a token. Returns a const pointer at a
-	 * TokenDescriptor that will be used to reference the newly created token.
-	 *
-	 * @param token is the token string that should be registered.
-	 * @return a unique identifier for the registered token or EmptyToken if
-	 * an error occured.
-	 */
-	TokenTypeId registerToken(const std::string &token);
-
-	/**
-	 * Unregisters the token belonging to the given TokenTypeId.
-	 *
-	 * @param type is the token type that should be unregistered. The
-	 *TokenTypeId
-	 * must have been returned by registerToken.
-	 * @return true if the operation was successful, false otherwise (e.g.
-	 * because the given TokenDescriptor was already unregistered).
-	 */
-	bool unregisterToken(TokenTypeId type);
-
-	/**
-	 * Returns the token that was registered under the given TokenTypeId id or
-	 *an
-	 * empty string if an invalid TokenTypeId id is given.
-	 *
-	 * @param type is the TokenTypeId id for which the corresponding token
-	 *string
-	 * should be returned.
-	 * @return the registered token string or an empty string if the given type
-	 * was invalid.
-	 */
-	std::string getTokenString(TokenTypeId type);
-
-	/**
-	 * Sets the whitespace mode.
-	 *
-	 * @param whitespaceMode defines how whitespace should be treated in text
-	 * tokens.
-	 */
-	void setWhitespaceMode(WhitespaceMode mode);
-
-	/**
-	 * Returns the current value of the whitespace mode.
-	 *
-	 * @return the whitespace mode.
-	 */
-	WhitespaceMode getWhitespaceMode();
-
-	/**
-	 * Reads a new token from the CharReader and stores it in the given
-	 * DynamicToken instance.
-	 *
-	 * @param reader is the CharReader instance from which the data should be
-	 * read.
-	 * @param token is a reference at the token instance into which the Token
-	 * information should be written.
-	 * @return true if a token could be read, false if the end of the stream
-	 * has been reached.
-	 */
-	bool read(CharReader &reader, DynamicToken &token);
-
-	/**
-	 * The peek method does not advance the read position of the char reader,
-	 * but reads the next token from the current char reader peek position.
-	 *
-	 * @param reader is the CharReader instance from which the data should be
-	 * read.
-	 * @param token is a reference at the token instance into which the Token
-	 * information should be written.
-	 * @return true if a token could be read, false if the end of the stream
-	 * has been reached.
-	 */
-	bool peek(CharReader &reader, DynamicToken &token);
-};
-}
-
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
-
diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp
deleted file mode 100644
index 4a0430b..0000000
--- a/src/formats/osdm/TokenTrie.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "TokenTrie.hpp"
-
-namespace ousia {
-
-/* Class DynamicTokenTree::Node */
-
-TokenTrie::Node::Node() : type(EmptyToken) {}
-
-/* Class DynamicTokenTree */
-
-bool TokenTrie::registerToken(const std::string &token,
-                              TokenTypeId type) noexcept
-{
-	// Abort if the token is empty -- this would taint the root node
-	if (token.empty()) {
-		return false;
-	}
-
-	// Iterate over each character in the given string and insert them as
-	// (new) nodes
-	Node *node = &root;
-	for (size_t i = 0; i < token.size(); i++) {
-		// Insert a new node if this one does not exist
-		const char c = token[i];
-		auto it = node->children.find(c);
-		if (it == node->children.end()) {
-			it = node->children.emplace(c, std::make_shared<Node>()).first;
-		}
-		node = it->second.get();
-	}
-
-	// If the resulting node already has a type set, we're screwed.
-	if (node->type != EmptyToken) {
-		return false;
-	}
-
-	// Otherwise just set the type to the given type.
-	node->type = type;
-	return true;
-}
-
-bool TokenTrie::unregisterToken(const std::string &token) noexcept
-{
-	// We cannot remove empty tokens as we need to access the fist character
-	// upfront
-	if (token.empty()) {
-		return false;
-	}
-
-	// First pass -- search the node in the path that can be deleted
-	Node *subtreeRoot = &root;
-	char subtreeKey = token[0];
-	Node *node = &root;
-	for (size_t i = 0; i < token.size(); i++) {
-		// Go to the next node, abort if the tree ends unexpectedly
-		auto it = node->children.find(token[i]);
-		if (it == node->children.end()) {
-			return false;
-		}
-
-		// Reset the subtree handler if this node has another type
-		node = it->second.get();
-		if ((node->type != EmptyToken || node->children.size() > 1) &&
-		    (i + 1 != token.size())) {
-			subtreeRoot = node;
-			subtreeKey = token[i + 1];
-		}
-	}
-
-	// If the node type is already EmptyToken, we cannot do anything here
-	if (node->type == EmptyToken) {
-		return false;
-	}
-
-	// If the target node has children, we cannot delete the subtree. Set the
-	// type to EmptyToken instead
-	if (!node->children.empty()) {
-		node->type = EmptyToken;
-		return true;
-	}
-
-	// If we end up here, we can safely delete the complete subtree
-	subtreeRoot->children.erase(subtreeKey);
-	return true;
-}
-
-TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
-{
-	Node const *node = &root;
-	for (size_t i = 0; i < token.size(); i++) {
-		const char c = token[i];
-		auto it = node->children.find(c);
-		if (it == node->children.end()) {
-			return EmptyToken;
-		}
-		node = it->second.get();
-	}
-	return node->type;
-}
-}
-
diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp
deleted file mode 100644
index 36c2ffa..0000000
--- a/src/formats/osdm/TokenTrie.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
-    Ousía
-    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file TokenTrie.hpp
- *
- * Class representing a token trie that can be updated dynamically.
- *
- * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_TOKEN_TRIE_HPP_
-#define _OUSIA_TOKEN_TRIE_HPP_
-
-#include <cstdint>
-#include <memory>
-#include <limits>
-#include <unordered_map>
-
-namespace ousia {
-
-/**
- * The TokenTypeId is used to give each token type a unique id.
- */
-using TokenTypeId = uint32_t;
-
-/**
- * Token which is not a token.
- */
-constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
-
-/**
- * Token which represents a text token.
- */
-constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
-
-/**
- * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
- * the longest consecutive token in the text. This is equivalent to a prefix
- * trie.
- *
- * A token trie is a construct that structures all special tokens a Tokenizer
- * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
- * three. Then the token tree would look like this:
- *
- * \code{*.txt}
- *        ~ (0)
- *       /     \
- *      a (2)  b (0)
- *      |      |
- *      a (0)  a (0)
- *      |      |
- *      b (1)  c (0)
- * \endcode
- *
- * Where the number indicates the corresponding token descriptor identifier.
- */
-class TokenTrie {
-public:
-	/**
-	 * Structure used to build the node tree.
-	 */
-	struct Node {
-		/**
-		 * Type used for the child map.
-		 */
-		using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
-
-		/**
-		 * Map from single characters at the corresponding child nodes.
-		 */
-		ChildMap children;
-
-		/**
-		 * Reference at the corresponding token descriptor. Set to nullptr if
-		 * no token is attached to this node.
-		 */
-		TokenTypeId type;
-
-		/**
-		 * Default constructor, initializes the descriptor with nullptr.
-		 */
-		Node();
-	};
-
-private:
-	/**
-	 * Root node of the internal token tree.
-	 */
-	Node root;
-
-public:
-	/**
-	 * Registers a token containing the given string. Returns false if the
-	 * token already exists, true otherwise.
-	 *
-	 * @param token is the character sequence that should be registered as
-	 * token.
-	 * @param type is the descriptor that should be set for this token.
-	 * @return true if the operation is successful, false otherwise.
-	 */
-	bool registerToken(const std::string &token, TokenTypeId type) noexcept;
-
-	/**
-	 * Unregisters the token from the token tree. Returns true if the token was
-	 * unregistered successfully, false otherwise.
-	 *
-	 * @param token is the character sequence that should be unregistered.
-	 * @return true if the operation was successful, false otherwise.
-	 */
-	bool unregisterToken(const std::string &token) noexcept;
-
-	/**
-	 * Returns true, if the given token exists within the TokenTree. This
-	 * function is mostly thought for debugging and unit testing.
-	 *
-	 * @param token is the character sequence that should be searched.
-	 * @return the attached token descriptor or nullptr if the given token is
-	 * not found.
-	 */
-	TokenTypeId hasToken(const std::string &token) const noexcept;
-
-	/**
-	 * Returns a reference at the root node to be used for traversing the token
-	 * tree.
-	 *
-	 * @return a reference at the root node.
-	 */
-	const Node *getRoot() const noexcept { return &root; }
-};
-}
-
-#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
-
diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp
new file mode 100644
index 0000000..4973639
--- /dev/null
+++ b/src/formats/osml/OsmlParser.cpp
@@ -0,0 +1,57 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/parser/generic/ParserStateCallbacks.hpp>
+#include <core/parser/generic/ParserStateStack.hpp>
+
+#include "OsdmParser.hpp"
+#include "OsdmStreamParser.hpp"
+
+namespace ousia {
+
+namespace {
+
+/**
+ * The OsdmParserImplementation class contains the actual implementation of the
+ * parsing process and is created in the "doParse" function of the OsdmParser.
+ 
+ */
+class OsdmParserImplementation : public ParserStateCallbacks {
+private:
+	/**
+	 * OsdmStreamParser instance.
+	 */
+	OsdmStreamParser parser;
+
+	/**
+	 * Instance of the ParserStateStack.
+	 */
+	ParserStateStack stack;
+
+public:
+	OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap)
+};
+}
+
+void OsdmParser::doParse(CharReader &reader, ParserContext &ctx)
+{
+	OsdmParserImplementation parser(reader, ctx);
+	parser.parse();
+}
+
+}
diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp
new file mode 100644
index 0000000..37505b4
--- /dev/null
+++ b/src/formats/osml/OsmlParser.hpp
@@ -0,0 +1,48 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsdmParser.hpp
+ *
+ * Contains the parser of the osdm format, the standard plain-text format used
+ * by Ousía for documents.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSDM_PARSER_HPP_
+#define _OUSIA_OSDM_PARSER_HPP_
+
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+
+/**
+ * OsdmParser is a small wrapper implementing the Parser interface. The actual
+ * parsing is performed with the OsdmStreamParser in conjunction with the
+ * ParserStateStack.
+ */
+class OsdmParser : public Parser {
+protected:
+	void doParse(CharReader &reader, ParserContext &ctx) override;
+};
+
+}
+
+#endif /* _OUSIA_OSDM_PARSER_HPP_ */
+
diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index 8cb8caf..0174fa4 100644
--- a/src/formats/osdm/OsdmStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -21,14 +21,14 @@
 #include <core/common/Utils.hpp>
 #include <core/common/VariantReader.hpp>
 
-#include "OsdmStreamParser.hpp"
+#include "OsmlStreamParser.hpp"
 
 namespace ousia {
 
 /**
  * Plain format default tokenizer.
  */
-class PlainFormatTokens : public DynamicTokenizer {
+class PlainFormatTokens : public Tokenizer {
 public:
 	/**
 	 * Id of the backslash token.
@@ -61,6 +61,21 @@ public:
 	TokenTypeId FieldEnd;
 
 	/**
+	 * Id of the default field start token.
+	 */
+	TokenTypeId DefaultFieldStart;
+
+	/**
+	 * Id of the annotation start token.
+	 */
+	TokenTypeId AnnotationStart;
+
+	/**
+	 * Id of the annotation end token.
+	 */
+	TokenTypeId AnnotationEnd;
+
+	/**
 	 * Registers the plain format tokens in the internal tokenizer.
 	 */
 	PlainFormatTokens()
@@ -71,6 +86,9 @@ public:
 		BlockCommentEnd = registerToken("}%");
 		FieldStart = registerToken("{");
 		FieldEnd = registerToken("}");
+		DefaultFieldStart = registerToken("{!");
+		AnnotationStart = registerToken("<\\");
+		AnnotationEnd = registerToken("\\>");
 	}
 };
 
@@ -160,14 +178,14 @@ public:
 	}
 };
 
-OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger)
+OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
     : reader(reader), logger(logger), tokenizer(Tokens)
 {
 	// Place an intial command representing the complete file on the stack
-	commands.push(Command{"", Variant::mapType{}, true, true, true});
+	commands.push(Command{"", Variant::mapType{}, true, true, true, false});
 }
 
-Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
 {
 	bool first = true;
 	bool hasCharSiceNSSep = false;
@@ -210,7 +228,7 @@ Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
 	return res;
 }
 
-OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
+OsmlStreamParser::State OsmlStreamParser::parseBeginCommand()
 {
 	// Expect a '{' after the command
 	reader.consumeWhitespace();
@@ -251,7 +269,7 @@ OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
 	return State::COMMAND;
 }
 
-static bool checkStillInField(const OsdmStreamParser::Command &cmd,
+static bool checkStillInField(const OsmlStreamParser::Command &cmd,
                               const Variant &endName, Logger &logger)
 {
 	if (cmd.inField && !cmd.inRangeField) {
@@ -264,7 +282,7 @@ static bool checkStillInField(const OsdmStreamParser::Command &cmd,
 	return false;
 }
 
-OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
+OsmlStreamParser::State OsmlStreamParser::parseEndCommand()
 {
 	// Expect a '{' after the command
 	if (!reader.expect('{')) {
@@ -327,7 +345,7 @@ OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
 	return cmd.inRangeField ? State::FIELD_END : State::NONE;
 }
 
-Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
+Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName)
 {
 	// Parse the arguments using the universal VariantReader
 	Variant commandArguments;
@@ -353,7 +371,7 @@ Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
 	return commandArguments;
 }
 
-void OsdmStreamParser::pushCommand(Variant commandName,
+void OsmlStreamParser::pushCommand(Variant commandName,
                                    Variant commandArguments, bool hasRange)
 {
 	// Store the location on the stack
@@ -365,10 +383,11 @@ void OsdmStreamParser::pushCommand(Variant commandName,
 		commands.pop();
 	}
 	commands.push(Command{std::move(commandName), std::move(commandArguments),
-	                      hasRange, false, false});
+	                      hasRange, false, false, false});
 }
 
-OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
+OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start,
+                                                       bool isAnnotation)
 {
 	// Parse the commandName as a first identifier
 	Variant commandName = parseIdentifier(start, true);
@@ -382,6 +401,9 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
 	    Utils::split(commandName.asString(), ':');
 	const bool isBegin = commandNameComponents[0] == "begin";
 	const bool isEnd = commandNameComponents[0] == "end";
+
+	// Parse the begin or end command
+	State res = State::COMMAND;
 	if (isBegin || isEnd) {
 		if (commandNameComponents.size() > 1) {
 			logger.error(
@@ -390,35 +412,81 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
 			    commandName);
 		}
 		if (isBegin) {
-			return parseBeginCommand();
+			res = parseBeginCommand();
 		} else if (isEnd) {
-			return parseEndCommand();
+			res = parseEndCommand();
 		}
+	} else {
+		// Check whether the next character is a '#', indicating the start of
+		// the command name
+		Variant commandArgName;
+		start = reader.getOffset();
+		if (reader.expect('#')) {
+			commandArgName = parseIdentifier(start);
+			if (commandArgName.asString().empty()) {
+				logger.error("Expected identifier after \"#\"", commandArgName);
+			}
+		}
+
+		// Parse the arugments
+		Variant commandArguments =
+		    parseCommandArguments(std::move(commandArgName));
+
+		// Push the command onto the command stack
+		pushCommand(std::move(commandName), std::move(commandArguments), false);
 	}
 
-	// Check whether the next character is a '#', indicating the start of the
-	// command name
-	Variant commandArgName;
-	start = reader.getOffset();
-	if (reader.expect('#')) {
-		commandArgName = parseIdentifier(start);
-		if (commandArgName.asString().empty()) {
-			logger.error("Expected identifier after \"#\"", commandArgName);
+	// Check whether a ">" character is the next character that is to be read.
+	// In that case the current command could be an annotation end command!
+	char c;
+	if (reader.fetch(c) && c == '>') {
+		// Ignore the character after a begin or end command
+		if (isBegin || isEnd) {
+			logger.warning(
+			    "Ignoring annotation end character \">\" after special "
+			    "commands \"begin\" or \"end\". Write \"\\>\" to end a "
+			    "\"begin\"/\"end\" enclosed annotation.",
+			    reader);
+			return res;
 		}
-	}
 
-	// Parse the arugments
-	Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+		// If this should be an annoation, ignore the character
+		if (isAnnotation) {
+			logger.warning(
+			    "Ignoring annotation end character \">\" after annotation "
+			    "start command. Write \"\\>\" to end the annotation.",
+			    reader);
+		} else {
+			// Make sure no arguments apart from the "name" argument are given
+			// to an annotation end
+			Variant::mapType &map = commands.top().arguments.asMap();
+			if (!map.empty()) {
+				if (map.count("name") == 0 || map.size() > 1U) {
+					logger.error(
+					    "An annotation end command may not have any arguments "
+					    "other than \"name\"");
+					return res;
+				}
+			}
 
-	// Push the command onto the command stack
-	pushCommand(std::move(commandName), std::move(commandArguments), false);
+			// If we got here, this is a valid ANNOTATION_END command, issue it
+			reader.peek(c);
+			reader.consumePeek();
+			return State::ANNOTATION_END;
+		}
+	}
 
-	return State::COMMAND;
+	// If we're starting an annotation, return the command as annotation start
+	// instead of command
+	if (isAnnotation && res == State::COMMAND) {
+		return State::ANNOTATION_START;
+	}
+	return res;
 }
 
-void OsdmStreamParser::parseBlockComment()
+void OsmlStreamParser::parseBlockComment()
 {
-	DynamicToken token;
+	Token token;
 	size_t depth = 1;
 	while (tokenizer.read(reader, token)) {
 		if (token.type == Tokens.BlockCommentEnd) {
@@ -436,7 +504,7 @@ void OsdmStreamParser::parseBlockComment()
 	logger.error("File ended while being in a block comment", reader);
 }
 
-void OsdmStreamParser::parseLineComment()
+void OsmlStreamParser::parseLineComment()
 {
 	char c;
 	while (reader.read(c)) {
@@ -446,7 +514,7 @@ void OsdmStreamParser::parseLineComment()
 	}
 }
 
-bool OsdmStreamParser::checkIssueData(DataHandler &handler)
+bool OsmlStreamParser::checkIssueData(DataHandler &handler)
 {
 	if (!handler.isEmpty()) {
 		data = handler.toVariant(reader.getSourceId());
@@ -457,7 +525,7 @@ bool OsdmStreamParser::checkIssueData(DataHandler &handler)
 	return false;
 }
 
-bool OsdmStreamParser::checkIssueFieldStart()
+bool OsmlStreamParser::checkIssueFieldStart()
 {
 	// Fetch the current command, and check whether we're currently inside a
 	// field of this command
@@ -482,18 +550,41 @@ bool OsdmStreamParser::checkIssueFieldStart()
 	return false;
 }
 
-OsdmStreamParser::State OsdmStreamParser::parse()
+bool OsmlStreamParser::closeField()
+{
+	// Try to end an open field of the current command -- if the current command
+	// is not inside an open field, end this command and try to close the next
+	// one
+	for (int i = 0; i < 2 && commands.size() > 1; i++) {
+		Command &cmd = commands.top();
+		if (!cmd.inRangeField) {
+			if (cmd.inField) {
+				cmd.inField = false;
+				if (cmd.inDefaultField) {
+					commands.pop();
+				}
+				return true;
+			}
+			commands.pop();
+		} else {
+			return false;
+		}
+	}
+	return false;
+}
+
+OsmlStreamParser::State OsmlStreamParser::parse()
 {
 	// Handler for incomming data
 	DataHandler handler;
 
 	// Read tokens until the outer loop should be left
-	DynamicToken token;
+	Token token;
 	while (tokenizer.peek(reader, token)) {
 		const TokenTypeId type = token.type;
 
 		// Special handling for Backslash and Text
-		if (type == Tokens.Backslash) {
+		if (type == Tokens.Backslash || type == Tokens.AnnotationStart) {
 			// Before appending anything to the output data or starting a new
 			// command, check whether FIELD_START has to be issued, as the
 			// current command is a command with range
@@ -519,7 +610,8 @@ OsdmStreamParser::State OsdmStreamParser::parse()
 				}
 
 				// Parse the actual command
-				State res = parseCommand(token.location.getStart());
+				State res = parseCommand(token.location.getStart(),
+				                         type == Tokens.AnnotationStart);
 				switch (res) {
 					case State::ERROR:
 						throw LoggableException(
@@ -536,6 +628,14 @@ OsdmStreamParser::State OsdmStreamParser::parse()
 			// to the data buffer, use the escape character start as start
 			// location and the peek offset as end location
 			reader.peek(c);  // Peek the previously fetched character
+
+			// If this was an annotation start token, add the parsed < to the
+			// output
+			if (type == Tokens.AnnotationStart) {
+				handler.append('<', token.location.getStart(),
+				               token.location.getStart() + 1);
+			}
+
 			handler.append(c, token.location.getStart(),
 			               reader.getPeekOffset());
 			reader.consumePeek();
@@ -579,28 +679,37 @@ OsdmStreamParser::State OsdmStreamParser::parse()
 			}
 			logger.error(
 			    "Got field start token \"{\", but no command for which to "
-			    "start the field. Did you mean \"\\{\"?",
+			    "start the field. Write \"\\{\" to insert this sequence as "
+			    "text.",
 			    token);
 		} else if (token.type == Tokens.FieldEnd) {
-			// Try to end an open field of the current command -- if the current
-			// command is not inside an open field, end this command and try to
-			// close the next one
-			for (int i = 0; i < 2 && commands.size() > 1; i++) {
-				Command &cmd = commands.top();
-				if (!cmd.inRangeField) {
-					if (cmd.inField) {
-						cmd.inField = false;
-						return State::FIELD_END;
-					}
-					commands.pop();
-				} else {
-					break;
-				}
+			if (closeField()) {
+				return State::FIELD_END;
 			}
 			logger.error(
-			    "Got field end token \"}\", but there is no field to end. Did "
-			    "you mean \"\\}\"?",
+			    "Got field end token \"}\", but there is no field to end. "
+			    "Write \"\\}\" to insert this sequence as text.",
 			    token);
+		} else if (token.type == Tokens.DefaultFieldStart) {
+			// Try to start a default field the first time the token is reached
+			Command &topCmd = commands.top();
+			if (!topCmd.inField) {
+				topCmd.inField = true;
+				topCmd.inDefaultField = true;
+				return State::FIELD_START;
+			}
+			logger.error(
+			    "Got default field start token \"{!\", but no command for "
+			    "which to start the field. Write \"\\{!\" to insert this "
+			    "sequence as text",
+			    token);
+		} else if (token.type == Tokens.AnnotationEnd) {
+			// We got a single annotation end token "\>" -- simply issue the
+			// ANNOTATION_END event
+			Variant annotationName = Variant::fromString("");
+			annotationName.setLocation(token.location);
+			pushCommand(annotationName, Variant::mapType{}, false);
+			return State::ANNOTATION_END;
 		} else {
 			logger.error("Unexpected token \"" + token.content + "\"", token);
 		}
@@ -627,14 +736,19 @@ OsdmStreamParser::State OsdmStreamParser::parse()
 	return State::END;
 }
 
-const Variant &OsdmStreamParser::getCommandName()
+const Variant &OsmlStreamParser::getCommandName() const
 {
 	return commands.top().name;
 }
 
-const Variant &OsdmStreamParser::getCommandArguments()
+const Variant &OsmlStreamParser::getCommandArguments() const
 {
 	return commands.top().arguments;
 }
+
+bool OsmlStreamParser::inDefaultField() const
+{
+	return commands.top().inRangeField || commands.top().inDefaultField;
+}
 }
 
diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index 48d8fb7..dc3034c 100644
--- a/src/formats/osdm/OsdmStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -17,23 +17,22 @@
 */
 
 /**
- * @file OsdmStreamParser.hpp
+ * @file OsmlStreamParser.hpp
  *
- * Provides classes for low-level classes for reading the TeX-esque osdm
+ * Provides classes for low-level classes for reading the TeX-esque osml
  * format. The class provided here does not build any model objects and does not
  * implement the Parser interface.
  *
  * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
  */
 
-#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_
-#define _OUSIA_OSDM_STREAM_PARSER_HPP_
+#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
+#define _OUSIA_OSML_STREAM_PARSER_HPP_
 
 #include <stack>
 
 #include <core/common/Variant.hpp>
-
-#include "DynamicTokenizer.hpp"
+#include <core/parser/utils/Tokenizer.hpp>
 
 namespace ousia {
 
@@ -43,7 +42,7 @@ class Logger;
 class DataHandler;
 
 /**
- * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm
+ * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
  * format. The parser is constructed around a "parse" function, which reads data
  * from the underlying CharReader until a new state is reached and indicates
  * this state in a return value. The calling code then has to pull corresponding
@@ -53,10 +52,10 @@ class DataHandler;
  * fields, as this would lead to too many consecutive errors) a
  * LoggableException is thrown.
  */
-class OsdmStreamParser {
+class OsmlStreamParser {
 public:
 	/**
-	 * Enum used to indicate which state the OsdmStreamParser class is in
+	 * Enum used to indicate which state the OsmlStreamParser class is in
 	 * after calling the "parse" function.
 	 */
 	enum class State {
@@ -140,23 +139,35 @@ public:
 		/**
 		 * Set to true if this is a command with clear begin and end.
 		 */
-		bool hasRange;
+		bool hasRange : 1;
 
 		/**
 		 * Set to true if we are currently inside a field of this command.
 		 */
-		bool inField;
+		bool inField : 1;
 
 		/**
 		 * Set to true if we are currently in the range field of the command
 		 * (implies inField being set to true).
 		 */
-		bool inRangeField;
+		bool inRangeField : 1;
+
+		/**
+		 * Set to true if we are currently in a field that has been especially
+		 * marked as default field (using the "|") syntax.
+		 */
+		bool inDefaultField : 1;
 
 		/**
 		 * Default constructor.
 		 */
-		Command() : hasRange(false), inField(false), inRangeField(false) {}
+		Command()
+		    : hasRange(false),
+		      inField(false),
+		      inRangeField(false),
+		      inDefaultField()
+		{
+		}
 
 		/**
 		 * Constructor of the Command class.
@@ -169,16 +180,19 @@ public:
 		 * explicit range.
 		 * @param inField is set to true if we currently are inside a field
 		 * of this command.
-		 * @param inRangeField is set to true if we currently inside the outer
-		 * field of the command.
+		 * @param inRangeField is set to true if we currently are inside the
+		 * outer field of a ranged command.
+		 * @param inDefaultField is set to true if we currently are in a
+		 * specially marked default field.
 		 */
-		Command(Variant name, Variant arguments, bool hasRange, bool inField,
-		        bool inRangeField)
+		Command(Variant name, Variant arguments, bool hasRange,
+		        bool inField, bool inRangeField, bool inDefaultField)
 		    : name(std::move(name)),
 		      arguments(std::move(arguments)),
 		      hasRange(hasRange),
 		      inField(inField),
-		      inRangeField(inRangeField)
+		      inRangeField(inRangeField),
+		      inDefaultField(inDefaultField)
 		{
 		}
 	};
@@ -198,7 +212,7 @@ private:
 	/**
 	 * Tokenizer instance used to read individual tokens from the text.
 	 */
-	DynamicTokenizer tokenizer;
+	Tokenizer tokenizer;
 
 	/**
 	 * Stack containing the current commands.
@@ -258,9 +272,11 @@ private:
 	 *
 	 * @param start is the start byte offset of the command (including the
 	 * backslash)
+	 * @param isAnnotation if true, the command is not returned as command, but
+	 * as annotation start.
 	 * @return true if a command was actuall parsed, false otherwise.
 	 */
-	State parseCommand(size_t start);
+	State parseCommand(size_t start, bool isAnnotation);
 
 	/**
 	 * Function used internally to parse a block comment.
@@ -290,16 +306,26 @@ private:
 	 */
 	bool checkIssueFieldStart();
 
+	/**
+	 * Closes a currently open field. Note that the command will be removed from
+	 * the internal command stack if the field that is being closed is a
+	 * field marked as default field.
+	 *
+	 * @return true if the field could be closed, false if there was no field
+	 * to close.
+	 */
+	bool closeField();
+
 public:
 	/**
-	 * Constructor of the OsdmStreamParser class. Attaches the new
-	 * OsdmStreamParser to the given CharReader and Logger instances.
+	 * Constructor of the OsmlStreamParser class. Attaches the new
+	 * OsmlStreamParser to the given CharReader and Logger instances.
 	 *
 	 * @param reader is the reader instance from which incomming characters
 	 * should be read.
 	 * @param logger is the logger instance to which errors should be written.
 	 */
-	OsdmStreamParser(CharReader &reader, Logger &logger);
+	OsmlStreamParser(CharReader &reader, Logger &logger);
 
 	/**
 	 * Continues parsing. Returns one of the states defined in the State enum.
@@ -318,7 +344,7 @@ public:
 	 * @return a reference at a variant containing the data parsed by the
 	 * "parse" function.
 	 */
-	const Variant &getData() { return data; }
+	const Variant &getData() const { return data; }
 
 	/**
 	 * Returns a reference at the internally stored command name. Only valid if
@@ -327,7 +353,7 @@ public:
 	 * @return a reference at a variant containing name and location of the
 	 * parsed command.
 	 */
-	const Variant &getCommandName();
+	const Variant &getCommandName() const;
 
 	/**
 	 * Returns a reference at the internally stored command name. Only valid if
@@ -336,16 +362,24 @@ public:
 	 * @return a reference at a variant containing arguments given to the
 	 * command.
 	 */
-	const Variant &getCommandArguments();
+	const Variant &getCommandArguments() const;
+
+	/**
+	 * Returns true if the current field is the "default" field. This is true if
+	 * the parser either is in the outer range of a range command or inside a
+	 * field that has been especially marked as "default" field (using the "|"
+	 * syntax).
+	 */
+	bool inDefaultField() const;
 
 	/**
 	 * Returns a reference at the char reader.
 	 *
 	 * @return the last internal token location.
 	 */
-	SourceLocation &getLocation() { return location; }
+	const SourceLocation &getLocation() const { return location; }
 };
 }
 
-#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */
+#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */
 
diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp
new file mode 100644
index 0000000..e37446a
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.cpp
@@ -0,0 +1,144 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/Location.hpp>
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+
+#include "OsxmlAttributeLocator.hpp"
+
+namespace ousia {
+
+/**
+ * Enum used internally in the statemachine of the xml argument parser.
+ */
+enum class XmlAttributeState {
+	IN_TAG_NAME,
+	SEARCH_ATTR,
+	IN_ATTR_NAME,
+	HAS_ATTR_NAME,
+	HAS_ATTR_EQUALS,
+	IN_ATTR_DATA
+};
+
+std::map<std::string, SourceLocation> OsxmlAttributeLocator::locate(
+    CharReader &reader, size_t offs)
+{
+	std::map<std::string, SourceLocation> res;
+
+	// Fork the reader, we don't want to mess up the XML parsing process, do we?
+	CharReaderFork readerFork = reader.fork();
+
+	// Move the read cursor to the start location, abort if this does not work
+	if (offs != readerFork.seek(offs)) {
+		return res;
+	}
+
+	// Now all we need to do is to implement one half of an XML parser. As this
+	// is inherently complicated we'll totaly fail at it. Don't care. All we
+	// want to get is those darn offsets for pretty error messages... (and we
+	// can assume the XML is valid as it was already read by expat)
+	XmlAttributeState state = XmlAttributeState::IN_TAG_NAME;
+	char c;
+	std::stringstream attrName;
+	while (readerFork.read(c)) {
+		// Abort at the end of the tag
+		if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) {
+			return res;
+		}
+
+		// One state machine to rule them all, one state machine to find them,
+		// One state machine to bring them all and in the darkness bind them
+		// (the byte offsets)
+		switch (state) {
+			case XmlAttributeState::IN_TAG_NAME:
+				if (Utils::isWhitespace(c)) {
+					res.emplace("$tag",
+					            SourceLocation{reader.getSourceId(), offs + 1,
+					                           readerFork.getOffset() - 1});
+					state = XmlAttributeState::SEARCH_ATTR;
+				}
+				break;
+			case XmlAttributeState::SEARCH_ATTR:
+				if (!Utils::isWhitespace(c)) {
+					state = XmlAttributeState::IN_ATTR_NAME;
+					attrName << c;
+				}
+				break;
+			case XmlAttributeState::IN_ATTR_NAME:
+				if (Utils::isWhitespace(c)) {
+					state = XmlAttributeState::HAS_ATTR_NAME;
+				} else if (c == '=') {
+					state = XmlAttributeState::HAS_ATTR_EQUALS;
+				} else {
+					attrName << c;
+				}
+				break;
+			case XmlAttributeState::HAS_ATTR_NAME:
+				if (!Utils::isWhitespace(c)) {
+					if (c == '=') {
+						state = XmlAttributeState::HAS_ATTR_EQUALS;
+						break;
+					}
+					// Well, this is a strange XML file... We expected to
+					// see a '=' here! Try to continue with the
+					// "HAS_ATTR_EQUALS" state as this state will hopefully
+					// inlcude some error recovery
+				} else {
+					// Skip whitespace here
+					break;
+				}
+			// Fallthrough
+			case XmlAttributeState::HAS_ATTR_EQUALS:
+				if (!Utils::isWhitespace(c)) {
+					if (c == '"') {
+						// Here we are! We have found the beginning of an
+						// attribute. Let's quickly lock the current offset away
+						// in the result map
+						res.emplace(attrName.str(),
+						            SourceLocation{reader.getSourceId(),
+						                           readerFork.getOffset()});
+						state = XmlAttributeState::IN_ATTR_DATA;
+					} else {
+						// No, this XML file is not well formed. Assume we're in
+						// an attribute name once again
+						attrName.str(std::string{&c, 1});
+						state = XmlAttributeState::IN_ATTR_NAME;
+					}
+				}
+				break;
+			case XmlAttributeState::IN_ATTR_DATA:
+				if (c == '"') {
+					// We're at the end of the attribute data, set the end
+					// location
+					auto it = res.find(attrName.str());
+					if (it != res.end()) {
+						it->second.setEnd(readerFork.getOffset() - 1);
+					}
+
+					// Reset the attribute name and restart the search
+					attrName.str(std::string{});
+					state = XmlAttributeState::SEARCH_ATTR;
+				}
+				break;
+		}
+	}
+	return res;
+}
+}
+
diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp
new file mode 100644
index 0000000..f9a3437
--- /dev/null
+++ b/src/formats/osxml/OsxmlAttributeLocator.hpp
@@ -0,0 +1,67 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlAttributeLocator.hpp
+ *
+ * Contains a class used for locating the byte offsets of the attributes given
+ * in a XML tag.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_
+
+#include <map>
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class SourceLocation;
+
+/**
+ * Class containing one static function for locating the byte offsets of the
+ * attributes in a XML tag. This are not retrieved by our xml parser, so we have
+ * to do this manually.
+ */
+class OsxmlAttributeLocator {
+public:
+	/**
+	 * Function used to reconstruct the location of the attributes of a XML tag
+	 * in the source code. This is necessary, as the xml parser only returns an
+	 * offset to the begining of a tag and not to the position of the individual
+	 * arguments.
+	 *
+	 * @param reader is the char reader from which the character data should be
+	 * read.
+	 * @param offs is a byte offset in the xml file pointing at the "<"
+	 * character of the tag.
+	 * @return a map from attribute keys to the corresponding location
+	 * (including range) of the atribute. Also contains the location of the
+	 * tagname in the form of the virtual attribute "$tag".
+	 */
+	static std::map<std::string, SourceLocation> locate(CharReader &reader,
+	                                                    size_t offs);
+};
+
+}
+
+#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */
+
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
new file mode 100644
index 0000000..7404960
--- /dev/null
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -0,0 +1,547 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <expat.h>
+
+#include <vector>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Logger.hpp>
+#include <core/common/Variant.hpp>
+#include <core/common/VariantReader.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/WhitespaceHandler.hpp>
+
+#include "OsxmlAttributeLocator.hpp"
+#include "OsxmlEventParser.hpp"
+
+namespace ousia {
+
+/* Class OsxmlEventParser */
+
+/**
+ * Class containing data used by the internal functions.
+ */
+class OsxmlEventParserData {
+public:
+	/**
+	 * Contains the current depth of the parsing process.
+	 */
+	ssize_t depth;
+
+	/**
+	 * Set to a value larger or equal to zero if the parser is currently inside
+	 * an annotation end tag -- the value represents the depth in which the
+	 * tag was opened.
+	 */
+	ssize_t annotationEndTagDepth;
+
+	/**
+	 * Current character data buffer.
+	 */
+	std::vector<char> textBuf;
+
+	/**
+	 * Current whitespace buffer (for the trimming whitspace mode)
+	 */
+	std::vector<char> whitespaceBuf;
+
+	/**
+	 * Flag indicating whether a whitespace character was present (for the
+	 * collapsing whitespace mode).
+	 */
+	bool hasWhitespace;
+
+	/**
+	 * Current character data start.
+	 */
+	size_t textStart;
+
+	/**
+	 * Current character data end.
+	 */
+	size_t textEnd;
+
+	/**
+	 * Default constructor.
+	 */
+	OsxmlEventParserData();
+
+	/**
+	 * Increments the depth.
+	 */
+	void incrDepth();
+
+	/**
+	 * Decrement the depth and reset the annotationEndTagDepth flag.
+	 */
+	void decrDepth();
+
+	/**
+	 * Returns true if we're currently inside an end tag.
+	 */
+	bool inAnnotationEndTag();
+
+	/**
+	 * Returns true if character data is available.
+	 *
+	 * @return true if character data is available.
+	 */
+	bool hasText();
+
+	/**
+	 * Returns a Variant containing the character data and its location.
+	 *
+	 * @return a string variant containing the text data and the character
+	 * location.
+	 */
+	Variant getText(SourceId sourceId);
+};
+
+/* Class GuardedExpatXmlParser */
+
+/**
+ * Wrapper class around the XML_Parser pointer which safely frees it whenever
+ * the scope is left (e.g. because an exception was thrown).
+ */
+class GuardedExpatXmlParser {
+private:
+	/**
+	 * Internal pointer to the XML_Parser instance.
+	 */
+	XML_Parser parser;
+
+public:
+	/**
+	 * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS
+	 * from the expat library. Throws a parser exception if the XML parser
+	 * cannot be initialized.
+	 *
+	 * @param encoding is the protocol-defined encoding passed to expat (or
+	 * nullptr if expat should determine the encoding by itself).
+	 */
+	GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr)
+	{
+		parser = XML_ParserCreate(encoding);
+		if (!parser) {
+			throw LoggableException{
+			    "Internal error: Could not create expat XML parser!"};
+		}
+	}
+
+	/**
+	 * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance.
+	 */
+	~GuardedExpatXmlParser()
+	{
+		if (parser) {
+			XML_ParserFree(parser);
+			parser = nullptr;
+		}
+	}
+
+	/**
+	 * Returns the XML_Parser pointer.
+	 */
+	XML_Parser operator&() { return parser; }
+};
+
+/**
+ * Name of the special outer tag used for allowing multiple top-level elements
+ * in an xml file.
+ */
+static const std::string TOP_LEVEL_TAG{"ousia"};
+
+/**
+ * Prefix used to indicate the start of an annoation (note the trailing colon)
+ */
+static const std::string ANNOTATION_START_PREFIX{"a:start:"};
+
+/**
+ * Prefix used to indicate the end of an annotation.
+ */
+static const std::string ANNOTATION_END_PREFIX{"a:end"};
+
+/**
+ * Synchronizes the position of the xml parser with the default location of the
+ * logger instance.
+ *
+ * @param p is a pointer at the xml parser instance.
+ * @param len is the length of the string that should be refered to.
+ * @return the SourceLocation that has been set in the logger.
+ */
+static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0)
+{
+	// Fetch the OsxmlEventParser instance
+	OsxmlEventParser *parser =
+	    static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+	// Fetch the current location in the XML file and set the default location
+	// in the logger
+	size_t offs = XML_GetCurrentByteIndex(p);
+	SourceLocation loc =
+	    SourceLocation{parser->getReader().getSourceId(), offs, offs + len};
+	parser->getLogger().setDefaultLocation(loc);
+
+	// Return the fetched location
+	return loc;
+}
+
+/**
+ * Callback called by eXpat whenever a start handler is reached.
+ */
+static void xmlStartElementHandler(void *ref, const XML_Char *name,
+                                   const XML_Char **attrs)
+{
+	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+	XML_Parser p = static_cast<XML_Parser>(ref);
+	OsxmlEventParser *parser =
+	    static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+	// If there is any text data in the buffer, issue that first
+	if (parser->getData().hasText()) {
+		parser->getEvents().data(
+		    parser->getData().getText(parser->getReader().getSourceId()));
+	}
+
+	// Read the argument locations -- this is only a stupid and slow hack,
+	// but it is necessary, as expat doesn't give use the byte offset of the
+	// arguments.
+	std::map<std::string, SourceLocation> attributeOffsets =
+	    OsxmlAttributeLocator::locate(parser->getReader(),
+	                                  XML_GetCurrentByteIndex(p));
+
+	// Update the logger position
+	SourceLocation loc = xmlSyncLoggerPosition(p);
+
+	// Fetch the location of the name
+	SourceLocation nameLoc = loc;
+	auto it = attributeOffsets.find("$tag");
+	if (it != attributeOffsets.end()) {
+		nameLoc = it->second;
+	}
+	// Increment the current depth
+	parser->getData().incrDepth();
+
+	// Make sure we're currently not inside an annotation end tag -- this would
+	// be highly illegal!
+	if (parser->getData().inAnnotationEndTag()) {
+		parser->getLogger().error(
+		    "No tags allowed inside an annotation end tag", nameLoc);
+		return;
+	}
+
+	// Assemble the arguments
+	Variant::mapType args;
+	const XML_Char **attr = attrs;
+	while (*attr) {
+		// Convert the C string to a std::string
+		const std::string key{*(attr++)};
+
+		// Search the location of the key
+		SourceLocation keyLoc;
+		auto it = attributeOffsets.find(key);
+		if (it != attributeOffsets.end()) {
+			keyLoc = it->second;
+		}
+
+		// Parse the string, pass the location of the key
+		std::pair<bool, Variant> value = VariantReader::parseGenericString(
+		    *(attr++), parser->getLogger(), keyLoc.getSourceId(),
+		    keyLoc.getStart());
+
+		// Set the overall location of the parsed element to the attribute
+		// location
+		value.second.setLocation(keyLoc);
+
+		// Store the keys in the map
+		args.emplace(key, value.second).second;
+	}
+
+	// Fetch the name of the tag, check for special tags
+	std::string nameStr(name);
+	if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) {
+		// We're in the top-level and the magic tag is reached -- just
+		// ignore it and issue a warning for each argument that has been given
+		for (const auto &arg : args) {
+			parser->getLogger().warning(std::string("Ignoring attribute \"") +
+			                                arg.first +
+			                                std::string("\" for magic tag \"") +
+			                                TOP_LEVEL_TAG + std::string("\""),
+			                            arg.second);
+		}
+	} else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) {
+		// Assemble a name variant containing the name minus the prefix
+		Variant nameVar =
+		    Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size()));
+		nameVar.setLocation(nameLoc);
+
+		// Issue the "annotationStart" event
+		parser->getEvents().annotationStart(nameVar, args);
+	} else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) {
+		// Assemble a name variant containing the name minus the prefix
+		nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size());
+
+		// Discard a potentially leading colon
+		if (!nameStr.empty() && nameStr[0] == ':') {
+			nameStr = nameStr.substr(1);
+		}
+
+		// Assemble the variant containing the name and its location
+		Variant nameVar = Variant::fromString(nameStr);
+		nameVar.setLocation(nameLoc);
+
+		// Check whether a "name" attribute was given
+		Variant elementName;
+		for (const auto &arg : args) {
+			if (arg.first == "name") {
+				elementName = arg.second;
+			} else {
+				parser->getLogger().warning(
+				    std::string("Ignoring attribute \"") + arg.first +
+				        "\" in annotation end tag",
+				    arg.second);
+			}
+		}
+
+		// Set the annotationEndTagDepth to disallow any further tags to be
+		// opened inside the annotation end tag.
+		parser->getData().annotationEndTagDepth = parser->getData().depth;
+
+		// Issue the "annotationEnd" event
+		parser->getEvents().annotationEnd(nameVar, args);
+	} else {
+		// Just issue a "commandStart" event in any other case
+		Variant nameVar = Variant::fromString(nameStr);
+		nameVar.setLocation(nameLoc);
+		parser->getEvents().command(nameVar, args);
+	}
+}
+
+static void xmlEndElementHandler(void *ref, const XML_Char *name)
+{
+	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+	XML_Parser p = static_cast<XML_Parser>(ref);
+	OsxmlEventParser *parser =
+	    static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+	// Synchronize the position of the logger with teh position
+	xmlSyncLoggerPosition(p);
+
+	// Abort as long as we're in an annotation end tag
+	if (parser->getData().inAnnotationEndTag()) {
+		parser->getData().decrDepth();
+		return;
+	}
+
+	// Decrement the current depth
+	parser->getData().decrDepth();
+
+	// If there is any text data in the buffer, issue that first
+	if (parser->getData().hasText()) {
+		parser->getEvents().data(
+		    parser->getData().getText(parser->getReader().getSourceId()));
+	}
+
+	// Abort if the special ousia tag ends here
+	std::string nameStr{name};
+	if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) {
+		return;
+	}
+
+	// Issue the "fieldEnd" event
+	parser->getEvents().fieldEnd();
+}
+
+static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
+{
+	// Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser
+	XML_Parser p = static_cast<XML_Parser>(ref);
+	OsxmlEventParser *parser =
+	    static_cast<OsxmlEventParser *>(XML_GetUserData(p));
+
+	// Abort as long as we're in an annotation end tag
+	if (parser->getData().inAnnotationEndTag()) {
+		return;
+	}
+
+	// Convert the signed (smell the 90's C library here?) length to an usigned
+	// value
+	size_t ulen = len > 0 ? static_cast<size_t>(len) : 0;
+
+	// Synchronize the logger position
+	SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
+
+	// Fetch some variables for convenience
+	const WhitespaceMode mode = parser->getWhitespaceMode();
+	OsxmlEventParserData &data = parser->getData();
+	std::vector<char> &textBuf = data.textBuf;
+	std::vector<char> &whitespaceBuf = data.whitespaceBuf;
+	bool &hasWhitespace = data.hasWhitespace;
+	size_t &textStart = data.textStart;
+	size_t &textEnd = data.textEnd;
+
+	size_t pos = loc.getStart();
+	for (size_t i = 0; i < ulen; i++, pos++) {
+		switch (mode) {
+			case WhitespaceMode::PRESERVE:
+				PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+				                                    textStart, textEnd);
+				break;
+			case WhitespaceMode::TRIM:
+				TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+				                                  textStart, textEnd,
+				                                  whitespaceBuf);
+				break;
+			case WhitespaceMode::COLLAPSE:
+				CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
+				                                    textStart, textEnd,
+				                                    hasWhitespace);
+				break;
+		}
+	}
+}
+
+/* Class OsxmlEvents */
+
+OsxmlEvents::~OsxmlEvents() {}
+
+/* Class OsxmlEventParser */
+
+OsxmlEventParserData::OsxmlEventParserData()
+    : depth(0),
+      annotationEndTagDepth(-1),
+      hasWhitespace(false),
+      textStart(0),
+      textEnd(0)
+{
+}
+
+void OsxmlEventParserData::incrDepth() { depth++; }
+
+void OsxmlEventParserData::decrDepth()
+{
+	if (depth > 0) {
+		depth--;
+	}
+	if (depth < annotationEndTagDepth) {
+		annotationEndTagDepth = -1;
+	}
+}
+
+bool OsxmlEventParserData::inAnnotationEndTag()
+{
+	return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth);
+}
+
+bool OsxmlEventParserData::hasText() { return !textBuf.empty(); }
+
+Variant OsxmlEventParserData::getText(SourceId sourceId)
+{
+	// Create a variant containing the string data and the location
+	Variant var =
+	    Variant::fromString(std::string{textBuf.data(), textBuf.size()});
+	var.setLocation({sourceId, textStart, textEnd});
+
+	// Reset the text buffers
+	textBuf.clear();
+	whitespaceBuf.clear();
+	hasWhitespace = false;
+	textStart = 0;
+	textEnd = 0;
+
+	// Return the variant
+	return var;
+}
+
+/* Class OsxmlEventParser */
+
+OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
+                                   Logger &logger)
+    : reader(reader),
+      events(events),
+      logger(logger),
+      whitespaceMode(WhitespaceMode::TRIM),
+      data(new OsxmlEventParserData())
+{
+}
+
+OsxmlEventParser::~OsxmlEventParser() {}
+
+void OsxmlEventParser::parse()
+{
+	// Create the parser object
+	GuardedExpatXmlParser p{"UTF-8"};
+
+	// Reset the depth
+	data->depth = 0;
+
+	// Pass the reference to this parser instance to the XML handler
+	XML_SetUserData(&p, this);
+	XML_UseParserAsHandlerArg(&p);
+
+	// Set the callback functions
+	XML_SetStartElementHandler(&p, xmlStartElementHandler);
+	XML_SetEndElementHandler(&p, xmlEndElementHandler);
+	XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler);
+
+	// Feed data into expat while there is data to process
+	constexpr size_t BUFFER_SIZE = 64 * 1024;
+	while (true) {
+		// Fetch a buffer from expat for the input data
+		char *buf = static_cast<char *>(XML_GetBuffer(&p, BUFFER_SIZE));
+		if (!buf) {
+			throw OusiaException{"Internal error: XML parser out of memory!"};
+		}
+
+		// Read into the buffer
+		size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE);
+
+		// Parse the data and handle any XML error as exception
+		if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) {
+			throw LoggableException{
+			    "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))},
+			    xmlSyncLoggerPosition(&p)};
+		}
+
+		// Abort once there are no more bytes in the stream
+		if (bytesRead == 0) {
+			break;
+		}
+	}
+}
+
+void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
+{
+	this->whitespaceMode = whitespaceMode;
+}
+
+WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
+{
+	return whitespaceMode;
+}
+
+CharReader &OsxmlEventParser::getReader() const { return reader; }
+
+Logger &OsxmlEventParser::getLogger() const { return logger; }
+
+OsxmlEvents &OsxmlEventParser::getEvents() const { return events; }
+
+OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; }
+}
+
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
new file mode 100644
index 0000000..e39245f
--- /dev/null
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -0,0 +1,217 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlEventParser.hpp
+ *
+ * The OsxmlEventParser class is responsible for parsing an XML file and calling
+ * the corresponding event handler functions if an XML item is found. Event
+ * handling is performed using a listener interface.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OSXML_EVENT_PARSER_HPP_
+#define _OSXML_EVENT_PARSER_HPP_
+
+#include <memory>
+#include <string>
+
+#include <core/common/Whitespace.hpp>
+
+namespace ousia {
+
+// Forward declarations
+class Logger;
+class Variant;
+class OsxmlEventParserData;
+
+/**
+ * Interface which defines the callback functions which are called by the
+ * OsxmlEventParser whenever an event occurs.
+ */
+class OsxmlEvents {
+public:
+	/**
+	 * Virtual destructor.
+	 */
+	virtual ~OsxmlEvents();
+
+	/**
+	 * Called whenever a command starts. Note that this implicitly always starts
+	 * the default field of the command.
+	 *
+	 * @param name is a string variant containing name and location of the
+	 * command.
+	 * @param args is a map containing the arguments that were given to the
+	 * command.
+	 */
+	virtual void command(const Variant &name, const Variant::mapType &args) = 0;
+
+	/**
+	 * Called whenever an annotation starts. Note that this implicitly always
+	 * starts the default field of the annotation.
+	 *
+	 * @param className is a string variant containing the name of the
+	 * annotation class and the location of the annotation definition.
+	 * @param args is a map variant containing the arguments that were given
+	 * to the annotation definition.
+	 */
+	virtual void annotationStart(const Variant &className,
+	                             const Variant::mapType &args) = 0;
+
+	/**
+	 * Called whenever the range of an annotation ends. The callee must
+	 * disambiguate the actual annotation that is finished here.
+	 *
+	 * @param className is a string variant containing the name of the
+	 * annotation class that should end here. May be empty (or nullptr), if no
+	 * elementName has been specified at the end of the annotation.
+	 * @param elementName is the name of the annotation element that should be
+	 * ended here. May be empty (or nullptr), if no elementName has been
+	 * specified at the end of the annotation.
+	 */
+	virtual void annotationEnd(const Variant &className,
+	                           const Variant &elementName) = 0;
+
+	/**
+	 * Called whenever the default field which was implicitly started by
+	 * commandStart or annotationStart ends. Note that this does not end the
+	 * range of an annotation, but the default field of the annotation. To
+	 * signal the end of the annotation this, the annotationEnd method will be
+	 * invoked.
+	 */
+	virtual void fieldEnd() = 0;
+
+	/**
+	 * Called whenever data is found. Whitespace data is handled as specified
+	 * and the data has been parsed to the specified variant type. This function
+	 * is not called if the parsing failed, the parser prints an error message
+	 * instead.
+	 *
+	 * @param data is the already parsed data that should be passed to the
+	 * handler.
+	 */
+	virtual void data(const Variant &data) = 0;
+};
+
+/**
+ * The OsxmlEventParser class is a wrapper around eXpat which implements the
+ * specialities of the osxml formats class (like annotation ranges). It notifies
+ * a specified event handler whenever a command, annotation or data has been
+ * reached.
+ */
+class OsxmlEventParser {
+private:
+	/**
+	 * Reference at the internal CharReader instance.
+	 */
+	CharReader &reader;
+
+	/**
+	 * Set of callback functions to be called whenever an event is triggered.
+	 */
+	OsxmlEvents &events;
+
+	/**
+	 * Reference at the Logger object to which error messages or warnings should
+	 * be logged.
+	 */
+	Logger &logger;
+
+	/**
+	 * Current whitespace mode.
+	 */
+	WhitespaceMode whitespaceMode;
+
+	/**
+	 * Data to be used by the internal functions.
+	 */
+	std::unique_ptr<OsxmlEventParserData> data;
+
+public:
+	/**
+	 * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents
+	 * of which the callback functions are called.
+	 *
+	 * @param reader is a reference to the CharReader instance from which the
+	 * XML should be read.
+	 * @param events is a refence at an instance of the OsxmlEvents class. All
+	 * events are forwarded to this class.
+	 * @param logger is the Logger instance to which log messages should be
+	 * written.
+	 */
+	OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger);
+
+	/**
+	 * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type)
+	 */
+	~OsxmlEventParser();
+
+	/**
+	 * Performs the actual parsing. Reads the XML using eXpat and calles the
+	 * callbacks in the event listener instance whenever something interesting
+	 * happens.
+	 */
+	void parse();
+
+	/**
+	 * Sets the whitespace handling mode.
+	 *
+	 * @param whitespaceMode defines how whitespace in the data should be
+	 * handled.
+	 */
+	void setWhitespaceMode(WhitespaceMode whitespaceMode);
+
+	/**
+	 * Returns the current whitespace handling mode.
+	 *
+	 * @return the currently set whitespace handling mode.
+	 */
+	WhitespaceMode getWhitespaceMode() const;
+
+	/**
+	 * Returns the internal CharReader reference.
+	 *
+	 * @return the CharReader reference.
+	 */
+	CharReader &getReader() const;
+
+	/**
+	 * Returns the internal Logger reference.
+	 *
+	 * @return the internal Logger reference.
+	 */
+	Logger &getLogger() const;
+
+	/**
+	 * Returns the internal OsxmlEvents reference.
+	 *
+	 * @return the internal OsxmlEvents reference.
+	 */
+	OsxmlEvents &getEvents() const;
+
+	/**
+	 * Returns a reference at the internal data.
+	 */
+	OsxmlEventParserData &getData() const;
+};
+}
+
+#endif /* _OSXML_EVENT_PARSER_HPP_ */
+
diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp
new file mode 100644
index 0000000..c216855
--- /dev/null
+++ b/src/formats/osxml/OsxmlParser.cpp
@@ -0,0 +1,98 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/parser/stack/GenericParserStates.hpp>
+#include <core/parser/stack/Stack.hpp>
+#include <core/parser/ParserContext.hpp>
+
+#include "OsxmlEventParser.hpp"
+#include "OsxmlParser.hpp"
+
+namespace ousia {
+
+using namespace parser_stack;
+
+/**
+ * Class containing the actual OsxmlParser implementation.
+ */
+class OsxmlParserImplementation : public OsxmlEvents {
+private:
+	/**
+	 * Actual xml parser -- converts the xml stream into a set of events.
+	 */
+	OsxmlEventParser parser;
+
+	/**
+	 * Pushdown automaton responsible for converting the xml events into an
+	 * actual Node tree.
+	 */
+	Stack stack;
+
+public:
+	/**
+	 * Constructor of the OsxmlParserImplementation class.
+	 *
+	 * @param reader is a reference to the CharReader instance from which the
+	 * XML should be read.
+	 * @param ctx is a reference to the ParserContext instance that should be
+	 * used.
+	 */
+	OsxmlParserImplementation(CharReader &reader, ParserContext &ctx)
+	    : parser(reader, *this, ctx.getLogger()),
+	      stack(ctx, GenericParserStates)
+	{
+	}
+
+	/**
+	 * Starts the actual parsing process.
+	 */
+	void parse() { parser.parse(); }
+
+	void command(const Variant &name, const Variant::mapType &args) override
+	{
+		stack.command(name, args);
+		stack.fieldStart(true);
+	}
+
+	void annotationStart(const Variant &name,
+	                     const Variant::mapType &args) override
+	{
+		stack.annotationStart(name, args);
+		stack.fieldStart(true);
+	}
+
+	void annotationEnd(const Variant &className,
+	                   const Variant &elementName) override
+	{
+		stack.annotationEnd(className, elementName);
+	}
+
+	void fieldEnd() override { stack.fieldEnd(); }
+
+	void data(const Variant &data) override { stack.data(data); }
+};
+
+/* Class OsxmlParser */
+
+void OsxmlParser::doParse(CharReader &reader, ParserContext &ctx)
+{
+	OsxmlParserImplementation impl(reader, ctx);
+	impl.parse();
+}
+}
+
diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp
new file mode 100644
index 0000000..0fbf83c
--- /dev/null
+++ b/src/formats/osxml/OsxmlParser.hpp
@@ -0,0 +1,55 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsxmlParser.hpp
+ *
+ * Contains the parser responsible for reading Ousía XML Documents (extension
+ * oxd) and Ousía XML Modules (extension oxm).
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSXML_PARSER_HPP_
+#define _OUSIA_OSXML_PARSER_HPP_
+
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+
+/**
+ * The OsxmlParser class implements parsing the various types of Ousía XML
+ * documents using the OsxmlEventParser and Stack classes.
+ */
+class OsxmlParser : public Parser {
+protected:
+	/**
+	 * Parses the given input stream as XML file and returns the parsed
+	 * top-level node.
+	 *
+	 * @param reader is the CharReader from which the input should be read.
+	 * @param ctx is a reference to the ParserContext instance that should be
+	 * used.
+	 */
+	void doParse(CharReader &reader, ParserContext &ctx) override;
+};
+
+}
+
+#endif /* _OUSIA_OSXML_PARSER_HPP_ */
+
author	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2015-02-15 21:56:04 +0100
committer	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2015-02-15 21:56:04 +0100
commit	d2f99e4b43ed93ef0fa8e138e0c3afc79775b77c (patch)
tree	8e7cdb894b7036b3ca01499ee9432d2e62930477 /src/formats
parent	40f7df390f00f85c17bd0e6527ec4ba19cbce4fc (diff)
parent	4f2872d9968aec93bebff90d1238347c8a364949 (diff)