summaryrefslogtreecommitdiff
path: root/src/formats
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-10 16:42:20 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-10 16:42:20 +0100
commit6352751f87a4acdd9a32baf4f1133819678d06b0 (patch)
tree23f1764605227d25ee9861fcf23e2dc381bdc187 /src/formats
parente85e174028de377cee9f6dfcccbe418d185cc7b4 (diff)
Renamed some files and changed folder structure
Diffstat (limited to 'src/formats')
-rw-r--r--src/formats/osdm/DynamicTokenizer.cpp544
-rw-r--r--src/formats/osdm/DynamicTokenizer.hpp252
-rw-r--r--src/formats/osdm/OsdmStreamParser.cpp640
-rw-r--r--src/formats/osdm/OsdmStreamParser.hpp351
-rw-r--r--src/formats/osdm/TokenTrie.cpp119
-rw-r--r--src/formats/osdm/TokenTrie.hpp150
6 files changed, 2056 insertions, 0 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp
new file mode 100644
index 0000000..f2cfcd1
--- /dev/null
+++ b/src/formats/osdm/DynamicTokenizer.cpp
@@ -0,0 +1,544 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <memory>
+#include <vector>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Exceptions.hpp>
+#include <core/common/Utils.hpp>
+
+#include "DynamicTokenizer.hpp"
+
+namespace ousia {
+
+namespace {
+
+/* Internal class TokenMatch */
+
+/**
+ * Contains information about a matching token.
+ */
+struct TokenMatch {
+ /**
+ * Token that was matched.
+ */
+ DynamicToken token;
+
+ /**
+ * Current length of the data within the text handler. The text buffer needs
+ * to be trimmed to this length if this token matches.
+ */
+ size_t textLength;
+
+ /**
+ * End location of the current text handler. This location needs to be used
+ * for the text token that is emitted before the actual token.
+ */
+ size_t textEnd;
+
+ /**
+ * Constructor of the TokenMatch class.
+ */
+ TokenMatch() : textLength(0), textEnd(0) {}
+
+ /**
+ * Returns true if this TokenMatch instance actually represents a match.
+ */
+ bool hasMatch() { return token.type != EmptyToken; }
+};
+
+/* Internal class TokenLookup */
+
+/**
+ * The TokenLookup class is used to represent a thread in a running token
+ * lookup.
+ */
+class TokenLookup {
+private:
+ /**
+ * Current node within the token trie.
+ */
+ TokenTrie::Node const *node;
+
+ /**
+ * Start offset within the source file.
+ */
+ size_t start;
+
+ /**
+ * Current length of the data within the text handler. The text buffer needs
+ * to be trimmed to this length if this token matches.
+ */
+ size_t textLength;
+
+ /**
+ * End location of the current text handler. This location needs to be used
+ * for the text token that is emitted before the actual token.
+ */
+ size_t textEnd;
+
+public:
+ /**
+ * Constructor of the TokenLookup class.
+ *
+ * @param node is the current node.
+ * @param start is the start position.
+ * @param textLength is the text buffer length of the previous text token.
+ * @param textEnd is the current end location of the previous text token.
+ */
+ TokenLookup(const TokenTrie::Node *node, size_t start,
+ size_t textLength, size_t textEnd)
+ : node(node), start(start), textLength(textLength), textEnd(textEnd)
+ {
+ }
+
+ /**
+ * Tries to extend the current path in the token trie with the given
+ * character. If a complete token is matched, stores this match in the
+ * tokens list (in case it is longer than any previous token).
+ *
+ * @param c is the character that should be appended to the current prefix.
+ * @param lookups is a list to which new TokeLookup instances are added --
+ * which could potentially be expanded in the next iteration.
+ * @param match is the DynamicToken instance to which the matching token
+ * should be written.
+ * @param tokens is a reference at the internal token list of the
+ * DynamicTokenizer.
+ * @param end is the end byte offset of the current character.
+ * @param sourceId is the source if of this file.
+ */
+ void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+ const std::vector<std::string> &tokens, SourceOffset end,
+ SourceId sourceId)
+ {
+ // Check whether we can continue the current token path with the given
+ // character without visiting an already visited node
+ auto it = node->children.find(c);
+ if (it == node->children.end()) {
+ return;
+ }
+
+ // Check whether the new node represents a complete token a whether it
+ // is longer than the current token. If yes, replace the current token.
+ node = it->second.get();
+ if (node->type != EmptyToken) {
+ const std::string &str = tokens[node->type];
+ size_t len = str.size();
+ if (len > match.token.content.size()) {
+ match.token =
+ DynamicToken{node->type, str, {sourceId, start, end}};
+ match.textLength = textLength;
+ match.textEnd = textEnd;
+ }
+ }
+
+ // If this state can possibly be advanced, store it in the states list.
+ if (!node->children.empty()) {
+ lookups.emplace_back(*this);
+ }
+ }
+};
+
+/* Internal class TextHandlerBase */
+
+/**
+ * Base class used for those classes that may be used as TextHandler in the
+ * DynamicTokenizer::next function.
+ */
+class TextHandlerBase {
+public:
+ /**
+ * Start position of the extracted text.
+ */
+ size_t textStart;
+
+ /**
+ * End position of the extracted text.
+ */
+ size_t textEnd;
+
+ /**
+ * Buffer containing the extracted text.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Constructor of the TextHandlerBase base class. Initializes the start and
+ * end position with zeros.
+ */
+ TextHandlerBase() : textStart(0), textEnd(0) {}
+
+ /**
+ * Transforms the given token into a text token containing the extracted
+ * text.
+ *
+ * @param token is the output token to which the text should be written.
+ * @param sourceId is the source id of the underlying file.
+ */
+ void buildTextToken(TokenMatch &match, SourceId sourceId)
+ {
+ if (match.hasMatch()) {
+ match.token.content =
+ std::string{textBuf.data(), match.textLength};
+ match.token.location =
+ SourceLocation{sourceId, textStart, match.textEnd};
+ } else {
+ match.token.content = std::string{textBuf.data(), textBuf.size()};
+ match.token.location = SourceLocation{sourceId, textStart, textEnd};
+ }
+ match.token.type = TextToken;
+ }
+
+ /**
+ * Returns true if this whitespace handler has found any text and a text
+ * token could be emitted.
+ *
+ * @return true if the internal data buffer is non-empty.
+ */
+ bool hasText() { return !textBuf.empty(); }
+};
+
+/* Internal class PreservingTextHandler */
+
+/**
+ * The PreservingTextHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
+
+ /**
+ * Appends the given character to the internal text buffer, does not
+ * eliminate whitespace.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+ textBuf.push_back(c);
+ }
+};
+
+/* Internal class TrimmingTextHandler */
+
+/**
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
+ */
+class TrimmingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
+
+ /**
+ * Buffer used internally to temporarily store all whitespace characters.
+ * They are only added to the output buffer if another non-whitespace
+ * character is reached.
+ */
+ std::vector<char> whitespaceBuf;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * whitespace characters at the begin and end of the text.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ whitespaceBuf.push_back(c);
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (!whitespaceBuf.empty()) {
+ textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+ whitespaceBuf.end());
+ whitespaceBuf.clear();
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/* Internal class CollapsingTextHandler */
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
+
+ /**
+ * Flag set to true if a whitespace character was reached.
+ */
+ bool hasWhitespace = false;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * redundant whitespace characters.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ hasWhitespace = true;
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (hasWhitespace) {
+ textBuf.push_back(' ');
+ hasWhitespace = false;
+ }
+ textBuf.push_back(c);
+ }
+};
+}
+
+/* Class DynamicTokenizer */
+
+DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
+ : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
+{
+}
+
+template <typename TextHandler, bool read>
+bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
+{
+ // If we're in the read mode, reset the char reader peek position to the
+ // current read position
+ if (read) {
+ reader.resetPeek();
+ }
+
+ // Prepare the lookups in the token trie
+ const TokenTrie::Node *root = trie.getRoot();
+ TokenMatch match;
+ std::vector<TokenLookup> lookups;
+ std::vector<TokenLookup> nextLookups;
+
+ // Instantiate the text handler
+ TextHandler textHandler;
+
+ // Peek characters from the reader and try to advance the current token tree
+ // cursor
+ char c;
+ size_t charStart = reader.getPeekOffset();
+ const SourceId sourceId = reader.getSourceId();
+ while (reader.peek(c)) {
+ const size_t charEnd = reader.getPeekOffset();
+ const size_t textLength = textHandler.textBuf.size();
+ const size_t textEnd = textHandler.textEnd;
+
+ // If we do not have a match yet, start a new lookup from the root
+ if (!match.hasMatch()) {
+ TokenLookup{root, charStart, textLength, textEnd}.advance(
+ c, nextLookups, match, tokens, charEnd, sourceId);
+ }
+
+ // Try to advance all other lookups with the new character
+ for (TokenLookup &lookup : lookups) {
+ lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+ }
+
+ // We have found a token and there are no more states to advance or the
+ // text handler has found something -- abort to return the new token
+ if (match.hasMatch()) {
+ if ((nextLookups.empty() || textHandler.hasText())) {
+ break;
+ }
+ } else {
+ // Record all incomming characters
+ textHandler.append(c, charStart, charEnd);
+ }
+
+ // Swap the lookups and the nextLookups list
+ lookups = std::move(nextLookups);
+ nextLookups.clear();
+
+ // Advance the offset
+ charStart = charEnd;
+ }
+
+ // If we found text, emit that text
+ if (textHandler.hasText() &&
+ (!match.hasMatch() || match.textLength > 0)) {
+ textHandler.buildTextToken(match, sourceId);
+ }
+
+ // Move the read/peek cursor to the end of the token, abort if an error
+ // happens while doing so
+ if (match.hasMatch()) {
+ // Make sure we have a valid location
+ if (match.token.location.getEnd() == InvalidSourceOffset) {
+ throw OusiaException{"Token end position offset out of range"};
+ }
+
+ // Seek to the end of the current token
+ const size_t end = match.token.location.getEnd();
+ if (read) {
+ reader.seek(end);
+ } else {
+ reader.seekPeekCursor(end);
+ }
+ token = match.token;
+ } else {
+ token = DynamicToken{};
+ }
+ return match.hasMatch();
+}
+
+bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
+{
+ switch (whitespaceMode) {
+ case WhitespaceMode::PRESERVE:
+ return next<PreservingTextHandler, true>(reader, token);
+ case WhitespaceMode::TRIM:
+ return next<TrimmingTextHandler, true>(reader, token);
+ case WhitespaceMode::COLLAPSE:
+ return next<CollapsingTextHandler, true>(reader, token);
+ }
+ return false;
+}
+
+bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
+{
+ switch (whitespaceMode) {
+ case WhitespaceMode::PRESERVE:
+ return next<PreservingTextHandler, false>(reader, token);
+ case WhitespaceMode::TRIM:
+ return next<TrimmingTextHandler, false>(reader, token);
+ case WhitespaceMode::COLLAPSE:
+ return next<CollapsingTextHandler, false>(reader, token);
+ }
+ return false;
+}
+
+TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
+{
+ // Abort if an empty token should be registered
+ if (token.empty()) {
+ return EmptyToken;
+ }
+
+ // Search for a new slot in the tokens list
+ TokenTypeId type = EmptyToken;
+ for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
+ if (tokens[i].empty()) {
+ tokens[i] = token;
+ type = i;
+ break;
+ }
+ }
+
+ // No existing slot was found, add a new one -- make sure we do not
+ // override the special token type handles
+ if (type == EmptyToken) {
+ type = tokens.size();
+ if (type == TextToken || type == EmptyToken) {
+ throw OusiaException{"Token type ids depleted!"};
+ }
+ tokens.emplace_back(token);
+ }
+ nextTokenTypeId = type + 1;
+
+ // Try to register the token in the trie -- if this fails, remove it
+ // from the tokens list
+ if (!trie.registerToken(token, type)) {
+ tokens[type] = std::string();
+ nextTokenTypeId = type;
+ return EmptyToken;
+ }
+ return type;
+}
+
+bool DynamicTokenizer::unregisterToken(TokenTypeId type)
+{
+ // Unregister the token from the trie, abort if an invalid type is given
+ if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
+ tokens[type] = std::string{};
+ nextTokenTypeId = type;
+ return true;
+ }
+ return false;
+}
+
+std::string DynamicTokenizer::getTokenString(TokenTypeId type)
+{
+ if (type < tokens.size()) {
+ return tokens[type];
+ }
+ return std::string{};
+}
+
+void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
+{
+ whitespaceMode = mode;
+}
+
+WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
+
+/* Explicitly instantiate all possible instantiations of the "next" member
+ function */
+template bool DynamicTokenizer::next<PreservingTextHandler, false>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
+ CharReader &reader,DynamicToken &token);
+template bool DynamicTokenizer::next<PreservingTextHandler, true>(
+ CharReader &reader,DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
+ CharReader &reader,DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
+ CharReader &reader,DynamicToken &token);
+}
+
diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp
new file mode 100644
index 0000000..0cac2e8
--- /dev/null
+++ b/src/formats/osdm/DynamicTokenizer.hpp
@@ -0,0 +1,252 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file DynamicTokenizer.hpp
+ *
+ * Tokenizer that can be reconfigured at runtime used for parsing the plain
+ * text format.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include <core/common/Location.hpp>
+
+#include "TokenTrie.hpp"
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+
+/**
+ * The DynamicToken structure describes a token discovered by the Tokenizer.
+ */
+struct DynamicToken {
+ /**
+ * Id of the type of this token.
+ */
+ TokenTypeId type;
+
+ /**
+ * String that was matched.
+ */
+ std::string content;
+
+ /**
+ * Location from which the string was extracted.
+ */
+ SourceLocation location;
+
+ /**
+ * Default constructor.
+ */
+ DynamicToken() : type(EmptyToken) {}
+
+ /**
+ * Constructor of the DynamicToken struct.
+ *
+ * @param id represents the token type.
+ * @param content is the string content that has been extracted.
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ DynamicToken(TokenTypeId type, const std::string &content,
+ SourceLocation location)
+ : type(type), content(content), location(location)
+ {
+ }
+
+ /**
+ * Constructor of the DynamicToken struct, only initializes the token type
+ *
+ * @param type is the id corresponding to the type of the token.
+ */
+ DynamicToken(TokenTypeId type) : type(type) {}
+
+ /**
+ * The getLocation function allows the tokens to be directly passed as
+ * parameter to Logger or LoggableException instances.
+ *
+ * @return a reference at the location field
+ */
+ const SourceLocation &getLocation() const { return location; }
+};
+
+/**
+ * Enum specifying the whitespace handling of the DynamicTokenizer class when
+ * reading non-token text.
+ */
+enum class WhitespaceMode {
+ /**
+ * Preserves all whitespaces as they are found in the source file.
+ */
+ PRESERVE,
+
+ /**
+ * Trims whitespace at the beginning and the end of the found text.
+ */
+ TRIM,
+
+ /**
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
+ COLLAPSE
+};
+
+/**
+ * The DynamicTokenizer is used to extract tokens and chunks of text from a
+ * CharReader. It allows to register and unregister tokens while parsing and
+ * to modify the handling of whitespace characters. Note that the
+ * DynamicTokenizer always tries to extract the longest possible token from the
+ * tokenizer.
+ */
+class DynamicTokenizer {
+private:
+ /**
+ * Internally used token trie. This object holds all registered tokens.
+ */
+ TokenTrie trie;
+
+ /**
+ * Flag defining whether whitespaces should be preserved or not.
+ */
+ WhitespaceMode whitespaceMode;
+
+ /**
+ * Vector containing all registered token types.
+ */
+ std::vector<std::string> tokens;
+
+ /**
+ * Next index in the tokens list where to search for a new token id.
+ */
+ size_t nextTokenTypeId;
+
+ /**
+ * Templated function used internally to read the current token. The
+ * function is templated in order to force code generation for all six
+ * combiations of whitespace modes and reading/peeking.
+ *
+ * @tparam TextHandler is the type to be used for the textHandler instance.
+ * @tparam read specifies whether the function should start from and advance
+ * the read pointer of the char reader.
+ * @param reader is the CharReader instance from which the data should be
+ * read.
+ * @param token is the token structure into which the token information
+ * should be written.
+ * @return false if the end of the stream has been reached, true otherwise.
+ */
+ template <typename TextHandler, bool read>
+ bool next(CharReader &reader, DynamicToken &token);
+
+public:
+ /**
+ * Constructor of the DynamicTokenizer class.
+ *
+ * @param whitespaceMode specifies how whitespace should be handled.
+ */
+ DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
+
+ /**
+ * Registers the given string as a token. Returns a const pointer at a
+ * TokenDescriptor that will be used to reference the newly created token.
+ *
+ * @param token is the token string that should be registered.
+ * @return a unique identifier for the registered token or EmptyToken if
+ * an error occured.
+ */
+ TokenTypeId registerToken(const std::string &token);
+
+ /**
+ * Unregisters the token belonging to the given TokenTypeId.
+ *
+ * @param type is the token type that should be unregistered. The
+ *TokenTypeId
+ * must have been returned by registerToken.
+ * @return true if the operation was successful, false otherwise (e.g.
+ * because the given TokenDescriptor was already unregistered).
+ */
+ bool unregisterToken(TokenTypeId type);
+
+ /**
+ * Returns the token that was registered under the given TokenTypeId id or
+ *an
+ * empty string if an invalid TokenTypeId id is given.
+ *
+ * @param type is the TokenTypeId id for which the corresponding token
+ *string
+ * should be returned.
+ * @return the registered token string or an empty string if the given type
+ * was invalid.
+ */
+ std::string getTokenString(TokenTypeId type);
+
+ /**
+ * Sets the whitespace mode.
+ *
+ * @param whitespaceMode defines how whitespace should be treated in text
+ * tokens.
+ */
+ void setWhitespaceMode(WhitespaceMode mode);
+
+ /**
+ * Returns the current value of the whitespace mode.
+ *
+ * @return the whitespace mode.
+ */
+ WhitespaceMode getWhitespaceMode();
+
+ /**
+ * Reads a new token from the CharReader and stores it in the given
+ * DynamicToken instance.
+ *
+ * @param reader is the CharReader instance from which the data should be
+ * read.
+ * @param token is a reference at the token instance into which the Token
+ * information should be written.
+ * @return true if a token could be read, false if the end of the stream
+ * has been reached.
+ */
+ bool read(CharReader &reader, DynamicToken &token);
+
+ /**
+ * The peek method does not advance the read position of the char reader,
+ * but reads the next token from the current char reader peek position.
+ *
+ * @param reader is the CharReader instance from which the data should be
+ * read.
+ * @param token is a reference at the token instance into which the Token
+ * information should be written.
+ * @return true if a token could be read, false if the end of the stream
+ * has been reached.
+ */
+ bool peek(CharReader &reader, DynamicToken &token);
+};
+}
+
+#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
+
diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osdm/OsdmStreamParser.cpp
new file mode 100644
index 0000000..8cb8caf
--- /dev/null
+++ b/src/formats/osdm/OsdmStreamParser.cpp
@@ -0,0 +1,640 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Logger.hpp>
+#include <core/common/Utils.hpp>
+#include <core/common/VariantReader.hpp>
+
+#include "OsdmStreamParser.hpp"
+
+namespace ousia {
+
+/**
+ * Plain format default tokenizer.
+ */
+class PlainFormatTokens : public DynamicTokenizer {
+public:
+ /**
+ * Id of the backslash token.
+ */
+ TokenTypeId Backslash;
+
+ /**
+ * Id of the line comment token.
+ */
+ TokenTypeId LineComment;
+
+ /**
+ * Id of the block comment start token.
+ */
+ TokenTypeId BlockCommentStart;
+
+ /**
+ * Id of the block comment end token.
+ */
+ TokenTypeId BlockCommentEnd;
+
+ /**
+ * Id of the field start token.
+ */
+ TokenTypeId FieldStart;
+
+ /**
+ * Id of the field end token.
+ */
+ TokenTypeId FieldEnd;
+
+ /**
+ * Registers the plain format tokens in the internal tokenizer.
+ */
+ PlainFormatTokens()
+ {
+ Backslash = registerToken("\\");
+ LineComment = registerToken("%");
+ BlockCommentStart = registerToken("%{");
+ BlockCommentEnd = registerToken("}%");
+ FieldStart = registerToken("{");
+ FieldEnd = registerToken("}");
+ }
+};
+
+static const PlainFormatTokens Tokens;
+
+/**
+ * Class used internally to collect data issued via "DATA" event.
+ */
+class DataHandler {
+private:
+ /**
+ * Internal character buffer.
+ */
+ std::vector<char> buf;
+
+ /**
+ * Start location of the character data.
+ */
+ SourceOffset start;
+
+ /**
+ * End location of the character data.
+ */
+ SourceOffset end;
+
+public:
+ /**
+ * Default constructor, initializes start and end with zeros.
+ */
+ DataHandler() : start(0), end(0) {}
+
+ /**
+ * Returns true if the internal buffer is empty.
+ *
+ * @return true if no characters were added to the internal buffer, false
+ * otherwise.
+ */
+ bool isEmpty() { return buf.empty(); }
+
+ /**
+ * Appends a single character to the internal buffer.
+ *
+ * @param c is the character that should be added to the internal buffer.
+ * @param charStart is the start position of the character.
+ * @param charEnd is the end position of the character.
+ */
+ void append(char c, SourceOffset charStart, SourceOffset charEnd)
+ {
+ if (isEmpty()) {
+ start = charStart;
+ }
+ buf.push_back(c);
+ end = charEnd;
+ }
+
+ /**
+ * Appends a string to the internal buffer.
+ *
+ * @param s is the string that should be added to the internal buffer.
+ * @param stringStart is the start position of the string.
+ * @param stringEnd is the end position of the string.
+ */
+ void append(const std::string &s, SourceOffset stringStart,
+ SourceOffset stringEnd)
+ {
+ if (isEmpty()) {
+ start = stringStart;
+ }
+ std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
+ end = stringEnd;
+ }
+
+ /**
+ * Converts the internal buffer to a variant with attached location
+ * information.
+ *
+ * @param sourceId is the source id which is needed for building the
+ * location information.
+ * @return a Variant with the internal buffer content as string and
+ * the correct start and end location.
+ */
+ Variant toVariant(SourceId sourceId)
+ {
+ Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
+ res.setLocation({sourceId, start, end});
+ return res;
+ }
+};
+
+OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger)
+ : reader(reader), logger(logger), tokenizer(Tokens)
+{
+ // Place an intial command representing the complete file on the stack
+ commands.push(Command{"", Variant::mapType{}, true, true, true});
+}
+
+Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep)
+{
+ bool first = true;
+ bool hasCharSiceNSSep = false;
+ std::vector<char> identifier;
+ size_t end = reader.getPeekOffset();
+ char c, c2;
+ while (reader.peek(c)) {
+ // Abort if this character is not a valid identifer character
+ if ((first && Utils::isIdentifierStartCharacter(c)) ||
+ (!first && Utils::isIdentifierCharacter(c))) {
+ identifier.push_back(c);
+ } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+ Utils::isIdentifierStartCharacter(c2)) {
+ identifier.push_back(c);
+ } else {
+ if (c == ':' && allowNSSep) {
+ logger.error(
+ "Expected character before and after namespace separator "
+ "\":\"",
+ reader);
+ }
+ reader.resetPeek();
+ break;
+ }
+
+ // This is no longer the first character
+ first = false;
+
+ // Advance the hasCharSiceNSSep flag
+ hasCharSiceNSSep = allowNSSep && (c != ':');
+
+ end = reader.getPeekOffset();
+ reader.consumePeek();
+ }
+
+ // Return the identifier at its location
+ Variant res =
+ Variant::fromString(std::string(identifier.data(), identifier.size()));
+ res.setLocation({reader.getSourceId(), start, end});
+ return res;
+}
+
+OsdmStreamParser::State OsdmStreamParser::parseBeginCommand()
+{
+ // Expect a '{' after the command
+ reader.consumeWhitespace();
+ if (!reader.expect('{')) {
+ logger.error("Expected \"{\" after \\begin", reader);
+ return State::NONE;
+ }
+
+ // Parse the name of the command that should be opened
+ Variant commandName = parseIdentifier(reader.getOffset(), true);
+ if (commandName.asString().empty()) {
+ logger.error("Expected identifier", commandName);
+ return State::ERROR;
+ }
+
+ // Check whether the next character is a '#', indicating the start of the
+ // command name
+ Variant commandArgName;
+ SourceOffset start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ if (!reader.expect('}')) {
+ logger.error("Expected \"}\"", reader);
+ return State::ERROR;
+ }
+
+ // Parse the arguments
+ Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), true);
+
+ return State::COMMAND;
+}
+
+static bool checkStillInField(const OsdmStreamParser::Command &cmd,
+ const Variant &endName, Logger &logger)
+{
+ if (cmd.inField && !cmd.inRangeField) {
+ logger.error(std::string("\\end in open field of command \"") +
+ cmd.name.asString() + std::string("\""),
+ endName);
+ logger.note(std::string("Open command started here:"), cmd.name);
+ return true;
+ }
+ return false;
+}
+
+OsdmStreamParser::State OsdmStreamParser::parseEndCommand()
+{
+ // Expect a '{' after the command
+ if (!reader.expect('{')) {
+ logger.error("Expected \"{\" after \\end", reader);
+ return State::NONE;
+ }
+
+ // Fetch the name of the command that should be ended here
+ Variant name = parseIdentifier(reader.getOffset(), true);
+
+ // Make sure the given command name is not empty
+ if (name.asString().empty()) {
+ logger.error("Expected identifier", name);
+ return State::ERROR;
+ }
+
+ // Make sure the command name is terminated with a '}'
+ if (!reader.expect('}')) {
+ logger.error("Expected \"}\"", reader);
+ return State::ERROR;
+ }
+
+ // Unroll the command stack up to the last range command
+ while (!commands.top().hasRange) {
+ if (checkStillInField(commands.top(), name, logger)) {
+ return State::ERROR;
+ }
+ commands.pop();
+ }
+
+ // Make sure we're not in an open field of this command
+ if (checkStillInField(commands.top(), name, logger)) {
+ return State::ERROR;
+ }
+
+ // Special error message if the top-level command is reached
+ if (commands.size() == 1) {
+ logger.error(std::string("Cannot end command \"") + name.asString() +
+ std::string("\" here, no command open"),
+ name);
+ return State::ERROR;
+ }
+
+ // Inform the about command mismatches
+ const Command &cmd = commands.top();
+ if (commands.top().name.asString() != name.asString()) {
+ logger.error(std::string("Trying to end command \"") +
+ cmd.name.asString() +
+ std::string("\", but open command is \"") +
+ name.asString() + std::string("\""),
+ name);
+ logger.note("Last command was opened here:", cmd.name);
+ return State::ERROR;
+ }
+
+ // Set the location to the location of the command that was ended, then end
+ // the current command
+ location = name.getLocation();
+ commands.pop();
+ return cmd.inRangeField ? State::FIELD_END : State::NONE;
+}
+
+Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName)
+{
+ // Parse the arguments using the universal VariantReader
+ Variant commandArguments;
+ if (reader.expect('[')) {
+ auto res = VariantReader::parseObject(reader, logger, ']');
+ commandArguments = res.second;
+ } else {
+ commandArguments = Variant::mapType{};
+ }
+
+ // Insert the parsed name, make sure "name" was not specified in the
+ // arguments
+ if (commandArgName.isString()) {
+ auto res =
+ commandArguments.asMap().emplace("name", std::move(commandArgName));
+ if (!res.second) {
+ logger.error("Name argument specified multiple times",
+ SourceLocation{}, MessageMode::NO_CONTEXT);
+ logger.note("First occurance is here: ", commandArgName);
+ logger.note("Second occurance is here: ", res.first->second);
+ }
+ }
+ return commandArguments;
+}
+
+void OsdmStreamParser::pushCommand(Variant commandName,
+ Variant commandArguments, bool hasRange)
+{
+ // Store the location on the stack
+ location = commandName.getLocation();
+
+ // Place the command on the command stack, remove the last commands if we're
+ // not currently inside a field of these commands
+ while (!commands.top().inField) {
+ commands.pop();
+ }
+ commands.push(Command{std::move(commandName), std::move(commandArguments),
+ hasRange, false, false});
+}
+
+OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start)
+{
+ // Parse the commandName as a first identifier
+ Variant commandName = parseIdentifier(start, true);
+ if (commandName.asString().empty()) {
+ logger.error("Empty command name", reader);
+ return State::NONE;
+ }
+
+ // Handle the special "begin" and "end" commands
+ const auto commandNameComponents =
+ Utils::split(commandName.asString(), ':');
+ const bool isBegin = commandNameComponents[0] == "begin";
+ const bool isEnd = commandNameComponents[0] == "end";
+ if (isBegin || isEnd) {
+ if (commandNameComponents.size() > 1) {
+ logger.error(
+ "Special commands \"\\begin\" and \"\\end\" may not contain a "
+ "namespace separator \":\"",
+ commandName);
+ }
+ if (isBegin) {
+ return parseBeginCommand();
+ } else if (isEnd) {
+ return parseEndCommand();
+ }
+ }
+
+ // Check whether the next character is a '#', indicating the start of the
+ // command name
+ Variant commandArgName;
+ start = reader.getOffset();
+ if (reader.expect('#')) {
+ commandArgName = parseIdentifier(start);
+ if (commandArgName.asString().empty()) {
+ logger.error("Expected identifier after \"#\"", commandArgName);
+ }
+ }
+
+ // Parse the arugments
+ Variant commandArguments = parseCommandArguments(std::move(commandArgName));
+
+ // Push the command onto the command stack
+ pushCommand(std::move(commandName), std::move(commandArguments), false);
+
+ return State::COMMAND;
+}
+
+void OsdmStreamParser::parseBlockComment()
+{
+ DynamicToken token;
+ size_t depth = 1;
+ while (tokenizer.read(reader, token)) {
+ if (token.type == Tokens.BlockCommentEnd) {
+ depth--;
+ if (depth == 0) {
+ return;
+ }
+ }
+ if (token.type == Tokens.BlockCommentStart) {
+ depth++;
+ }
+ }
+
+ // Issue an error if the file ends while we are in a block comment
+ logger.error("File ended while being in a block comment", reader);
+}
+
+void OsdmStreamParser::parseLineComment()
+{
+ char c;
+ while (reader.read(c)) {
+ if (c == '\n') {
+ return;
+ }
+ }
+}
+
+bool OsdmStreamParser::checkIssueData(DataHandler &handler)
+{
+ if (!handler.isEmpty()) {
+ data = handler.toVariant(reader.getSourceId());
+ location = data.getLocation();
+ reader.resetPeek();
+ return true;
+ }
+ return false;
+}
+
+bool OsdmStreamParser::checkIssueFieldStart()
+{
+ // Fetch the current command, and check whether we're currently inside a
+ // field of this command
+ Command &cmd = commands.top();
+ if (!cmd.inField) {
+ // If this is a range command, we're now implicitly inside the field of
+ // this command -- we'll have to issue a field start command!
+ if (cmd.hasRange) {
+ cmd.inField = true;
+ cmd.inRangeField = true;
+ reader.resetPeek();
+ return true;
+ }
+
+ // This was not a range command, so obviously we're now inside within
+ // a field of some command -- so unroll the commands stack until a
+ // command with open field is reached
+ while (!commands.top().inField) {
+ commands.pop();
+ }
+ }
+ return false;
+}
+
+OsdmStreamParser::State OsdmStreamParser::parse()
+{
+ // Handler for incomming data
+ DataHandler handler;
+
+ // Read tokens until the outer loop should be left
+ DynamicToken token;
+ while (tokenizer.peek(reader, token)) {
+ const TokenTypeId type = token.type;
+
+ // Special handling for Backslash and Text
+ if (type == Tokens.Backslash) {
+ // Before appending anything to the output data or starting a new
+ // command, check whether FIELD_START has to be issued, as the
+ // current command is a command with range
+ if (checkIssueFieldStart()) {
+ location = token.location;
+ return State::FIELD_START;
+ }
+
+ // Check whether a command starts now, without advancing the peek
+ // cursor
+ char c;
+ if (!reader.fetchPeek(c)) {
+ logger.error("Trailing backslash at the end of the file.",
+ token);
+ return State::END;
+ }
+
+ // Try to parse a command
+ if (Utils::isIdentifierStartCharacter(c)) {
+ // Make sure to issue any data before it is to late
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // Parse the actual command
+ State res = parseCommand(token.location.getStart());
+ switch (res) {
+ case State::ERROR:
+ throw LoggableException(
+ "Last error was irrecoverable, ending parsing "
+ "process");
+ case State::NONE:
+ continue;
+ default:
+ return res;
+ }
+ }
+
+ // This was not a special character, just append the given character
+ // to the data buffer, use the escape character start as start
+ // location and the peek offset as end location
+ reader.peek(c); // Peek the previously fetched character
+ handler.append(c, token.location.getStart(),
+ reader.getPeekOffset());
+ reader.consumePeek();
+ continue;
+ } else if (type == TextToken) {
+ // Check whether FIELD_START has to be issued before appending text
+ if (checkIssueFieldStart()) {
+ location = token.location;
+ return State::FIELD_START;
+ }
+
+ // Append the text to the data handler
+ handler.append(token.content, token.location.getStart(),
+ token.location.getEnd());
+
+ reader.consumePeek();
+ continue;
+ }
+
+ // A non-text token was reached, make sure all pending data commands
+ // have been issued
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // We will handle the token now, consume the peeked characters
+ reader.consumePeek();
+
+ // Update the location to the current token location
+ location = token.location;
+
+ if (token.type == Tokens.LineComment) {
+ parseLineComment();
+ } else if (token.type == Tokens.BlockCommentStart) {
+ parseBlockComment();
+ } else if (token.type == Tokens.FieldStart) {
+ Command &cmd = commands.top();
+ if (!cmd.inField) {
+ cmd.inField = true;
+ return State::FIELD_START;
+ }
+ logger.error(
+ "Got field start token \"{\", but no command for which to "
+ "start the field. Did you mean \"\\{\"?",
+ token);
+ } else if (token.type == Tokens.FieldEnd) {
+ // Try to end an open field of the current command -- if the current
+ // command is not inside an open field, end this command and try to
+ // close the next one
+ for (int i = 0; i < 2 && commands.size() > 1; i++) {
+ Command &cmd = commands.top();
+ if (!cmd.inRangeField) {
+ if (cmd.inField) {
+ cmd.inField = false;
+ return State::FIELD_END;
+ }
+ commands.pop();
+ } else {
+ break;
+ }
+ }
+ logger.error(
+ "Got field end token \"}\", but there is no field to end. Did "
+ "you mean \"\\}\"?",
+ token);
+ } else {
+ logger.error("Unexpected token \"" + token.content + "\"", token);
+ }
+ }
+
+ // Issue available data
+ if (checkIssueData(handler)) {
+ return State::DATA;
+ }
+
+ // Make sure all open commands and fields have been ended at the end of the
+ // stream
+ while (commands.size() > 1) {
+ Command &cmd = commands.top();
+ if (cmd.inField || cmd.hasRange) {
+ logger.error("Reached end of stream, but command \"" +
+ cmd.name.asString() + "\" has not been ended",
+ cmd.name);
+ }
+ commands.pop();
+ }
+
+ location = SourceLocation{reader.getSourceId(), reader.getOffset()};
+ return State::END;
+}
+
+const Variant &OsdmStreamParser::getCommandName()
+{
+ return commands.top().name;
+}
+
+const Variant &OsdmStreamParser::getCommandArguments()
+{
+ return commands.top().arguments;
+}
+}
+
diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osdm/OsdmStreamParser.hpp
new file mode 100644
index 0000000..48d8fb7
--- /dev/null
+++ b/src/formats/osdm/OsdmStreamParser.hpp
@@ -0,0 +1,351 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file OsdmStreamParser.hpp
+ *
+ * Provides classes for low-level classes for reading the TeX-esque osdm
+ * format. The class provided here does not build any model objects and does not
+ * implement the Parser interface.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_
+#define _OUSIA_OSDM_STREAM_PARSER_HPP_
+
+#include <stack>
+
+#include <core/common/Variant.hpp>
+
+#include "DynamicTokenizer.hpp"
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+class Logger;
+class DataHandler;
+
+/**
+ * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm
+ * format. The parser is constructed around a "parse" function, which reads data
+ * from the underlying CharReader until a new state is reached and indicates
+ * this state in a return value. The calling code then has to pull corresponding
+ * data from the stream reader. The reader makes sure the incommind file is
+ * syntactically valid and tries to recorver from most errors. If an error is
+ * irrecoverable (this is the case for errors with wrong nesting of commands or
+ * fields, as this would lead to too many consecutive errors) a
+ * LoggableException is thrown.
+ */
+class OsdmStreamParser {
+public:
+ /**
+ * Enum used to indicate which state the OsdmStreamParser class is in
+ * after calling the "parse" function.
+ */
+ enum class State {
+ /**
+ * State returned if a fully featured command has been read. A command
+ * consists of the command name and its arguments (which optionally
+ * includes the name).
+ */
+ COMMAND,
+
+ /**
+ * State returned if data is given. The reader must decide which field
+ * or command this should be routed to. Trailing or leading whitespace
+ * has been removed. Only called if the data is non-empty.
+ */
+ DATA,
+
+ /**
+ * A user-defined entity has been found. The entity sequence is stored
+ * in the command name.
+ */
+ ENTITY,
+
+ /**
+ * State returned if an annotation was started. An annotation consists
+ * of the command name and its arguments (which optionally include the
+ * name).
+ */
+ ANNOTATION_START,
+
+ /**
+ * State returned if an annotation ends. The reader indicates which
+ * annotation ends.
+ */
+ ANNOTATION_END,
+
+ /**
+ * State returned if a new field started. The reader assures that the
+ * current field ends before a new field is started and that the field
+ * is not started if data has been given outside of a field. The
+ * field number is set to the current field index.
+ */
+ FIELD_START,
+
+ /**
+ * State returned if the current field ends. The reader assures that a
+ * field was actually open.
+ */
+ FIELD_END,
+
+ /**
+ * The end of the stream has been reached.
+ */
+ END,
+
+ /**
+ * Returned from internal functions if nothing should be done.
+ */
+ NONE,
+
+ /**
+ * Returned from internal function to indicate irrecoverable errors.
+ */
+ ERROR
+ };
+
+ /**
+ * Entry used for the command stack.
+ */
+ struct Command {
+ /**
+ * Name and location of the current command.
+ */
+ Variant name;
+
+ /**
+ * Arguments that were passed to the command.
+ */
+ Variant arguments;
+
+ /**
+ * Set to true if this is a command with clear begin and end.
+ */
+ bool hasRange;
+
+ /**
+ * Set to true if we are currently inside a field of this command.
+ */
+ bool inField;
+
+ /**
+ * Set to true if we are currently in the range field of the command
+ * (implies inField being set to true).
+ */
+ bool inRangeField;
+
+ /**
+ * Default constructor.
+ */
+ Command() : hasRange(false), inField(false), inRangeField(false) {}
+
+ /**
+ * Constructor of the Command class.
+ *
+ * @param name is a string variant with name and location of the
+ * command.
+ * @param arguments is a map variant with the arguments given to the
+ * command.
+ * @param hasRange should be set to true if this is a command with
+ * explicit range.
+ * @param inField is set to true if we currently are inside a field
+ * of this command.
+ * @param inRangeField is set to true if we currently inside the outer
+ * field of the command.
+ */
+ Command(Variant name, Variant arguments, bool hasRange, bool inField,
+ bool inRangeField)
+ : name(std::move(name)),
+ arguments(std::move(arguments)),
+ hasRange(hasRange),
+ inField(inField),
+ inRangeField(inRangeField)
+ {
+ }
+ };
+
+private:
+ /**
+ * Reference to the CharReader instance from which the incomming bytes are
+ * read.
+ */
+ CharReader &reader;
+
+ /**
+ * Reference at the logger instance to which all error messages are sent.
+ */
+ Logger &logger;
+
+ /**
+ * Tokenizer instance used to read individual tokens from the text.
+ */
+ DynamicTokenizer tokenizer;
+
+ /**
+ * Stack containing the current commands.
+ */
+ std::stack<Command> commands;
+
+ /**
+ * Variant containing the data that has been read (always is a string,
+ * contains the exact location of the data in the source file).
+ */
+ Variant data;
+
+ /**
+ * Contains the location of the last token.
+ */
+ SourceLocation location;
+
+ /**
+ * Contains the field index of the current command.
+ */
+ size_t fieldIdx;
+
+ /**
+ * Function used internall to parse an identifier.
+ *
+ * @param start is the start byte offset of the identifier (including the
+ * backslash).
+ * @param allowNSSep should be set to true if the namespace separator is
+ * allowed in the identifier name. Issues error if the namespace separator
+ * is placed incorrectly.
+ */
+ Variant parseIdentifier(size_t start, bool allowNSSep = false);
+
+ /**
+ * Function used internally to handle the special "\begin" command.
+ */
+ State parseBeginCommand();
+
+ /**
+ * Function used internally to handle the special "\end" command.
+ */
+ State parseEndCommand();
+
+ /**
+ * Pushes the parsed command onto the command stack.
+ */
+ void pushCommand(Variant commandName, Variant commandArguments,
+ bool hasRange);
+
+ /**
+ * Parses the command arguments.
+ */
+ Variant parseCommandArguments(Variant commandArgName);
+
+ /**
+ * Function used internally to parse a command.
+ *
+ * @param start is the start byte offset of the command (including the
+ * backslash)
+ * @return true if a command was actuall parsed, false otherwise.
+ */
+ State parseCommand(size_t start);
+
+ /**
+ * Function used internally to parse a block comment.
+ */
+ void parseBlockComment();
+
+ /**
+ * Function used internally to parse a generic comment.
+ */
+ void parseLineComment();
+
+ /**
+ * Checks whether there is any data pending to be issued, if yes, issues it.
+ *
+ * @param handler is the data handler that contains the data that may be
+ * returned to the user.
+ * @return true if there was any data and DATA should be returned by the
+ * parse function, false otherwise.
+ */
+ bool checkIssueData(DataHandler &handler);
+
+ /**
+ * Called before any data is appended to the internal data handler. Checks
+ * whether a new field should be started or implicitly ended.
+ *
+ * @return true if FIELD_START should be returned by the parse function.
+ */
+ bool checkIssueFieldStart();
+
+public:
+ /**
+ * Constructor of the OsdmStreamParser class. Attaches the new
+ * OsdmStreamParser to the given CharReader and Logger instances.
+ *
+ * @param reader is the reader instance from which incomming characters
+ * should be read.
+ * @param logger is the logger instance to which errors should be written.
+ */
+ OsdmStreamParser(CharReader &reader, Logger &logger);
+
+ /**
+ * Continues parsing. Returns one of the states defined in the State enum.
+ * Callers should stop once the State::END state is reached. Use the getter
+ * functions to get more information about the current state, such as the
+ * command name or the data or the current field index.
+ *
+ * @return the new state the parser has reached.
+ */
+ State parse();
+
+ /**
+ * Returns a reference at the internally stored data. Only valid if
+ * State::DATA was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing the data parsed by the
+ * "parse" function.
+ */
+ const Variant &getData() { return data; }
+
+ /**
+ * Returns a reference at the internally stored command name. Only valid if
+ * State::COMMAND was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing name and location of the
+ * parsed command.
+ */
+ const Variant &getCommandName();
+
+ /**
+ * Returns a reference at the internally stored command name. Only valid if
+ * State::COMMAND was returned by the "parse" function.
+ *
+ * @return a reference at a variant containing arguments given to the
+ * command.
+ */
+ const Variant &getCommandArguments();
+
+ /**
+ * Returns a reference at the char reader.
+ *
+ * @return the last internal token location.
+ */
+ SourceLocation &getLocation() { return location; }
+};
+}
+
+#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */
+
diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp
new file mode 100644
index 0000000..4a0430b
--- /dev/null
+++ b/src/formats/osdm/TokenTrie.cpp
@@ -0,0 +1,119 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "TokenTrie.hpp"
+
+namespace ousia {
+
+/* Class DynamicTokenTree::Node */
+
+TokenTrie::Node::Node() : type(EmptyToken) {}
+
+/* Class DynamicTokenTree */
+
+bool TokenTrie::registerToken(const std::string &token,
+ TokenTypeId type) noexcept
+{
+ // Abort if the token is empty -- this would taint the root node
+ if (token.empty()) {
+ return false;
+ }
+
+ // Iterate over each character in the given string and insert them as
+ // (new) nodes
+ Node *node = &root;
+ for (size_t i = 0; i < token.size(); i++) {
+ // Insert a new node if this one does not exist
+ const char c = token[i];
+ auto it = node->children.find(c);
+ if (it == node->children.end()) {
+ it = node->children.emplace(c, std::make_shared<Node>()).first;
+ }
+ node = it->second.get();
+ }
+
+ // If the resulting node already has a type set, we're screwed.
+ if (node->type != EmptyToken) {
+ return false;
+ }
+
+ // Otherwise just set the type to the given type.
+ node->type = type;
+ return true;
+}
+
+bool TokenTrie::unregisterToken(const std::string &token) noexcept
+{
+ // We cannot remove empty tokens as we need to access the fist character
+ // upfront
+ if (token.empty()) {
+ return false;
+ }
+
+ // First pass -- search the node in the path that can be deleted
+ Node *subtreeRoot = &root;
+ char subtreeKey = token[0];
+ Node *node = &root;
+ for (size_t i = 0; i < token.size(); i++) {
+ // Go to the next node, abort if the tree ends unexpectedly
+ auto it = node->children.find(token[i]);
+ if (it == node->children.end()) {
+ return false;
+ }
+
+ // Reset the subtree handler if this node has another type
+ node = it->second.get();
+ if ((node->type != EmptyToken || node->children.size() > 1) &&
+ (i + 1 != token.size())) {
+ subtreeRoot = node;
+ subtreeKey = token[i + 1];
+ }
+ }
+
+ // If the node type is already EmptyToken, we cannot do anything here
+ if (node->type == EmptyToken) {
+ return false;
+ }
+
+ // If the target node has children, we cannot delete the subtree. Set the
+ // type to EmptyToken instead
+ if (!node->children.empty()) {
+ node->type = EmptyToken;
+ return true;
+ }
+
+ // If we end up here, we can safely delete the complete subtree
+ subtreeRoot->children.erase(subtreeKey);
+ return true;
+}
+
+TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
+{
+ Node const *node = &root;
+ for (size_t i = 0; i < token.size(); i++) {
+ const char c = token[i];
+ auto it = node->children.find(c);
+ if (it == node->children.end()) {
+ return EmptyToken;
+ }
+ node = it->second.get();
+ }
+ return node->type;
+}
+}
+
diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp
new file mode 100644
index 0000000..36c2ffa
--- /dev/null
+++ b/src/formats/osdm/TokenTrie.hpp
@@ -0,0 +1,150 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file TokenTrie.hpp
+ *
+ * Class representing a token trie that can be updated dynamically.
+ *
+ * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_TOKEN_TRIE_HPP_
+#define _OUSIA_TOKEN_TRIE_HPP_
+
+#include <cstdint>
+#include <memory>
+#include <limits>
+#include <unordered_map>
+
+namespace ousia {
+
+/**
+ * The TokenTypeId is used to give each token type a unique id.
+ */
+using TokenTypeId = uint32_t;
+
+/**
+ * Token which is not a token.
+ */
+constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+
+/**
+ * Token which represents a text token.
+ */
+constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+
+/**
+ * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
+ * the longest consecutive token in the text. This is equivalent to a prefix
+ * trie.
+ *
+ * A token trie is a construct that structures all special tokens a Tokenizer
+ * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
+ * three. Then the token tree would look like this:
+ *
+ * \code{*.txt}
+ * ~ (0)
+ * / \
+ * a (2) b (0)
+ * | |
+ * a (0) a (0)
+ * | |
+ * b (1) c (0)
+ * \endcode
+ *
+ * Where the number indicates the corresponding token descriptor identifier.
+ */
+class TokenTrie {
+public:
+ /**
+ * Structure used to build the node tree.
+ */
+ struct Node {
+ /**
+ * Type used for the child map.
+ */
+ using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
+
+ /**
+ * Map from single characters at the corresponding child nodes.
+ */
+ ChildMap children;
+
+ /**
+ * Reference at the corresponding token descriptor. Set to nullptr if
+ * no token is attached to this node.
+ */
+ TokenTypeId type;
+
+ /**
+ * Default constructor, initializes the descriptor with nullptr.
+ */
+ Node();
+ };
+
+private:
+ /**
+ * Root node of the internal token tree.
+ */
+ Node root;
+
+public:
+ /**
+ * Registers a token containing the given string. Returns false if the
+ * token already exists, true otherwise.
+ *
+ * @param token is the character sequence that should be registered as
+ * token.
+ * @param type is the descriptor that should be set for this token.
+ * @return true if the operation is successful, false otherwise.
+ */
+ bool registerToken(const std::string &token, TokenTypeId type) noexcept;
+
+ /**
+ * Unregisters the token from the token tree. Returns true if the token was
+ * unregistered successfully, false otherwise.
+ *
+ * @param token is the character sequence that should be unregistered.
+ * @return true if the operation was successful, false otherwise.
+ */
+ bool unregisterToken(const std::string &token) noexcept;
+
+ /**
+ * Returns true, if the given token exists within the TokenTree. This
+ * function is mostly thought for debugging and unit testing.
+ *
+ * @param token is the character sequence that should be searched.
+ * @return the attached token descriptor or nullptr if the given token is
+ * not found.
+ */
+ TokenTypeId hasToken(const std::string &token) const noexcept;
+
+ /**
+ * Returns a reference at the root node to be used for traversing the token
+ * tree.
+ *
+ * @return a reference at the root node.
+ */
+ const Node *getRoot() const noexcept { return &root; }
+};
+}
+
+#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
+