summaryrefslogtreecommitdiff
path: root/src/plugins
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins')
-rw-r--r--src/plugins/plain/DynamicTokenizer.cpp544
-rw-r--r--src/plugins/plain/DynamicTokenizer.hpp252
-rw-r--r--src/plugins/plain/PlainFormatStreamReader.cpp641
-rw-r--r--src/plugins/plain/PlainFormatStreamReader.hpp347
-rw-r--r--src/plugins/plain/TokenTrie.cpp119
-rw-r--r--src/plugins/plain/TokenTrie.hpp150
6 files changed, 0 insertions, 2053 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
deleted file mode 100644
index f2cfcd1..0000000
--- a/src/plugins/plain/DynamicTokenizer.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <memory>
-#include <vector>
-
-#include <core/common/CharReader.hpp>
-#include <core/common/Exceptions.hpp>
-#include <core/common/Utils.hpp>
-
-#include "DynamicTokenizer.hpp"
-
-namespace ousia {
-
-namespace {
-
-/* Internal class TokenMatch */
-
-/**
- * Contains information about a matching token.
- */
-struct TokenMatch {
- /**
- * Token that was matched.
- */
- DynamicToken token;
-
- /**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
- */
- size_t textLength;
-
- /**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
- */
- size_t textEnd;
-
- /**
- * Constructor of the TokenMatch class.
- */
- TokenMatch() : textLength(0), textEnd(0) {}
-
- /**
- * Returns true if this TokenMatch instance actually represents a match.
- */
- bool hasMatch() { return token.type != EmptyToken; }
-};
-
-/* Internal class TokenLookup */
-
-/**
- * The TokenLookup class is used to represent a thread in a running token
- * lookup.
- */
-class TokenLookup {
-private:
- /**
- * Current node within the token trie.
- */
- TokenTrie::Node const *node;
-
- /**
- * Start offset within the source file.
- */
- size_t start;
-
- /**
- * Current length of the data within the text handler. The text buffer needs
- * to be trimmed to this length if this token matches.
- */
- size_t textLength;
-
- /**
- * End location of the current text handler. This location needs to be used
- * for the text token that is emitted before the actual token.
- */
- size_t textEnd;
-
-public:
- /**
- * Constructor of the TokenLookup class.
- *
- * @param node is the current node.
- * @param start is the start position.
- * @param textLength is the text buffer length of the previous text token.
- * @param textEnd is the current end location of the previous text token.
- */
- TokenLookup(const TokenTrie::Node *node, size_t start,
- size_t textLength, size_t textEnd)
- : node(node), start(start), textLength(textLength), textEnd(textEnd)
- {
- }
-
- /**
- * Tries to extend the current path in the token trie with the given
- * character. If a complete token is matched, stores this match in the
- * tokens list (in case it is longer than any previous token).
- *
- * @param c is the character that should be appended to the current prefix.
- * @param lookups is a list to which new TokeLookup instances are added --
- * which could potentially be expanded in the next iteration.
- * @param match is the DynamicToken instance to which the matching token
- * should be written.
- * @param tokens is a reference at the internal token list of the
- * DynamicTokenizer.
- * @param end is the end byte offset of the current character.
- * @param sourceId is the source if of this file.
- */
- void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
- const std::vector<std::string> &tokens, SourceOffset end,
- SourceId sourceId)
- {
- // Check whether we can continue the current token path with the given
- // character without visiting an already visited node
- auto it = node->children.find(c);
- if (it == node->children.end()) {
- return;
- }
-
- // Check whether the new node represents a complete token a whether it
- // is longer than the current token. If yes, replace the current token.
- node = it->second.get();
- if (node->type != EmptyToken) {
- const std::string &str = tokens[node->type];
- size_t len = str.size();
- if (len > match.token.content.size()) {
- match.token =
- DynamicToken{node->type, str, {sourceId, start, end}};
- match.textLength = textLength;
- match.textEnd = textEnd;
- }
- }
-
- // If this state can possibly be advanced, store it in the states list.
- if (!node->children.empty()) {
- lookups.emplace_back(*this);
- }
- }
-};
-
-/* Internal class TextHandlerBase */
-
-/**
- * Base class used for those classes that may be used as TextHandler in the
- * DynamicTokenizer::next function.
- */
-class TextHandlerBase {
-public:
- /**
- * Start position of the extracted text.
- */
- size_t textStart;
-
- /**
- * End position of the extracted text.
- */
- size_t textEnd;
-
- /**
- * Buffer containing the extracted text.
- */
- std::vector<char> textBuf;
-
- /**
- * Constructor of the TextHandlerBase base class. Initializes the start and
- * end position with zeros.
- */
- TextHandlerBase() : textStart(0), textEnd(0) {}
-
- /**
- * Transforms the given token into a text token containing the extracted
- * text.
- *
- * @param token is the output token to which the text should be written.
- * @param sourceId is the source id of the underlying file.
- */
- void buildTextToken(TokenMatch &match, SourceId sourceId)
- {
- if (match.hasMatch()) {
- match.token.content =
- std::string{textBuf.data(), match.textLength};
- match.token.location =
- SourceLocation{sourceId, textStart, match.textEnd};
- } else {
- match.token.content = std::string{textBuf.data(), textBuf.size()};
- match.token.location = SourceLocation{sourceId, textStart, textEnd};
- }
- match.token.type = TextToken;
- }
-
- /**
- * Returns true if this whitespace handler has found any text and a text
- * token could be emitted.
- *
- * @return true if the internal data buffer is non-empty.
- */
- bool hasText() { return !textBuf.empty(); }
-};
-
-/* Internal class PreservingTextHandler */
-
-/**
- * The PreservingTextHandler class preserves all characters unmodified,
- * including whitepace characters.
- */
-class PreservingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Appends the given character to the internal text buffer, does not
- * eliminate whitespace.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
- textBuf.push_back(c);
- }
-};
-
-/* Internal class TrimmingTextHandler */
-
-/**
- * The TrimmingTextHandler class trims all whitespace characters at the begin
- * and the end of a text section but leaves all other characters unmodified,
- * including whitepace characters.
- */
-class TrimmingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Buffer used internally to temporarily store all whitespace characters.
- * They are only added to the output buffer if another non-whitespace
- * character is reached.
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * whitespace characters at the begin and end of the text.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- whitespaceBuf.push_back(c);
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (!whitespaceBuf.empty()) {
- textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
- whitespaceBuf.end());
- whitespaceBuf.clear();
- }
- textBuf.push_back(c);
- }
-};
-
-/* Internal class CollapsingTextHandler */
-
-/**
- * The CollapsingTextHandler trims characters at the beginning and end of the
- * text and reduced multiple whitespace characters to a single blank.
- */
-class CollapsingTextHandler : public TextHandlerBase {
-public:
- using TextHandlerBase::TextHandlerBase;
-
- /**
- * Flag set to true if a whitespace character was reached.
- */
- bool hasWhitespace = false;
-
- /**
- * Appends the given character to the internal text buffer, eliminates
- * redundant whitespace characters.
- *
- * @param c is the character that should be appended to the internal buffer.
- * @param start is the start byte offset of the given character.
- * @param end is the end byte offset of the given character.
- */
- void append(char c, size_t start, size_t end)
- {
- // Handle whitespace characters
- if (Utils::isWhitespace(c)) {
- if (!textBuf.empty()) {
- hasWhitespace = true;
- }
- return;
- }
-
- // Set the start and end offset correctly
- if (textBuf.empty()) {
- textStart = start;
- }
- textEnd = end;
-
- // Store the character
- if (hasWhitespace) {
- textBuf.push_back(' ');
- hasWhitespace = false;
- }
- textBuf.push_back(c);
- }
-};
-}
-
-/* Class DynamicTokenizer */
-
-DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
- : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
-{
-}
-
-template <typename TextHandler, bool read>
-bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
-{
- // If we're in the read mode, reset the char reader peek position to the
- // current read position
- if (read) {
- reader.resetPeek();
- }
-
- // Prepare the lookups in the token trie
- const TokenTrie::Node *root = trie.getRoot();
- TokenMatch match;
- std::vector<TokenLookup> lookups;
- std::vector<TokenLookup> nextLookups;
-
- // Instantiate the text handler
- TextHandler textHandler;
-
- // Peek characters from the reader and try to advance the current token tree
- // cursor
- char c;
- size_t charStart = reader.getPeekOffset();
- const SourceId sourceId = reader.getSourceId();
- while (reader.peek(c)) {
- const size_t charEnd = reader.getPeekOffset();
- const size_t textLength = textHandler.textBuf.size();
- const size_t textEnd = textHandler.textEnd;
-
- // If we do not have a match yet, start a new lookup from the root
- if (!match.hasMatch()) {
- TokenLookup{root, charStart, textLength, textEnd}.advance(
- c, nextLookups, match, tokens, charEnd, sourceId);
- }
-
- // Try to advance all other lookups with the new character
- for (TokenLookup &lookup : lookups) {
- lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
- }
-
- // We have found a token and there are no more states to advance or the
- // text handler has found something -- abort to return the new token
- if (match.hasMatch()) {
- if ((nextLookups.empty() || textHandler.hasText())) {
- break;
- }
- } else {
- // Record all incomming characters
- textHandler.append(c, charStart, charEnd);
- }
-
- // Swap the lookups and the nextLookups list
- lookups = std::move(nextLookups);
- nextLookups.clear();
-
- // Advance the offset
- charStart = charEnd;
- }
-
- // If we found text, emit that text
- if (textHandler.hasText() &&
- (!match.hasMatch() || match.textLength > 0)) {
- textHandler.buildTextToken(match, sourceId);
- }
-
- // Move the read/peek cursor to the end of the token, abort if an error
- // happens while doing so
- if (match.hasMatch()) {
- // Make sure we have a valid location
- if (match.token.location.getEnd() == InvalidSourceOffset) {
- throw OusiaException{"Token end position offset out of range"};
- }
-
- // Seek to the end of the current token
- const size_t end = match.token.location.getEnd();
- if (read) {
- reader.seek(end);
- } else {
- reader.seekPeekCursor(end);
- }
- token = match.token;
- } else {
- token = DynamicToken{};
- }
- return match.hasMatch();
-}
-
-bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
-{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, true>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, true>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, true>(reader, token);
- }
- return false;
-}
-
-bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
-{
- switch (whitespaceMode) {
- case WhitespaceMode::PRESERVE:
- return next<PreservingTextHandler, false>(reader, token);
- case WhitespaceMode::TRIM:
- return next<TrimmingTextHandler, false>(reader, token);
- case WhitespaceMode::COLLAPSE:
- return next<CollapsingTextHandler, false>(reader, token);
- }
- return false;
-}
-
-TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
-{
- // Abort if an empty token should be registered
- if (token.empty()) {
- return EmptyToken;
- }
-
- // Search for a new slot in the tokens list
- TokenTypeId type = EmptyToken;
- for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
- if (tokens[i].empty()) {
- tokens[i] = token;
- type = i;
- break;
- }
- }
-
- // No existing slot was found, add a new one -- make sure we do not
- // override the special token type handles
- if (type == EmptyToken) {
- type = tokens.size();
- if (type == TextToken || type == EmptyToken) {
- throw OusiaException{"Token type ids depleted!"};
- }
- tokens.emplace_back(token);
- }
- nextTokenTypeId = type + 1;
-
- // Try to register the token in the trie -- if this fails, remove it
- // from the tokens list
- if (!trie.registerToken(token, type)) {
- tokens[type] = std::string();
- nextTokenTypeId = type;
- return EmptyToken;
- }
- return type;
-}
-
-bool DynamicTokenizer::unregisterToken(TokenTypeId type)
-{
- // Unregister the token from the trie, abort if an invalid type is given
- if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
- tokens[type] = std::string{};
- nextTokenTypeId = type;
- return true;
- }
- return false;
-}
-
-std::string DynamicTokenizer::getTokenString(TokenTypeId type)
-{
- if (type < tokens.size()) {
- return tokens[type];
- }
- return std::string{};
-}
-
-void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
-{
- whitespaceMode = mode;
-}
-
-WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
-
-/* Explicitly instantiate all possible instantiations of the "next" member
- function */
-template bool DynamicTokenizer::next<PreservingTextHandler, false>(
- CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
- CharReader &reader, DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<PreservingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
- CharReader &reader,DynamicToken &token);
-}
-
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
deleted file mode 100644
index 0cac2e8..0000000
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file DynamicTokenizer.hpp
- *
- * Tokenizer that can be reconfigured at runtime used for parsing the plain
- * text format.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
-#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
-
-#include <set>
-#include <string>
-#include <vector>
-
-#include <core/common/Location.hpp>
-
-#include "TokenTrie.hpp"
-
-namespace ousia {
-
-// Forward declarations
-class CharReader;
-
-/**
- * The DynamicToken structure describes a token discovered by the Tokenizer.
- */
-struct DynamicToken {
- /**
- * Id of the type of this token.
- */
- TokenTypeId type;
-
- /**
- * String that was matched.
- */
- std::string content;
-
- /**
- * Location from which the string was extracted.
- */
- SourceLocation location;
-
- /**
- * Default constructor.
- */
- DynamicToken() : type(EmptyToken) {}
-
- /**
- * Constructor of the DynamicToken struct.
- *
- * @param id represents the token type.
- * @param content is the string content that has been extracted.
- * @param location is the location of the extracted string content in the
- * source file.
- */
- DynamicToken(TokenTypeId type, const std::string &content,
- SourceLocation location)
- : type(type), content(content), location(location)
- {
- }
-
- /**
- * Constructor of the DynamicToken struct, only initializes the token type
- *
- * @param type is the id corresponding to the type of the token.
- */
- DynamicToken(TokenTypeId type) : type(type) {}
-
- /**
- * The getLocation function allows the tokens to be directly passed as
- * parameter to Logger or LoggableException instances.
- *
- * @return a reference at the location field
- */
- const SourceLocation &getLocation() const { return location; }
-};
-
-/**
- * Enum specifying the whitespace handling of the DynamicTokenizer class when
- * reading non-token text.
- */
-enum class WhitespaceMode {
- /**
- * Preserves all whitespaces as they are found in the source file.
- */
- PRESERVE,
-
- /**
- * Trims whitespace at the beginning and the end of the found text.
- */
- TRIM,
-
- /**
- * Whitespaces are trimmed and collapsed, multiple whitespace characters
- * are replaced by a single space character.
- */
- COLLAPSE
-};
-
-/**
- * The DynamicTokenizer is used to extract tokens and chunks of text from a
- * CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters. Note that the
- * DynamicTokenizer always tries to extract the longest possible token from the
- * tokenizer.
- */
-class DynamicTokenizer {
-private:
- /**
- * Internally used token trie. This object holds all registered tokens.
- */
- TokenTrie trie;
-
- /**
- * Flag defining whether whitespaces should be preserved or not.
- */
- WhitespaceMode whitespaceMode;
-
- /**
- * Vector containing all registered token types.
- */
- std::vector<std::string> tokens;
-
- /**
- * Next index in the tokens list where to search for a new token id.
- */
- size_t nextTokenTypeId;
-
- /**
- * Templated function used internally to read the current token. The
- * function is templated in order to force code generation for all six
- * combiations of whitespace modes and reading/peeking.
- *
- * @tparam TextHandler is the type to be used for the textHandler instance.
- * @tparam read specifies whether the function should start from and advance
- * the read pointer of the char reader.
- * @param reader is the CharReader instance from which the data should be
- * read.
- * @param token is the token structure into which the token information
- * should be written.
- * @return false if the end of the stream has been reached, true otherwise.
- */
- template <typename TextHandler, bool read>
- bool next(CharReader &reader, DynamicToken &token);
-
-public:
- /**
- * Constructor of the DynamicTokenizer class.
- *
- * @param whitespaceMode specifies how whitespace should be handled.
- */
- DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
-
- /**
- * Registers the given string as a token. Returns a const pointer at a
- * TokenDescriptor that will be used to reference the newly created token.
- *
- * @param token is the token string that should be registered.
- * @return a unique identifier for the registered token or EmptyToken if
- * an error occured.
- */
- TokenTypeId registerToken(const std::string &token);
-
- /**
- * Unregisters the token belonging to the given TokenTypeId.
- *
- * @param type is the token type that should be unregistered. The
- *TokenTypeId
- * must have been returned by registerToken.
- * @return true if the operation was successful, false otherwise (e.g.
- * because the given TokenDescriptor was already unregistered).
- */
- bool unregisterToken(TokenTypeId type);
-
- /**
- * Returns the token that was registered under the given TokenTypeId id or
- *an
- * empty string if an invalid TokenTypeId id is given.
- *
- * @param type is the TokenTypeId id for which the corresponding token
- *string
- * should be returned.
- * @return the registered token string or an empty string if the given type
- * was invalid.
- */
- std::string getTokenString(TokenTypeId type);
-
- /**
- * Sets the whitespace mode.
- *
- * @param whitespaceMode defines how whitespace should be treated in text
- * tokens.
- */
- void setWhitespaceMode(WhitespaceMode mode);
-
- /**
- * Returns the current value of the whitespace mode.
- *
- * @return the whitespace mode.
- */
- WhitespaceMode getWhitespaceMode();
-
- /**
- * Reads a new token from the CharReader and stores it in the given
- * DynamicToken instance.
- *
- * @param reader is the CharReader instance from which the data should be
- * read.
- * @param token is a reference at the token instance into which the Token
- * information should be written.
- * @return true if a token could be read, false if the end of the stream
- * has been reached.
- */
- bool read(CharReader &reader, DynamicToken &token);
-
- /**
- * The peek method does not advance the read position of the char reader,
- * but reads the next token from the current char reader peek position.
- *
- * @param reader is the CharReader instance from which the data should be
- * read.
- * @param token is a reference at the token instance into which the Token
- * information should be written.
- * @return true if a token could be read, false if the end of the stream
- * has been reached.
- */
- bool peek(CharReader &reader, DynamicToken &token);
-};
-}
-
-#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
-
diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp
deleted file mode 100644
index 05769f0..0000000
--- a/src/plugins/plain/PlainFormatStreamReader.cpp
+++ /dev/null
@@ -1,641 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <core/common/CharReader.hpp>
-#include <core/common/Logger.hpp>
-#include <core/common/Utils.hpp>
-#include <core/common/VariantReader.hpp>
-
-#include "PlainFormatStreamReader.hpp"
-
-namespace ousia {
-
-/**
- * Plain format default tokenizer.
- */
-class PlainFormatTokens : public DynamicTokenizer {
-public:
- /**
- * Id of the backslash token.
- */
- TokenTypeId Backslash;
-
- /**
- * Id of the line comment token.
- */
- TokenTypeId LineComment;
-
- /**
- * Id of the block comment start token.
- */
- TokenTypeId BlockCommentStart;
-
- /**
- * Id of the block comment end token.
- */
- TokenTypeId BlockCommentEnd;
-
- /**
- * Id of the field start token.
- */
- TokenTypeId FieldStart;
-
- /**
- * Id of the field end token.
- */
- TokenTypeId FieldEnd;
-
- /**
- * Registers the plain format tokens in the internal tokenizer.
- */
- PlainFormatTokens()
- {
- Backslash = registerToken("\\");
- LineComment = registerToken("%");
- BlockCommentStart = registerToken("%{");
- BlockCommentEnd = registerToken("}%");
- FieldStart = registerToken("{");
- FieldEnd = registerToken("}");
- }
-};
-
-static const PlainFormatTokens Tokens;
-
-/**
- * Class used internally to collect data issued via "DATA" event.
- */
-class DataHandler {
-private:
- /**
- * Internal character buffer.
- */
- std::vector<char> buf;
-
- /**
- * Start location of the character data.
- */
- SourceOffset start;
-
- /**
- * End location of the character data.
- */
- SourceOffset end;
-
-public:
- /**
- * Default constructor, initializes start and end with zeros.
- */
- DataHandler() : start(0), end(0) {}
-
- /**
- * Returns true if the internal buffer is empty.
- *
- * @return true if no characters were added to the internal buffer, false
- * otherwise.
- */
- bool isEmpty() { return buf.empty(); }
-
- /**
- * Appends a single character to the internal buffer.
- *
- * @param c is the character that should be added to the internal buffer.
- * @param charStart is the start position of the character.
- * @param charEnd is the end position of the character.
- */
- void append(char c, SourceOffset charStart, SourceOffset charEnd)
- {
- if (isEmpty()) {
- start = charStart;
- }
- buf.push_back(c);
- end = charEnd;
- }
-
- /**
- * Appends a string to the internal buffer.
- *
- * @param s is the string that should be added to the internal buffer.
- * @param stringStart is the start position of the string.
- * @param stringEnd is the end position of the string.
- */
- void append(const std::string &s, SourceOffset stringStart,
- SourceOffset stringEnd)
- {
- if (isEmpty()) {
- start = stringStart;
- }
- std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
- end = stringEnd;
- }
-
- /**
- * Converts the internal buffer to a variant with attached location
- * information.
- *
- * @param sourceId is the source id which is needed for building the
- * location information.
- * @return a Variant with the internal buffer content as string and
- * the correct start and end location.
- */
- Variant toVariant(SourceId sourceId)
- {
- Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
- res.setLocation({sourceId, start, end});
- return res;
- }
-};
-
-PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,
- Logger &logger)
- : reader(reader), logger(logger), tokenizer(Tokens)
-{
- // Place an intial command representing the complete file on the stack
- commands.push(Command{"", Variant::mapType{}, true, true, true});
-}
-
-Variant PlainFormatStreamReader::parseIdentifier(size_t start, bool allowNSSep)
-{
- bool first = true;
- bool hasCharSiceNSSep = false;
- std::vector<char> identifier;
- size_t end = reader.getPeekOffset();
- char c, c2;
- while (reader.peek(c)) {
- // Abort if this character is not a valid identifer character
- if ((first && Utils::isIdentifierStartCharacter(c)) ||
- (!first && Utils::isIdentifierCharacter(c))) {
- identifier.push_back(c);
- } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) {
- identifier.push_back(c);
- } else {
- if (c == ':' && allowNSSep) {
- logger.error(
- "Expected character before and after namespace separator \":\"",
- reader);
- }
- reader.resetPeek();
- break;
- }
-
- // This is no longer the first character
- first = false;
-
- // Advance the hasCharSiceNSSep flag
- hasCharSiceNSSep = allowNSSep && (c != ':');
-
- end = reader.getPeekOffset();
- reader.consumePeek();
- }
-
- // Return the identifier at its location
- Variant res =
- Variant::fromString(std::string(identifier.data(), identifier.size()));
- res.setLocation({reader.getSourceId(), start, end});
- return res;
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parseBeginCommand()
-{
- // Expect a '{' after the command
- reader.consumeWhitespace();
- if (!reader.expect('{')) {
- logger.error("Expected \"{\" after \\begin", reader);
- return State::NONE;
- }
-
- // Parse the name of the command that should be opened
- Variant commandName = parseIdentifier(reader.getOffset(), true);
- if (commandName.asString().empty()) {
- logger.error("Expected identifier", commandName);
- return State::ERROR;
- }
-
- // Check whether the next character is a '#', indicating the start of the
- // command name
- Variant commandArgName;
- SourceOffset start = reader.getOffset();
- if (reader.expect('#')) {
- commandArgName = parseIdentifier(start);
- if (commandArgName.asString().empty()) {
- logger.error("Expected identifier after \"#\"", commandArgName);
- }
- }
-
- if (!reader.expect('}')) {
- logger.error("Expected \"}\"", reader);
- return State::ERROR;
- }
-
- // Parse the arguments
- Variant commandArguments = parseCommandArguments(std::move(commandArgName));
-
- // Push the command onto the command stack
- pushCommand(std::move(commandName), std::move(commandArguments), true);
-
- return State::COMMAND;
-}
-
-static bool checkStillInField(const PlainFormatStreamReader::Command &cmd,
- const Variant &endName, Logger &logger)
-{
- if (cmd.inField && !cmd.inRangeField) {
- logger.error(std::string("\\end in open field of command \"") +
- cmd.name.asString() + std::string("\""),
- endName);
- logger.note(std::string("Open command started here:"), cmd.name);
- return true;
- }
- return false;
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parseEndCommand()
-{
- // Expect a '{' after the command
- if (!reader.expect('{')) {
- logger.error("Expected \"{\" after \\end", reader);
- return State::NONE;
- }
-
- // Fetch the name of the command that should be ended here
- Variant name = parseIdentifier(reader.getOffset(), true);
-
- // Make sure the given command name is not empty
- if (name.asString().empty()) {
- logger.error("Expected identifier", name);
- return State::ERROR;
- }
-
- // Make sure the command name is terminated with a '}'
- if (!reader.expect('}')) {
- logger.error("Expected \"}\"", reader);
- return State::ERROR;
- }
-
- // Unroll the command stack up to the last range command
- while (!commands.top().hasRange) {
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
- }
- commands.pop();
- }
-
- // Make sure we're not in an open field of this command
- if (checkStillInField(commands.top(), name, logger)) {
- return State::ERROR;
- }
-
- // Special error message if the top-level command is reached
- if (commands.size() == 1) {
- logger.error(std::string("Cannot end command \"") + name.asString() +
- std::string("\" here, no command open"),
- name);
- return State::ERROR;
- }
-
- // Inform the about command mismatches
- const Command &cmd = commands.top();
- if (commands.top().name.asString() != name.asString()) {
- logger.error(std::string("Trying to end command \"") +
- cmd.name.asString() +
- std::string("\", but open command is \"") +
- name.asString() + std::string("\""),
- name);
- logger.note("Last command was opened here:", cmd.name);
- return State::ERROR;
- }
-
- // Set the location to the location of the command that was ended, then end
- // the current command
- location = name.getLocation();
- commands.pop();
- return cmd.inRangeField ? State::FIELD_END : State::NONE;
-}
-
-Variant PlainFormatStreamReader::parseCommandArguments(Variant commandArgName)
-{
- // Parse the arguments using the universal VariantReader
- Variant commandArguments;
- if (reader.expect('[')) {
- auto res = VariantReader::parseObject(reader, logger, ']');
- commandArguments = res.second;
- } else {
- commandArguments = Variant::mapType{};
- }
-
- // Insert the parsed name, make sure "name" was not specified in the
- // arguments
- if (commandArgName.isString()) {
- auto res =
- commandArguments.asMap().emplace("name", std::move(commandArgName));
- if (!res.second) {
- logger.error("Name argument specified multiple times",
- SourceLocation{}, MessageMode::NO_CONTEXT);
- logger.note("First occurance is here: ", commandArgName);
- logger.note("Second occurance is here: ", res.first->second);
- }
- }
- return commandArguments;
-}
-
-void PlainFormatStreamReader::pushCommand(Variant commandName,
- Variant commandArguments,
- bool hasRange)
-{
- // Store the location on the stack
- location = commandName.getLocation();
-
- // Place the command on the command stack, remove the last commands if we're
- // not currently inside a field of these commands
- while (!commands.top().inField) {
- commands.pop();
- }
- commands.push(Command{std::move(commandName), std::move(commandArguments),
- hasRange, false, false});
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parseCommand(
- size_t start)
-{
- // Parse the commandName as a first identifier
- Variant commandName = parseIdentifier(start, true);
- if (commandName.asString().empty()) {
- logger.error("Empty command name", reader);
- return State::NONE;
- }
-
- // Handle the special "begin" and "end" commands
- const auto commandNameComponents =
- Utils::split(commandName.asString(), ':');
- const bool isBegin = commandNameComponents[0] == "begin";
- const bool isEnd = commandNameComponents[0] == "end";
- if (isBegin || isEnd) {
- if (commandNameComponents.size() > 1) {
- logger.error(
- "Special commands \"\\begin\" and \"\\end\" may not contain a "
- "namespace separator \":\"",
- commandName);
- }
- if (isBegin) {
- return parseBeginCommand();
- } else if (isEnd) {
- return parseEndCommand();
- }
- }
-
- // Check whether the next character is a '#', indicating the start of the
- // command name
- Variant commandArgName;
- start = reader.getOffset();
- if (reader.expect('#')) {
- commandArgName = parseIdentifier(start);
- if (commandArgName.asString().empty()) {
- logger.error("Expected identifier after \"#\"", commandArgName);
- }
- }
-
- // Parse the arugments
- Variant commandArguments = parseCommandArguments(std::move(commandArgName));
-
- // Push the command onto the command stack
- pushCommand(std::move(commandName), std::move(commandArguments), false);
-
- return State::COMMAND;
-}
-
-void PlainFormatStreamReader::parseBlockComment()
-{
- DynamicToken token;
- size_t depth = 1;
- while (tokenizer.read(reader, token)) {
- if (token.type == Tokens.BlockCommentEnd) {
- depth--;
- if (depth == 0) {
- return;
- }
- }
- if (token.type == Tokens.BlockCommentStart) {
- depth++;
- }
- }
-
- // Issue an error if the file ends while we are in a block comment
- logger.error("File ended while being in a block comment", reader);
-}
-
-void PlainFormatStreamReader::parseLineComment()
-{
- char c;
- while (reader.read(c)) {
- if (c == '\n') {
- return;
- }
- }
-}
-
-bool PlainFormatStreamReader::checkIssueData(DataHandler &handler)
-{
- if (!handler.isEmpty()) {
- data = handler.toVariant(reader.getSourceId());
- location = data.getLocation();
- reader.resetPeek();
- return true;
- }
- return false;
-}
-
-bool PlainFormatStreamReader::checkIssueFieldStart()
-{
- // Fetch the current command, and check whether we're currently inside a
- // field of this command
- Command &cmd = commands.top();
- if (!cmd.inField) {
- // If this is a range command, we're now implicitly inside the field of
- // this command -- we'll have to issue a field start command!
- if (cmd.hasRange) {
- cmd.inField = true;
- cmd.inRangeField = true;
- reader.resetPeek();
- return true;
- }
-
- // This was not a range command, so obviously we're now inside within
- // a field of some command -- so unroll the commands stack until a
- // command with open field is reached
- while (!commands.top().inField) {
- commands.pop();
- }
- }
- return false;
-}
-
-PlainFormatStreamReader::State PlainFormatStreamReader::parse()
-{
- // Handler for incomming data
- DataHandler handler;
-
- // Read tokens until the outer loop should be left
- DynamicToken token;
- while (tokenizer.peek(reader, token)) {
- const TokenTypeId type = token.type;
-
- // Special handling for Backslash and Text
- if (type == Tokens.Backslash) {
- // Before appending anything to the output data or starting a new
- // command, check whether FIELD_START has to be issued, as the
- // current command is a command with range
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
- // Check whether a command starts now, without advancing the peek
- // cursor
- char c;
- if (!reader.fetchPeek(c)) {
- logger.error("Trailing backslash at the end of the file.",
- token);
- return State::END;
- }
-
- // Try to parse a command
- if (Utils::isIdentifierStartCharacter(c)) {
- // Make sure to issue any data before it is to late
- if (checkIssueData(handler)) {
- return State::DATA;
- }
-
- // Parse the actual command
- State res = parseCommand(token.location.getStart());
- switch (res) {
- case State::ERROR:
- throw LoggableException(
- "Last error was irrecoverable, ending parsing "
- "process");
- case State::NONE:
- continue;
- default:
- return res;
- }
- }
-
- // This was not a special character, just append the given character
- // to the data buffer, use the escape character start as start
- // location and the peek offset as end location
- reader.peek(c); // Peek the previously fetched character
- handler.append(c, token.location.getStart(),
- reader.getPeekOffset());
- reader.consumePeek();
- continue;
- } else if (type == TextToken) {
- // Check whether FIELD_START has to be issued before appending text
- if (checkIssueFieldStart()) {
- location = token.location;
- return State::FIELD_START;
- }
-
- // Append the text to the data handler
- handler.append(token.content, token.location.getStart(),
- token.location.getEnd());
-
- reader.consumePeek();
- continue;
- }
-
- // A non-text token was reached, make sure all pending data commands
- // have been issued
- if (checkIssueData(handler)) {
- return State::DATA;
- }
-
- // We will handle the token now, consume the peeked characters
- reader.consumePeek();
-
- // Update the location to the current token location
- location = token.location;
-
- if (token.type == Tokens.LineComment) {
- parseLineComment();
- } else if (token.type == Tokens.BlockCommentStart) {
- parseBlockComment();
- } else if (token.type == Tokens.FieldStart) {
- Command &cmd = commands.top();
- if (!cmd.inField) {
- cmd.inField = true;
- return State::FIELD_START;
- }
- logger.error(
- "Got field start token \"{\", but no command for which to "
- "start the field. Did you mean \"\\{\"?",
- token);
- } else if (token.type == Tokens.FieldEnd) {
- // Try to end an open field of the current command -- if the current
- // command is not inside an open field, end this command and try to
- // close the next one
- for (int i = 0; i < 2 && commands.size() > 1; i++) {
- Command &cmd = commands.top();
- if (!cmd.inRangeField) {
- if (cmd.inField) {
- cmd.inField = false;
- return State::FIELD_END;
- }
- commands.pop();
- } else {
- break;
- }
- }
- logger.error(
- "Got field end token \"}\", but there is no field to end. Did "
- "you mean \"\\}\"?",
- token);
- } else {
- logger.error("Unexpected token \"" + token.content + "\"", token);
- }
- }
-
- // Issue available data
- if (checkIssueData(handler)) {
- return State::DATA;
- }
-
- // Make sure all open commands and fields have been ended at the end of the
- // stream
- while (commands.size() > 1) {
- Command &cmd = commands.top();
- if (cmd.inField || cmd.hasRange) {
- logger.error("Reached end of stream, but command \"" +
- cmd.name.asString() + "\" has not been ended",
- cmd.name);
- }
- commands.pop();
- }
-
- location = SourceLocation{reader.getSourceId(), reader.getOffset()};
- return State::END;
-}
-
-const Variant &PlainFormatStreamReader::getCommandName()
-{
- return commands.top().name;
-}
-
-const Variant &PlainFormatStreamReader::getCommandArguments()
-{
- return commands.top().arguments;
-}
-}
-
diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp
deleted file mode 100644
index 2ee261c..0000000
--- a/src/plugins/plain/PlainFormatStreamReader.hpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file PlainFormatStreamReader.hpp
- *
- * Provides classes for low-level classes for reading the plain TeX-esque
- * format. The class provided here do not build any model objects and does not
- * implement the Parser interfaces.
- *
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_
-#define _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_
-
-#include <stack>
-
-#include <core/common/Variant.hpp>
-
-#include "DynamicTokenizer.hpp"
-
-namespace ousia {
-
-// Forward declarations
-class CharReader;
-class Logger;
-class DataHandler;
-
-/**
- * The PlainFormatStreamReader class provides a low-level reader for the plain
- * TeX-esque format. The parser is constructed around a "parse" function, which
- * reads data from the underlying CharReader until a new state is reached and
- * indicates this state in a return value. The calling code then has to pull
- * corresponding data from the stream reader. The reader already handles some
- * invalid cases, but recovers from most errors and happily continues parsing.
- */
-class PlainFormatStreamReader {
-public:
- /**
- * Enum used to indicate which state the PlainFormatStreamReader class is in
- * after calling the "parse" function.
- */
- enum class State {
- /**
- * State returned if a fully featured command has been read. A command
- * consists of the command name and its arguments (which optionally
- * includes the name).
- */
- COMMAND,
-
- /**
- * State returned if data is given. The reader must decide which field
- * or command this should be routed to. Trailing or leading whitespace
- * has been removed. Only called if the data is non-empty.
- */
- DATA,
-
- /**
- * A user-defined entity has been found. The entity sequence is stored
- * in the command name.
- */
- ENTITY,
-
- /**
- * State returned if an annotation was started. An annotation consists
- * of the command name and its arguments (which optionally include the
- * name).
- */
- ANNOTATION_START,
-
- /**
- * State returned if an annotation ends. The reader indicates which
- * annotation ends.
- */
- ANNOTATION_END,
-
- /**
- * State returned if a new field started. The reader assures that the
- * current field ends before a new field is started and that the field
- * is not started if data has been given outside of a field. The
- * field number is set to the current field index.
- */
- FIELD_START,
-
- /**
- * State returned if the current field ends. The reader assures that a
- * field was actually open.
- */
- FIELD_END,
-
- /**
- * The end of the stream has been reached.
- */
- END,
-
- /**
- * Returned from internal functions if nothing should be done.
- */
- NONE,
-
- /**
- * Returned from internal function to indicate irrecoverable errors.
- */
- ERROR
- };
-
- /**
- * Entry used for the command stack.
- */
- struct Command {
- /**
- * Name and location of the current command.
- */
- Variant name;
-
- /**
- * Arguments that were passed to the command.
- */
- Variant arguments;
-
- /**
- * Set to true if this is a command with clear begin and end.
- */
- bool hasRange;
-
- /**
- * Set to true if we are currently inside a field of this command.
- */
- bool inField;
-
- /**
- * Set to true if we are currently in the range field of the command
- * (implies inField being set to true).
- */
- bool inRangeField;
-
- /**
- * Default constructor.
- */
- Command() : hasRange(false), inField(false), inRangeField(false) {}
-
- /**
- * Constructor of the Command class.
- *
- * @param name is a string variant with name and location of the
- * command.
- * @param arguments is a map variant with the arguments given to the
- * command.
- * @param hasRange should be set to true if this is a command with
- * explicit range.
- * @param inField is set to true if we currently are inside a field
- * of this command.
- * @param inRangeField is set to true if we currently inside the outer
- * field of the command.
- */
- Command(Variant name, Variant arguments, bool hasRange,
- bool inField, bool inRangeField)
- : name(std::move(name)),
- arguments(std::move(arguments)),
- hasRange(hasRange),
- inField(inField),
- inRangeField(inRangeField)
- {
- }
- };
-
-private:
- /**
- * Reference to the CharReader instance from which the incomming bytes are
- * read.
- */
- CharReader &reader;
-
- /**
- * Reference at the logger instance to which all error messages are sent.
- */
- Logger &logger;
-
- /**
- * Tokenizer instance used to read individual tokens from the text.
- */
- DynamicTokenizer tokenizer;
-
- /**
- * Stack containing the current commands.
- */
- std::stack<Command> commands;
-
- /**
- * Variant containing the data that has been read (always is a string,
- * contains the exact location of the data in the source file).
- */
- Variant data;
-
- /**
- * Contains the location of the last token.
- */
- SourceLocation location;
-
- /**
- * Contains the field index of the current command.
- */
- size_t fieldIdx;
-
- /**
- * Function used internall to parse an identifier.
- *
- * @param start is the start byte offset of the identifier (including the
- * backslash).
- * @param allowNSSep should be set to true if the namespace separator is
- * allowed in the identifier name. Issues error if the namespace separator
- * is placed incorrectly.
- */
- Variant parseIdentifier(size_t start, bool allowNSSep = false);
-
- /**
- * Function used internally to handle the special "\begin" command.
- */
- State parseBeginCommand();
-
- /**
- * Function used internally to handle the special "\end" command.
- */
- State parseEndCommand();
-
- /**
- * Pushes the parsed command onto the command stack.
- */
- void pushCommand(Variant commandName, Variant commandArguments, bool hasRange);
-
- /**
- * Parses the command arguments.
- */
- Variant parseCommandArguments(Variant commandArgName);
-
- /**
- * Function used internally to parse a command.
- *
- * @param start is the start byte offset of the command (including the
- * backslash)
- * @return true if a command was actuall parsed, false otherwise.
- */
- State parseCommand(size_t start);
-
- /**
- * Function used internally to parse a block comment.
- */
- void parseBlockComment();
-
- /**
- * Function used internally to parse a generic comment.
- */
- void parseLineComment();
-
- /**
- * Checks whether there is any data pending to be issued, if yes, issues it.
- *
- * @param handler is the data handler that contains the data that may be
- * returned to the user.
- * @return true if there was any data and DATA should be returned by the
- * parse function, false otherwise.
- */
- bool checkIssueData(DataHandler &handler);
-
- /**
- * Called before any data is appended to the internal data handler. Checks
- * whether a new field should be started or implicitly ended.
- *
- * @return true if FIELD_START should be returned by the parse function.
- */
- bool checkIssueFieldStart();
-
-public:
- /**
- * Constructor of the PlainFormatStreamReader class. Attaches the new
- * PlainFormatStreamReader to the given CharReader and Logger instances.
- *
- * @param reader is the reader instance from which incomming characters
- * should be read.
- * @param logger is the logger instance to which errors should be written.
- */
- PlainFormatStreamReader(CharReader &reader, Logger &logger);
-
- /**
- * Continues parsing. Returns one of the states defined in the State enum.
- * Callers should stop once the State::END state is reached. Use the getter
- * functions to get more information about the current state, such as the
- * command name or the data or the current field index.
- *
- * @return the new state the parser has reached.
- */
- State parse();
-
- /**
- * Returns a reference at the internally stored data. Only valid if
- * State::DATA was returned by the "parse" function.
- *
- * @return a reference at a variant containing the data parsed by the
- * "parse" function.
- */
- const Variant &getData() { return data; }
-
- /**
- * Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
- *
- * @return a reference at a variant containing name and location of the
- * parsed command.
- */
- const Variant &getCommandName();
-
- /**
- * Returns a reference at the internally stored command name. Only valid if
- * State::COMMAND was returned by the "parse" function.
- *
- * @return a reference at a variant containing arguments given to the
- * command.
- */
- const Variant &getCommandArguments();
-
- /**
- * Returns a reference at the char reader.
- *
- * @return the last internal token location.
- */
- SourceLocation &getLocation() {return location;}
-};
-}
-
-#endif /* _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ */
-
diff --git a/src/plugins/plain/TokenTrie.cpp b/src/plugins/plain/TokenTrie.cpp
deleted file mode 100644
index 4a0430b..0000000
--- a/src/plugins/plain/TokenTrie.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "TokenTrie.hpp"
-
-namespace ousia {
-
-/* Class DynamicTokenTree::Node */
-
-TokenTrie::Node::Node() : type(EmptyToken) {}
-
-/* Class DynamicTokenTree */
-
-bool TokenTrie::registerToken(const std::string &token,
- TokenTypeId type) noexcept
-{
- // Abort if the token is empty -- this would taint the root node
- if (token.empty()) {
- return false;
- }
-
- // Iterate over each character in the given string and insert them as
- // (new) nodes
- Node *node = &root;
- for (size_t i = 0; i < token.size(); i++) {
- // Insert a new node if this one does not exist
- const char c = token[i];
- auto it = node->children.find(c);
- if (it == node->children.end()) {
- it = node->children.emplace(c, std::make_shared<Node>()).first;
- }
- node = it->second.get();
- }
-
- // If the resulting node already has a type set, we're screwed.
- if (node->type != EmptyToken) {
- return false;
- }
-
- // Otherwise just set the type to the given type.
- node->type = type;
- return true;
-}
-
-bool TokenTrie::unregisterToken(const std::string &token) noexcept
-{
- // We cannot remove empty tokens as we need to access the fist character
- // upfront
- if (token.empty()) {
- return false;
- }
-
- // First pass -- search the node in the path that can be deleted
- Node *subtreeRoot = &root;
- char subtreeKey = token[0];
- Node *node = &root;
- for (size_t i = 0; i < token.size(); i++) {
- // Go to the next node, abort if the tree ends unexpectedly
- auto it = node->children.find(token[i]);
- if (it == node->children.end()) {
- return false;
- }
-
- // Reset the subtree handler if this node has another type
- node = it->second.get();
- if ((node->type != EmptyToken || node->children.size() > 1) &&
- (i + 1 != token.size())) {
- subtreeRoot = node;
- subtreeKey = token[i + 1];
- }
- }
-
- // If the node type is already EmptyToken, we cannot do anything here
- if (node->type == EmptyToken) {
- return false;
- }
-
- // If the target node has children, we cannot delete the subtree. Set the
- // type to EmptyToken instead
- if (!node->children.empty()) {
- node->type = EmptyToken;
- return true;
- }
-
- // If we end up here, we can safely delete the complete subtree
- subtreeRoot->children.erase(subtreeKey);
- return true;
-}
-
-TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
-{
- Node const *node = &root;
- for (size_t i = 0; i < token.size(); i++) {
- const char c = token[i];
- auto it = node->children.find(c);
- if (it == node->children.end()) {
- return EmptyToken;
- }
- node = it->second.get();
- }
- return node->type;
-}
-}
-
diff --git a/src/plugins/plain/TokenTrie.hpp b/src/plugins/plain/TokenTrie.hpp
deleted file mode 100644
index 36c2ffa..0000000
--- a/src/plugins/plain/TokenTrie.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file TokenTrie.hpp
- *
- * Class representing a token trie that can be updated dynamically.
- *
- * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
- * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
- */
-
-#ifndef _OUSIA_TOKEN_TRIE_HPP_
-#define _OUSIA_TOKEN_TRIE_HPP_
-
-#include <cstdint>
-#include <memory>
-#include <limits>
-#include <unordered_map>
-
-namespace ousia {
-
-/**
- * The TokenTypeId is used to give each token type a unique id.
- */
-using TokenTypeId = uint32_t;
-
-/**
- * Token which is not a token.
- */
-constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
-
-/**
- * Token which represents a text token.
- */
-constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
-
-/**
- * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
- * the longest consecutive token in the text. This is equivalent to a prefix
- * trie.
- *
- * A token trie is a construct that structures all special tokens a Tokenizer
- * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
- * three. Then the token tree would look like this:
- *
- * \code{*.txt}
- * ~ (0)
- * / \
- * a (2) b (0)
- * | |
- * a (0) a (0)
- * | |
- * b (1) c (0)
- * \endcode
- *
- * Where the number indicates the corresponding token descriptor identifier.
- */
-class TokenTrie {
-public:
- /**
- * Structure used to build the node tree.
- */
- struct Node {
- /**
- * Type used for the child map.
- */
- using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
-
- /**
- * Map from single characters at the corresponding child nodes.
- */
- ChildMap children;
-
- /**
- * Reference at the corresponding token descriptor. Set to nullptr if
- * no token is attached to this node.
- */
- TokenTypeId type;
-
- /**
- * Default constructor, initializes the descriptor with nullptr.
- */
- Node();
- };
-
-private:
- /**
- * Root node of the internal token tree.
- */
- Node root;
-
-public:
- /**
- * Registers a token containing the given string. Returns false if the
- * token already exists, true otherwise.
- *
- * @param token is the character sequence that should be registered as
- * token.
- * @param type is the descriptor that should be set for this token.
- * @return true if the operation is successful, false otherwise.
- */
- bool registerToken(const std::string &token, TokenTypeId type) noexcept;
-
- /**
- * Unregisters the token from the token tree. Returns true if the token was
- * unregistered successfully, false otherwise.
- *
- * @param token is the character sequence that should be unregistered.
- * @return true if the operation was successful, false otherwise.
- */
- bool unregisterToken(const std::string &token) noexcept;
-
- /**
- * Returns true, if the given token exists within the TokenTree. This
- * function is mostly thought for debugging and unit testing.
- *
- * @param token is the character sequence that should be searched.
- * @return the attached token descriptor or nullptr if the given token is
- * not found.
- */
- TokenTypeId hasToken(const std::string &token) const noexcept;
-
- /**
- * Returns a reference at the root node to be used for traversing the token
- * tree.
- *
- * @return a reference at the root node.
- */
- const Node *getRoot() const noexcept { return &root; }
-};
-}
-
-#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
-