diff options
Diffstat (limited to 'src/plugins')
-rw-r--r-- | src/plugins/plain/DynamicTokenizer.cpp | 544 | ||||
-rw-r--r-- | src/plugins/plain/DynamicTokenizer.hpp | 252 | ||||
-rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.cpp | 641 | ||||
-rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.hpp | 347 | ||||
-rw-r--r-- | src/plugins/plain/TokenTrie.cpp | 119 | ||||
-rw-r--r-- | src/plugins/plain/TokenTrie.hpp | 150 |
6 files changed, 0 insertions, 2053 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp deleted file mode 100644 index f2cfcd1..0000000 --- a/src/plugins/plain/DynamicTokenizer.cpp +++ /dev/null @@ -1,544 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <memory> -#include <vector> - -#include <core/common/CharReader.hpp> -#include <core/common/Exceptions.hpp> -#include <core/common/Utils.hpp> - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -namespace { - -/* Internal class TokenMatch */ - -/** - * Contains information about a matching token. - */ -struct TokenMatch { - /** - * Token that was matched. - */ - DynamicToken token; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - - /** - * Constructor of the TokenMatch class. - */ - TokenMatch() : textLength(0), textEnd(0) {} - - /** - * Returns true if this TokenMatch instance actually represents a match. - */ - bool hasMatch() { return token.type != EmptyToken; } -}; - -/* Internal class TokenLookup */ - -/** - * The TokenLookup class is used to represent a thread in a running token - * lookup. - */ -class TokenLookup { -private: - /** - * Current node within the token trie. - */ - TokenTrie::Node const *node; - - /** - * Start offset within the source file. - */ - size_t start; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - -public: - /** - * Constructor of the TokenLookup class. - * - * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. - */ - TokenLookup(const TokenTrie::Node *node, size_t start, - size_t textLength, size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) - { - } - - /** - * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). - * - * @param c is the character that should be appended to the current prefix. - * @param lookups is a list to which new TokeLookup instances are added -- - * which could potentially be expanded in the next iteration. - * @param match is the DynamicToken instance to which the matching token - * should be written. - * @param tokens is a reference at the internal token list of the - * DynamicTokenizer. - * @param end is the end byte offset of the current character. - * @param sourceId is the source if of this file. - */ - void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, - const std::vector<std::string> &tokens, SourceOffset end, - SourceId sourceId) - { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node - auto it = node->children.find(c); - if (it == node->children.end()) { - return; - } - - // Check whether the new node represents a complete token a whether it - // is longer than the current token. If yes, replace the current token. - node = it->second.get(); - if (node->type != EmptyToken) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - DynamicToken{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } - } - - // If this state can possibly be advanced, store it in the states list. - if (!node->children.empty()) { - lookups.emplace_back(*this); - } - } -}; - -/* Internal class TextHandlerBase */ - -/** - * Base class used for those classes that may be used as TextHandler in the - * DynamicTokenizer::next function. - */ -class TextHandlerBase { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector<char> textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - TextHandlerBase() : textStart(0), textEnd(0) {} - - /** - * Transforms the given token into a text token containing the extracted - * text. - * - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ - void buildTextToken(TokenMatch &match, SourceId sourceId) - { - if (match.hasMatch()) { - match.token.content = - std::string{textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, textStart, match.textEnd}; - } else { - match.token.content = std::string{textBuf.data(), textBuf.size()}; - match.token.location = SourceLocation{sourceId, textStart, textEnd}; - } - match.token.type = TextToken; - } - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } -}; - -/* Internal class PreservingTextHandler */ - -/** - * The PreservingTextHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/* Internal class TrimmingTextHandler */ - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector<char> whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/* Internal class CollapsingTextHandler */ - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; -} - -/* Class DynamicTokenizer */ - -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenTypeId(0) -{ -} - -template <typename TextHandler, bool read> -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) -{ - // If we're in the read mode, reset the char reader peek position to the - // current read position - if (read) { - reader.resetPeek(); - } - - // Prepare the lookups in the token trie - const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; - std::vector<TokenLookup> lookups; - std::vector<TokenLookup> nextLookups; - - // Instantiate the text handler - TextHandler textHandler; - - // Peek characters from the reader and try to advance the current token tree - // cursor - char c; - size_t charStart = reader.getPeekOffset(); - const SourceId sourceId = reader.getSourceId(); - while (reader.peek(c)) { - const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; - - // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); - } - - // Try to advance all other lookups with the new character - for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); - } - - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { - break; - } - } else { - // Record all incomming characters - textHandler.append(c, charStart, charEnd); - } - - // Swap the lookups and the nextLookups list - lookups = std::move(nextLookups); - nextLookups.clear(); - - // Advance the offset - charStart = charEnd; - } - - // If we found text, emit that text - if (textHandler.hasText() && - (!match.hasMatch() || match.textLength > 0)) { - textHandler.buildTextToken(match, sourceId); - } - - // Move the read/peek cursor to the end of the token, abort if an error - // happens while doing so - if (match.hasMatch()) { - // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { - throw OusiaException{"Token end position offset out of range"}; - } - - // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); - if (read) { - reader.seek(end); - } else { - reader.seekPeekCursor(end); - } - token = match.token; - } else { - token = DynamicToken{}; - } - return match.hasMatch(); -} - -bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingTextHandler, true>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingTextHandler, true>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingTextHandler, true>(reader, token); - } - return false; -} - -bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next<PreservingTextHandler, false>(reader, token); - case WhitespaceMode::TRIM: - return next<TrimmingTextHandler, false>(reader, token); - case WhitespaceMode::COLLAPSE: - return next<CollapsingTextHandler, false>(reader, token); - } - return false; -} - -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) -{ - // Abort if an empty token should be registered - if (token.empty()) { - return EmptyToken; - } - - // Search for a new slot in the tokens list - TokenTypeId type = EmptyToken; - for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; - type = i; - break; - } - } - - // No existing slot was found, add a new one -- make sure we do not - // override the special token type handles - if (type == EmptyToken) { - type = tokens.size(); - if (type == TextToken || type == EmptyToken) { - throw OusiaException{"Token type ids depleted!"}; - } - tokens.emplace_back(token); - } - nextTokenTypeId = type + 1; - - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list - if (!trie.registerToken(token, type)) { - tokens[type] = std::string(); - nextTokenTypeId = type; - return EmptyToken; - } - return type; -} - -bool DynamicTokenizer::unregisterToken(TokenTypeId type) -{ - // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return true; - } - return false; -} - -std::string DynamicTokenizer::getTokenString(TokenTypeId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} - -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) -{ - whitespaceMode = mode; -} - -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } - -/* Explicitly instantiate all possible instantiations of the "next" member - function */ -template bool DynamicTokenizer::next<PreservingTextHandler, false>( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, false>( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, false>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<PreservingTextHandler, true>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<TrimmingTextHandler, true>( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next<CollapsingTextHandler, true>( - CharReader &reader,DynamicToken &token); -} - diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp deleted file mode 100644 index 0cac2e8..0000000 --- a/src/plugins/plain/DynamicTokenizer.hpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file DynamicTokenizer.hpp - * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ - -#include <set> -#include <string> -#include <vector> - -#include <core/common/Location.hpp> - -#include "TokenTrie.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; - -/** - * The DynamicToken structure describes a token discovered by the Tokenizer. - */ -struct DynamicToken { - /** - * Id of the type of this token. - */ - TokenTypeId type; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - DynamicToken() : type(EmptyToken) {} - - /** - * Constructor of the DynamicToken struct. - * - * @param id represents the token type. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - DynamicToken(TokenTypeId type, const std::string &content, - SourceLocation location) - : type(type), content(content), location(location) - { - } - - /** - * Constructor of the DynamicToken struct, only initializes the token type - * - * @param type is the id corresponding to the type of the token. - */ - DynamicToken(TokenTypeId type) : type(type) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; - -/** - * Enum specifying the whitespace handling of the DynamicTokenizer class when - * reading non-token text. - */ -enum class WhitespaceMode { - /** - * Preserves all whitespaces as they are found in the source file. - */ - PRESERVE, - - /** - * Trims whitespace at the beginning and the end of the found text. - */ - TRIM, - - /** - * Whitespaces are trimmed and collapsed, multiple whitespace characters - * are replaced by a single space character. - */ - COLLAPSE -}; - -/** - * The DynamicTokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the - * tokenizer. - */ -class DynamicTokenizer { -private: - /** - * Internally used token trie. This object holds all registered tokens. - */ - TokenTrie trie; - - /** - * Flag defining whether whitespaces should be preserved or not. - */ - WhitespaceMode whitespaceMode; - - /** - * Vector containing all registered token types. - */ - std::vector<std::string> tokens; - - /** - * Next index in the tokens list where to search for a new token id. - */ - size_t nextTokenTypeId; - - /** - * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. - * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is the token structure into which the token information - * should be written. - * @return false if the end of the stream has been reached, true otherwise. - */ - template <typename TextHandler, bool read> - bool next(CharReader &reader, DynamicToken &token); - -public: - /** - * Constructor of the DynamicTokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. - */ - DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); - - /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. - * - * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if - * an error occured. - */ - TokenTypeId registerToken(const std::string &token); - - /** - * Unregisters the token belonging to the given TokenTypeId. - * - * @param type is the token type that should be unregistered. The - *TokenTypeId - * must have been returned by registerToken. - * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). - */ - bool unregisterToken(TokenTypeId type); - - /** - * Returns the token that was registered under the given TokenTypeId id or - *an - * empty string if an invalid TokenTypeId id is given. - * - * @param type is the TokenTypeId id for which the corresponding token - *string - * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenTypeId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. - */ - WhitespaceMode getWhitespaceMode(); - - /** - * Reads a new token from the CharReader and stores it in the given - * DynamicToken instance. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool read(CharReader &reader, DynamicToken &token); - - /** - * The peek method does not advance the read position of the char reader, - * but reads the next token from the current char reader peek position. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool peek(CharReader &reader, DynamicToken &token); -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ - diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp deleted file mode 100644 index 05769f0..0000000 --- a/src/plugins/plain/PlainFormatStreamReader.cpp +++ /dev/null @@ -1,641 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <core/common/CharReader.hpp> -#include <core/common/Logger.hpp> -#include <core/common/Utils.hpp> -#include <core/common/VariantReader.hpp> - -#include "PlainFormatStreamReader.hpp" - -namespace ousia { - -/** - * Plain format default tokenizer. - */ -class PlainFormatTokens : public DynamicTokenizer { -public: - /** - * Id of the backslash token. - */ - TokenTypeId Backslash; - - /** - * Id of the line comment token. - */ - TokenTypeId LineComment; - - /** - * Id of the block comment start token. - */ - TokenTypeId BlockCommentStart; - - /** - * Id of the block comment end token. - */ - TokenTypeId BlockCommentEnd; - - /** - * Id of the field start token. - */ - TokenTypeId FieldStart; - - /** - * Id of the field end token. - */ - TokenTypeId FieldEnd; - - /** - * Registers the plain format tokens in the internal tokenizer. - */ - PlainFormatTokens() - { - Backslash = registerToken("\\"); - LineComment = registerToken("%"); - BlockCommentStart = registerToken("%{"); - BlockCommentEnd = registerToken("}%"); - FieldStart = registerToken("{"); - FieldEnd = registerToken("}"); - } -}; - -static const PlainFormatTokens Tokens; - -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: - /** - * Internal character buffer. - */ - std::vector<char> buf; - - /** - * Start location of the character data. - */ - SourceOffset start; - - /** - * End location of the character data. - */ - SourceOffset end; - -public: - /** - * Default constructor, initializes start and end with zeros. - */ - DataHandler() : start(0), end(0) {} - - /** - * Returns true if the internal buffer is empty. - * - * @return true if no characters were added to the internal buffer, false - * otherwise. - */ - bool isEmpty() { return buf.empty(); } - - /** - * Appends a single character to the internal buffer. - * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. - */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) - { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; - } - - /** - * Appends a string to the internal buffer. - * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. - */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) - { - if (isEmpty()) { - start = stringStart; - } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; - } - - /** - * Converts the internal buffer to a variant with attached location - * information. - * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. - */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } -}; - -PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader, - Logger &logger) - : reader(reader), logger(logger), tokenizer(Tokens) -{ - // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true}); -} - -Variant PlainFormatStreamReader::parseIdentifier(size_t start, bool allowNSSep) -{ - bool first = true; - bool hasCharSiceNSSep = false; - std::vector<char> identifier; - size_t end = reader.getPeekOffset(); - char c, c2; - while (reader.peek(c)) { - // Abort if this character is not a valid identifer character - if ((first && Utils::isIdentifierStartCharacter(c)) || - (!first && Utils::isIdentifierCharacter(c))) { - identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { - identifier.push_back(c); - } else { - if (c == ':' && allowNSSep) { - logger.error( - "Expected character before and after namespace separator \":\"", - reader); - } - reader.resetPeek(); - break; - } - - // This is no longer the first character - first = false; - - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); - - end = reader.getPeekOffset(); - reader.consumePeek(); - } - - // Return the identifier at its location - Variant res = - Variant::fromString(std::string(identifier.data(), identifier.size())); - res.setLocation({reader.getSourceId(), start, end}); - return res; -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parseBeginCommand() -{ - // Expect a '{' after the command - reader.consumeWhitespace(); - if (!reader.expect('{')) { - logger.error("Expected \"{\" after \\begin", reader); - return State::NONE; - } - - // Parse the name of the command that should be opened - Variant commandName = parseIdentifier(reader.getOffset(), true); - if (commandName.asString().empty()) { - logger.error("Expected identifier", commandName); - return State::ERROR; - } - - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - SourceOffset start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); - } - } - - if (!reader.expect('}')) { - logger.error("Expected \"}\"", reader); - return State::ERROR; - } - - // Parse the arguments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), true); - - return State::COMMAND; -} - -static bool checkStillInField(const PlainFormatStreamReader::Command &cmd, - const Variant &endName, Logger &logger) -{ - if (cmd.inField && !cmd.inRangeField) { - logger.error(std::string("\\end in open field of command \"") + - cmd.name.asString() + std::string("\""), - endName); - logger.note(std::string("Open command started here:"), cmd.name); - return true; - } - return false; -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parseEndCommand() -{ - // Expect a '{' after the command - if (!reader.expect('{')) { - logger.error("Expected \"{\" after \\end", reader); - return State::NONE; - } - - // Fetch the name of the command that should be ended here - Variant name = parseIdentifier(reader.getOffset(), true); - - // Make sure the given command name is not empty - if (name.asString().empty()) { - logger.error("Expected identifier", name); - return State::ERROR; - } - - // Make sure the command name is terminated with a '}' - if (!reader.expect('}')) { - logger.error("Expected \"}\"", reader); - return State::ERROR; - } - - // Unroll the command stack up to the last range command - while (!commands.top().hasRange) { - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - commands.pop(); - } - - // Make sure we're not in an open field of this command - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - - // Special error message if the top-level command is reached - if (commands.size() == 1) { - logger.error(std::string("Cannot end command \"") + name.asString() + - std::string("\" here, no command open"), - name); - return State::ERROR; - } - - // Inform the about command mismatches - const Command &cmd = commands.top(); - if (commands.top().name.asString() != name.asString()) { - logger.error(std::string("Trying to end command \"") + - cmd.name.asString() + - std::string("\", but open command is \"") + - name.asString() + std::string("\""), - name); - logger.note("Last command was opened here:", cmd.name); - return State::ERROR; - } - - // Set the location to the location of the command that was ended, then end - // the current command - location = name.getLocation(); - commands.pop(); - return cmd.inRangeField ? State::FIELD_END : State::NONE; -} - -Variant PlainFormatStreamReader::parseCommandArguments(Variant commandArgName) -{ - // Parse the arguments using the universal VariantReader - Variant commandArguments; - if (reader.expect('[')) { - auto res = VariantReader::parseObject(reader, logger, ']'); - commandArguments = res.second; - } else { - commandArguments = Variant::mapType{}; - } - - // Insert the parsed name, make sure "name" was not specified in the - // arguments - if (commandArgName.isString()) { - auto res = - commandArguments.asMap().emplace("name", std::move(commandArgName)); - if (!res.second) { - logger.error("Name argument specified multiple times", - SourceLocation{}, MessageMode::NO_CONTEXT); - logger.note("First occurance is here: ", commandArgName); - logger.note("Second occurance is here: ", res.first->second); - } - } - return commandArguments; -} - -void PlainFormatStreamReader::pushCommand(Variant commandName, - Variant commandArguments, - bool hasRange) -{ - // Store the location on the stack - location = commandName.getLocation(); - - // Place the command on the command stack, remove the last commands if we're - // not currently inside a field of these commands - while (!commands.top().inField) { - commands.pop(); - } - commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false}); -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parseCommand( - size_t start) -{ - // Parse the commandName as a first identifier - Variant commandName = parseIdentifier(start, true); - if (commandName.asString().empty()) { - logger.error("Empty command name", reader); - return State::NONE; - } - - // Handle the special "begin" and "end" commands - const auto commandNameComponents = - Utils::split(commandName.asString(), ':'); - const bool isBegin = commandNameComponents[0] == "begin"; - const bool isEnd = commandNameComponents[0] == "end"; - if (isBegin || isEnd) { - if (commandNameComponents.size() > 1) { - logger.error( - "Special commands \"\\begin\" and \"\\end\" may not contain a " - "namespace separator \":\"", - commandName); - } - if (isBegin) { - return parseBeginCommand(); - } else if (isEnd) { - return parseEndCommand(); - } - } - - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); - } - } - - // Parse the arugments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), false); - - return State::COMMAND; -} - -void PlainFormatStreamReader::parseBlockComment() -{ - DynamicToken token; - size_t depth = 1; - while (tokenizer.read(reader, token)) { - if (token.type == Tokens.BlockCommentEnd) { - depth--; - if (depth == 0) { - return; - } - } - if (token.type == Tokens.BlockCommentStart) { - depth++; - } - } - - // Issue an error if the file ends while we are in a block comment - logger.error("File ended while being in a block comment", reader); -} - -void PlainFormatStreamReader::parseLineComment() -{ - char c; - while (reader.read(c)) { - if (c == '\n') { - return; - } - } -} - -bool PlainFormatStreamReader::checkIssueData(DataHandler &handler) -{ - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); - location = data.getLocation(); - reader.resetPeek(); - return true; - } - return false; -} - -bool PlainFormatStreamReader::checkIssueFieldStart() -{ - // Fetch the current command, and check whether we're currently inside a - // field of this command - Command &cmd = commands.top(); - if (!cmd.inField) { - // If this is a range command, we're now implicitly inside the field of - // this command -- we'll have to issue a field start command! - if (cmd.hasRange) { - cmd.inField = true; - cmd.inRangeField = true; - reader.resetPeek(); - return true; - } - - // This was not a range command, so obviously we're now inside within - // a field of some command -- so unroll the commands stack until a - // command with open field is reached - while (!commands.top().inField) { - commands.pop(); - } - } - return false; -} - -PlainFormatStreamReader::State PlainFormatStreamReader::parse() -{ - // Handler for incomming data - DataHandler handler; - - // Read tokens until the outer loop should be left - DynamicToken token; - while (tokenizer.peek(reader, token)) { - const TokenTypeId type = token.type; - - // Special handling for Backslash and Text - if (type == Tokens.Backslash) { - // Before appending anything to the output data or starting a new - // command, check whether FIELD_START has to be issued, as the - // current command is a command with range - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - - // Check whether a command starts now, without advancing the peek - // cursor - char c; - if (!reader.fetchPeek(c)) { - logger.error("Trailing backslash at the end of the file.", - token); - return State::END; - } - - // Try to parse a command - if (Utils::isIdentifierStartCharacter(c)) { - // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { - return State::DATA; - } - - // Parse the actual command - State res = parseCommand(token.location.getStart()); - switch (res) { - case State::ERROR: - throw LoggableException( - "Last error was irrecoverable, ending parsing " - "process"); - case State::NONE: - continue; - default: - return res; - } - } - - // This was not a special character, just append the given character - // to the data buffer, use the escape character start as start - // location and the peek offset as end location - reader.peek(c); // Peek the previously fetched character - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); - reader.consumePeek(); - continue; - } else if (type == TextToken) { - // Check whether FIELD_START has to be issued before appending text - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - - reader.consumePeek(); - continue; - } - - // A non-text token was reached, make sure all pending data commands - // have been issued - if (checkIssueData(handler)) { - return State::DATA; - } - - // We will handle the token now, consume the peeked characters - reader.consumePeek(); - - // Update the location to the current token location - location = token.location; - - if (token.type == Tokens.LineComment) { - parseLineComment(); - } else if (token.type == Tokens.BlockCommentStart) { - parseBlockComment(); - } else if (token.type == Tokens.FieldStart) { - Command &cmd = commands.top(); - if (!cmd.inField) { - cmd.inField = true; - return State::FIELD_START; - } - logger.error( - "Got field start token \"{\", but no command for which to " - "start the field. Did you mean \"\\{\"?", - token); - } else if (token.type == Tokens.FieldEnd) { - // Try to end an open field of the current command -- if the current - // command is not inside an open field, end this command and try to - // close the next one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - return State::FIELD_END; - } - commands.pop(); - } else { - break; - } - } - logger.error( - "Got field end token \"}\", but there is no field to end. Did " - "you mean \"\\}\"?", - token); - } else { - logger.error("Unexpected token \"" + token.content + "\"", token); - } - } - - // Issue available data - if (checkIssueData(handler)) { - return State::DATA; - } - - // Make sure all open commands and fields have been ended at the end of the - // stream - while (commands.size() > 1) { - Command &cmd = commands.top(); - if (cmd.inField || cmd.hasRange) { - logger.error("Reached end of stream, but command \"" + - cmd.name.asString() + "\" has not been ended", - cmd.name); - } - commands.pop(); - } - - location = SourceLocation{reader.getSourceId(), reader.getOffset()}; - return State::END; -} - -const Variant &PlainFormatStreamReader::getCommandName() -{ - return commands.top().name; -} - -const Variant &PlainFormatStreamReader::getCommandArguments() -{ - return commands.top().arguments; -} -} - diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp deleted file mode 100644 index 2ee261c..0000000 --- a/src/plugins/plain/PlainFormatStreamReader.hpp +++ /dev/null @@ -1,347 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file PlainFormatStreamReader.hpp - * - * Provides classes for low-level classes for reading the plain TeX-esque - * format. The class provided here do not build any model objects and does not - * implement the Parser interfaces. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ -#define _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ - -#include <stack> - -#include <core/common/Variant.hpp> - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; -class Logger; -class DataHandler; - -/** - * The PlainFormatStreamReader class provides a low-level reader for the plain - * TeX-esque format. The parser is constructed around a "parse" function, which - * reads data from the underlying CharReader until a new state is reached and - * indicates this state in a return value. The calling code then has to pull - * corresponding data from the stream reader. The reader already handles some - * invalid cases, but recovers from most errors and happily continues parsing. - */ -class PlainFormatStreamReader { -public: - /** - * Enum used to indicate which state the PlainFormatStreamReader class is in - * after calling the "parse" function. - */ - enum class State { - /** - * State returned if a fully featured command has been read. A command - * consists of the command name and its arguments (which optionally - * includes the name). - */ - COMMAND, - - /** - * State returned if data is given. The reader must decide which field - * or command this should be routed to. Trailing or leading whitespace - * has been removed. Only called if the data is non-empty. - */ - DATA, - - /** - * A user-defined entity has been found. The entity sequence is stored - * in the command name. - */ - ENTITY, - - /** - * State returned if an annotation was started. An annotation consists - * of the command name and its arguments (which optionally include the - * name). - */ - ANNOTATION_START, - - /** - * State returned if an annotation ends. The reader indicates which - * annotation ends. - */ - ANNOTATION_END, - - /** - * State returned if a new field started. The reader assures that the - * current field ends before a new field is started and that the field - * is not started if data has been given outside of a field. The - * field number is set to the current field index. - */ - FIELD_START, - - /** - * State returned if the current field ends. The reader assures that a - * field was actually open. - */ - FIELD_END, - - /** - * The end of the stream has been reached. - */ - END, - - /** - * Returned from internal functions if nothing should be done. - */ - NONE, - - /** - * Returned from internal function to indicate irrecoverable errors. - */ - ERROR - }; - - /** - * Entry used for the command stack. - */ - struct Command { - /** - * Name and location of the current command. - */ - Variant name; - - /** - * Arguments that were passed to the command. - */ - Variant arguments; - - /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). - */ - bool inRangeField; - - /** - * Default constructor. - */ - Command() : hasRange(false), inField(false), inRangeField(false) {} - - /** - * Constructor of the Command class. - * - * @param name is a string variant with name and location of the - * command. - * @param arguments is a map variant with the arguments given to the - * command. - * @param hasRange should be set to true if this is a command with - * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently inside the outer - * field of the command. - */ - Command(Variant name, Variant arguments, bool hasRange, - bool inField, bool inRangeField) - : name(std::move(name)), - arguments(std::move(arguments)), - hasRange(hasRange), - inField(inField), - inRangeField(inRangeField) - { - } - }; - -private: - /** - * Reference to the CharReader instance from which the incomming bytes are - * read. - */ - CharReader &reader; - - /** - * Reference at the logger instance to which all error messages are sent. - */ - Logger &logger; - - /** - * Tokenizer instance used to read individual tokens from the text. - */ - DynamicTokenizer tokenizer; - - /** - * Stack containing the current commands. - */ - std::stack<Command> commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). - */ - Variant data; - - /** - * Contains the location of the last token. - */ - SourceLocation location; - - /** - * Contains the field index of the current command. - */ - size_t fieldIdx; - - /** - * Function used internall to parse an identifier. - * - * @param start is the start byte offset of the identifier (including the - * backslash). - * @param allowNSSep should be set to true if the namespace separator is - * allowed in the identifier name. Issues error if the namespace separator - * is placed incorrectly. - */ - Variant parseIdentifier(size_t start, bool allowNSSep = false); - - /** - * Function used internally to handle the special "\begin" command. - */ - State parseBeginCommand(); - - /** - * Function used internally to handle the special "\end" command. - */ - State parseEndCommand(); - - /** - * Pushes the parsed command onto the command stack. - */ - void pushCommand(Variant commandName, Variant commandArguments, bool hasRange); - - /** - * Parses the command arguments. - */ - Variant parseCommandArguments(Variant commandArgName); - - /** - * Function used internally to parse a command. - * - * @param start is the start byte offset of the command (including the - * backslash) - * @return true if a command was actuall parsed, false otherwise. - */ - State parseCommand(size_t start); - - /** - * Function used internally to parse a block comment. - */ - void parseBlockComment(); - - /** - * Function used internally to parse a generic comment. - */ - void parseLineComment(); - - /** - * Checks whether there is any data pending to be issued, if yes, issues it. - * - * @param handler is the data handler that contains the data that may be - * returned to the user. - * @return true if there was any data and DATA should be returned by the - * parse function, false otherwise. - */ - bool checkIssueData(DataHandler &handler); - - /** - * Called before any data is appended to the internal data handler. Checks - * whether a new field should be started or implicitly ended. - * - * @return true if FIELD_START should be returned by the parse function. - */ - bool checkIssueFieldStart(); - -public: - /** - * Constructor of the PlainFormatStreamReader class. Attaches the new - * PlainFormatStreamReader to the given CharReader and Logger instances. - * - * @param reader is the reader instance from which incomming characters - * should be read. - * @param logger is the logger instance to which errors should be written. - */ - PlainFormatStreamReader(CharReader &reader, Logger &logger); - - /** - * Continues parsing. Returns one of the states defined in the State enum. - * Callers should stop once the State::END state is reached. Use the getter - * functions to get more information about the current state, such as the - * command name or the data or the current field index. - * - * @return the new state the parser has reached. - */ - State parse(); - - /** - * Returns a reference at the internally stored data. Only valid if - * State::DATA was returned by the "parse" function. - * - * @return a reference at a variant containing the data parsed by the - * "parse" function. - */ - const Variant &getData() { return data; } - - /** - * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. - * - * @return a reference at a variant containing name and location of the - * parsed command. - */ - const Variant &getCommandName(); - - /** - * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. - * - * @return a reference at a variant containing arguments given to the - * command. - */ - const Variant &getCommandArguments(); - - /** - * Returns a reference at the char reader. - * - * @return the last internal token location. - */ - SourceLocation &getLocation() {return location;} -}; -} - -#endif /* _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ */ - diff --git a/src/plugins/plain/TokenTrie.cpp b/src/plugins/plain/TokenTrie.cpp deleted file mode 100644 index 4a0430b..0000000 --- a/src/plugins/plain/TokenTrie.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "TokenTrie.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -TokenTrie::Node::Node() : type(EmptyToken) {} - -/* Class DynamicTokenTree */ - -bool TokenTrie::registerToken(const std::string &token, - TokenTypeId type) noexcept -{ - // Abort if the token is empty -- this would taint the root node - if (token.empty()) { - return false; - } - - // Iterate over each character in the given string and insert them as - // (new) nodes - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Insert a new node if this one does not exist - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - it = node->children.emplace(c, std::make_shared<Node>()).first; - } - node = it->second.get(); - } - - // If the resulting node already has a type set, we're screwed. - if (node->type != EmptyToken) { - return false; - } - - // Otherwise just set the type to the given type. - node->type = type; - return true; -} - -bool TokenTrie::unregisterToken(const std::string &token) noexcept -{ - // We cannot remove empty tokens as we need to access the fist character - // upfront - if (token.empty()) { - return false; - } - - // First pass -- search the node in the path that can be deleted - Node *subtreeRoot = &root; - char subtreeKey = token[0]; - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Go to the next node, abort if the tree ends unexpectedly - auto it = node->children.find(token[i]); - if (it == node->children.end()) { - return false; - } - - // Reset the subtree handler if this node has another type - node = it->second.get(); - if ((node->type != EmptyToken || node->children.size() > 1) && - (i + 1 != token.size())) { - subtreeRoot = node; - subtreeKey = token[i + 1]; - } - } - - // If the node type is already EmptyToken, we cannot do anything here - if (node->type == EmptyToken) { - return false; - } - - // If the target node has children, we cannot delete the subtree. Set the - // type to EmptyToken instead - if (!node->children.empty()) { - node->type = EmptyToken; - return true; - } - - // If we end up here, we can safely delete the complete subtree - subtreeRoot->children.erase(subtreeKey); - return true; -} - -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept -{ - Node const *node = &root; - for (size_t i = 0; i < token.size(); i++) { - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - return EmptyToken; - } - node = it->second.get(); - } - return node->type; -} -} - diff --git a/src/plugins/plain/TokenTrie.hpp b/src/plugins/plain/TokenTrie.hpp deleted file mode 100644 index 36c2ffa..0000000 --- a/src/plugins/plain/TokenTrie.hpp +++ /dev/null @@ -1,150 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file TokenTrie.hpp - * - * Class representing a token trie that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_TRIE_HPP_ -#define _OUSIA_TOKEN_TRIE_HPP_ - -#include <cstdint> -#include <memory> -#include <limits> -#include <unordered_map> - -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); - -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; - -/** - * The Tokenizer internally uses a TokenTrie to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * A token trie is a construct that structures all special tokens a Tokenizer - * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and - * three. Then the token tree would look like this: - * - * \code{*.txt} - * ~ (0) - * / \ - * a (2) b (0) - * | | - * a (0) a (0) - * | | - * b (1) c (0) - * \endcode - * - * Where the number indicates the corresponding token descriptor identifier. - */ -class TokenTrie { -public: - /** - * Structure used to build the node tree. - */ - struct Node { - /** - * Type used for the child map. - */ - using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; - - /** - * Map from single characters at the corresponding child nodes. - */ - ChildMap children; - - /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. - */ - TokenTypeId type; - - /** - * Default constructor, initializes the descriptor with nullptr. - */ - Node(); - }; - -private: - /** - * Root node of the internal token tree. - */ - Node root; - -public: - /** - * Registers a token containing the given string. Returns false if the - * token already exists, true otherwise. - * - * @param token is the character sequence that should be registered as - * token. - * @param type is the descriptor that should be set for this token. - * @return true if the operation is successful, false otherwise. - */ - bool registerToken(const std::string &token, TokenTypeId type) noexcept; - - /** - * Unregisters the token from the token tree. Returns true if the token was - * unregistered successfully, false otherwise. - * - * @param token is the character sequence that should be unregistered. - * @return true if the operation was successful, false otherwise. - */ - bool unregisterToken(const std::string &token) noexcept; - - /** - * Returns true, if the given token exists within the TokenTree. This - * function is mostly thought for debugging and unit testing. - * - * @param token is the character sequence that should be searched. - * @return the attached token descriptor or nullptr if the given token is - * not found. - */ - TokenTypeId hasToken(const std::string &token) const noexcept; - - /** - * Returns a reference at the root node to be used for traversing the token - * tree. - * - * @return a reference at the root node. - */ - const Node *getRoot() const noexcept { return &root; } -}; -} - -#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ - |