diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-10 16:42:20 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-10 16:42:20 +0100 |
commit | 6352751f87a4acdd9a32baf4f1133819678d06b0 (patch) | |
tree | 23f1764605227d25ee9861fcf23e2dc381bdc187 /src/formats | |
parent | e85e174028de377cee9f6dfcccbe418d185cc7b4 (diff) |
Renamed some files and changed folder structure
Diffstat (limited to 'src/formats')
-rw-r--r-- | src/formats/osdm/DynamicTokenizer.cpp | 544 | ||||
-rw-r--r-- | src/formats/osdm/DynamicTokenizer.hpp | 252 | ||||
-rw-r--r-- | src/formats/osdm/OsdmStreamParser.cpp | 640 | ||||
-rw-r--r-- | src/formats/osdm/OsdmStreamParser.hpp | 351 | ||||
-rw-r--r-- | src/formats/osdm/TokenTrie.cpp | 119 | ||||
-rw-r--r-- | src/formats/osdm/TokenTrie.hpp | 150 |
6 files changed, 2056 insertions, 0 deletions
diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp new file mode 100644 index 0000000..f2cfcd1 --- /dev/null +++ b/src/formats/osdm/DynamicTokenizer.cpp @@ -0,0 +1,544 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <memory> +#include <vector> + +#include <core/common/CharReader.hpp> +#include <core/common/Exceptions.hpp> +#include <core/common/Utils.hpp> + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + DynamicToken token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t textLength, size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the DynamicToken instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * DynamicTokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, + const std::vector<std::string> &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + DynamicToken{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/* Internal class TextHandlerBase */ + +/** + * Base class used for those classes that may be used as TextHandler in the + * DynamicTokenizer::next function. + */ +class TextHandlerBase { +public: + /** + * Start position of the extracted text. + */ + size_t textStart; + + /** + * End position of the extracted text. + */ + size_t textEnd; + + /** + * Buffer containing the extracted text. + */ + std::vector<char> textBuf; + + /** + * Constructor of the TextHandlerBase base class. Initializes the start and + * end position with zeros. + */ + TextHandlerBase() : textStart(0), textEnd(0) {} + + /** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ + void buildTextToken(TokenMatch &match, SourceId sourceId) + { + if (match.hasMatch()) { + match.token.content = + std::string{textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, textStart, match.textEnd}; + } else { + match.token.content = std::string{textBuf.data(), textBuf.size()}; + match.token.location = SourceLocation{sourceId, textStart, textEnd}; + } + match.token.type = TextToken; + } + + /** + * Returns true if this whitespace handler has found any text and a text + * token could be emitted. + * + * @return true if the internal data buffer is non-empty. + */ + bool hasText() { return !textBuf.empty(); } +}; + +/* Internal class PreservingTextHandler */ + +/** + * The PreservingTextHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; + + /** + * Appends the given character to the internal text buffer, does not + * eliminate whitespace. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + textBuf.push_back(c); + } +}; + +/* Internal class TrimmingTextHandler */ + +/** + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. + */ +class TrimmingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; + + /** + * Buffer used internally to temporarily store all whitespace characters. + * They are only added to the output buffer if another non-whitespace + * character is reached. + */ + std::vector<char> whitespaceBuf; + + /** + * Appends the given character to the internal text buffer, eliminates + * whitespace characters at the begin and end of the text. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + whitespaceBuf.push_back(c); + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (!whitespaceBuf.empty()) { + textBuf.insert(textBuf.end(), whitespaceBuf.begin(), + whitespaceBuf.end()); + whitespaceBuf.clear(); + } + textBuf.push_back(c); + } +}; + +/* Internal class CollapsingTextHandler */ + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingTextHandler : public TextHandlerBase { +public: + using TextHandlerBase::TextHandlerBase; + + /** + * Flag set to true if a whitespace character was reached. + */ + bool hasWhitespace = false; + + /** + * Appends the given character to the internal text buffer, eliminates + * redundant whitespace characters. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + hasWhitespace = true; + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (hasWhitespace) { + textBuf.push_back(' '); + hasWhitespace = false; + } + textBuf.push_back(c); + } +}; +} + +/* Class DynamicTokenizer */ + +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template <typename TextHandler, bool read> +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) +{ + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector<TokenLookup> lookups; + std::vector<TokenLookup> nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && + (!match.hasMatch() || match.textLength > 0)) { + textHandler.buildTextToken(match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = DynamicToken{}; + } + return match.hasMatch(); +} + +bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next<PreservingTextHandler, true>(reader, token); + case WhitespaceMode::TRIM: + return next<TrimmingTextHandler, true>(reader, token); + case WhitespaceMode::COLLAPSE: + return next<CollapsingTextHandler, true>(reader, token); + } + return false; +} + +bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next<PreservingTextHandler, false>(reader, token); + case WhitespaceMode::TRIM: + return next<TrimmingTextHandler, false>(reader, token); + case WhitespaceMode::COLLAPSE: + return next<CollapsingTextHandler, false>(reader, token); + } + return false; +} + +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } + + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; + + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string(); + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool DynamicTokenizer::next<PreservingTextHandler, false>( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<TrimmingTextHandler, false>( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next<CollapsingTextHandler, false>( + CharReader &reader,DynamicToken &token); +template bool DynamicTokenizer::next<PreservingTextHandler, true>( + CharReader &reader,DynamicToken &token); +template bool DynamicTokenizer::next<TrimmingTextHandler, true>( + CharReader &reader,DynamicToken &token); +template bool DynamicTokenizer::next<CollapsingTextHandler, true>( + CharReader &reader,DynamicToken &token); +} + diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp new file mode 100644 index 0000000..0cac2e8 --- /dev/null +++ b/src/formats/osdm/DynamicTokenizer.hpp @@ -0,0 +1,252 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file DynamicTokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include <set> +#include <string> +#include <vector> + +#include <core/common/Location.hpp> + +#include "TokenTrie.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; + +/** + * The DynamicToken structure describes a token discovered by the Tokenizer. + */ +struct DynamicToken { + /** + * Id of the type of this token. + */ + TokenTypeId type; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + DynamicToken() : type(EmptyToken) {} + + /** + * Constructor of the DynamicToken struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + DynamicToken(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the DynamicToken struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + DynamicToken(TokenTypeId type) : type(type) {} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; + +/** + * Enum specifying the whitespace handling of the DynamicTokenizer class when + * reading non-token text. + */ +enum class WhitespaceMode { + /** + * Preserves all whitespaces as they are found in the source file. + */ + PRESERVE, + + /** + * Trims whitespace at the beginning and the end of the found text. + */ + TRIM, + + /** + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ + COLLAPSE +}; + +/** + * The DynamicTokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. Note that the + * DynamicTokenizer always tries to extract the longest possible token from the + * tokenizer. + */ +class DynamicTokenizer { +private: + /** + * Internally used token trie. This object holds all registered tokens. + */ + TokenTrie trie; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token types. + */ + std::vector<std::string> tokens; + + /** + * Next index in the tokens list where to search for a new token id. + */ + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. + * + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template <typename TextHandler, bool read> + bool next(CharReader &reader, DynamicToken &token); + +public: + /** + * Constructor of the DynamicTokenizer class. + * + * @param whitespaceMode specifies how whitespace should be handled. + */ + DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. + */ + TokenTypeId registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenTypeId. + * + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * DynamicToken instance. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(CharReader &reader, DynamicToken &token); + + /** + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool peek(CharReader &reader, DynamicToken &token); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osdm/OsdmStreamParser.cpp new file mode 100644 index 0000000..8cb8caf --- /dev/null +++ b/src/formats/osdm/OsdmStreamParser.cpp @@ -0,0 +1,640 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/CharReader.hpp> +#include <core/common/Logger.hpp> +#include <core/common/Utils.hpp> +#include <core/common/VariantReader.hpp> + +#include "OsdmStreamParser.hpp" + +namespace ousia { + +/** + * Plain format default tokenizer. + */ +class PlainFormatTokens : public DynamicTokenizer { +public: + /** + * Id of the backslash token. + */ + TokenTypeId Backslash; + + /** + * Id of the line comment token. + */ + TokenTypeId LineComment; + + /** + * Id of the block comment start token. + */ + TokenTypeId BlockCommentStart; + + /** + * Id of the block comment end token. + */ + TokenTypeId BlockCommentEnd; + + /** + * Id of the field start token. + */ + TokenTypeId FieldStart; + + /** + * Id of the field end token. + */ + TokenTypeId FieldEnd; + + /** + * Registers the plain format tokens in the internal tokenizer. + */ + PlainFormatTokens() + { + Backslash = registerToken("\\"); + LineComment = registerToken("%"); + BlockCommentStart = registerToken("%{"); + BlockCommentEnd = registerToken("}%"); + FieldStart = registerToken("{"); + FieldEnd = registerToken("}"); + } +}; + +static const PlainFormatTokens Tokens; + +/** + * Class used internally to collect data issued via "DATA" event. + */ +class DataHandler { +private: + /** + * Internal character buffer. + */ + std::vector<char> buf; + + /** + * Start location of the character data. + */ + SourceOffset start; + + /** + * End location of the character data. + */ + SourceOffset end; + +public: + /** + * Default constructor, initializes start and end with zeros. + */ + DataHandler() : start(0), end(0) {} + + /** + * Returns true if the internal buffer is empty. + * + * @return true if no characters were added to the internal buffer, false + * otherwise. + */ + bool isEmpty() { return buf.empty(); } + + /** + * Appends a single character to the internal buffer. + * + * @param c is the character that should be added to the internal buffer. + * @param charStart is the start position of the character. + * @param charEnd is the end position of the character. + */ + void append(char c, SourceOffset charStart, SourceOffset charEnd) + { + if (isEmpty()) { + start = charStart; + } + buf.push_back(c); + end = charEnd; + } + + /** + * Appends a string to the internal buffer. + * + * @param s is the string that should be added to the internal buffer. + * @param stringStart is the start position of the string. + * @param stringEnd is the end position of the string. + */ + void append(const std::string &s, SourceOffset stringStart, + SourceOffset stringEnd) + { + if (isEmpty()) { + start = stringStart; + } + std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); + end = stringEnd; + } + + /** + * Converts the internal buffer to a variant with attached location + * information. + * + * @param sourceId is the source id which is needed for building the + * location information. + * @return a Variant with the internal buffer content as string and + * the correct start and end location. + */ + Variant toVariant(SourceId sourceId) + { + Variant res = Variant::fromString(std::string(buf.data(), buf.size())); + res.setLocation({sourceId, start, end}); + return res; + } +}; + +OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) + : reader(reader), logger(logger), tokenizer(Tokens) +{ + // Place an intial command representing the complete file on the stack + commands.push(Command{"", Variant::mapType{}, true, true, true}); +} + +Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +{ + bool first = true; + bool hasCharSiceNSSep = false; + std::vector<char> identifier; + size_t end = reader.getPeekOffset(); + char c, c2; + while (reader.peek(c)) { + // Abort if this character is not a valid identifer character + if ((first && Utils::isIdentifierStartCharacter(c)) || + (!first && Utils::isIdentifierCharacter(c))) { + identifier.push_back(c); + } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + Utils::isIdentifierStartCharacter(c2)) { + identifier.push_back(c); + } else { + if (c == ':' && allowNSSep) { + logger.error( + "Expected character before and after namespace separator " + "\":\"", + reader); + } + reader.resetPeek(); + break; + } + + // This is no longer the first character + first = false; + + // Advance the hasCharSiceNSSep flag + hasCharSiceNSSep = allowNSSep && (c != ':'); + + end = reader.getPeekOffset(); + reader.consumePeek(); + } + + // Return the identifier at its location + Variant res = + Variant::fromString(std::string(identifier.data(), identifier.size())); + res.setLocation({reader.getSourceId(), start, end}); + return res; +} + +OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +{ + // Expect a '{' after the command + reader.consumeWhitespace(); + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\begin", reader); + return State::NONE; + } + + // Parse the name of the command that should be opened + Variant commandName = parseIdentifier(reader.getOffset(), true); + if (commandName.asString().empty()) { + logger.error("Expected identifier", commandName); + return State::ERROR; + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + SourceOffset start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Parse the arguments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), true); + + return State::COMMAND; +} + +static bool checkStillInField(const OsdmStreamParser::Command &cmd, + const Variant &endName, Logger &logger) +{ + if (cmd.inField && !cmd.inRangeField) { + logger.error(std::string("\\end in open field of command \"") + + cmd.name.asString() + std::string("\""), + endName); + logger.note(std::string("Open command started here:"), cmd.name); + return true; + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +{ + // Expect a '{' after the command + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\end", reader); + return State::NONE; + } + + // Fetch the name of the command that should be ended here + Variant name = parseIdentifier(reader.getOffset(), true); + + // Make sure the given command name is not empty + if (name.asString().empty()) { + logger.error("Expected identifier", name); + return State::ERROR; + } + + // Make sure the command name is terminated with a '}' + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Unroll the command stack up to the last range command + while (!commands.top().hasRange) { + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + commands.pop(); + } + + // Make sure we're not in an open field of this command + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + + // Special error message if the top-level command is reached + if (commands.size() == 1) { + logger.error(std::string("Cannot end command \"") + name.asString() + + std::string("\" here, no command open"), + name); + return State::ERROR; + } + + // Inform the about command mismatches + const Command &cmd = commands.top(); + if (commands.top().name.asString() != name.asString()) { + logger.error(std::string("Trying to end command \"") + + cmd.name.asString() + + std::string("\", but open command is \"") + + name.asString() + std::string("\""), + name); + logger.note("Last command was opened here:", cmd.name); + return State::ERROR; + } + + // Set the location to the location of the command that was ended, then end + // the current command + location = name.getLocation(); + commands.pop(); + return cmd.inRangeField ? State::FIELD_END : State::NONE; +} + +Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +{ + // Parse the arguments using the universal VariantReader + Variant commandArguments; + if (reader.expect('[')) { + auto res = VariantReader::parseObject(reader, logger, ']'); + commandArguments = res.second; + } else { + commandArguments = Variant::mapType{}; + } + + // Insert the parsed name, make sure "name" was not specified in the + // arguments + if (commandArgName.isString()) { + auto res = + commandArguments.asMap().emplace("name", std::move(commandArgName)); + if (!res.second) { + logger.error("Name argument specified multiple times", + SourceLocation{}, MessageMode::NO_CONTEXT); + logger.note("First occurance is here: ", commandArgName); + logger.note("Second occurance is here: ", res.first->second); + } + } + return commandArguments; +} + +void OsdmStreamParser::pushCommand(Variant commandName, + Variant commandArguments, bool hasRange) +{ + // Store the location on the stack + location = commandName.getLocation(); + + // Place the command on the command stack, remove the last commands if we're + // not currently inside a field of these commands + while (!commands.top().inField) { + commands.pop(); + } + commands.push(Command{std::move(commandName), std::move(commandArguments), + hasRange, false, false}); +} + +OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +{ + // Parse the commandName as a first identifier + Variant commandName = parseIdentifier(start, true); + if (commandName.asString().empty()) { + logger.error("Empty command name", reader); + return State::NONE; + } + + // Handle the special "begin" and "end" commands + const auto commandNameComponents = + Utils::split(commandName.asString(), ':'); + const bool isBegin = commandNameComponents[0] == "begin"; + const bool isEnd = commandNameComponents[0] == "end"; + if (isBegin || isEnd) { + if (commandNameComponents.size() > 1) { + logger.error( + "Special commands \"\\begin\" and \"\\end\" may not contain a " + "namespace separator \":\"", + commandName); + } + if (isBegin) { + return parseBeginCommand(); + } else if (isEnd) { + return parseEndCommand(); + } + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + // Parse the arugments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), false); + + return State::COMMAND; +} + +void OsdmStreamParser::parseBlockComment() +{ + DynamicToken token; + size_t depth = 1; + while (tokenizer.read(reader, token)) { + if (token.type == Tokens.BlockCommentEnd) { + depth--; + if (depth == 0) { + return; + } + } + if (token.type == Tokens.BlockCommentStart) { + depth++; + } + } + + // Issue an error if the file ends while we are in a block comment + logger.error("File ended while being in a block comment", reader); +} + +void OsdmStreamParser::parseLineComment() +{ + char c; + while (reader.read(c)) { + if (c == '\n') { + return; + } + } +} + +bool OsdmStreamParser::checkIssueData(DataHandler &handler) +{ + if (!handler.isEmpty()) { + data = handler.toVariant(reader.getSourceId()); + location = data.getLocation(); + reader.resetPeek(); + return true; + } + return false; +} + +bool OsdmStreamParser::checkIssueFieldStart() +{ + // Fetch the current command, and check whether we're currently inside a + // field of this command + Command &cmd = commands.top(); + if (!cmd.inField) { + // If this is a range command, we're now implicitly inside the field of + // this command -- we'll have to issue a field start command! + if (cmd.hasRange) { + cmd.inField = true; + cmd.inRangeField = true; + reader.resetPeek(); + return true; + } + + // This was not a range command, so obviously we're now inside within + // a field of some command -- so unroll the commands stack until a + // command with open field is reached + while (!commands.top().inField) { + commands.pop(); + } + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parse() +{ + // Handler for incomming data + DataHandler handler; + + // Read tokens until the outer loop should be left + DynamicToken token; + while (tokenizer.peek(reader, token)) { + const TokenTypeId type = token.type; + + // Special handling for Backslash and Text + if (type == Tokens.Backslash) { + // Before appending anything to the output data or starting a new + // command, check whether FIELD_START has to be issued, as the + // current command is a command with range + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Check whether a command starts now, without advancing the peek + // cursor + char c; + if (!reader.fetchPeek(c)) { + logger.error("Trailing backslash at the end of the file.", + token); + return State::END; + } + + // Try to parse a command + if (Utils::isIdentifierStartCharacter(c)) { + // Make sure to issue any data before it is to late + if (checkIssueData(handler)) { + return State::DATA; + } + + // Parse the actual command + State res = parseCommand(token.location.getStart()); + switch (res) { + case State::ERROR: + throw LoggableException( + "Last error was irrecoverable, ending parsing " + "process"); + case State::NONE: + continue; + default: + return res; + } + } + + // This was not a special character, just append the given character + // to the data buffer, use the escape character start as start + // location and the peek offset as end location + reader.peek(c); // Peek the previously fetched character + handler.append(c, token.location.getStart(), + reader.getPeekOffset()); + reader.consumePeek(); + continue; + } else if (type == TextToken) { + // Check whether FIELD_START has to be issued before appending text + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Append the text to the data handler + handler.append(token.content, token.location.getStart(), + token.location.getEnd()); + + reader.consumePeek(); + continue; + } + + // A non-text token was reached, make sure all pending data commands + // have been issued + if (checkIssueData(handler)) { + return State::DATA; + } + + // We will handle the token now, consume the peeked characters + reader.consumePeek(); + + // Update the location to the current token location + location = token.location; + + if (token.type == Tokens.LineComment) { + parseLineComment(); + } else if (token.type == Tokens.BlockCommentStart) { + parseBlockComment(); + } else if (token.type == Tokens.FieldStart) { + Command &cmd = commands.top(); + if (!cmd.inField) { + cmd.inField = true; + return State::FIELD_START; + } + logger.error( + "Got field start token \"{\", but no command for which to " + "start the field. Did you mean \"\\{\"?", + token); + } else if (token.type == Tokens.FieldEnd) { + // Try to end an open field of the current command -- if the current + // command is not inside an open field, end this command and try to + // close the next one + for (int i = 0; i < 2 && commands.size() > 1; i++) { + Command &cmd = commands.top(); + if (!cmd.inRangeField) { + if (cmd.inField) { + cmd.inField = false; + return State::FIELD_END; + } + commands.pop(); + } else { + break; + } + } + logger.error( + "Got field end token \"}\", but there is no field to end. Did " + "you mean \"\\}\"?", + token); + } else { + logger.error("Unexpected token \"" + token.content + "\"", token); + } + } + + // Issue available data + if (checkIssueData(handler)) { + return State::DATA; + } + + // Make sure all open commands and fields have been ended at the end of the + // stream + while (commands.size() > 1) { + Command &cmd = commands.top(); + if (cmd.inField || cmd.hasRange) { + logger.error("Reached end of stream, but command \"" + + cmd.name.asString() + "\" has not been ended", + cmd.name); + } + commands.pop(); + } + + location = SourceLocation{reader.getSourceId(), reader.getOffset()}; + return State::END; +} + +const Variant &OsdmStreamParser::getCommandName() +{ + return commands.top().name; +} + +const Variant &OsdmStreamParser::getCommandArguments() +{ + return commands.top().arguments; +} +} + diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osdm/OsdmStreamParser.hpp new file mode 100644 index 0000000..48d8fb7 --- /dev/null +++ b/src/formats/osdm/OsdmStreamParser.hpp @@ -0,0 +1,351 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file OsdmStreamParser.hpp + * + * Provides classes for low-level classes for reading the TeX-esque osdm + * format. The class provided here does not build any model objects and does not + * implement the Parser interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ +#define _OUSIA_OSDM_STREAM_PARSER_HPP_ + +#include <stack> + +#include <core/common/Variant.hpp> + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; +class DataHandler; + +/** + * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * format. The parser is constructed around a "parse" function, which reads data + * from the underlying CharReader until a new state is reached and indicates + * this state in a return value. The calling code then has to pull corresponding + * data from the stream reader. The reader makes sure the incommind file is + * syntactically valid and tries to recorver from most errors. If an error is + * irrecoverable (this is the case for errors with wrong nesting of commands or + * fields, as this would lead to too many consecutive errors) a + * LoggableException is thrown. + */ +class OsdmStreamParser { +public: + /** + * Enum used to indicate which state the OsdmStreamParser class is in + * after calling the "parse" function. + */ + enum class State { + /** + * State returned if a fully featured command has been read. A command + * consists of the command name and its arguments (which optionally + * includes the name). + */ + COMMAND, + + /** + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. + */ + DATA, + + /** + * A user-defined entity has been found. The entity sequence is stored + * in the command name. + */ + ENTITY, + + /** + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). + */ + ANNOTATION_START, + + /** + * State returned if an annotation ends. The reader indicates which + * annotation ends. + */ + ANNOTATION_END, + + /** + * State returned if a new field started. The reader assures that the + * current field ends before a new field is started and that the field + * is not started if data has been given outside of a field. The + * field number is set to the current field index. + */ + FIELD_START, + + /** + * State returned if the current field ends. The reader assures that a + * field was actually open. + */ + FIELD_END, + + /** + * The end of the stream has been reached. + */ + END, + + /** + * Returned from internal functions if nothing should be done. + */ + NONE, + + /** + * Returned from internal function to indicate irrecoverable errors. + */ + ERROR + }; + + /** + * Entry used for the command stack. + */ + struct Command { + /** + * Name and location of the current command. + */ + Variant name; + + /** + * Arguments that were passed to the command. + */ + Variant arguments; + + /** + * Set to true if this is a command with clear begin and end. + */ + bool hasRange; + + /** + * Set to true if we are currently inside a field of this command. + */ + bool inField; + + /** + * Set to true if we are currently in the range field of the command + * (implies inField being set to true). + */ + bool inRangeField; + + /** + * Default constructor. + */ + Command() : hasRange(false), inField(false), inRangeField(false) {} + + /** + * Constructor of the Command class. + * + * @param name is a string variant with name and location of the + * command. + * @param arguments is a map variant with the arguments given to the + * command. + * @param hasRange should be set to true if this is a command with + * explicit range. + * @param inField is set to true if we currently are inside a field + * of this command. + * @param inRangeField is set to true if we currently inside the outer + * field of the command. + */ + Command(Variant name, Variant arguments, bool hasRange, bool inField, + bool inRangeField) + : name(std::move(name)), + arguments(std::move(arguments)), + hasRange(hasRange), + inField(inField), + inRangeField(inRangeField) + { + } + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; + + /** + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Tokenizer instance used to read individual tokens from the text. + */ + DynamicTokenizer tokenizer; + + /** + * Stack containing the current commands. + */ + std::stack<Command> commands; + + /** + * Variant containing the data that has been read (always is a string, + * contains the exact location of the data in the source file). + */ + Variant data; + + /** + * Contains the location of the last token. + */ + SourceLocation location; + + /** + * Contains the field index of the current command. + */ + size_t fieldIdx; + + /** + * Function used internall to parse an identifier. + * + * @param start is the start byte offset of the identifier (including the + * backslash). + * @param allowNSSep should be set to true if the namespace separator is + * allowed in the identifier name. Issues error if the namespace separator + * is placed incorrectly. + */ + Variant parseIdentifier(size_t start, bool allowNSSep = false); + + /** + * Function used internally to handle the special "\begin" command. + */ + State parseBeginCommand(); + + /** + * Function used internally to handle the special "\end" command. + */ + State parseEndCommand(); + + /** + * Pushes the parsed command onto the command stack. + */ + void pushCommand(Variant commandName, Variant commandArguments, + bool hasRange); + + /** + * Parses the command arguments. + */ + Variant parseCommandArguments(Variant commandArgName); + + /** + * Function used internally to parse a command. + * + * @param start is the start byte offset of the command (including the + * backslash) + * @return true if a command was actuall parsed, false otherwise. + */ + State parseCommand(size_t start); + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseLineComment(); + + /** + * Checks whether there is any data pending to be issued, if yes, issues it. + * + * @param handler is the data handler that contains the data that may be + * returned to the user. + * @return true if there was any data and DATA should be returned by the + * parse function, false otherwise. + */ + bool checkIssueData(DataHandler &handler); + + /** + * Called before any data is appended to the internal data handler. Checks + * whether a new field should be started or implicitly ended. + * + * @return true if FIELD_START should be returned by the parse function. + */ + bool checkIssueFieldStart(); + +public: + /** + * Constructor of the OsdmStreamParser class. Attaches the new + * OsdmStreamParser to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + OsdmStreamParser(CharReader &reader, Logger &logger); + + /** + * Continues parsing. Returns one of the states defined in the State enum. + * Callers should stop once the State::END state is reached. Use the getter + * functions to get more information about the current state, such as the + * command name or the data or the current field index. + * + * @return the new state the parser has reached. + */ + State parse(); + + /** + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. + */ + const Variant &getData() { return data; } + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing name and location of the + * parsed command. + */ + const Variant &getCommandName(); + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing arguments given to the + * command. + */ + const Variant &getCommandArguments(); + + /** + * Returns a reference at the char reader. + * + * @return the last internal token location. + */ + SourceLocation &getLocation() { return location; } +}; +} + +#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ + diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/formats/osdm/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::make_shared<Node>()).first; + } + node = it->second.get(); + } + + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { + return false; + } + + // Otherwise just set the type to the given type. + node->type = type; + return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another type + node = it->second.get(); + if ((node->type != EmptyToken || node->children.size() > 1) && + (i + 1 != token.size())) { + subtreeRoot = node; + subtreeKey = token[i + 1]; + } + } + + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { + return false; + } + + // If the target node has children, we cannot delete the subtree. Set the + // type to EmptyToken instead + if (!node->children.empty()) { + node->type = EmptyToken; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return EmptyToken; + } + node = it->second.get(); + } + return node->type; +} +} + diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/formats/osdm/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include <cstdint> +#include <memory> +#include <limits> +#include <unordered_map> + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenTypeId type; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param type is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, TokenTypeId type) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + |