/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include "TokenizedData.hpp" #include "Tokenizer.hpp" namespace ousia { namespace { /* Internal class TokenMatch */ /** * Contains information about a matching token. */ struct TokenMatch { /** * Token that was matched. */ Token token; /** * Position at which this token starts in the TokenizedData instance. */ size_t dataStartOffset; /** * Set to true if the matched token is a primary token. */ bool primary; /** * Constructor of the TokenMatch class. */ TokenMatch() : dataStartOffset(0), primary(false) {} /** * Returns true if this TokenMatch instance actually represents a match. * * @return true if the TokenMatch actually has a match. */ bool hasMatch() const { return token.id != Tokens::Empty; } /** * Returns the length of the matched token. * * @return the length of the token string. */ size_t size() const { return token.content.size(); } }; /* Internal class TokenLookup */ /** * The TokenLookup class is used to represent a thread in a running token * lookup. */ class TokenLookup { private: /** * Current node within the token trie. */ TokenTrie::Node const *node; /** * Start offset within the source file. */ size_t start; /** * Position at which this token starts in the TokenizedData instance. */ size_t dataStartOffset; public: /** * Constructor of the TokenLookup class. * * @param node is the current node. * @param start is the start position in the source file. * @param dataStartOffset is the current length of the TokenizedData buffer. */ TokenLookup(const TokenTrie::Node *node, size_t start, size_t dataStartOffset) : node(node), start(start), dataStartOffset(dataStartOffset) { } /** * Tries to extend the current path in the token trie with the given * character. If a complete token is matched, stores the match in the given * TokenMatch reference and returns true. * * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- * which could potentially be expanded in the next iteration. * @param match is the Token instance to which the matching token * should be written. * @param tokens is a reference at the internal token list of the * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. * @return true if a token was matched, false otherwise. */ bool advance(char c, std::vector &lookups, TokenMatch &match, const std::vector &tokens, SourceOffset end, SourceId sourceId) { // Set to true once a token has been matched bool res = false; // Check whether we can continue the current token path, if not, abort auto it = node->children.find(c); if (it == node->children.end()) { return res; } // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); if (node->id != Tokens::Empty) { const Tokenizer::TokenDescriptor &descr = tokens[node->id]; match.token = Token(node->id, descr.string, SourceLocation(sourceId, start, end)); match.dataStartOffset = dataStartOffset; match.primary = descr.primary; res = true; } // If this state can possibly be advanced, store it in the states list. if (!node->children.empty()) { lookups.emplace_back(*this); } return res; } }; } /* Class Tokenizer */ Tokenizer::Tokenizer() : nextTokenId(0) {} template bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) { // If we're in the read mode, reset the char reader peek position to the // current read position if (read) { reader.resetPeek(); } // Prepare the lookups in the token trie const TokenTrie::Node *root = trie.getRoot(); TokenMatch bestMatch; std::vector lookups; std::vector nextLookups; // Peek characters from the reader and try to advance the current token tree // cursor char c; const size_t initialDataSize = data.size(); size_t charStart = reader.getPeekOffset(); const SourceId sourceId = reader.getSourceId(); while (reader.peek(c)) { const size_t charEnd = reader.getPeekOffset(); const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root if (!bestMatch.hasMatch()) { lookups.emplace_back(root, charStart, dataStartOffset); } // Try to advance all other lookups with the new character TokenMatch match; for (TokenLookup &lookup : lookups) { // Continue if the current lookup if (!lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId)) { continue; } // If the matched token is primary, check whether it is better than // the current best match, if yes, replace the best match. In any // case just continue if (match.primary) { if (match.size() > bestMatch.size()) { bestMatch = match; } continue; } // Otherwise -- if the matched token is a non-primary token (and no // primary token has been found until now) -- mark the match in the // TokenizedData if (!bestMatch.hasMatch()) { data.mark(match.token.id, data.size() - match.size() + 1, match.size()); } } // We have found a token and there are no more states to advance or the // text handler has found something -- abort to return the new token if (bestMatch.hasMatch()) { if ((nextLookups.empty() || data.size() > initialDataSize)) { break; } } else { // Record all incomming characters data.append(c, charStart, charEnd); } // Swap the lookups and the nextLookups list lookups = std::move(nextLookups); nextLookups.clear(); // Advance the offset charStart = charEnd; } // If we found data, emit a corresponding data token if (data.size() > initialDataSize && (!bestMatch.hasMatch() || bestMatch.dataStartOffset > initialDataSize)) { // If we have a "bestMatch" wich starts after text data has started, // trim the TokenizedData to this offset if (bestMatch.dataStartOffset > initialDataSize) { data.trim(bestMatch.dataStartOffset); } // Create a token containing the data location bestMatch.token = Token{data.getLocation()}; } else if (bestMatch.hasMatch() && bestMatch.dataStartOffset == initialDataSize) { data.trim(initialDataSize); } // Move the read/peek cursor to the end of the token, abort if an error // happens while doing so if (bestMatch.hasMatch()) { // Make sure we have a valid location if (bestMatch.token.location.getEnd() == InvalidSourceOffset) { throw OusiaException{"Token end position offset out of range"}; } // Seek to the end of the current token const size_t end = bestMatch.token.location.getEnd(); if (read) { reader.seek(end); } else { reader.seekPeekCursor(end); } token = bestMatch.token; } else { token = Token{}; } return bestMatch.hasMatch(); } bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data) { return next(reader, token, data); } bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data) { return next(reader, token, data); } TokenId Tokenizer::registerToken(const std::string &token, bool primary) { // Abort if an empty token should be registered if (token.empty()) { return Tokens::Empty; } // Search for a new slot in the tokens list TokenId type = Tokens::Empty; for (size_t i = nextTokenId; i < tokens.size(); i++) { if (!tokens[i].valid()) { tokens[i] = TokenDescriptor(token, primary); type = i; break; } } // No existing slot was found, add a new one -- make sure we do not // override the special token type handles if (type == Tokens::Empty) { type = tokens.size(); if (type >= Tokens::MaxTokenId) { throw OusiaException{"Token type ids depleted!"}; } tokens.emplace_back(token, primary); } nextTokenId = type + 1; // Try to register the token in the trie -- if this fails, remove it from // the tokens list if (!trie.registerToken(token, type)) { tokens[type] = TokenDescriptor(); nextTokenId = type; return Tokens::Empty; } return type; } bool Tokenizer::unregisterToken(TokenId id) { // Unregister the token from the trie, abort if an invalid type is given if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { tokens[id] = TokenDescriptor(); nextTokenId = id; return true; } return false; } static Tokenizer::TokenDescriptor EmptyTokenDescriptor; const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const { if (id < tokens.size()) { return tokens[id]; } return EmptyTokenDescriptor; } /* Explicitly instantiate all possible instantiations of the "next" member function */ template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); }