/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include "TokenizedData.hpp"
#include "Tokenizer.hpp"
namespace ousia {
namespace {
/* Internal class TokenMatch */
/**
* Contains information about a matching token.
*/
struct TokenMatch {
/**
* Token that was matched.
*/
Token token;
/**
* Position at which this token starts in the TokenizedData instance.
*/
size_t dataStartOffset;
/**
* Set to true if the matched token is a primary token.
*/
bool primary;
/**
* Constructor of the TokenMatch class.
*/
TokenMatch() : dataStartOffset(0), primary(false) {}
/**
* Returns true if this TokenMatch instance actually represents a match.
*
* @return true if the TokenMatch actually has a match.
*/
bool hasMatch() const { return token.id != Tokens::Empty; }
/**
* Returns the length of the matched token.
*
* @return the length of the token string.
*/
size_t size() const { return token.content.size(); }
};
/* Internal class TokenLookup */
/**
* The TokenLookup class is used to represent a thread in a running token
* lookup.
*/
class TokenLookup {
private:
/**
* Current node within the token trie.
*/
TokenTrie::Node const *node;
/**
* Start offset within the source file.
*/
size_t start;
/**
* Position at which this token starts in the TokenizedData instance.
*/
size_t dataStartOffset;
public:
/**
* Constructor of the TokenLookup class.
*
* @param node is the current node.
* @param start is the start position in the source file.
* @param dataStartOffset is the current length of the TokenizedData buffer.
*/
TokenLookup(const TokenTrie::Node *node, size_t start,
size_t dataStartOffset)
: node(node), start(start), dataStartOffset(dataStartOffset)
{
}
/**
* Tries to extend the current path in the token trie with the given
* character. If a complete token is matched, stores the match in the given
* TokenMatch reference and returns true.
*
* @param c is the character that should be appended to the current prefix.
* @param lookups is a list to which new TokeLookup instances are added --
* which could potentially be expanded in the next iteration.
* @param match is the Token instance to which the matching token
* should be written.
* @param tokens is a reference at the internal token list of the
* Tokenizer.
* @param end is the end byte offset of the current character.
* @param sourceId is the source if of this file.
* @return true if a token was matched, false otherwise.
*/
bool advance(char c, std::vector &lookups, TokenMatch &match,
const std::vector &tokens,
SourceOffset end, SourceId sourceId)
{
// Set to true once a token has been matched
bool res = false;
// Check whether we can continue the current token path, if not, abort
auto it = node->children.find(c);
if (it == node->children.end()) {
return res;
}
// Check whether the new node represents a complete token a whether it
// is longer than the current token. If yes, replace the current token.
node = it->second.get();
if (node->id != Tokens::Empty) {
const Tokenizer::TokenDescriptor &descr = tokens[node->id];
match.token = Token(node->id, descr.string,
SourceLocation(sourceId, start, end));
match.dataStartOffset = dataStartOffset;
match.primary = descr.primary;
res = true;
}
// If this state can possibly be advanced, store it in the states list.
if (!node->children.empty()) {
lookups.emplace_back(*this);
}
return res;
}
};
}
/* Class Tokenizer */
Tokenizer::Tokenizer() : nextTokenId(0) {}
template
bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data)
{
// If we're in the read mode, reset the char reader peek position to the
// current read position
if (read) {
reader.resetPeek();
}
// Prepare the lookups in the token trie
const TokenTrie::Node *root = trie.getRoot();
TokenMatch bestMatch;
std::vector lookups;
std::vector nextLookups;
// Peek characters from the reader and try to advance the current token tree
// cursor
char c;
const size_t initialDataSize = data.size();
size_t charStart = reader.getPeekOffset();
const SourceId sourceId = reader.getSourceId();
while (reader.peek(c)) {
const size_t charEnd = reader.getPeekOffset();
const size_t dataStartOffset = data.size();
// If we do not have a match yet, start a new lookup from the root
if (!bestMatch.hasMatch()) {
lookups.emplace_back(root, charStart, dataStartOffset);
}
// Try to advance all other lookups with the new character
TokenMatch match;
for (TokenLookup &lookup : lookups) {
// Continue if the current lookup
if (!lookup.advance(c, nextLookups, match, tokens, charEnd,
sourceId)) {
continue;
}
// If the matched token is primary, check whether it is better than
// the current best match, if yes, replace the best match. In any
// case just continue
if (match.primary) {
if (match.size() > bestMatch.size()) {
bestMatch = match;
}
continue;
}
// Otherwise -- if the matched token is a non-primary token (and no
// primary token has been found until now) -- mark the match in the
// TokenizedData
if (!bestMatch.hasMatch()) {
data.mark(match.token.id, data.size() - match.size() + 1,
match.size());
}
}
// We have found a token and there are no more states to advance or the
// text handler has found something -- abort to return the new token
if (bestMatch.hasMatch()) {
if ((nextLookups.empty() || data.size() > initialDataSize)) {
break;
}
} else {
// Record all incomming characters
data.append(c, charStart, charEnd);
}
// Swap the lookups and the nextLookups list
lookups = std::move(nextLookups);
nextLookups.clear();
// Advance the offset
charStart = charEnd;
}
// If we found data, emit a corresponding data token
if (data.size() > initialDataSize &&
(!bestMatch.hasMatch() ||
bestMatch.dataStartOffset > initialDataSize)) {
// If we have a "bestMatch" wich starts after text data has started,
// trim the TokenizedData to this offset
if (bestMatch.dataStartOffset > initialDataSize) {
data.trim(bestMatch.dataStartOffset);
}
// Create a token containing the data location
bestMatch.token = Token{data.getLocation()};
}
// Move the read/peek cursor to the end of the token, abort if an error
// happens while doing so
if (bestMatch.hasMatch()) {
// Make sure we have a valid location
if (bestMatch.token.location.getEnd() == InvalidSourceOffset) {
throw OusiaException{"Token end position offset out of range"};
}
// Seek to the end of the current token
const size_t end = bestMatch.token.location.getEnd();
if (read) {
reader.seek(end);
} else {
reader.seekPeekCursor(end);
}
token = bestMatch.token;
} else {
token = Token{};
}
return bestMatch.hasMatch();
}
bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data)
{
return next(reader, token, data);
}
bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data)
{
return next(reader, token, data);
}
TokenId Tokenizer::registerToken(const std::string &token, bool primary)
{
// Abort if an empty token should be registered
if (token.empty()) {
return Tokens::Empty;
}
// Search for a new slot in the tokens list
TokenId type = Tokens::Empty;
for (size_t i = nextTokenId; i < tokens.size(); i++) {
if (!tokens[i].valid()) {
tokens[i] = TokenDescriptor(token, primary);
type = i;
break;
}
}
// No existing slot was found, add a new one -- make sure we do not
// override the special token type handles
if (type == Tokens::Empty) {
type = tokens.size();
if (type >= Tokens::MaxTokenId) {
throw OusiaException{"Token type ids depleted!"};
}
tokens.emplace_back(token, primary);
}
nextTokenId = type + 1;
// Try to register the token in the trie -- if this fails, remove it from
// the tokens list
if (!trie.registerToken(token, type)) {
tokens[type] = TokenDescriptor();
nextTokenId = type;
return Tokens::Empty;
}
return type;
}
bool Tokenizer::unregisterToken(TokenId id)
{
// Unregister the token from the trie, abort if an invalid type is given
if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) {
tokens[id] = TokenDescriptor();
nextTokenId = id;
return true;
}
return false;
}
static Tokenizer::TokenDescriptor EmptyTokenDescriptor;
const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const
{
if (id < tokens.size()) {
return tokens[id];
}
return EmptyTokenDescriptor;
}
/* Explicitly instantiate all possible instantiations of the "next" member
function */
template bool Tokenizer::next(CharReader &, Token &, TokenizedData &);
template bool Tokenizer::next(CharReader &, Token &, TokenizedData &);
}