/*
    Ousía
    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see .
*/
#include 
#include 
#include 
#include 
#include 
#include 
#include "Tokenizer.hpp"
namespace ousia {
namespace {
/* Internal class TokenMatch */
/**
 * Contains information about a matching token.
 */
struct TokenMatch {
	/**
	 * Token that was matched.
	 */
	Token token;
	/**
	 * Current length of the data within the text handler. The text buffer needs
	 * to be trimmed to this length if this token matches.
	 */
	size_t textLength;
	/**
	 * End location of the current text handler. This location needs to be used
	 * for the text token that is emitted before the actual token.
	 */
	size_t textEnd;
	/**
	 * Constructor of the TokenMatch class.
	 */
	TokenMatch() : textLength(0), textEnd(0) {}
	/**
	 * Returns true if this TokenMatch instance actually represents a match.
	 */
	bool hasMatch() { return token.id != Tokens::Empty; }
};
/* Internal class TokenLookup */
/**
 * The TokenLookup class is used to represent a thread in a running token
 * lookup.
 */
class TokenLookup {
private:
	/**
	 * Current node within the token trie.
	 */
	TokenTrie::Node const *node;
	/**
	 * Start offset within the source file.
	 */
	size_t start;
	/**
	 * Current length of the data within the text handler. The text buffer needs
	 * to be trimmed to this length if this token matches.
	 */
	size_t textLength;
	/**
	 * End location of the current text handler. This location needs to be used
	 * for the text token that is emitted before the actual token.
	 */
	size_t textEnd;
public:
	/**
	 * Constructor of the TokenLookup class.
	 *
	 * @param node is the current node.
	 * @param start is the start position.
	 * @param textLength is the text buffer length of the previous text token.
	 * @param textEnd is the current end location of the previous text token.
	 */
	TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength,
	            size_t textEnd)
	    : node(node), start(start), textLength(textLength), textEnd(textEnd)
	{
	}
	/**
	 * Tries to extend the current path in the token trie with the given
	 * character. If a complete token is matched, stores this match in the
	 * tokens list (in case it is longer than any previous token).
	 *
	 * @param c is the character that should be appended to the current prefix.
	 * @param lookups is a list to which new TokeLookup instances are added --
	 * which could potentially be expanded in the next iteration.
	 * @param match is the Token instance to which the matching token
	 * should be written.
	 * @param tokens is a reference at the internal token list of the
	 * Tokenizer.
	 * @param end is the end byte offset of the current character.
	 * @param sourceId is the source if of this file.
	 */
	void advance(char c, std::vector &lookups, TokenMatch &match,
	             const std::vector &tokens, SourceOffset end,
	             SourceId sourceId)
	{
		// Check whether we can continue the current token path with the given
		// character without visiting an already visited node
		auto it = node->children.find(c);
		if (it == node->children.end()) {
			return;
		}
		// Check whether the new node represents a complete token a whether it
		// is longer than the current token. If yes, replace the current token.
		node = it->second.get();
		if (node->type != Tokens::Empty) {
			const std::string &str = tokens[node->type];
			size_t len = str.size();
			if (len > match.token.content.size()) {
				match.token =
				    Token{node->type, str, {sourceId, start, end}};
				match.textLength = textLength;
				match.textEnd = textEnd;
			}
		}
		// If this state can possibly be advanced, store it in the states list.
		if (!node->children.empty()) {
			lookups.emplace_back(*this);
		}
	}
};
/**
 * Transforms the given token into a data token containing the extracted
 * text.
 *
 * @param handler is the WhitespaceHandler containing the collected data.
 * @param token is the output token to which the text should be written.
 * @param sourceId is the source id of the underlying file.
 */
static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match,
                           SourceId sourceId)
{
	if (match.hasMatch()) {
		match.token.content =
		    std::string{handler.textBuf.data(), match.textLength};
		match.token.location =
		    SourceLocation{sourceId, handler.textStart, match.textEnd};
	} else {
		match.token.content = handler.toString();
		match.token.location =
		    SourceLocation{sourceId, handler.textStart, handler.textEnd};
	}
	match.token.id = Tokens::Data;
}
}
/* Class Tokenizer */
Tokenizer::Tokenizer(WhitespaceMode whitespaceMode)
    : whitespaceMode(whitespaceMode), nextTokenId(0)
{
}
template 
bool Tokenizer::next(CharReader &reader, Token &token)
{
	// If we're in the read mode, reset the char reader peek position to the
	// current read position
	if (read) {
		reader.resetPeek();
	}
	// Prepare the lookups in the token trie
	const TokenTrie::Node *root = trie.getRoot();
	TokenMatch match;
	std::vector lookups;
	std::vector nextLookups;
	// Instantiate the text handler
	TextHandler textHandler;
	// Peek characters from the reader and try to advance the current token tree
	// cursor
	char c;
	size_t charStart = reader.getPeekOffset();
	const SourceId sourceId = reader.getSourceId();
	while (reader.peek(c)) {
		const size_t charEnd = reader.getPeekOffset();
		const size_t textLength = textHandler.textBuf.size();
		const size_t textEnd = textHandler.textEnd;
		// If we do not have a match yet, start a new lookup from the root
		if (!match.hasMatch()) {
			TokenLookup{root, charStart, textLength, textEnd}.advance(
			    c, nextLookups, match, tokens, charEnd, sourceId);
		}
		// Try to advance all other lookups with the new character
		for (TokenLookup &lookup : lookups) {
			lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
		}
		// We have found a token and there are no more states to advance or the
		// text handler has found something -- abort to return the new token
		if (match.hasMatch()) {
			if ((nextLookups.empty() || textHandler.hasText())) {
				break;
			}
		} else {
			// Record all incomming characters
			textHandler.append(c, charStart, charEnd);
		}
		// Swap the lookups and the nextLookups list
		lookups = std::move(nextLookups);
		nextLookups.clear();
		// Advance the offset
		charStart = charEnd;
	}
	// If we found text, emit that text
	if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) {
		buildDataToken(textHandler, match, sourceId);
	}
	// Move the read/peek cursor to the end of the token, abort if an error
	// happens while doing so
	if (match.hasMatch()) {
		// Make sure we have a valid location
		if (match.token.location.getEnd() == InvalidSourceOffset) {
			throw OusiaException{"Token end position offset out of range"};
		}
		// Seek to the end of the current token
		const size_t end = match.token.location.getEnd();
		if (read) {
			reader.seek(end);
		} else {
			reader.seekPeekCursor(end);
		}
		token = match.token;
	} else {
		token = Token{};
	}
	return match.hasMatch();
}
bool Tokenizer::read(CharReader &reader, Token &token)
{
	switch (whitespaceMode) {
		case WhitespaceMode::PRESERVE:
			return next(reader, token);
		case WhitespaceMode::TRIM:
			return next(reader, token);
		case WhitespaceMode::COLLAPSE:
			return next(reader, token);
	}
	return false;
}
bool Tokenizer::peek(CharReader &reader, Token &token)
{
	switch (whitespaceMode) {
		case WhitespaceMode::PRESERVE:
			return next(reader, token);
		case WhitespaceMode::TRIM:
			return next(reader, token);
		case WhitespaceMode::COLLAPSE:
			return next(reader, token);
	}
	return false;
}
TokenId Tokenizer::registerToken(const std::string &token)
{
	// Abort if an empty token should be registered
	if (token.empty()) {
		return Tokens::Empty;
	}
	// Search for a new slot in the tokens list
	TokenId type = Tokens::Empty;
	for (size_t i = nextTokenId; i < tokens.size(); i++) {
		if (tokens[i].empty()) {
			tokens[i] = token;
			type = i;
			break;
		}
	}
	// No existing slot was found, add a new one -- make sure we do not
	// override the special token type handles
	if (type == Tokens::Empty) {
		type = tokens.size();
		if (type == Tokens::Data || type == Tokens::Empty) {
			throw OusiaException{"Token type ids depleted!"};
		}
		tokens.emplace_back(token);
	}
	nextTokenId = type + 1;
	// Try to register the token in the trie -- if this fails, remove it
	// from the tokens list
	if (!trie.registerToken(token, type)) {
		tokens[type] = std::string{};
		nextTokenId = type;
		return Tokens::Empty;
	}
	return type;
}
bool Tokenizer::unregisterToken(TokenId type)
{
	// Unregister the token from the trie, abort if an invalid type is given
	if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
		tokens[type] = std::string{};
		nextTokenId = type;
		return true;
	}
	return false;
}
std::string Tokenizer::getTokenString(TokenId type)
{
	if (type < tokens.size()) {
		return tokens[type];
	}
	return std::string{};
}
void Tokenizer::setWhitespaceMode(WhitespaceMode mode)
{
	whitespaceMode = mode;
}
WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; }
/* Explicitly instantiate all possible instantiations of the "next" member
   function */
template bool Tokenizer::next(
    CharReader &reader, Token &token);
template bool Tokenizer::next(
    CharReader &reader, Token &token);
template bool Tokenizer::next(
    CharReader &reader, Token &token);
template bool Tokenizer::next(
    CharReader &reader, Token &token);
template bool Tokenizer::next(
    CharReader &reader, Token &token);
template bool Tokenizer::next(
    CharReader &reader, Token &token);
}