/*
    Ousía
    Copyright (C) 2014  Benjamin Paaßen, Andreas Stöckel

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/**
 * @file TokenizedData.hpp
 *
 * The TokenizedData class defined in this file stores string data extracted
 * from a document including user defined tokens. Tokens can be dynamically
 * enabled and disabled. And the data up to the next enabled token can be
 * returned. Additionally, the data provided by the TokenizedData class is
 * processed according to a whitespace mode that can be dynamically updated.
 *
 * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
 */

#ifndef _OUSIA_TOKENIZED_DATA_HPP_
#define _OUSIA_TOKENIZED_DATA_HPP_

#include <cstdint>
#include <memory>
#include <unordered_set>

#include <core/common/Location.hpp>
#include <core/common/Whitespace.hpp>
#include <core/common/Token.hpp>

namespace ousia {

// Forward declaration
class TokenizedDataImpl;
class TokenizedDataReader;
class TokenizedDataReaderFork;

/**
 * Internally used structure representing a cursor within the TokenizedData
 * stream.
 */
struct TokenizedDataCursor {
	/**
	 * Position within the byte buffer.
	 */
	size_t bufPos;

	/**
	 * Position within the token mark buffer.
	 */
	size_t markPos;

	/**
	 * Default constructor. The resulting cursor points at the beginning of the
	 * stream.
	 */
	TokenizedDataCursor() : bufPos(0), markPos(0) {}
};

/**
 * The TokenizedData class stores data extracted from a user defined document.
 * The data stored in TokenizedData
 */
class TokenizedData {
private:
	/**
	 * Shared pointer pointing at the internal data. This data is shared with
	 * all the TokenizedDataReader instances.
	 */
	std::shared_ptr<TokenizedDataImpl> impl;

public:
	/**
	 * Default constructor, creates a new instance of TokenizedData, sets the
	 * internal SourceId to the InvalidSourceId constant.
	 */
	TokenizedData();

	/**
	 * Creates a new instance of TokenizedData, takes a SourceId.
	 *
	 * @param sourceId is the source identifier that should be used for
	 * constructing the location when returning tokens.
	 */
	TokenizedData(SourceId sourceId);

	/**
	 * Creates a new instance of TokenizedData, takes a SourceId and an initial
	 * string buffer.
	 *
	 * @param data is the string that should be appended to the buffer.
	 * @param offsStart is the start offset in bytes in the input file.
	 * @param sourceId is the source identifier that should be used for
	 * constructing the location when returning tokens.
	 */
	TokenizedData(const std::string &data, SourceOffset offsStart = 0,
	              SourceId sourceId = InvalidSourceId);

	/**
	 * Destructor. Needs to be defined explicitly for freeing a shared pointer
	 * of the incomplete TokenizedDataImpl type.
	 */
	~TokenizedData();

	/**
	 * Appends a complete string to the internal character buffer. Note that the
	 * start and end positions for each character in the given data string will
	 * be interpolated and may thus be incorrect (e.g. when multi-character
	 * linebreaks or multi-character characters (not handled now) are read).
	 *
	 * @param data is the string that should be appended to the buffer.
	 * @param offsStart is the start offset in bytes in the input file.
	 * @param protect if set to true, the appended characters will not be
	 * affected by whitespace handling, they will be returned as is.
	 * @return the current size of the internal byte buffer. The returned value
	 * is intended to be used for the "mark" function.
	 */
	size_t append(const std::string &data, SourceOffset offsStart = 0,
	              bool protect = false);

	/**
	 * Appends a single character to the internal character buffer.
	 *
	 * @param c is the character that should be appended to the buffer.
	 * @param start is the start offset in bytes in the input file.
	 * @param end is the end offset in bytes in the input file.
	 * @param protect if set to true, the appended character will not be
	 * affected by whitespace handling, it will be returned as is.
	 * @return the current size of the internal byte buffer. The returned value
	 * is intended to be used for the "mark" function.
	 */
	size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd,
	              bool protect = false);

	/**
	 * Marks the whitespace character at the given buffer position as protected.
	 *
	 * @param bufPos is the position of the character for which the "protected"
	 * flag should be set.
	 */
	void protect(size_t bufPos);

	/**
	 * Stores a token ending at the last character of the current buffer.
	 *
	 * @param id is the id of the token for which the mark should be stored.
	 * @param len is the length of the token.
	 */
	void mark(TokenId id, TokenLength len);

	/**
	 * Stores a token at the given position.
	 *
	 * @param id is the if of the token for which the mark should be stored.
	 * @param bufStart is the start position in the internal buffer. Use the
	 * values returned by append to calculate the start position.
	 * @param len is the length of the token.
	 */
	void mark(TokenId id, size_t bufStart, TokenLength len);

	/**
	 * Resets the TokenizedData instance to the state it had when it was
	 * constructred.
	 */
	void clear();

	/**
	 * Trims the length of the TokenizedData instance to the given length. Note
	 * that this function does not remove any token matches for performance
	 * reasons, it merely renders them incaccessible. Appending new data after
	 * calling trim will make the token marks accessible again. Thus this method
	 * should be the last function called to modify the data buffer and the
	 * token marks.
	 *
	 * @param length is the number of characters to which the TokenizedData
	 * instance should be trimmed.
	 */
	void trim(size_t length);

	/**
	 * Returns the number of characters currently represented by this
	 * TokenizedData instance.
	 */
	size_t size() const;

	/**
	 * Returns true if the TokenizedData instance is empty, false otherwise.
	 *
	 * @return true if not data is stored inside the TokenizedData instance.
	 */
	bool empty() const;

	/**
	 * Returns the location of the entire TokenizedData instance.
	 *
	 * @return the location of the entire data represented by this instance.
	 */
	SourceLocation getLocation() const;

	/**
	 * Returns a TokenizedDataReader instance that can be used to access the
	 * data.
	 *
	 * @return a new TokenizedDataReader instance pointing at the beginning of
	 * the internal buffer.
	 */
	TokenizedDataReader reader() const;

	/**
	 * Returns true if at least one non-whitespace character is stored in the
	 * TokenizedData structure.
	 *
	 * @return true if the at least one character in the TokenizedData structure
	 * is a non-whitespace character.
	 */
	bool hasNonWhitespaceChar() const;

	/**
	 * Returns true if the last character of the TokenizedData structure is a
	 * whitespace character.
	 *
	 * @return true if the last character is a whitespace character.
	 */
	bool lastCharIsWhitespace() const;

	/**
	 * Returns true if the first character of the TokenizedData structure is a
	 * whitespace character.
	 *
	 * @return true if the first character is a whitespace character.
	 */
	bool firstCharIsWhitespace() const;
};

/**
 * The TokenizedDataReader
 */
class TokenizedDataReader {
private:
	friend TokenizedData;

	/**
	 * Shared pointer pointing at the internal data. This data is shared with
	 * all the TokenizedDataReader instances.
	 */
	std::shared_ptr<const TokenizedDataImpl> impl;

	/**
	 * Position from which the last element was read from the internal buffer.
	 */
	TokenizedDataCursor readCursor;

	/**
	 * Position from which the last element was peeked from the internal buffer.
	 */
	TokenizedDataCursor peekCursor;

protected:
	/**
	 * Protected constructor of TokenizedDataReader, taking a reference to the
	 * internal TokenizedDataImpl structure storing the data that is accessed by
	 * the reader.
	 *
	 * @param impl is the TokenizedDataImpl instance that holds the actual data.
	 * @param readCursor is the cursor position from which tokens and text are
	 * read.
	 * @param peekCursor is the cursor position from which tokens and text are
	 * peeked.
	 */
	TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl,
	                    const TokenizedDataCursor &readCursor,
	                    const TokenizedDataCursor &peekCursor);

public:
	/**
	 * Returns a new TokenizedDataReaderFork from which tokens and text can be
	 * read without advancing this reader instance.
	 */
	TokenizedDataReaderFork fork();

	/**
	 * Returns true if this TokenizedData instance is at the end.
	 *
	 * @return true if the end of the TokenizedData instance has been reached.
	 */
	bool atEnd() const;

	/**
	 * Stores the next token in the given token reference, returns true if the
	 * operation was successful, false if there are no more tokens. Advances the
	 * internal cursor and re
	 *
	 * @param token is an output parameter into which the read token will be
	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
	 * @param tokens is the set of token identifers, representing the currently
	 * enabled tokens.
	 * @param mode is the whitespace mode that should be used when a text token
	 * is returned.
	 * @param endAtWhitespace if true, only delivers data until the first
	 * whitespace character after a sequence of non-whitespace characters. Does
	 * not affect the delivery of non-data tokens.
	 * @return true if the operation was successful and there is a next token,
	 * false if there are no more tokens.
	 */
	bool read(Token &token, const TokenSet &tokens = TokenSet{},
	          WhitespaceMode mode = WhitespaceMode::TRIM,
	          bool endAtWhitespace = false);

	/**
	 * Stores the next token in the given token reference, returns true if the
	 * operation was successful, false if there are no more tokens.
	 *
	 * @param token is an output parameter into which the read token will be
	 * stored. The TokenId is set to Tokens::Empty if there are no more tokens.
	 * @param tokens is the set of token identifers, representing the currently
	 * enabled tokens.
	 * @param mode is the whitespace mode that should be used when a text token
	 * is returned.
	 * @param endAtWhitespace if true, only delivers data until the first
	 * whitespace character after a sequence of non-whitespace characters. Does
	 * not affect the delivery of non-data tokens.
	 * @return true if the operation was successful and there is a next token,
	 * false if there are no more tokens.
	 */
	bool peek(Token &token, const TokenSet &tokens = TokenSet{},
	          WhitespaceMode mode = WhitespaceMode::TRIM,
	          bool endAtWhitespace = false);

	/**
	 * Consumes the peeked tokens, the read cursor will now be at the position
	 * of the peek cursor.
	 */
	void consumePeek() { readCursor = peekCursor; }

	/**
	 * Resets the peek cursor to the position of the read cursor.
	 */
	void resetPeek() { peekCursor = readCursor; }
};

/**
 * The TokenizedDataReaderFork class is created when forking a
 * TokenizedDataReader
 */
class TokenizedDataReaderFork : public TokenizedDataReader {
private:
	friend TokenizedDataReader;

	/**
	 * Reference pointing at the parent TokenizedDataReader to which changes may
	 * be commited.
	 */
	TokenizedDataReader &parent;

	/**
	 * Private constructor of TokenizedDataReaderFork, taking a reference to the
	 * internal TokenizedDataImpl structure storing the data that is accessed by
	 * the reader and a reference at the parent TokenizedDataReader.
	 *
	 * @param parent is the TokenizedDataReader instance to which the current
	 * read/peek progress may be commited.
	 * @param impl is the TokenizedDataImpl instance that holds the actual data.
	 * @param readCursor is the cursor position from which tokens and text are
	 * read.
	 * @param peekCursor is the cursor position from which tokens and text are
	 * peeked.
	 */
	TokenizedDataReaderFork(TokenizedDataReader &parent,
	                        std::shared_ptr<const TokenizedDataImpl> impl,
	                        const TokenizedDataCursor &readCursor,
	                        const TokenizedDataCursor &peekCursor)
	    : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent)
	{
	}

public:
	/**
	 * Commits the read/peek progress to the underlying parent.
	 */
	void commit() { parent = *this; }
};
}

#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */