/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* @file Token.hpp
*
* Definition of the TokenId id and constants for some special tokens.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_TOKEN_HPP_
#define _OUSIA_TOKEN_HPP_
#include
#include
#include
#include
#include
namespace ousia {
/**
* The TokenId is used to give each token id a unique id.
*/
using TokenId = uint32_t;
/**
* Type used for storing token lengths.
*/
using TokenLength = uint16_t;
/**
* Type used for storing token sets.
*/
using TokenSet = std::unordered_set;
/**
* Namespace containing constants for TokenId instances with special meaning.
*/
namespace Tokens {
/**
* Token which is not a token.
*/
constexpr TokenId Empty = std::numeric_limits::max();
/**
* Token which represents data (represented as TokenizedData).
*/
constexpr TokenId Data = std::numeric_limits::max() - 1;
/**
* Token which represents a newline token.
*/
constexpr TokenId Newline = std::numeric_limits::max() - 2;
/**
* Token which represents a paragraph token -- issued if two consecutive
* newlines occur with optionally any amout of whitespace between them. The
* paragraph token is not repeated until more text is reached.
*/
constexpr TokenId Paragraph = std::numeric_limits::max() - 3;
/**
* Token which represents a section token -- issued if three or more
* consecutive newlines occur with optionally any amout of whitespace between
* them. The section token is not repeated until more text is reached.
*/
constexpr TokenId Section = std::numeric_limits::max() - 4;
/**
* Token which represents an indentation token -- issued if the indentation of
* this line is larger than the indentation of the previous line.
*/
constexpr TokenId Indent = std::numeric_limits::max() - 5;
/**
* Token which represents an dedentation -- issued if the indentation of
* this line is smaller than the indentation of the previous line.
*/
constexpr TokenId Dedent = std::numeric_limits::max() - 6;
/**
* Maximum token id to be used. Tokens allocated for users should not surpass
* this value.
*/
constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255;
}
/**
* The Token structure describes a token discovered by the Tokenizer or read
* from the TokenizedData struct.
*/
struct Token {
/**
* Id of the id of this token.
*/
TokenId id;
/**
* String that was matched.
*/
std::string content;
/**
* Location from which the string was extracted.
*/
SourceLocation location;
/**
* Default constructor.
*/
Token() : id(Tokens::Empty) {}
/**
* Constructor of a "data" token with no explicit content.
*
* @param location is the location of the extracted string content in the
* source file.
*/
Token(const SourceLocation &location) : id(Tokens::Data), location(location)
{
}
/**
* Constructor of the Token struct.
*
* @param id represents the token id.
* @param content is the string content that has been extracted.
* @param location is the location of the extracted string content in the
* source file.
*/
Token(TokenId id, const std::string &content,
const SourceLocation &location)
: id(id), content(content), location(location)
{
}
/**
* Constructor of the a "data" Token with the given string data and
* location.
*
* @param content is the string content that should be stored in the token.
* @param location is the location of the content within the source file.
*/
Token(const std::string &content,
const SourceLocation &location = SourceLocation{})
: id(Tokens::Data), content(content), location(location)
{
}
/**
* Constructor of the Token struct, only initializes the token id
*
* @param id is the id corresponding to the id of the token.
*/
Token(TokenId id) : id(id) {}
/**
* Returns true if this token is special.
*
* @return true if the TokenId indicates that this token is a "special"
* token.
*/
bool isSpecial() const { return isSpecial(id); }
/**
* Returns true if the given token id is special.
*
* @param id is the token id that should be checked for being special.
* @return true if the TokenId indicates that this token is a "special"
* token.
*/
static bool isSpecial(TokenId id) {return id > Tokens::MaxTokenId; }
/**
* Returns the name of the token -- which is either its content or the name
* of the special token (if it is one).
*
* @return the human readable name of this token instance.
*/
std::string name() const;
/**
* Returns the name of the special token or an empty string if it is not a
* special token.
*
* @param id is the TokenId for which the special name should be returned.
*/
static const char* specialName(TokenId id);
/**
* The getLocation function allows the tokens to be directly passed as
* parameter to Logger or LoggableException instances.
*
* @return a reference at the location field
*/
const SourceLocation &getLocation() const { return location; }
};
}
#endif /* _OUSIA_TOKENS_HPP_ */