/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * @file Token.hpp * * Definition of the TokenId id and constants for some special tokens. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_TOKEN_HPP_ #define _OUSIA_TOKEN_HPP_ #include #include #include #include #include namespace ousia { /** * The TokenId is used to give each token id a unique id. */ using TokenId = uint32_t; /** * Type used for storing token lengths. */ using TokenLength = uint16_t; /** * Type used for storing token sets. */ using TokenSet = std::unordered_set; /** * Namespace containing constants for TokenId instances with special meaning. */ namespace Tokens { /** * Token which is not a token. */ constexpr TokenId Empty = std::numeric_limits::max(); /** * Token which represents data (represented as TokenizedData). */ constexpr TokenId Data = std::numeric_limits::max() - 1; /** * Token which represents a newline token. */ constexpr TokenId Newline = std::numeric_limits::max() - 2; /** * Token which represents a paragraph token -- issued if two consecutive * newlines occur with optionally any amout of whitespace between them. The * paragraph token is not repeated until more text is reached. */ constexpr TokenId Paragraph = std::numeric_limits::max() - 3; /** * Token which represents a section token -- issued if three or more * consecutive newlines occur with optionally any amout of whitespace between * them. The section token is not repeated until more text is reached. */ constexpr TokenId Section = std::numeric_limits::max() - 4; /** * Token which represents an indentation token -- issued if the indentation of * this line is larger than the indentation of the previous line. */ constexpr TokenId Indent = std::numeric_limits::max() - 5; /** * Token which represents an dedentation -- issued if the indentation of * this line is smaller than the indentation of the previous line. */ constexpr TokenId Dedent = std::numeric_limits::max() - 6; /** * Maximum token id to be used. Tokens allocated for users should not surpass * this value. */ constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255; } /** * The Token structure describes a token discovered by the Tokenizer or read * from the TokenizedData struct. */ struct Token { /** * Id of the id of this token. */ TokenId id; /** * String that was matched. */ std::string content; /** * Location from which the string was extracted. */ SourceLocation location; /** * Default constructor. */ Token() : id(Tokens::Empty) {} /** * Constructor of a "data" token with no explicit content. * * @param location is the location of the extracted string content in the * source file. */ Token(SourceLocation location) : id(Tokens::Data), location(location) {} /** * Constructor of the Token struct. * * @param id represents the token id. * @param content is the string content that has been extracted. * @param location is the location of the extracted string content in the * source file. */ Token(TokenId id, const std::string &content, SourceLocation location) : id(id), content(content), location(location) { } /** * Constructor of the Token struct, only initializes the token id * * @param id is the id corresponding to the id of the token. */ Token(TokenId id) : id(id) {} /** * Returns true if this token is special. * * @return true if the TokenId indicates that this token is a "special" * token. */ bool isSpecial() const { return id > Tokens::MaxTokenId; } /** * The getLocation function allows the tokens to be directly passed as * parameter to Logger or LoggableException instances. * * @return a reference at the location field */ const SourceLocation &getLocation() const { return location; } }; /** * Class describing the user defined syntax for a single field or annotation. */ struct TokenSyntaxDescriptor { /** * Possible start token or Tokens::Empty if no token is set. */ TokenId start; /** * Possible end token or Tokens::Empty if no token is set. */ TokenId end; /** * Possible representation token or Tokens::Empty if no token is set. */ TokenId shortForm; /** * Flag specifying whether this TokenSyntaxDescriptor describes an * annotation. */ bool isAnnotation; /** * Default constructor, sets all token ids to Tokens::Empty and isAnnotation * to false. */ TokenSyntaxDescriptor() : start(Tokens::Empty), end(Tokens::Empty), shortForm(Tokens::Empty), isAnnotation(false) { } /** * Member initializer constructor. * * @param start is a possible start token. * @param end is a possible end token. * @param shortForm is a possible short form token. * @param isAnnotation is set to true if this syntax descriptor describes an * annotation. */ TokenSyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, bool isAnnotation) : start(start), end(end), shortForm(shortForm), isAnnotation(isAnnotation) { } /** * Inserts all tokens referenced in this TokenSyntaxDescriptor into the * given TokenSet. Skips token ids set to Tokens::Empty. * * @param set is the TokenSet instance into which the Tokens should be * inserted. */ void insertIntoTokenSet(TokenSet &set) const; }; } #endif /* _OUSIA_TOKENS_HPP_ */