summaryrefslogtreecommitdiff
path: root/src/core/model/Syntax.hpp
blob: 5f360bc4a0aa5fd354495102a23d712731a75969 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/*
    Ousía
    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/**
 * @file Syntax.hpp
 *
 * This header contains the Descriptor classes for user definable syntax for
 * Document entities or fields. These classes are referenced in Ontology.hpp.
 */

#ifndef _OUSIA_MODEL_SYNTAX_HPP_
#define _OUSIA_MODEL_SYNTAX_HPP_

#include <core/common/Token.hpp>
#include "Node.hpp"

namespace ousia {

/**
 * Class to describe a single token that shall be used as user-defined syntax.
 */
struct TokenDescriptor {
	/**
	 * The string content of this token, if it is not a special one.
	 */
	std::string token;

	/**
	 * An id to uniquely identify this token.
	 */
	TokenId id;

	/**
	 * A flag to be set true if this TokenDescriptor uses a special token.
	 */
	bool special;

	/**
	 * A flag indicating whether the token is greedy or not. Currently only used
	 * for "shortForm" tokens. Default value is true. If false, only one data
	 * command is passed to the corresponding handler if the handler was opened
	 * for the implicity default field.
	 */
	bool greedy;

	/**
	 * Constructor for non-special tokens. The special flag is set to false and
	 * the id to Tokens::Empty.
	 *
	 * @param token The string content of this token, if it is not a special
	 *              one.
	 */
	TokenDescriptor(std::string token = std::string())
	    : token(std::move(token)),
	      id(Tokens::Empty),
	      special(false),
	      greedy(true)
	{
	}

	/**
	 * Constructor for special tokens. The token is set to an empty string and
	 * the special flag to true.
	 *
	 * @param id the id of the special token.
	 */
	TokenDescriptor(TokenId id, bool greedy = true)
	    : id(id), special(true), greedy(greedy)
	{
	}

	/**
	 * Returns true if and only if neither a string nor an ID is given.
	 *
	 * @return true if and only if neither a string nor an ID is given.
	 */
	bool isEmpty() const { return token.empty() && id == Tokens::Empty; }

	/**
	 * Returns true if the token is valid, which is the case if this class is
	 * either marked as special token or is empty or does have a valid token
	 * string set.
	 *
	 * @return true if the token descriptor is valid, false otherwise.
	 */
	bool isValid() const;
};

/**
 * Class describing the user defined syntax for a StructuredClass,
 * AnnotationClass or FieldDescriptor.
 *
 * This class is used during parsing of a Document. It is used to describe
 * the tokens relevant for one Descriptor that could be created at this point
 * during parsing.
 */
struct SyntaxDescriptor {
	/**
	 * Possible open token or Tokens::Empty if no token is set.
	 */
	TokenId open;

	/**
	 * Possible close token or Tokens::Empty if no token is set.
	 */
	TokenId close;

	/**
	 * Possible short form token or Tokens::Empty if no token is set.
	 */
	TokenId shortForm;

	/**
	 * The Descriptor this SyntaxDescriptor belongs to. As this may be
	 * a FieldDescriptor as well as a class Descriptor (StructuredClass or
	 * AnnotationClass) we can only use the class Node as inner argument here.
	 */
	Rooted<Node> descriptor;

	/**
	 * Given the current leaf in the parsed document the depth of a
	 * SyntaxDescriptor is defined as the number of transparent elements that
	 * would be needed to construct an instance of the referenced descriptor.
	 *
	 * TODO: What do negative values mean?
	 */
	ssize_t depth;

	/**
	 * Set to true if the shortForm is greedy (default), to false if the
	 * corresponding handler should receive at most one piece of data if it was
	 * started implicitly.
	 */
	bool greedyShortForm;

	/**
	 * Default constructor, sets all token ids to Tokens::Empty and the
	 * descriptor handle to nullptr.
	 */
	SyntaxDescriptor()
	    : open(Tokens::Empty),
	      close(Tokens::Empty),
	      shortForm(Tokens::Empty),
	      descriptor(nullptr),
	      depth(-1),
	      greedyShortForm(true)
	{
	}

	/**
	 * Member initializer constructor.
	 *
	 * @param open is a possible open token.
	 * @param close is a possible close token.
	 * @param shortForm is a possible short form token.
	 * @param descriptor The Descriptor this SyntaxDescriptor belongs to.
	 * @param depth Given the current leaf in the parsed document the depth of a
	 * SyntaxDescriptor is defined as the number of transparent elements that
	 * would be needed to construct an instance of the referenced descriptor.
	 * @param greedyShortForm set to false if the shortForm token should be
	 * treated in a non-greedy way, meaning that it should be given at most
	 * one piece of data if it was started implicitly.
	 */
	SyntaxDescriptor(TokenId open, TokenId close, TokenId shortForm,
	                 Handle<Node> descriptor, ssize_t depth,
	                 bool greedyShortForm)
	    : open(open),
	      close(close),
	      shortForm(shortForm),
	      descriptor(descriptor),
	      depth(depth),
	      greedyShortForm(greedyShortForm)
	{
	}

	/**
	 * Equality operator, returns true if the two SyntaxDescriptor instances
	 * are exactly equal.
	 *
	 * @param o1 is the first syntax descriptor for the comparison.
	 * @param o2 is the second syntax descriptor for the comparison.
	 * @return true if the two syntax descriptors equal, false otherwise.
	 */
	friend bool operator==(const SyntaxDescriptor &o1,
	                       const SyntaxDescriptor &o2);

	/**
	 * Orders two SyntaxDescriptor instances by their depth, open, close and
	 * shortForm TokenId and the descriptor pointer. Additionally,
	 * SyntaxDescriptors belonging to FieldDescriptors are prefered.
	 *
	 * @param o1 is the first syntax descriptor for the comparison.
	 * @param o2 is the second syntax descriptor for the comparison.
	 * @return true if o1 is should be ordered before o2.
	 */
	friend bool operator<(const SyntaxDescriptor &o1,
	                      const SyntaxDescriptor &o2);

	/**
	 * Inserts all tokens referenced in this SyntaxDescriptor into the
	 * given TokenSet. Skips token ids set to Tokens::Empty.
	 *
	 * @param set is the TokenSet instance into which the Tokens should be
	 * inserted.
	 */
	void insertIntoTokenSet(TokenSet &set) const;

	/**
	 * Returns true if and only if this SyntaxDescriptor belongs to an
	 * AnnotationClass.
	 *
	 * @return true if and only if this SyntaxDescriptor belongs to an
	 * AnnotationClass.
	 */
	bool isAnnotation() const;

	/**
	 * Returns true if and only if this SyntaxDescriptor belongs to a
	 * StrcturedClass.
	 *
	 * @return true if and only if this SyntaxDescriptor belongs to a
	 * StrcturedClass.
	 */
	bool isStruct() const;

	/**
	 * Returns true if and only if this SyntaxDescriptor belongs to a
	 * FieldDescriptor.
	 *
	 * @return true if and only if this SyntaxDescriptor belongs to a
	 * FieldDescriptor.
	 */
	bool isFieldDescriptor() const;

	/**
	 * Returns true if and only if this SyntaxDescriptor has only empty
	 * entries in start, end and short.
	 *
	 * @return true if and only if this SyntaxDescriptor has only empty
	 * entries in start, end and short.
	 */
	bool isEmpty() const;
};
}
#endif