1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file TokenTrie.hpp
*
* Class representing a token trie that can be updated dynamically.
*
* @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_TOKEN_TRIE_HPP_
#define _OUSIA_TOKEN_TRIE_HPP_
#include <cstdint>
#include <memory>
#include <limits>
#include <unordered_map>
#include "Token.hpp"
namespace ousia {
/**
* The Tokenizer internally uses a TokenTrie to be efficiently able to identify
* the longest consecutive token in the text. This is equivalent to a prefix
* trie.
*
* A token trie is a construct that structures all special tokens a Tokenizer
* recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
* three. Then the token tree would look like this:
*
* \code{*.txt}
* ~ (0)
* / \
* a (2) b (0)
* | |
* a (0) a (0)
* | |
* b (1) c (0)
* \endcode
*
* Where the number indicates the corresponding token descriptor identifier.
*/
class TokenTrie {
public:
/**
* Structure used to build the node tree.
*/
struct Node {
/**
* Type used for the child map.
*/
using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
/**
* Map from single characters at the corresponding child nodes.
*/
ChildMap children;
/**
* Reference at the corresponding token descriptor. Set to nullptr if
* no token is attached to this node.
*/
TokenId type;
/**
* Default constructor, initializes the descriptor with nullptr.
*/
Node();
};
private:
/**
* Root node of the internal token tree.
*/
Node root;
public:
/**
* Registers a token containing the given string. Returns false if the
* token already exists, true otherwise.
*
* @param token is the character sequence that should be registered as
* token.
* @param type is the descriptor that should be set for this token.
* @return true if the operation is successful, false otherwise.
*/
bool registerToken(const std::string &token, TokenId type) noexcept;
/**
* Unregisters the token from the token tree. Returns true if the token was
* unregistered successfully, false otherwise.
*
* @param token is the character sequence that should be unregistered.
* @return true if the operation was successful, false otherwise.
*/
bool unregisterToken(const std::string &token) noexcept;
/**
* Returns true, if the given token exists within the TokenTree. This
* function is mostly thought for debugging and unit testing.
*
* @param token is the character sequence that should be searched.
* @return the attached token descriptor or nullptr if the given token is
* not found.
*/
TokenId hasToken(const std::string &token) const noexcept;
/**
* Returns a reference at the root node to be used for traversing the token
* tree.
*
* @return a reference at the root node.
*/
const Node *getRoot() const noexcept { return &root; }
};
}
#endif /* _OUSIA_TOKEN_TRIE_HPP_ */
|