diff options
Diffstat (limited to 'src/core/utils')
-rw-r--r-- | src/core/utils/CodeTokenizer.hpp | 53 |
1 files changed, 49 insertions, 4 deletions
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp index f26a74c..18cf02a 100644 --- a/src/core/utils/CodeTokenizer.hpp +++ b/src/core/utils/CodeTokenizer.hpp @@ -28,6 +28,17 @@ namespace ousia { namespace utils { +/* + * This enum contains all special Token the CodeTokenizer supports, namely: + * + * 1.) An ambigous Tokens - in post programming languages single-quotes ' or + * double-quotes " - to delimit string tokens. + * 2.) A start token for line comments, which would e.g. be // in Java. + * 3.) A start token for a block comment + * 4.) An end token for a block comment. + * 5.) The linebreak token (this does not have to be specified by the user) + * 6.) The escape token, which would e.g. be \ in java. + */ enum class CodeTokenMode { STRING_START_END, LINE_COMMENT, @@ -38,6 +49,10 @@ enum class CodeTokenMode { NONE }; +/** + * A CodeTokenDescriptor draws the connection between an id returned by the + * underlying Tokenizer and the mode this token represents. + */ struct CodeTokenDescriptor { CodeTokenMode mode; int id; @@ -45,7 +60,10 @@ struct CodeTokenDescriptor { CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} }; - +/** + * The CodeTokenizer is a finite state machine with the states NORMAL, being + * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. + */ enum class CodeTokenizerState { NORMAL, IN_BLOCK_COMMENT, @@ -53,6 +71,14 @@ enum class CodeTokenizerState { IN_STRING }; +/** + * The purpose of a CodeTokenizer is to make it easier to parse classical + * programming Code. It adds the following features to a regular Tokenizer: + * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens + * for the opening delimiter, the text and the closing delimiter. + * 2.) Escaping in String tokens. + * 3.) Comment Tokens (for line comments as well as block comments) + */ class CodeTokenizer : public Tokenizer { private: std::map<int, CodeTokenDescriptor> descriptors; @@ -62,17 +88,36 @@ private: int returnTokenId; bool escaped = false; - Token constructToken(const Token& t); - void buffer(const Token& t); + Token constructToken(const Token &t); + void buffer(const Token &t); protected: bool doPrepare(const Token &t, std::deque<Token> &peeked) override; public: + /** + * If you do not want comment tokens to be returned you can set this to + * true. + */ bool ignoreComments = false; + /** + * + * @param input a BufferedCharReader containing the input for this + *tokenizer, + * as with a regular tokenizer. + * @param root a TokenTreeNode representing the root of the TokenTree. + * Please note that you have to specify all tokenIDs here that you use + * in the descriptors map. + * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. + * In this way you can specify the meaning of certain Tokens. Say you + * specified the Token "//" with the id 1 in the TokenTree. Then you could + * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map + * and this CodeTokenizer would recognize the token "//" as starting a + * line comment. + */ CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root, - std::map<int, CodeTokenDescriptor> descriptors) + std::map<int, CodeTokenDescriptor> descriptors) : Tokenizer(input, root), descriptors(descriptors) { } |