1 files changed, 49 insertions, 4 deletions
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index f26a74c..18cf02a 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -28,6 +28,17 @@
 namespace ousia {
 namespace utils {
 
+/*
+ * This enum contains all special Token the CodeTokenizer supports, namely:
+ *
+ * 1.) An ambigous Tokens - in post programming languages single-quotes ' or
+ * double-quotes " - to delimit string tokens.
+ * 2.) A start token for line comments, which would e.g. be // in Java.
+ * 3.) A start token for a block comment
+ * 4.) An end token for a block comment.
+ * 5.) The linebreak token (this does not have to be specified by the user)
+ * 6.) The escape token, which would e.g. be \ in java.
+ */
 enum class CodeTokenMode {
 	STRING_START_END,
 	LINE_COMMENT,
@@ -38,6 +49,10 @@ enum class CodeTokenMode {
 	NONE
 };
 
+/**
+ * A CodeTokenDescriptor draws the connection between an id returned by the
+ * underlying Tokenizer and the mode this token represents.
+ */
 struct CodeTokenDescriptor {
 	CodeTokenMode mode;
 	int id;
@@ -45,7 +60,10 @@ struct CodeTokenDescriptor {
 	CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
 };
 
-
+/**
+ * The CodeTokenizer is a finite state machine with the states NORMAL, being
+ * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING.
+ */
 enum class CodeTokenizerState {
 	NORMAL,
 	IN_BLOCK_COMMENT,
@@ -53,6 +71,14 @@ enum class CodeTokenizerState {
 	IN_STRING
 };
 
+/**
+ * The purpose of a CodeTokenizer is to make it easier to parse classical
+ * programming Code. It adds the following features to a regular Tokenizer:
+ * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens
+ * for the opening delimiter, the text and the closing delimiter.
+ * 2.) Escaping in String tokens.
+ * 3.) Comment Tokens (for line comments as well as block comments)
+ */
 class CodeTokenizer : public Tokenizer {
 private:
 	std::map<int, CodeTokenDescriptor> descriptors;
@@ -62,17 +88,36 @@ private:
 	int returnTokenId;
 	bool escaped = false;
 
-	Token constructToken(const Token& t);
-	void buffer(const Token& t);
+	Token constructToken(const Token &t);
+	void buffer(const Token &t);
 
 protected:
 	bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
 
 public:
+	/**
+	 * If you do not want comment tokens to be returned you can set this to
+	 * true.
+	 */
 	bool ignoreComments = false;
 
+	/**
+	 *
+	 * @param input a BufferedCharReader containing the input for this
+	 *tokenizer,
+	 * as with a regular tokenizer.
+	 * @param root a TokenTreeNode representing the root of the TokenTree.
+	 * Please note that you have to specify all tokenIDs here that you use
+	 * in the descriptors map.
+	 * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors.
+	 * In this way you can specify the meaning of certain Tokens. Say you
+	 * specified the Token "//" with the id 1 in the TokenTree. Then you could
+	 * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map
+	 * and this CodeTokenizer would recognize the token "//" as starting a
+	 * line comment.
+	 */
 	CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
-	          std::map<int, CodeTokenDescriptor> descriptors)
+	              std::map<int, CodeTokenDescriptor> descriptors)
 	    : Tokenizer(input, root), descriptors(descriptors)
 	{
 	}