summaryrefslogtreecommitdiff
path: root/src/core/utils
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/utils')
-rw-r--r--src/core/utils/CodeTokenizer.hpp53
1 files changed, 49 insertions, 4 deletions
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index f26a74c..18cf02a 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -28,6 +28,17 @@
namespace ousia {
namespace utils {
+/*
+ * This enum contains all special Token the CodeTokenizer supports, namely:
+ *
+ * 1.) An ambigous Tokens - in post programming languages single-quotes ' or
+ * double-quotes " - to delimit string tokens.
+ * 2.) A start token for line comments, which would e.g. be // in Java.
+ * 3.) A start token for a block comment
+ * 4.) An end token for a block comment.
+ * 5.) The linebreak token (this does not have to be specified by the user)
+ * 6.) The escape token, which would e.g. be \ in java.
+ */
enum class CodeTokenMode {
STRING_START_END,
LINE_COMMENT,
@@ -38,6 +49,10 @@ enum class CodeTokenMode {
NONE
};
+/**
+ * A CodeTokenDescriptor draws the connection between an id returned by the
+ * underlying Tokenizer and the mode this token represents.
+ */
struct CodeTokenDescriptor {
CodeTokenMode mode;
int id;
@@ -45,7 +60,10 @@ struct CodeTokenDescriptor {
CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
};
-
+/**
+ * The CodeTokenizer is a finite state machine with the states NORMAL, being
+ * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING.
+ */
enum class CodeTokenizerState {
NORMAL,
IN_BLOCK_COMMENT,
@@ -53,6 +71,14 @@ enum class CodeTokenizerState {
IN_STRING
};
+/**
+ * The purpose of a CodeTokenizer is to make it easier to parse classical
+ * programming Code. It adds the following features to a regular Tokenizer:
+ * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens
+ * for the opening delimiter, the text and the closing delimiter.
+ * 2.) Escaping in String tokens.
+ * 3.) Comment Tokens (for line comments as well as block comments)
+ */
class CodeTokenizer : public Tokenizer {
private:
std::map<int, CodeTokenDescriptor> descriptors;
@@ -62,17 +88,36 @@ private:
int returnTokenId;
bool escaped = false;
- Token constructToken(const Token& t);
- void buffer(const Token& t);
+ Token constructToken(const Token &t);
+ void buffer(const Token &t);
protected:
bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
public:
+ /**
+ * If you do not want comment tokens to be returned you can set this to
+ * true.
+ */
bool ignoreComments = false;
+ /**
+ *
+ * @param input a BufferedCharReader containing the input for this
+ *tokenizer,
+ * as with a regular tokenizer.
+ * @param root a TokenTreeNode representing the root of the TokenTree.
+ * Please note that you have to specify all tokenIDs here that you use
+ * in the descriptors map.
+ * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors.
+ * In this way you can specify the meaning of certain Tokens. Say you
+ * specified the Token "//" with the id 1 in the TokenTree. Then you could
+ * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map
+ * and this CodeTokenizer would recognize the token "//" as starting a
+ * line comment.
+ */
CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
- std::map<int, CodeTokenDescriptor> descriptors)
+ std::map<int, CodeTokenDescriptor> descriptors)
: Tokenizer(input, root), descriptors(descriptors)
{
}