diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/utils/CSSParser.cpp | 45 | ||||
-rw-r--r-- | src/core/utils/CSSParser.hpp | 3 | ||||
-rw-r--r-- | src/core/utils/CodeTokenizer.cpp | 62 | ||||
-rw-r--r-- | src/core/utils/CodeTokenizer.hpp | 57 | ||||
-rw-r--r-- | src/core/utils/Tokenizer.cpp | 26 | ||||
-rw-r--r-- | src/core/utils/Tokenizer.hpp | 122 |
6 files changed, 274 insertions, 41 deletions
diff --git a/src/core/utils/CSSParser.cpp b/src/core/utils/CSSParser.cpp index e66eb34..1763cc2 100644 --- a/src/core/utils/CSSParser.cpp +++ b/src/core/utils/CSSParser.cpp @@ -17,6 +17,7 @@ */ #include "BufferedCharReader.hpp" +#include "CodeTokenizer.hpp" #include "Tokenizer.hpp" #include "CSSParser.hpp" @@ -24,6 +25,7 @@ namespace ousia { namespace utils { +// CSS code tokens static const int CURLY_OPEN = 1; static const int CURLY_CLOSE = 2; static const int COLON = 3; @@ -31,8 +33,19 @@ static const int SEMICOLON = 4; static const int HASH = 5; static const int BRACKET_OPEN = 6; static const int BRACKET_CLOSE = 7; -static const int COMMENT_OPEN = 8; -static const int COMMENT_CLOSE = 9; +static const int PAREN_OPEN = 8; +static const int PAREN_CLOSE = 9; +// comments +static const int COMMENT = 100; +static const int COMMENT_OPEN = 101; +static const int COMMENT_CLOSE = 102; +// strings +static const int STRING = 200; +static const int SINGLE_QUOTE = 201; +static const int DOUBLE_QUOTE = 202; +static const int ESCAPE = 203; +// general syntax +static const int LINEBREAK = 300; static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN}, {"}", CURLY_CLOSE}, @@ -41,16 +54,28 @@ static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN}, {"#", HASH}, {"[", BRACKET_OPEN}, {"]", BRACKET_CLOSE}, + {"(", PAREN_OPEN}, + {")", PAREN_CLOSE}, {"/*", COMMENT_OPEN}, - {"*/", COMMENT_CLOSE}}}; - -StyleNode CSSParser::parse(BufferedCharReader &input) { - Tokenizer tokenizer {input, CSS_ROOT}; - //TODO: implement - -} - + {"*/", COMMENT_CLOSE}, + {"\\", ESCAPE}, + {"\''", SINGLE_QUOTE}, + {"\"", DOUBLE_QUOTE}, + {"\n", LINEBREAK}}}; +static const std::map<int, CodeTokenDescriptor> CSS_DESCRIPTORS = { + {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}}, + {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}}, + {SINGLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, + {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, + {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}}, + {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; +StyleNode CSSParser::parse(BufferedCharReader &input) +{ + CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS}; + tokenizer.ignoreComments = true; + // TODO: implement +} } } diff --git a/src/core/utils/CSSParser.hpp b/src/core/utils/CSSParser.hpp index 0f9cd8f..c8b772d 100644 --- a/src/core/utils/CSSParser.hpp +++ b/src/core/utils/CSSParser.hpp @@ -156,6 +156,9 @@ public: }; class CSSParser { + +private: + public: StyleNode parse(BufferedCharReader &input); }; diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp index c1376af..e5b8610 100644 --- a/src/core/utils/CodeTokenizer.cpp +++ b/src/core/utils/CodeTokenizer.cpp @@ -23,30 +23,30 @@ namespace ousia { namespace utils { -Token CodeTokenizer::constructToken(const Token& t) +Token CodeTokenizer::constructToken(const Token &t) { std::string content = buf.str(); buf.str(std::string()); - return Token{returnTokenId, content, - startToken.startColumn, startToken.startLine, - t.endColumn, t.endLine}; + return Token{returnTokenId, content, startToken.startColumn, + startToken.startLine, t.endColumn, t.endLine}; } void CodeTokenizer::buffer(const Token &t) { buf << t.content; } bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) { - if (t.startLine != t.endLine) { - throw TokenizerException( - "We did not expect a multiline token. Most likely you did not add " - "a linebreak token to your tokenizer!"); - } - auto it = descriptors.find(t.tokenId); CodeTokenMode mode = CodeTokenMode::NONE; if (it != descriptors.end()) { mode = it->second.mode; } + + if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) { + throw TokenizerException( + "We did not expect a multiline token (except linebreaks). Most " + "likely you did not add a linebreak token to your tokenizer!"); + } + switch (state) { case CodeTokenizerState::NORMAL: switch (mode) { @@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) case CodeTokenMode::LINE_COMMENT: state = CodeTokenizerState::IN_LINE_COMMENT; break; + case CodeTokenMode::LINEBREAK: + peeked.push_back({it->second.id, t.content, t.startColumn, + t.startLine, t.endColumn, t.endLine}); + return true; default: if (t.tokenId == TOKEN_TEXT) { int begin = -1; for (size_t c = 0; c < t.content.length(); c++) { bool isWhitespace = t.content[c] == ' ' || t.content[c] == '\t'; - if (begin >= 0 && isWhitespace) { - peeked.push_back(Token{ - TOKEN_TEXT, - t.content.substr(begin, (int)c - begin), - t.startColumn + begin, t.startLine, - t.startColumn + (int)c, t.endLine}); - } - if (!isWhitespace && begin < 0) { - begin = c; + if (begin < 0) { + // if we have not yet set our beginning, + // we wait for the first + // non-whitespace-character to set it. + if (!isWhitespace) { + begin = c; + } + } else { + // if we have set our beginning, we wait for the + // first whitespace character, which marks the + // end of the current word. + if (isWhitespace) { + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin, (int)c - begin), + t.startColumn + begin, t.startLine, + t.startColumn + (int)c, t.endLine}); + begin = -1; + } } } + if(begin >= 0){ + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin), + t.startColumn + begin, t.startLine, + t.endColumn, t.endLine}); + } + } else { + peeked.push_back(t); } - peeked.push_back(t); return true; } startToken = t; diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp index f26a74c..0fc0862 100644 --- a/src/core/utils/CodeTokenizer.hpp +++ b/src/core/utils/CodeTokenizer.hpp @@ -28,6 +28,17 @@ namespace ousia { namespace utils { +/* + * This enum contains all special Token the CodeTokenizer supports, namely: + * + * 1.) An ambigous Tokens - in post programming languages single-quotes ' or + * double-quotes " - to delimit string tokens. + * 2.) A start token for line comments, which would e.g. be // in Java. + * 3.) A start token for a block comment + * 4.) An end token for a block comment. + * 5.) A linebreak token + * 6.) The escape token, which would e.g. be \ in java. + */ enum class CodeTokenMode { STRING_START_END, LINE_COMMENT, @@ -38,6 +49,12 @@ enum class CodeTokenMode { NONE }; +/** + * A CodeTokenDescriptor defines the id the user likes to have returned for + * a Token of the mode specified, e.g. if you want to get the id 4 for a + * String Token the corresponding CodeTokenDescriptor would be inizialized + * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; + */ struct CodeTokenDescriptor { CodeTokenMode mode; int id; @@ -45,7 +62,10 @@ struct CodeTokenDescriptor { CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} }; - +/** + * The CodeTokenizer is a finite state machine with the states NORMAL, being + * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. + */ enum class CodeTokenizerState { NORMAL, IN_BLOCK_COMMENT, @@ -53,6 +73,14 @@ enum class CodeTokenizerState { IN_STRING }; +/** + * The purpose of a CodeTokenizer is to make it easier to parse classical + * programming Code. It adds the following features to a regular Tokenizer: + * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens + * for the opening delimiter, the text and the closing delimiter. + * 2.) Escaping in String tokens. + * 3.) Comment Tokens (for line comments as well as block comments) + */ class CodeTokenizer : public Tokenizer { private: std::map<int, CodeTokenDescriptor> descriptors; @@ -62,18 +90,37 @@ private: int returnTokenId; bool escaped = false; - Token constructToken(const Token& t); - void buffer(const Token& t); + Token constructToken(const Token &t); + void buffer(const Token &t); protected: bool doPrepare(const Token &t, std::deque<Token> &peeked) override; public: + /** + * If you do not want comment tokens to be returned you can set this to + * true. + */ bool ignoreComments = false; + /** + * + * @param input a BufferedCharReader containing the input for this + *tokenizer, + * as with a regular tokenizer. + * @param root a TokenTreeNode representing the root of the TokenTree. + * Please note that you have to specify all tokenIDs here that you use + * in the descriptors map. + * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. + * In this way you can specify the meaning of certain Tokens. Say you + * specified the Token "//" with the id 1 in the TokenTree. Then you could + * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map + * and this CodeTokenizer would recognize the token "//" as starting a + * line comment. + */ CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root, - std::map<int, CodeTokenDescriptor> descriptors) - : Tokenizer(input, root), descriptors(descriptors) + std::map<int, CodeTokenDescriptor> descriptors) + : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) { } }; diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp index 164a30f..a0ca3aa 100644 --- a/src/core/utils/Tokenizer.cpp +++ b/src/core/utils/Tokenizer.cpp @@ -82,8 +82,8 @@ bool Tokenizer::prepare() { std::stringstream buffer; char c; - const int startColumn = input.getColumn(); - const int startLine = input.getLine(); + int startColumn = input.getColumn(); + int startLine = input.getLine(); bool bufEmpty = true; while (input.peek(&c)) { if (root.children.find(c) != root.children.end()) { @@ -118,9 +118,10 @@ bool Tokenizer::prepare() break; } } + //reset the peek pointer to the last valid position. + input.resetPeek(); // check if we did indeed find a special token. if (match != TOKEN_NONE) { - input.resetPeek(); if (bufEmpty) { // if we did not have text before, construct that token. if (doPrepare( @@ -128,8 +129,11 @@ bool Tokenizer::prepare() input.getColumn(), input.getLine()}, peeked)) { return true; + } else { + startColumn = input.getColumn(); + startLine = input.getLine(); + continue; } - } else { // otherwise we return the text before the token. if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, @@ -137,8 +141,20 @@ bool Tokenizer::prepare() input.getLine()}, peeked)) { return true; - } + } else{ + //we need to clear the buffer here. After all the token + //corresponding to this buffer segment is already + //constructed. + buffer.str(std::string()); + bufEmpty = true; + startColumn = input.getColumn(); + startLine = input.getLine(); + continue; + } } + } else{ + //if we found nothing, read at least one character. + input.peek(&c); } } buffer << c; diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp index eb8eed4..2debc75 100644 --- a/src/core/utils/Tokenizer.hpp +++ b/src/core/utils/Tokenizer.hpp @@ -28,6 +28,11 @@ namespace ousia { namespace utils { +/** + * This exception is currently only thrown if errors are made during the + * initialization of the Tokenizer. Have a closer look at the documentation + * of the TokenTreeNode constructor for more information. + */ class TokenizerException : public std::exception { public: const std::string msg; @@ -37,17 +42,82 @@ public: virtual const char *what() const noexcept override { return msg.c_str(); } }; +/** + * The Tokenizer internally uses a TokenTree to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * The TokenTree is a construct that structures all special tokens this + * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then + * the TokenTree would look like this: + * + * a + * | \ + * a $ + * | \ + * b c + * | | + * $ $ + * + * Every node in the TokenTree is a valid end state that has a $ attached to it. + * During the search algorithm the Tokenizer goes through the tree and stores + * the last valid position. If a character follows that does not lead to a new + * node in the TokenTree the search ends (and starts again at this character). + * The token corresponding to the last valid position is returned. + * + * This allows us to uniquely identify the matching token given a certain + * input text. Note that this is a greedy matching approach that does not + * work if you're using truly ambiguous tokens (that have the same text). + * + * It is also not allowed that tokens have common middle parts but varying + * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and + * the input string "abc". In that case we start looking for "abd" at the + * start, won't find it, wenn we hit "c" and start the scanning process + * anew. Thus the "bc" token is not found. + * + * For most (well-behaved) tokenization schemes this is not the case, + * though. + */ class TokenTreeNode { public: const std::map<char, TokenTreeNode> children; const int tokenId; + /** + * The TokenTreeNode constructor builds a TokenTree from the given token + * specifications. The node returned by this constructor then is the root of + * said TokenTree. + * @param inputs Specifications of tokens in map form. Each specification + * is a tuple of the text that should be matched and some unique ID (>= 0) + * that is returned to you if that Token is found in the text. + * An example for such a map would be + * { + * { "#" , 1}, + * { "##", 2}, + * { "/" , 3} + * } + * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE + * (-1) and TOKEN_TEXT (-2). + */ TokenTreeNode(const std::map<std::string, int> &inputs); }; +/** + * This is a reserved constant for the empty token. + */ static const int TOKEN_NONE = -1; +/** + * This is a reserved constant for every part of the input text that is not a + * specified token. + */ static const int TOKEN_TEXT = -2; +/** + * A token for us is identified by an integer tokenID (either one of the + * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). + * Additionally we return the matched text (which should only be really + * interesting in case of TOKEN_TEXT tokens) and the position in the input text. + */ struct Token { int tokenId; std::string content; @@ -70,6 +140,25 @@ struct Token { Token() : tokenId(TOKEN_NONE) {} }; +/** + * A Tokenizer has the purpose of subdividing an input text into tokens. In our + * definition here we distinguish between two kinds of tokens: + * 1.) User-specified tokens that match a fixed text. + * 2.) Any other text between those tokens. + * The user might want to specify the tokens '#{' and '#}' for example, because + * they have some meaning in her code. The user sets the IDs to 1 and 2. + * Given the input text + * "some text #{ special command #} some text" + * the tokenizer would return the tokens: + * 1.) "some text " with the id TOKEN_TEXT (-2). + * 2.) "#{" with the id 1. + * 3.) " special command " with the id TOKEN_TEXT (-2). + * 4.) "#}" with the id 2. + * 5.) " some text" with the id TOKEN_TEXT (-2). + * This makes the subsequent parsing of files of a specific type easier. + * Note that in case of tokens with that are prefixes of other tokens the + * longest possible match is returned. + */ class Tokenizer { private: BufferedCharReader &input; @@ -95,14 +184,45 @@ protected: virtual bool doPrepare(const Token &t, std::deque<Token> &peeked); public: + /** + * @param input The input of a Tokenizer is given in the form of a + * BufferedCharReader. Please refer to the respective documentation. + * @param root This is meant to be the root of a TokenTree giving the + * specification of user-defined tokens this Tokenizer should recognize. + * The Tokenizer promises to not change the TokenTree such that you can + * re-use the same specification for multiple inputs. + * Please refer to the TokenTreeNode documentation for more information. + */ Tokenizer(BufferedCharReader &input, const TokenTreeNode &root); + /** + * The next method consumes one Token from the input stream and gives + * it to the user (stored in the input argument). + * + * @param t a Token reference that is set to the next found token. + * @return true if a next token was found and false if the input is at its + * end. + */ bool next(Token &t); - + /** + * The peek method does not consume the next Token but buffers it and + * shows it to the user (stored in the input argument). + * + * @param t a Token reference that is set to the next found token. + * @return true if a next token was found and false if the input is at its + * end. + */ bool peek(Token &t); + /** + * Resets the peek pointer to the current position in the stream (to the + * beginning of the buffer). + */ void resetPeek(); + /** + * Clears the peek buffer, such that all peeked Tokens are consumed. + */ void consumePeek(); }; } |