diff options
-rw-r--r-- | src/core/utils/CodeTokenizer.cpp | 62 | ||||
-rw-r--r-- | src/core/utils/CodeTokenizer.hpp | 11 | ||||
-rw-r--r-- | test/core/utils/CodeTokenizerTest.cpp | 74 |
3 files changed, 122 insertions, 25 deletions
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp index c1376af..e5b8610 100644 --- a/src/core/utils/CodeTokenizer.cpp +++ b/src/core/utils/CodeTokenizer.cpp @@ -23,30 +23,30 @@ namespace ousia { namespace utils { -Token CodeTokenizer::constructToken(const Token& t) +Token CodeTokenizer::constructToken(const Token &t) { std::string content = buf.str(); buf.str(std::string()); - return Token{returnTokenId, content, - startToken.startColumn, startToken.startLine, - t.endColumn, t.endLine}; + return Token{returnTokenId, content, startToken.startColumn, + startToken.startLine, t.endColumn, t.endLine}; } void CodeTokenizer::buffer(const Token &t) { buf << t.content; } bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) { - if (t.startLine != t.endLine) { - throw TokenizerException( - "We did not expect a multiline token. Most likely you did not add " - "a linebreak token to your tokenizer!"); - } - auto it = descriptors.find(t.tokenId); CodeTokenMode mode = CodeTokenMode::NONE; if (it != descriptors.end()) { mode = it->second.mode; } + + if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) { + throw TokenizerException( + "We did not expect a multiline token (except linebreaks). Most " + "likely you did not add a linebreak token to your tokenizer!"); + } + switch (state) { case CodeTokenizerState::NORMAL: switch (mode) { @@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) case CodeTokenMode::LINE_COMMENT: state = CodeTokenizerState::IN_LINE_COMMENT; break; + case CodeTokenMode::LINEBREAK: + peeked.push_back({it->second.id, t.content, t.startColumn, + t.startLine, t.endColumn, t.endLine}); + return true; default: if (t.tokenId == TOKEN_TEXT) { int begin = -1; for (size_t c = 0; c < t.content.length(); c++) { bool isWhitespace = t.content[c] == ' ' || t.content[c] == '\t'; - if (begin >= 0 && isWhitespace) { - peeked.push_back(Token{ - TOKEN_TEXT, - t.content.substr(begin, (int)c - begin), - t.startColumn + begin, t.startLine, - t.startColumn + (int)c, t.endLine}); - } - if (!isWhitespace && begin < 0) { - begin = c; + if (begin < 0) { + // if we have not yet set our beginning, + // we wait for the first + // non-whitespace-character to set it. + if (!isWhitespace) { + begin = c; + } + } else { + // if we have set our beginning, we wait for the + // first whitespace character, which marks the + // end of the current word. + if (isWhitespace) { + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin, (int)c - begin), + t.startColumn + begin, t.startLine, + t.startColumn + (int)c, t.endLine}); + begin = -1; + } } } + if(begin >= 0){ + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin), + t.startColumn + begin, t.startLine, + t.endColumn, t.endLine}); + } + } else { + peeked.push_back(t); } - peeked.push_back(t); return true; } startToken = t; diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp index 18cf02a..fda4493 100644 --- a/src/core/utils/CodeTokenizer.hpp +++ b/src/core/utils/CodeTokenizer.hpp @@ -36,7 +36,7 @@ namespace utils { * 2.) A start token for line comments, which would e.g. be // in Java. * 3.) A start token for a block comment * 4.) An end token for a block comment. - * 5.) The linebreak token (this does not have to be specified by the user) + * 5.) A linebreak token * 6.) The escape token, which would e.g. be \ in java. */ enum class CodeTokenMode { @@ -50,8 +50,11 @@ enum class CodeTokenMode { }; /** - * A CodeTokenDescriptor draws the connection between an id returned by the - * underlying Tokenizer and the mode this token represents. + * A CodeTokenDescriptor defines the id the user likes to have returned for + * a Token of the mode specified, e.g. if you want to get the id 4 for a + * String Token the corresponding CodeTokenDescriptor would be inizialized + * with + * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; */ struct CodeTokenDescriptor { CodeTokenMode mode; @@ -118,7 +121,7 @@ public: */ CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root, std::map<int, CodeTokenDescriptor> descriptors) - : Tokenizer(input, root), descriptors(descriptors) + : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) { } }; diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp index d0f9a17..0b9d7b3 100644 --- a/test/core/utils/CodeTokenizerTest.cpp +++ b/test/core/utils/CodeTokenizerTest.cpp @@ -22,9 +22,81 @@ namespace ousia { namespace utils { + +static const int BLOCK_COMMENT = 30; +static const int LINE_COMMENT = 31; +static const int STRING = 20; +static const int ESCAPE = 21; +static const int LINEBREAK = 21; +static const int CURLY_OPEN = 40; +static const int CURLY_CLOSE = 41; + TEST(CodeTokenizer, testTokenizer) { - + BufferedCharReader reader; + reader.feed("/**\n"); // 1 + reader.feed(" * Some Block Comment\n"); // 2 + reader.feed(" */\n"); // 3 + reader.feed("var my_string = 'My \\'String\\'';\n"); // 4 + reader.feed("// and a line comment\n"); // 5 + reader.feed("var my_obj = { a = 4;}"); // 6 + // 123456789012345678901234567890123456789012345678901234567890123456789 + // 0 1 2 3 4 5 6 + TokenTreeNode root{{{"/*", 1}, + {"*/", 2}, + {"//", 3}, + {"'", 4}, + {"\\", 5}, + {"{", CURLY_OPEN}, + {"}", CURLY_CLOSE}, + {"\n", 6}}}; + std::map<int, CodeTokenDescriptor> descriptors{ + // the block comment start Token has the id 1 and if the Tokenizer + // returns a Block Comment Token that should have the id 10. + {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}}, + {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}}, + {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}}, + {4, {CodeTokenMode::STRING_START_END, STRING}}, + {5, {CodeTokenMode::ESCAPE, ESCAPE}}, + {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; + + std::vector<Token> expected = { + {BLOCK_COMMENT, "*\n * Some Block Comment\n ", 1, 1, 4, 3}, + {LINEBREAK, "\n", 4, 3, 1, 4}, + {TOKEN_TEXT, "var", 1, 4, 4, 4}, + {TOKEN_TEXT, "my_string", 5, 4, 14, 4}, + {TOKEN_TEXT, "=", 15, 4, 16, 4}, + {STRING, "My 'String'", 17, 4, 32, 4}, + {TOKEN_TEXT, ";", 32, 4, 33, 4}, + {LINEBREAK, "\n", 33, 4, 1, 5}, + //this is slightly counter-intuitive but makes sense if you think about + //it: As a line comment is ended by a line break the line break is + //technically still a part of the line comment and thus the ending + //is in the next line. + {LINE_COMMENT, " and a line comment", 1, 5, 1, 6}, + {TOKEN_TEXT, "var", 1, 6, 4, 6}, + {TOKEN_TEXT, "my_obj", 5, 6, 11, 6}, + {TOKEN_TEXT, "=", 12, 6, 13, 6}, + {CURLY_OPEN, "{", 14, 6, 15, 6}, + {TOKEN_TEXT, "a", 16, 6, 17, 6}, + {TOKEN_TEXT, "=", 18, 6, 19, 6}, + {TOKEN_TEXT, "4;", 20, 6, 22, 6}, + {CURLY_CLOSE, "}", 22, 6, 23, 6}, + }; + + CodeTokenizer tokenizer{reader, root, descriptors}; + + Token t; + for (auto &te : expected) { + ASSERT_TRUE(tokenizer.next(t)); + ASSERT_EQ(te.tokenId, t.tokenId); + ASSERT_EQ(te.content, t.content); + ASSERT_EQ(te.startColumn, t.startColumn); + ASSERT_EQ(te.startLine, t.startLine); + ASSERT_EQ(te.endColumn, t.endColumn); + ASSERT_EQ(te.endLine, t.endLine); + } + ASSERT_FALSE(tokenizer.next(t)); } } } |