From 45c6acf226f7867f847a9085d84960d337b1a2ae Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Wed, 19 Nov 2014 19:44:29 +0100 Subject: 1.) fixed the word tokenization mechanism in the CodeTokenizer which returned rubbish previously. 2.) Allowed multiline tokens if the mode is LINEBREAK (obsiously). 3.) returned LINEBREAK tokens in normal mode. 4.) added a CodeTokenizer test. --- test/core/utils/CodeTokenizerTest.cpp | 74 ++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) (limited to 'test/core/utils/CodeTokenizerTest.cpp') diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp index d0f9a17..0b9d7b3 100644 --- a/test/core/utils/CodeTokenizerTest.cpp +++ b/test/core/utils/CodeTokenizerTest.cpp @@ -22,9 +22,81 @@ namespace ousia { namespace utils { + +static const int BLOCK_COMMENT = 30; +static const int LINE_COMMENT = 31; +static const int STRING = 20; +static const int ESCAPE = 21; +static const int LINEBREAK = 21; +static const int CURLY_OPEN = 40; +static const int CURLY_CLOSE = 41; + TEST(CodeTokenizer, testTokenizer) { - + BufferedCharReader reader; + reader.feed("/**\n"); // 1 + reader.feed(" * Some Block Comment\n"); // 2 + reader.feed(" */\n"); // 3 + reader.feed("var my_string = 'My \\'String\\'';\n"); // 4 + reader.feed("// and a line comment\n"); // 5 + reader.feed("var my_obj = { a = 4;}"); // 6 + // 123456789012345678901234567890123456789012345678901234567890123456789 + // 0 1 2 3 4 5 6 + TokenTreeNode root{{{"/*", 1}, + {"*/", 2}, + {"//", 3}, + {"'", 4}, + {"\\", 5}, + {"{", CURLY_OPEN}, + {"}", CURLY_CLOSE}, + {"\n", 6}}}; + std::map descriptors{ + // the block comment start Token has the id 1 and if the Tokenizer + // returns a Block Comment Token that should have the id 10. + {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}}, + {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}}, + {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}}, + {4, {CodeTokenMode::STRING_START_END, STRING}}, + {5, {CodeTokenMode::ESCAPE, ESCAPE}}, + {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; + + std::vector expected = { + {BLOCK_COMMENT, "*\n * Some Block Comment\n ", 1, 1, 4, 3}, + {LINEBREAK, "\n", 4, 3, 1, 4}, + {TOKEN_TEXT, "var", 1, 4, 4, 4}, + {TOKEN_TEXT, "my_string", 5, 4, 14, 4}, + {TOKEN_TEXT, "=", 15, 4, 16, 4}, + {STRING, "My 'String'", 17, 4, 32, 4}, + {TOKEN_TEXT, ";", 32, 4, 33, 4}, + {LINEBREAK, "\n", 33, 4, 1, 5}, + //this is slightly counter-intuitive but makes sense if you think about + //it: As a line comment is ended by a line break the line break is + //technically still a part of the line comment and thus the ending + //is in the next line. + {LINE_COMMENT, " and a line comment", 1, 5, 1, 6}, + {TOKEN_TEXT, "var", 1, 6, 4, 6}, + {TOKEN_TEXT, "my_obj", 5, 6, 11, 6}, + {TOKEN_TEXT, "=", 12, 6, 13, 6}, + {CURLY_OPEN, "{", 14, 6, 15, 6}, + {TOKEN_TEXT, "a", 16, 6, 17, 6}, + {TOKEN_TEXT, "=", 18, 6, 19, 6}, + {TOKEN_TEXT, "4;", 20, 6, 22, 6}, + {CURLY_CLOSE, "}", 22, 6, 23, 6}, + }; + + CodeTokenizer tokenizer{reader, root, descriptors}; + + Token t; + for (auto &te : expected) { + ASSERT_TRUE(tokenizer.next(t)); + ASSERT_EQ(te.tokenId, t.tokenId); + ASSERT_EQ(te.content, t.content); + ASSERT_EQ(te.startColumn, t.startColumn); + ASSERT_EQ(te.startLine, t.startLine); + ASSERT_EQ(te.endColumn, t.endColumn); + ASSERT_EQ(te.endLine, t.endLine); + } + ASSERT_FALSE(tokenizer.next(t)); } } } -- cgit v1.2.3