summaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2014-11-19 19:44:29 +0100
committerBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2014-11-19 19:44:29 +0100
commit45c6acf226f7867f847a9085d84960d337b1a2ae (patch)
treed248b1155a87b89af0f877284ac69254b9024962 /test
parentec31aae293f88e36190aa32169a97a776873567a (diff)
1.) fixed the word tokenization mechanism in the CodeTokenizer which returned rubbish previously. 2.) Allowed multiline tokens if the mode is LINEBREAK (obsiously). 3.) returned LINEBREAK tokens in normal mode. 4.) added a CodeTokenizer test.
Diffstat (limited to 'test')
-rw-r--r--test/core/utils/CodeTokenizerTest.cpp74
1 files changed, 73 insertions, 1 deletions
diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp
index d0f9a17..0b9d7b3 100644
--- a/test/core/utils/CodeTokenizerTest.cpp
+++ b/test/core/utils/CodeTokenizerTest.cpp
@@ -22,9 +22,81 @@
namespace ousia {
namespace utils {
+
+static const int BLOCK_COMMENT = 30;
+static const int LINE_COMMENT = 31;
+static const int STRING = 20;
+static const int ESCAPE = 21;
+static const int LINEBREAK = 21;
+static const int CURLY_OPEN = 40;
+static const int CURLY_CLOSE = 41;
+
TEST(CodeTokenizer, testTokenizer)
{
-
+ BufferedCharReader reader;
+ reader.feed("/**\n"); // 1
+ reader.feed(" * Some Block Comment\n"); // 2
+ reader.feed(" */\n"); // 3
+ reader.feed("var my_string = 'My \\'String\\'';\n"); // 4
+ reader.feed("// and a line comment\n"); // 5
+ reader.feed("var my_obj = { a = 4;}"); // 6
+ // 123456789012345678901234567890123456789012345678901234567890123456789
+ // 0 1 2 3 4 5 6
+ TokenTreeNode root{{{"/*", 1},
+ {"*/", 2},
+ {"//", 3},
+ {"'", 4},
+ {"\\", 5},
+ {"{", CURLY_OPEN},
+ {"}", CURLY_CLOSE},
+ {"\n", 6}}};
+ std::map<int, CodeTokenDescriptor> descriptors{
+ // the block comment start Token has the id 1 and if the Tokenizer
+ // returns a Block Comment Token that should have the id 10.
+ {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}},
+ {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}},
+ {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}},
+ {4, {CodeTokenMode::STRING_START_END, STRING}},
+ {5, {CodeTokenMode::ESCAPE, ESCAPE}},
+ {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
+
+ std::vector<Token> expected = {
+ {BLOCK_COMMENT, "*\n * Some Block Comment\n ", 1, 1, 4, 3},
+ {LINEBREAK, "\n", 4, 3, 1, 4},
+ {TOKEN_TEXT, "var", 1, 4, 4, 4},
+ {TOKEN_TEXT, "my_string", 5, 4, 14, 4},
+ {TOKEN_TEXT, "=", 15, 4, 16, 4},
+ {STRING, "My 'String'", 17, 4, 32, 4},
+ {TOKEN_TEXT, ";", 32, 4, 33, 4},
+ {LINEBREAK, "\n", 33, 4, 1, 5},
+ //this is slightly counter-intuitive but makes sense if you think about
+ //it: As a line comment is ended by a line break the line break is
+ //technically still a part of the line comment and thus the ending
+ //is in the next line.
+ {LINE_COMMENT, " and a line comment", 1, 5, 1, 6},
+ {TOKEN_TEXT, "var", 1, 6, 4, 6},
+ {TOKEN_TEXT, "my_obj", 5, 6, 11, 6},
+ {TOKEN_TEXT, "=", 12, 6, 13, 6},
+ {CURLY_OPEN, "{", 14, 6, 15, 6},
+ {TOKEN_TEXT, "a", 16, 6, 17, 6},
+ {TOKEN_TEXT, "=", 18, 6, 19, 6},
+ {TOKEN_TEXT, "4;", 20, 6, 22, 6},
+ {CURLY_CLOSE, "}", 22, 6, 23, 6},
+ };
+
+ CodeTokenizer tokenizer{reader, root, descriptors};
+
+ Token t;
+ for (auto &te : expected) {
+ ASSERT_TRUE(tokenizer.next(t));
+ ASSERT_EQ(te.tokenId, t.tokenId);
+ ASSERT_EQ(te.content, t.content);
+ ASSERT_EQ(te.startColumn, t.startColumn);
+ ASSERT_EQ(te.startLine, t.startLine);
+ ASSERT_EQ(te.endColumn, t.endColumn);
+ ASSERT_EQ(te.endLine, t.endLine);
+ }
+ ASSERT_FALSE(tokenizer.next(t));
}
}
}