1.) fixed the word tokenization mechanism in the CodeTokenizer which returned rubbish previously. 2.) Allowed multiline tokens if the mode is LINEBREAK (obsiously). 3.) returned LINEBREAK tokens in normal mode. 4.) added a CodeTokenizer test.

author: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-11-19 19:44:29 +0100
committer: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-11-19 19:44:29 +0100
commit: 45c6acf226f7867f847a9085d84960d337b1a2ae (patch)
tree: d248b1155a87b89af0f877284ac69254b9024962 /test
parent: ec31aae293f88e36190aa32169a97a776873567a (diff)
1 files changed, 73 insertions, 1 deletions
diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp
index d0f9a17..0b9d7b3 100644
--- a/test/core/utils/CodeTokenizerTest.cpp
+++ b/test/core/utils/CodeTokenizerTest.cpp
@@ -22,9 +22,81 @@
 
 namespace ousia {
 namespace utils {
+
+static const int BLOCK_COMMENT = 30;
+static const int LINE_COMMENT = 31;
+static const int STRING = 20;
+static const int ESCAPE = 21;
+static const int LINEBREAK = 21;
+static const int CURLY_OPEN = 40;
+static const int CURLY_CLOSE = 41;
+
 TEST(CodeTokenizer, testTokenizer)
 {
-	
+	BufferedCharReader reader;
+	reader.feed("/**\n");                                 // 1
+	reader.feed(" * Some Block Comment\n");               // 2
+	reader.feed(" */\n");                                 // 3
+	reader.feed("var my_string = 'My \\'String\\'';\n");  // 4
+	reader.feed("// and a line comment\n");               // 5
+	reader.feed("var my_obj = { a = 4;}");                // 6
+	//           123456789012345678901234567890123456789012345678901234567890123456789
+	//           0        1         2         3         4         5         6
+	TokenTreeNode root{{{"/*", 1},
+	                    {"*/", 2},
+	                    {"//", 3},
+	                    {"'", 4},
+	                    {"\\", 5},
+	                    {"{", CURLY_OPEN},
+	                    {"}", CURLY_CLOSE},
+	                    {"\n", 6}}};
+	std::map<int, CodeTokenDescriptor> descriptors{
+	    // the block comment start Token has the id 1 and if the Tokenizer
+	    // returns a Block Comment Token that should have the id 10.
+	    {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}},
+	    {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}},
+	    {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}},
+	    {4, {CodeTokenMode::STRING_START_END, STRING}},
+	    {5, {CodeTokenMode::ESCAPE, ESCAPE}},
+	    {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
+
+	std::vector<Token> expected = {
+	    {BLOCK_COMMENT, "*\n * Some Block Comment\n ", 1, 1, 4, 3},
+	    {LINEBREAK, "\n", 4, 3, 1, 4},
+	    {TOKEN_TEXT, "var", 1, 4, 4, 4},
+	    {TOKEN_TEXT, "my_string", 5, 4, 14, 4},
+	    {TOKEN_TEXT, "=", 15, 4, 16, 4},
+	    {STRING, "My 'String'", 17, 4, 32, 4},
+	    {TOKEN_TEXT, ";", 32, 4, 33, 4},
+	    {LINEBREAK, "\n", 33, 4, 1, 5},
+		//this is slightly counter-intuitive but makes sense if you think about
+		//it: As a line comment is ended by a line break the line break is
+		//technically still a part of the line comment and thus the ending
+		//is in the next line.
+	    {LINE_COMMENT, " and a line comment", 1, 5, 1, 6},
+	    {TOKEN_TEXT, "var", 1, 6, 4, 6},
+	    {TOKEN_TEXT, "my_obj", 5, 6, 11, 6},
+	    {TOKEN_TEXT, "=", 12, 6, 13, 6},
+	    {CURLY_OPEN, "{", 14, 6, 15, 6},
+	    {TOKEN_TEXT, "a", 16, 6, 17, 6},
+	    {TOKEN_TEXT, "=", 18, 6, 19, 6},
+	    {TOKEN_TEXT, "4;", 20, 6, 22, 6},
+	    {CURLY_CLOSE, "}", 22, 6, 23, 6},
+	};
+
+	CodeTokenizer tokenizer{reader, root, descriptors};
+
+	Token t;
+	for (auto &te : expected) {
+		ASSERT_TRUE(tokenizer.next(t));
+		ASSERT_EQ(te.tokenId, t.tokenId);
+		ASSERT_EQ(te.content, t.content);
+		ASSERT_EQ(te.startColumn, t.startColumn);
+		ASSERT_EQ(te.startLine, t.startLine);
+		ASSERT_EQ(te.endColumn, t.endColumn);
+		ASSERT_EQ(te.endLine, t.endLine);
+	}
+	ASSERT_FALSE(tokenizer.next(t));
 }
 }
 }
author	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-11-19 19:44:29 +0100
committer	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-11-19 19:44:29 +0100
commit	45c6acf226f7867f847a9085d84960d337b1a2ae (patch)
tree	d248b1155a87b89af0f877284ac69254b9024962 /test
parent	ec31aae293f88e36190aa32169a97a776873567a (diff)