summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/utils/CodeTokenizer.cpp62
-rw-r--r--src/core/utils/CodeTokenizer.hpp11
-rw-r--r--test/core/utils/CodeTokenizerTest.cpp74
3 files changed, 122 insertions, 25 deletions
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
index c1376af..e5b8610 100644
--- a/src/core/utils/CodeTokenizer.cpp
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -23,30 +23,30 @@
namespace ousia {
namespace utils {
-Token CodeTokenizer::constructToken(const Token& t)
+Token CodeTokenizer::constructToken(const Token &t)
{
std::string content = buf.str();
buf.str(std::string());
- return Token{returnTokenId, content,
- startToken.startColumn, startToken.startLine,
- t.endColumn, t.endLine};
+ return Token{returnTokenId, content, startToken.startColumn,
+ startToken.startLine, t.endColumn, t.endLine};
}
void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
{
- if (t.startLine != t.endLine) {
- throw TokenizerException(
- "We did not expect a multiline token. Most likely you did not add "
- "a linebreak token to your tokenizer!");
- }
-
auto it = descriptors.find(t.tokenId);
CodeTokenMode mode = CodeTokenMode::NONE;
if (it != descriptors.end()) {
mode = it->second.mode;
}
+
+ if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
+ throw TokenizerException(
+ "We did not expect a multiline token (except linebreaks). Most "
+ "likely you did not add a linebreak token to your tokenizer!");
+ }
+
switch (state) {
case CodeTokenizerState::NORMAL:
switch (mode) {
@@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
case CodeTokenMode::LINE_COMMENT:
state = CodeTokenizerState::IN_LINE_COMMENT;
break;
+ case CodeTokenMode::LINEBREAK:
+ peeked.push_back({it->second.id, t.content, t.startColumn,
+ t.startLine, t.endColumn, t.endLine});
+ return true;
default:
if (t.tokenId == TOKEN_TEXT) {
int begin = -1;
for (size_t c = 0; c < t.content.length(); c++) {
bool isWhitespace =
t.content[c] == ' ' || t.content[c] == '\t';
- if (begin >= 0 && isWhitespace) {
- peeked.push_back(Token{
- TOKEN_TEXT,
- t.content.substr(begin, (int)c - begin),
- t.startColumn + begin, t.startLine,
- t.startColumn + (int)c, t.endLine});
- }
- if (!isWhitespace && begin < 0) {
- begin = c;
+ if (begin < 0) {
+ // if we have not yet set our beginning,
+ // we wait for the first
+ // non-whitespace-character to set it.
+ if (!isWhitespace) {
+ begin = c;
+ }
+ } else {
+ // if we have set our beginning, we wait for the
+ // first whitespace character, which marks the
+ // end of the current word.
+ if (isWhitespace) {
+ peeked.push_back(Token{
+ TOKEN_TEXT,
+ t.content.substr(begin, (int)c - begin),
+ t.startColumn + begin, t.startLine,
+ t.startColumn + (int)c, t.endLine});
+ begin = -1;
+ }
}
}
+ if(begin >= 0){
+ peeked.push_back(Token{
+ TOKEN_TEXT,
+ t.content.substr(begin),
+ t.startColumn + begin, t.startLine,
+ t.endColumn, t.endLine});
+ }
+ } else {
+ peeked.push_back(t);
}
- peeked.push_back(t);
return true;
}
startToken = t;
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
index 18cf02a..fda4493 100644
--- a/src/core/utils/CodeTokenizer.hpp
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -36,7 +36,7 @@ namespace utils {
* 2.) A start token for line comments, which would e.g. be // in Java.
* 3.) A start token for a block comment
* 4.) An end token for a block comment.
- * 5.) The linebreak token (this does not have to be specified by the user)
+ * 5.) A linebreak token
* 6.) The escape token, which would e.g. be \ in java.
*/
enum class CodeTokenMode {
@@ -50,8 +50,11 @@ enum class CodeTokenMode {
};
/**
- * A CodeTokenDescriptor draws the connection between an id returned by the
- * underlying Tokenizer and the mode this token represents.
+ * A CodeTokenDescriptor defines the id the user likes to have returned for
+ * a Token of the mode specified, e.g. if you want to get the id 4 for a
+ * String Token the corresponding CodeTokenDescriptor would be inizialized
+ * with
+ * CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
*/
struct CodeTokenDescriptor {
CodeTokenMode mode;
@@ -118,7 +121,7 @@ public:
*/
CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
std::map<int, CodeTokenDescriptor> descriptors)
- : Tokenizer(input, root), descriptors(descriptors)
+ : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL)
{
}
};
diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp
index d0f9a17..0b9d7b3 100644
--- a/test/core/utils/CodeTokenizerTest.cpp
+++ b/test/core/utils/CodeTokenizerTest.cpp
@@ -22,9 +22,81 @@
namespace ousia {
namespace utils {
+
+static const int BLOCK_COMMENT = 30;
+static const int LINE_COMMENT = 31;
+static const int STRING = 20;
+static const int ESCAPE = 21;
+static const int LINEBREAK = 21;
+static const int CURLY_OPEN = 40;
+static const int CURLY_CLOSE = 41;
+
TEST(CodeTokenizer, testTokenizer)
{
-
+ BufferedCharReader reader;
+ reader.feed("/**\n"); // 1
+ reader.feed(" * Some Block Comment\n"); // 2
+ reader.feed(" */\n"); // 3
+ reader.feed("var my_string = 'My \\'String\\'';\n"); // 4
+ reader.feed("// and a line comment\n"); // 5
+ reader.feed("var my_obj = { a = 4;}"); // 6
+ // 123456789012345678901234567890123456789012345678901234567890123456789
+ // 0 1 2 3 4 5 6
+ TokenTreeNode root{{{"/*", 1},
+ {"*/", 2},
+ {"//", 3},
+ {"'", 4},
+ {"\\", 5},
+ {"{", CURLY_OPEN},
+ {"}", CURLY_CLOSE},
+ {"\n", 6}}};
+ std::map<int, CodeTokenDescriptor> descriptors{
+ // the block comment start Token has the id 1 and if the Tokenizer
+ // returns a Block Comment Token that should have the id 10.
+ {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}},
+ {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}},
+ {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}},
+ {4, {CodeTokenMode::STRING_START_END, STRING}},
+ {5, {CodeTokenMode::ESCAPE, ESCAPE}},
+ {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
+
+ std::vector<Token> expected = {
+ {BLOCK_COMMENT, "*\n * Some Block Comment\n ", 1, 1, 4, 3},
+ {LINEBREAK, "\n", 4, 3, 1, 4},
+ {TOKEN_TEXT, "var", 1, 4, 4, 4},
+ {TOKEN_TEXT, "my_string", 5, 4, 14, 4},
+ {TOKEN_TEXT, "=", 15, 4, 16, 4},
+ {STRING, "My 'String'", 17, 4, 32, 4},
+ {TOKEN_TEXT, ";", 32, 4, 33, 4},
+ {LINEBREAK, "\n", 33, 4, 1, 5},
+ //this is slightly counter-intuitive but makes sense if you think about
+ //it: As a line comment is ended by a line break the line break is
+ //technically still a part of the line comment and thus the ending
+ //is in the next line.
+ {LINE_COMMENT, " and a line comment", 1, 5, 1, 6},
+ {TOKEN_TEXT, "var", 1, 6, 4, 6},
+ {TOKEN_TEXT, "my_obj", 5, 6, 11, 6},
+ {TOKEN_TEXT, "=", 12, 6, 13, 6},
+ {CURLY_OPEN, "{", 14, 6, 15, 6},
+ {TOKEN_TEXT, "a", 16, 6, 17, 6},
+ {TOKEN_TEXT, "=", 18, 6, 19, 6},
+ {TOKEN_TEXT, "4;", 20, 6, 22, 6},
+ {CURLY_CLOSE, "}", 22, 6, 23, 6},
+ };
+
+ CodeTokenizer tokenizer{reader, root, descriptors};
+
+ Token t;
+ for (auto &te : expected) {
+ ASSERT_TRUE(tokenizer.next(t));
+ ASSERT_EQ(te.tokenId, t.tokenId);
+ ASSERT_EQ(te.content, t.content);
+ ASSERT_EQ(te.startColumn, t.startColumn);
+ ASSERT_EQ(te.startLine, t.startLine);
+ ASSERT_EQ(te.endColumn, t.endColumn);
+ ASSERT_EQ(te.endLine, t.endLine);
+ }
+ ASSERT_FALSE(tokenizer.next(t));
}
}
}