diff options
author | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2014-11-14 17:41:03 +0100 |
---|---|---|
committer | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2014-11-14 17:41:03 +0100 |
commit | 165cf9a5c6ab03dab64d5eb5a5577f8c216bb832 (patch) | |
tree | 8ab8fd9baa491d92d9f21f2e9257450adab4f75f | |
parent | e3cf0a9d726c9d76f4938590691336fbf2e9f6d5 (diff) |
implemented tokenizer test and started implementing CodeTokenizer under supervision of Maester Stoeckel.
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | build/.empty | 0 | ||||
-rw-r--r-- | src/core/utils/CSSParser.cpp | 10 | ||||
-rw-r--r-- | src/core/utils/CodeTokenizer.cpp | 144 | ||||
-rw-r--r-- | src/core/utils/CodeTokenizer.hpp | 83 | ||||
-rw-r--r-- | src/core/utils/Tokenizer.cpp | 36 | ||||
-rw-r--r-- | src/core/utils/Tokenizer.hpp | 23 | ||||
-rw-r--r-- | test/core/utils/CodeTokenizerTest.cpp | 30 | ||||
-rw-r--r-- | test/core/utils/TokenizerTest.cpp | 34 |
9 files changed, 342 insertions, 20 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c04de7..c375e79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,6 +104,7 @@ ADD_LIBRARY(ousia_core src/core/script/ScriptEngine src/core/script/Variant src/core/utils/BufferedCharReader + src/core/utils/CodeTokenizer src/core/utils/CSSParser src/core/utils/Tokenizer src/core/utils/Utils @@ -141,6 +142,7 @@ IF(TEST) test/core/script/ObjectTest test/core/script/VariantTest test/core/utils/BufferedCharReaderTest + test/core/utils/CodeTokenizerTest test/core/utils/CSSParserTest test/core/utils/RangeSetTest test/core/utils/TokenizerTest diff --git a/build/.empty b/build/.empty deleted file mode 100644 index e69de29..0000000 --- a/build/.empty +++ /dev/null diff --git a/src/core/utils/CSSParser.cpp b/src/core/utils/CSSParser.cpp index 1639152..e66eb34 100644 --- a/src/core/utils/CSSParser.cpp +++ b/src/core/utils/CSSParser.cpp @@ -44,11 +44,11 @@ static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN}, {"/*", COMMENT_OPEN}, {"*/", COMMENT_CLOSE}}}; -//StyleNode CSSParser::parse(BufferedCharReader &input) { -// Tokenizer tokenizer {input, CSS_ROOT}; -// //TODO: implement -// -//} +StyleNode CSSParser::parse(BufferedCharReader &input) { + Tokenizer tokenizer {input, CSS_ROOT}; + //TODO: implement + +} diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp new file mode 100644 index 0000000..c1376af --- /dev/null +++ b/src/core/utils/CodeTokenizer.cpp @@ -0,0 +1,144 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <cassert> + +#include "CodeTokenizer.hpp" + +namespace ousia { +namespace utils { + +Token CodeTokenizer::constructToken(const Token& t) +{ + std::string content = buf.str(); + buf.str(std::string()); + return Token{returnTokenId, content, + startToken.startColumn, startToken.startLine, + t.endColumn, t.endLine}; +} + +void CodeTokenizer::buffer(const Token &t) { buf << t.content; } + +bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) +{ + if (t.startLine != t.endLine) { + throw TokenizerException( + "We did not expect a multiline token. Most likely you did not add " + "a linebreak token to your tokenizer!"); + } + + auto it = descriptors.find(t.tokenId); + CodeTokenMode mode = CodeTokenMode::NONE; + if (it != descriptors.end()) { + mode = it->second.mode; + } + switch (state) { + case CodeTokenizerState::NORMAL: + switch (mode) { + case CodeTokenMode::STRING_START_END: + state = CodeTokenizerState::IN_STRING; + break; + case CodeTokenMode::BLOCK_COMMENT_START: + state = CodeTokenizerState::IN_BLOCK_COMMENT; + break; + case CodeTokenMode::LINE_COMMENT: + state = CodeTokenizerState::IN_LINE_COMMENT; + break; + default: + if (t.tokenId == TOKEN_TEXT) { + int begin = -1; + for (size_t c = 0; c < t.content.length(); c++) { + bool isWhitespace = + t.content[c] == ' ' || t.content[c] == '\t'; + if (begin >= 0 && isWhitespace) { + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin, (int)c - begin), + t.startColumn + begin, t.startLine, + t.startColumn + (int)c, t.endLine}); + } + if (!isWhitespace && begin < 0) { + begin = c; + } + } + } + peeked.push_back(t); + return true; + } + startToken = t; + returnTokenId = it->second.id; + return false; + case CodeTokenizerState::IN_LINE_COMMENT: + switch (mode) { + case CodeTokenMode::LINEBREAK: + state = CodeTokenizerState::NORMAL; + if (!ignoreComments) { + peeked.push_back(constructToken(t)); + } + return !ignoreComments; + default: + if (!ignoreComments) { + buffer(t); + } + return false; + } + case CodeTokenizerState::IN_BLOCK_COMMENT: + switch (mode) { + case CodeTokenMode::BLOCK_COMMENT_END: + state = CodeTokenizerState::NORMAL; + if (!ignoreComments) { + peeked.push_back(constructToken(t)); + } + return !ignoreComments; + default: + if (!ignoreComments) { + buffer(t); + } + return false; + } + case CodeTokenizerState::IN_STRING: + switch (mode) { + case CodeTokenMode::ESCAPE: + if (escaped) { + buffer(t); + } + escaped = !escaped; + return false; + case CodeTokenMode::STRING_START_END: + if (escaped) { + buffer(t); + escaped = false; + return false; + } else { + peeked.push_back(constructToken(t)); + state = CodeTokenizerState::NORMAL; + return true; + } + default: + if (escaped) { + // TODO: handle escaped characters? + escaped = false; + } + buffer(t); + return false; + } + } + assert(false); +} +} +} diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp new file mode 100644 index 0000000..f26a74c --- /dev/null +++ b/src/core/utils/CodeTokenizer.hpp @@ -0,0 +1,83 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef _OUSIA_UTILS_CODE_TOKENIZER_HPP_ +#define _OUSIA_UTILS_CODE_TOKENIZER_HPP_ + +#include <map> +#include <sstream> + +#include "BufferedCharReader.hpp" +#include "Tokenizer.hpp" + +namespace ousia { +namespace utils { + +enum class CodeTokenMode { + STRING_START_END, + LINE_COMMENT, + BLOCK_COMMENT_START, + BLOCK_COMMENT_END, + LINEBREAK, + ESCAPE, + NONE +}; + +struct CodeTokenDescriptor { + CodeTokenMode mode; + int id; + + CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} +}; + + +enum class CodeTokenizerState { + NORMAL, + IN_BLOCK_COMMENT, + IN_LINE_COMMENT, + IN_STRING +}; + +class CodeTokenizer : public Tokenizer { +private: + std::map<int, CodeTokenDescriptor> descriptors; + CodeTokenizerState state; + std::stringstream buf; + Token startToken; + int returnTokenId; + bool escaped = false; + + Token constructToken(const Token& t); + void buffer(const Token& t); + +protected: + bool doPrepare(const Token &t, std::deque<Token> &peeked) override; + +public: + bool ignoreComments = false; + + CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root, + std::map<int, CodeTokenDescriptor> descriptors) + : Tokenizer(input, root), descriptors(descriptors) + { + } +}; +} +} + +#endif diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp index 2c36438..164a30f 100644 --- a/src/core/utils/Tokenizer.cpp +++ b/src/core/utils/Tokenizer.cpp @@ -96,6 +96,7 @@ bool Tokenizer::prepare() tBuf << c; n = &(n->children.at(c)); if (n->tokenId != TOKEN_NONE) { + match = n->tokenId; // from here on we found a token. If we have something // in our buffer already, we end the search now. if (!bufEmpty) { @@ -111,7 +112,7 @@ bool Tokenizer::prepare() // if we are at the end we break off the search. break; } - if (n->children.find(c) == root.children.end()) { + if (n->children.find(c) == n->children.end()) { // if we do not find a possible continuation anymore, // break off the search. break; @@ -122,16 +123,21 @@ bool Tokenizer::prepare() input.resetPeek(); if (bufEmpty) { // if we did not have text before, construct that token. - peeked.push_back(Token{match, tBuf.str(), startColumn, - startLine, input.getColumn(), - input.getLine()}); - return true; + if (doPrepare( + Token{match, tBuf.str(), startColumn, startLine, + input.getColumn(), input.getLine()}, + peeked)) { + return true; + } + } else { // otherwise we return the text before the token. - peeked.push_back(Token{TOKEN_TEXT, buffer.str(), - startColumn, startLine, - input.getColumn(), input.getLine()}); - return true; + if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, + startLine, input.getColumn(), + input.getLine()}, + peeked)) { + return true; + } } } } @@ -140,13 +146,19 @@ bool Tokenizer::prepare() input.consumePeek(); } if (!bufEmpty) { - peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, - input.getColumn(), input.getLine()}); - return true; + return doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, + input.getColumn(), input.getLine()}, + peeked); } return false; } +bool Tokenizer::doPrepare(const Token &t, std::deque<Token> &peeked) +{ + peeked.push_back(t); + return true; +} + bool Tokenizer::next(Token &t) { if (peeked.empty()) { diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp index 924b670..eb8eed4 100644 --- a/src/core/utils/Tokenizer.hpp +++ b/src/core/utils/Tokenizer.hpp @@ -45,6 +45,9 @@ public: TokenTreeNode(const std::map<std::string, int> &inputs); }; +static const int TOKEN_NONE = -1; +static const int TOKEN_TEXT = -2; + struct Token { int tokenId; std::string content; @@ -63,10 +66,9 @@ struct Token { endLine(endLine) { } -}; -static const int TOKEN_NONE = -1; -static const int TOKEN_TEXT = -2; + Token() : tokenId(TOKEN_NONE) {} +}; class Tokenizer { private: @@ -77,6 +79,21 @@ private: bool prepare(); +protected: + /** + * This method is an interface to build multiple tokens from a single one in + * derived classes. This might be interesting if you want to implement + * further logic on text tokens or similar applications. + * + * @param t a Token the "basic" tokenizer found. + * @param peeked a reference to the deque containing all temporary Tokens. + * You are supposed to append your tokens there. In the trivial case you just + * put the given Token on top of the deque. + * @return false if no token was appended to the deque (meaning that you want + * to ignore the given token explicitly) and true in all other cases. + */ + virtual bool doPrepare(const Token &t, std::deque<Token> &peeked); + public: Tokenizer(BufferedCharReader &input, const TokenTreeNode &root); diff --git a/test/core/utils/CodeTokenizerTest.cpp b/test/core/utils/CodeTokenizerTest.cpp new file mode 100644 index 0000000..d0f9a17 --- /dev/null +++ b/test/core/utils/CodeTokenizerTest.cpp @@ -0,0 +1,30 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <gtest/gtest.h> + +#include <core/utils/CodeTokenizer.hpp> + +namespace ousia { +namespace utils { +TEST(CodeTokenizer, testTokenizer) +{ + +} +} +} diff --git a/test/core/utils/TokenizerTest.cpp b/test/core/utils/TokenizerTest.cpp index f441fd8..ba06c33 100644 --- a/test/core/utils/TokenizerTest.cpp +++ b/test/core/utils/TokenizerTest.cpp @@ -18,6 +18,8 @@ #include <gtest/gtest.h> +#include <core/utils/BufferedCharReader.hpp> + #include <core/utils/Tokenizer.hpp> namespace ousia { @@ -59,5 +61,37 @@ TEST(TokenTreeNode, testConstructor) ASSERT_EQ(4, abd.tokenId); ASSERT_EQ(0, abd.children.size()); } + +TEST(Tokenizer, testTokenization) +{ + TokenTreeNode root{{{"/", 1}, {"/*", 2}, {"*/", 3}}}; + + BufferedCharReader reader; + reader.feed("Test/Test /* Block Comment */"); + // 12345678901234567890123456789 + // 0 1 2 + + std::vector<Token> expected = { + {TOKEN_TEXT, "Test", 1, 1, 5, 1}, + {1, "/", 5, 1, 6, 1}, + {TOKEN_TEXT, "Test ", 6, 1, 11, 1}, + {2, "/*", 11, 1, 13, 1}, + {TOKEN_TEXT, " Block Comment ", 13, 1, 28, 1}, + {3, "*/", 28, 1, 30, 1}}; + + Tokenizer tokenizer{reader, root}; + + Token t; + for (auto &te : expected) { + ASSERT_TRUE(tokenizer.next(t)); + ASSERT_EQ(te.tokenId, t.tokenId); + ASSERT_EQ(te.content, t.content); + ASSERT_EQ(te.startColumn, t.startColumn); + ASSERT_EQ(te.startLine, t.startLine); + ASSERT_EQ(te.endColumn, t.endColumn); + ASSERT_EQ(te.endLine, t.endLine); + } + ASSERT_FALSE(tokenizer.next(t)); +} } } |