From 72c1845961e77f7625db47ebd3de129aa90f4f5d Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Fri, 31 Oct 2014 14:56:13 +0000 Subject: finished first draft of tokenizer code. git-svn-id: file:///var/local/svn/basicwriter@90 daaaf23c-2e50-4459-9457-1e69db5a47bf --- CMakeLists.txt | 24 ++-- src/core/utils/BufferedCharReader.cpp | 4 +- src/core/utils/BufferedCharReader.hpp | 6 +- src/core/utils/Tokenizer.cpp | 113 +++++++++++++++- src/core/utils/Tokenizer.hpp | 48 ++++--- test/core/utils/BufferedCharReader.cpp | 198 ---------------------------- test/core/utils/BufferedCharReaderTest.cpp | 203 +++++++++++++++++++++++++++++ 7 files changed, 364 insertions(+), 232 deletions(-) delete mode 100644 test/core/utils/BufferedCharReader.cpp create mode 100644 test/core/utils/BufferedCharReaderTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 726e1a3..8d06b4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,18 +96,19 @@ ADD_DEFINITIONS( # ousia_script library (containing the bindings needed for script engines) ADD_LIBRARY(ousia_core - src/core/script/Function.cpp - src/core/script/Object.cpp - src/core/script/ScriptEngine.cpp - src/core/script/Variant.cpp - src/core/utils/Tokenizer.cpp - src/core/utils/Utils.cpp + src/core/script/Function + src/core/script/Object + src/core/script/ScriptEngine + src/core/script/Variant + src/core/utils/BufferedCharReader + src/core/utils/Tokenizer + src/core/utils/Utils ) # ousia_plugin_mozjs library ADD_LIBRARY(ousia_plugin_mozjs - src/plugins/mozjs/MozJsScriptEngine.cpp + src/plugins/mozjs/MozJsScriptEngine ) TARGET_LINK_LIBRARIES(ousia_plugin_mozjs @@ -130,12 +131,13 @@ IF(test) # Add all unit test files ADD_EXECUTABLE(ousia_test_core - test/core/utils/RangeSetTest - test/core/utils/TokenizerTest - test/core/utils/UtilsTest test/core/script/FunctionTest test/core/script/ObjectTest test/core/script/VariantTest + test/core/utils/BufferedCharReaderTest + test/core/utils/RangeSetTest + test/core/utils/TokenizerTest + test/core/utils/UtilsTest ) TARGET_LINK_LIBRARIES(ousia_test_core @@ -156,7 +158,7 @@ IF(test) # Add all unit test files ADD_EXECUTABLE(ousia_test_plugin_mozjs - test/plugins/mozjs/MozJsScriptEngineTest.cpp + test/plugins/mozjs/MozJsScriptEngineTest ) TARGET_LINK_LIBRARIES(ousia_test_plugin_mozjs diff --git a/src/core/utils/BufferedCharReader.cpp b/src/core/utils/BufferedCharReader.cpp index 0377015..c13628f 100644 --- a/src/core/utils/BufferedCharReader.cpp +++ b/src/core/utils/BufferedCharReader.cpp @@ -1,6 +1,6 @@ /* - SCAENEA IDL Compiler (scidlc) - Copyright (C) 2014 Andreas Stöckel + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/core/utils/BufferedCharReader.hpp b/src/core/utils/BufferedCharReader.hpp index 86f43b5..b13cde6 100644 --- a/src/core/utils/BufferedCharReader.hpp +++ b/src/core/utils/BufferedCharReader.hpp @@ -1,6 +1,6 @@ /* - SCAENEA IDL Compiler (scidlc) - Copyright (C) 2014 Andreas Stöckel + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -236,5 +236,5 @@ public: } } -#endif /* _OUSISA_UTILS_BUFFERED_CHAR_READER_H_ */ +#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */ diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp index 38f7585..2c36438 100644 --- a/src/core/utils/Tokenizer.cpp +++ b/src/core/utils/Tokenizer.cpp @@ -16,6 +16,8 @@ along with this program. If not, see . */ +#include + #include "Tokenizer.hpp" namespace ousia { @@ -52,10 +54,10 @@ static std::map buildChildren( static int buildId(const std::map &inputs) { - int tokenId = -1; + int tokenId = TOKEN_NONE; for (auto &e : inputs) { if (e.first.empty()) { - if (tokenId != -1) { + if (tokenId != TOKEN_NONE) { throw TokenizerException{std::string{"Ambigous token found: "} + std::to_string(e.second)}; } else { @@ -68,8 +70,115 @@ static int buildId(const std::map &inputs) TokenTreeNode::TokenTreeNode(const std::map &inputs) : children(buildChildren(inputs)), tokenId(buildId(inputs)) +{ +} + +Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root) + : input(input), root(root) +{ +} + +bool Tokenizer::prepare() +{ + std::stringstream buffer; + char c; + const int startColumn = input.getColumn(); + const int startLine = input.getLine(); + bool bufEmpty = true; + while (input.peek(&c)) { + if (root.children.find(c) != root.children.end()) { + // if there might be a special token, keep peeking forward + // until we find the token (or we don't). + TokenTreeNode const *n = &root; + std::stringstream tBuf; + int match = TOKEN_NONE; + while (true) { + tBuf << c; + n = &(n->children.at(c)); + if (n->tokenId != TOKEN_NONE) { + // from here on we found a token. If we have something + // in our buffer already, we end the search now. + if (!bufEmpty) { + break; + } else { + // if we want to return this token ( = we have nothing + // in our buffer yet) we look greedily for the longest + // possible token we can construct. + input.consumePeek(); + } + } + if (!input.peek(&c)) { + // if we are at the end we break off the search. + break; + } + if (n->children.find(c) == root.children.end()) { + // if we do not find a possible continuation anymore, + // break off the search. + break; + } + } + // check if we did indeed find a special token. + if (match != TOKEN_NONE) { + input.resetPeek(); + if (bufEmpty) { + // if we did not have text before, construct that token. + peeked.push_back(Token{match, tBuf.str(), startColumn, + startLine, input.getColumn(), + input.getLine()}); + return true; + } else { + // otherwise we return the text before the token. + peeked.push_back(Token{TOKEN_TEXT, buffer.str(), + startColumn, startLine, + input.getColumn(), input.getLine()}); + return true; + } + } + } + buffer << c; + bufEmpty = false; + input.consumePeek(); + } + if (!bufEmpty) { + peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, + input.getColumn(), input.getLine()}); + return true; + } + return false; +} +bool Tokenizer::next(Token &t) { + if (peeked.empty()) { + if (!prepare()) { + return false; + } + } + t = peeked.front(); + peeked.pop_front(); + resetPeek(); + return true; +} + +bool Tokenizer::peek(Token &t) +{ + if (peekCursor >= peeked.size()) { + if (!prepare()) { + return false; + } + } + t = peeked[peekCursor]; + return true; +} + +void Tokenizer::resetPeek() { peekCursor = 0; } + +void Tokenizer::consumePeek() +{ + while (peekCursor > 0) { + peeked.pop_front(); + peekCursor--; + } } } } diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp index 24c4f30..924b670 100644 --- a/src/core/utils/Tokenizer.hpp +++ b/src/core/utils/Tokenizer.hpp @@ -21,7 +21,9 @@ #include #include -#include +#include + +#include "BufferedCharReader.hpp" namespace ousia { namespace utils { @@ -44,33 +46,47 @@ public: }; struct Token { - const int tokenId; - const std::string content; - const int column; - const int line; - - Token(int tokenId, std::string content, int column, int line) - : tokenId(tokenId), content(content), column(column), line(line) + int tokenId; + std::string content; + int startColumn; + int startLine; + int endColumn; + int endLine; + + Token(int tokenId, std::string content, int startColumn, int startLine, + int endColumn, int endLine) + : tokenId(tokenId), + content(content), + startColumn(startColumn), + startLine(startLine), + endColumn(endColumn), + endLine(endLine) { } }; +static const int TOKEN_NONE = -1; +static const int TOKEN_TEXT = -2; + class Tokenizer { private: - const std::istream &input; - const TokenTreeNode root; - const std::queue peekQueue; + BufferedCharReader &input; + const TokenTreeNode &root; + std::deque peeked; + unsigned int peekCursor = 0; + + bool prepare(); public: - Tokenizer(const TokenTreeNode &root, std::istream &input); + Tokenizer(BufferedCharReader &input, const TokenTreeNode &root); - bool hasNext(); + bool next(Token &t); - const Token &next(); + bool peek(Token &t); - const Token &peek(); + void resetPeek(); - void reset(); + void consumePeek(); }; } } diff --git a/test/core/utils/BufferedCharReader.cpp b/test/core/utils/BufferedCharReader.cpp deleted file mode 100644 index f8f668c..0000000 --- a/test/core/utils/BufferedCharReader.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - SCAENEA IDL Compiler (scidlc) - Copyright (C) 2014 Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include - -#include "gtest/gtest.h" - -#include "BufferedCharReader.hpp" - -TEST(BufferedCharReaderTest, SimpleReadTest) -{ - const std::string testStr("this is a test"); - char c; - - // Feed a test string into the reader - scaenea::compiler::BufferedCharReader reader; - reader.feed(testStr); - reader.close(); - - // Try to read the test string - std::string res; - while (!reader.atEnd()) { - ASSERT_TRUE(reader.read(&c)); - res.append(&c, 1); - } - - // The two strings must equal - ASSERT_STREQ(testStr.c_str(), res.c_str()) ; - - // We must now be at line 1, column 15 - ASSERT_EQ(1, reader.getLine()); - ASSERT_EQ(testStr.size() + 1, reader.getColumn()); - - // If we call either read or peek, false is returned - ASSERT_FALSE(reader.read(&c)); - ASSERT_FALSE(reader.peek(&c)); -} - -TEST(BufferedCharReaderTest, SimplePeekTest) -{ - const std::string testStr("this is a test"); - char c; - - // Feed a test string into the reader - scaenea::compiler::BufferedCharReader reader; - reader.feed(testStr); - reader.close(); - - // Try to read the test string - std::string res; - while (reader.peek(&c)) { - res.append(&c, 1); - } - - // Peeking does not trigger the "atEnd" flag - ASSERT_FALSE(reader.atEnd()); - - // The two strings must equal - ASSERT_STREQ(testStr.c_str(), res.c_str()); - - // We must now be at line 1, column 1 and NOT at the end of the stream - ASSERT_EQ(1, reader.getLine()); - ASSERT_EQ(1, reader.getColumn()); - ASSERT_FALSE(reader.atEnd()); - - // If we consume the peek, we must be at line 1, column 15 and we should be - // at the end of the stream - reader.consumePeek(); - ASSERT_EQ(1, reader.getLine()); - ASSERT_EQ(testStr.size() + 1, reader.getColumn()); - ASSERT_TRUE(reader.atEnd()); - - // If we call either read or peek, false is returned - ASSERT_FALSE(reader.read(&c)); - ASSERT_FALSE(reader.peek(&c)); -} - -TEST(BufferedCharReaderTest, SplittedPeakTest) -{ - const std::string testStr("this is a test"); - char c; - - // Feed a test string into the reader - scaenea::compiler::BufferedCharReader reader; - - // Try to peek the test string, feed char after char into the reader - std::string res; - for (unsigned int i = 0; i < testStr.length(); i++) { - reader.feed(std::string(&testStr[i], 1)); - while (reader.peek(&c)) { - res.append(&c, 1); - } - } - reader.close(); - - // Consume the peeked data - ASSERT_FALSE(reader.atEnd()); - reader.consumePeek(); - ASSERT_TRUE(reader.atEnd()); - - // The two strings must equal - ASSERT_STREQ(testStr.c_str(), res.c_str()) ; - - // We must now be at line 1, column 15 - ASSERT_EQ(1, reader.getLine()); - ASSERT_EQ(testStr.size() + 1, reader.getColumn()); - - // If we call either read or peek, false is returned - ASSERT_FALSE(reader.read(&c)); - ASSERT_FALSE(reader.peek(&c)); -} - -TEST(BufferedCharReaderTest, RowColumnCounterTest) -{ - const std::string testStr("1\n\r2\n3\r\n\n4"); - char c; - - // Feed a test string into the reader - scaenea::compiler::BufferedCharReader reader; - reader.feed(testStr); - reader.close(); - - // We should currently be in line 1, column 1 - ASSERT_EQ(1, reader.getLine()); - ASSERT_EQ(1, reader.getColumn()); - - // Read two characters - for (int i = 0; i < 2; i++) reader.read(&c); - ASSERT_EQ(2, reader.getLine()); - ASSERT_EQ(1, reader.getColumn()); - - // Read two characters - for (int i = 0; i < 2; i++) reader.read(&c); - ASSERT_EQ(3, reader.getLine()); - ASSERT_EQ(1, reader.getColumn()); - - // Read three characters - for (int i = 0; i < 3; i++) reader.read(&c); - ASSERT_EQ(5, reader.getLine()); - ASSERT_EQ(1, reader.getColumn()); -} - -TEST(BufferedCharReaderTest, LinebreakSubstitutionTest) -{ - const std::string testStr("this\n\ris\n\rjust\na test\r\n\rtest\n\r"); - const std::string expStr("this\nis\njust\na test\n\ntest\n"); - - // Feed a test string into the reader - scaenea::compiler::BufferedCharReader reader; - reader.feed(testStr); - - // Read all characters from the test string - std::string res; - char c; - while (reader.read(&c)) { - res.append(&c, 1); - } - - // Test for equality - ASSERT_STREQ(expStr.c_str(), res.c_str()); -} - -TEST(BufferedCharReaderTest, RowColumnCounterUTF8Test) -{ - // Create a test string with some umlauts - const std::string testStr("\x61\xc3\x96\xc3\x84\xc3\x9c\xc3\x9f"); - char c; - - // Feed a test string into the reader - scaenea::compiler::BufferedCharReader reader; - reader.feed(testStr); - reader.close(); - - // Read all bytes - while (reader.read(&c)); - - // The sequence above equals 5 UTF-8 characters (so after reading all the - // cursor is at position 6) - ASSERT_EQ(1, reader.getLine()); - ASSERT_EQ(6, reader.getColumn()); -} - diff --git a/test/core/utils/BufferedCharReaderTest.cpp b/test/core/utils/BufferedCharReaderTest.cpp new file mode 100644 index 0000000..69c0974 --- /dev/null +++ b/test/core/utils/BufferedCharReaderTest.cpp @@ -0,0 +1,203 @@ +/* + SCAENEA IDL Compiler (scidlc) + Copyright (C) 2014 Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include "gtest/gtest.h" + +#include + +namespace ousia{ +namespace utils{ + +TEST(BufferedCharReaderTest, SimpleReadTest) +{ + const std::string testStr("this is a test"); + char c; + + // Feed a test string into the reader + BufferedCharReader reader; + reader.feed(testStr); + reader.close(); + + // Try to read the test string + std::string res; + while (!reader.atEnd()) { + ASSERT_TRUE(reader.read(&c)); + res.append(&c, 1); + } + + // The two strings must equal + ASSERT_STREQ(testStr.c_str(), res.c_str()) ; + + // We must now be at line 1, column 15 + ASSERT_EQ(1, reader.getLine()); + ASSERT_EQ(testStr.size() + 1, reader.getColumn()); + + // If we call either read or peek, false is returned + ASSERT_FALSE(reader.read(&c)); + ASSERT_FALSE(reader.peek(&c)); +} + +TEST(BufferedCharReaderTest, SimplePeekTest) +{ + const std::string testStr("this is a test"); + char c; + + // Feed a test string into the reader + BufferedCharReader reader; + reader.feed(testStr); + reader.close(); + + // Try to read the test string + std::string res; + while (reader.peek(&c)) { + res.append(&c, 1); + } + + // Peeking does not trigger the "atEnd" flag + ASSERT_FALSE(reader.atEnd()); + + // The two strings must equal + ASSERT_STREQ(testStr.c_str(), res.c_str()); + + // We must now be at line 1, column 1 and NOT at the end of the stream + ASSERT_EQ(1, reader.getLine()); + ASSERT_EQ(1, reader.getColumn()); + ASSERT_FALSE(reader.atEnd()); + + // If we consume the peek, we must be at line 1, column 15 and we should be + // at the end of the stream + reader.consumePeek(); + ASSERT_EQ(1, reader.getLine()); + ASSERT_EQ(testStr.size() + 1, reader.getColumn()); + ASSERT_TRUE(reader.atEnd()); + + // If we call either read or peek, false is returned + ASSERT_FALSE(reader.read(&c)); + ASSERT_FALSE(reader.peek(&c)); +} + +TEST(BufferedCharReaderTest, SplittedPeakTest) +{ + const std::string testStr("this is a test"); + char c; + + // Feed a test string into the reader + BufferedCharReader reader; + + // Try to peek the test string, feed char after char into the reader + std::string res; + for (unsigned int i = 0; i < testStr.length(); i++) { + reader.feed(std::string(&testStr[i], 1)); + while (reader.peek(&c)) { + res.append(&c, 1); + } + } + reader.close(); + + // Consume the peeked data + ASSERT_FALSE(reader.atEnd()); + reader.consumePeek(); + ASSERT_TRUE(reader.atEnd()); + + // The two strings must equal + ASSERT_STREQ(testStr.c_str(), res.c_str()) ; + + // We must now be at line 1, column 15 + ASSERT_EQ(1, reader.getLine()); + ASSERT_EQ(testStr.size() + 1, reader.getColumn()); + + // If we call either read or peek, false is returned + ASSERT_FALSE(reader.read(&c)); + ASSERT_FALSE(reader.peek(&c)); +} + +TEST(BufferedCharReaderTest, RowColumnCounterTest) +{ + const std::string testStr("1\n\r2\n3\r\n\n4"); + char c; + + // Feed a test string into the reader + BufferedCharReader reader; + reader.feed(testStr); + reader.close(); + + // We should currently be in line 1, column 1 + ASSERT_EQ(1, reader.getLine()); + ASSERT_EQ(1, reader.getColumn()); + + // Read two characters + for (int i = 0; i < 2; i++) reader.read(&c); + ASSERT_EQ(2, reader.getLine()); + ASSERT_EQ(1, reader.getColumn()); + + // Read two characters + for (int i = 0; i < 2; i++) reader.read(&c); + ASSERT_EQ(3, reader.getLine()); + ASSERT_EQ(1, reader.getColumn()); + + // Read three characters + for (int i = 0; i < 3; i++) reader.read(&c); + ASSERT_EQ(5, reader.getLine()); + ASSERT_EQ(1, reader.getColumn()); +} + +TEST(BufferedCharReaderTest, LinebreakSubstitutionTest) +{ + const std::string testStr("this\n\ris\n\rjust\na test\r\n\rtest\n\r"); + const std::string expStr("this\nis\njust\na test\n\ntest\n"); + + // Feed a test string into the reader + BufferedCharReader reader; + reader.feed(testStr); + + // Read all characters from the test string + std::string res; + char c; + while (reader.read(&c)) { + res.append(&c, 1); + } + + // Test for equality + ASSERT_STREQ(expStr.c_str(), res.c_str()); +} + +TEST(BufferedCharReaderTest, RowColumnCounterUTF8Test) +{ + // Create a test string with some umlauts + const std::string testStr("\x61\xc3\x96\xc3\x84\xc3\x9c\xc3\x9f"); + char c; + + // Feed a test string into the reader + BufferedCharReader reader; + reader.feed(testStr); + reader.close(); + + // Read all bytes + while (reader.read(&c)); + + // The sequence above equals 5 UTF-8 characters (so after reading all the + // cursor is at position 6) + ASSERT_EQ(1, reader.getLine()); + ASSERT_EQ(6, reader.getColumn()); +} + +} +} -- cgit v1.2.3