summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2014-10-31 14:56:13 +0000
committerbenjamin <benjamin@daaaf23c-2e50-4459-9457-1e69db5a47bf>2014-10-31 14:56:13 +0000
commit72c1845961e77f7625db47ebd3de129aa90f4f5d (patch)
tree790cfaba53fee7b02038bc7513d5bf62b974a4c7
parent9e233b3f13daebb9ac4c5cae0da073d0c6f782c0 (diff)
finished first draft of tokenizer code.
git-svn-id: file:///var/local/svn/basicwriter@90 daaaf23c-2e50-4459-9457-1e69db5a47bf
-rw-r--r--CMakeLists.txt24
-rw-r--r--src/core/utils/BufferedCharReader.cpp4
-rw-r--r--src/core/utils/BufferedCharReader.hpp6
-rw-r--r--src/core/utils/Tokenizer.cpp113
-rw-r--r--src/core/utils/Tokenizer.hpp48
-rw-r--r--test/core/utils/BufferedCharReaderTest.cpp (renamed from test/core/utils/BufferedCharReader.cpp)19
6 files changed, 173 insertions, 41 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 726e1a3..8d06b4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,18 +96,19 @@ ADD_DEFINITIONS(
# ousia_script library (containing the bindings needed for script engines)
ADD_LIBRARY(ousia_core
- src/core/script/Function.cpp
- src/core/script/Object.cpp
- src/core/script/ScriptEngine.cpp
- src/core/script/Variant.cpp
- src/core/utils/Tokenizer.cpp
- src/core/utils/Utils.cpp
+ src/core/script/Function
+ src/core/script/Object
+ src/core/script/ScriptEngine
+ src/core/script/Variant
+ src/core/utils/BufferedCharReader
+ src/core/utils/Tokenizer
+ src/core/utils/Utils
)
# ousia_plugin_mozjs library
ADD_LIBRARY(ousia_plugin_mozjs
- src/plugins/mozjs/MozJsScriptEngine.cpp
+ src/plugins/mozjs/MozJsScriptEngine
)
TARGET_LINK_LIBRARIES(ousia_plugin_mozjs
@@ -130,12 +131,13 @@ IF(test)
# Add all unit test files
ADD_EXECUTABLE(ousia_test_core
- test/core/utils/RangeSetTest
- test/core/utils/TokenizerTest
- test/core/utils/UtilsTest
test/core/script/FunctionTest
test/core/script/ObjectTest
test/core/script/VariantTest
+ test/core/utils/BufferedCharReaderTest
+ test/core/utils/RangeSetTest
+ test/core/utils/TokenizerTest
+ test/core/utils/UtilsTest
)
TARGET_LINK_LIBRARIES(ousia_test_core
@@ -156,7 +158,7 @@ IF(test)
# Add all unit test files
ADD_EXECUTABLE(ousia_test_plugin_mozjs
- test/plugins/mozjs/MozJsScriptEngineTest.cpp
+ test/plugins/mozjs/MozJsScriptEngineTest
)
TARGET_LINK_LIBRARIES(ousia_test_plugin_mozjs
diff --git a/src/core/utils/BufferedCharReader.cpp b/src/core/utils/BufferedCharReader.cpp
index 0377015..c13628f 100644
--- a/src/core/utils/BufferedCharReader.cpp
+++ b/src/core/utils/BufferedCharReader.cpp
@@ -1,6 +1,6 @@
/*
- SCAENEA IDL Compiler (scidlc)
- Copyright (C) 2014 Andreas Stöckel
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/src/core/utils/BufferedCharReader.hpp b/src/core/utils/BufferedCharReader.hpp
index 86f43b5..b13cde6 100644
--- a/src/core/utils/BufferedCharReader.hpp
+++ b/src/core/utils/BufferedCharReader.hpp
@@ -1,6 +1,6 @@
/*
- SCAENEA IDL Compiler (scidlc)
- Copyright (C) 2014 Andreas Stöckel
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -236,5 +236,5 @@ public:
}
}
-#endif /* _OUSISA_UTILS_BUFFERED_CHAR_READER_H_ */
+#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */
diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp
index 38f7585..2c36438 100644
--- a/src/core/utils/Tokenizer.cpp
+++ b/src/core/utils/Tokenizer.cpp
@@ -16,6 +16,8 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <sstream>
+
#include "Tokenizer.hpp"
namespace ousia {
@@ -52,10 +54,10 @@ static std::map<char, TokenTreeNode> buildChildren(
static int buildId(const std::map<std::string, int> &inputs)
{
- int tokenId = -1;
+ int tokenId = TOKEN_NONE;
for (auto &e : inputs) {
if (e.first.empty()) {
- if (tokenId != -1) {
+ if (tokenId != TOKEN_NONE) {
throw TokenizerException{std::string{"Ambigous token found: "} +
std::to_string(e.second)};
} else {
@@ -68,8 +70,115 @@ static int buildId(const std::map<std::string, int> &inputs)
TokenTreeNode::TokenTreeNode(const std::map<std::string, int> &inputs)
: children(buildChildren(inputs)), tokenId(buildId(inputs))
+{
+}
+
+Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root)
+ : input(input), root(root)
+{
+}
+
+bool Tokenizer::prepare()
+{
+ std::stringstream buffer;
+ char c;
+ const int startColumn = input.getColumn();
+ const int startLine = input.getLine();
+ bool bufEmpty = true;
+ while (input.peek(&c)) {
+ if (root.children.find(c) != root.children.end()) {
+ // if there might be a special token, keep peeking forward
+ // until we find the token (or we don't).
+ TokenTreeNode const *n = &root;
+ std::stringstream tBuf;
+ int match = TOKEN_NONE;
+ while (true) {
+ tBuf << c;
+ n = &(n->children.at(c));
+ if (n->tokenId != TOKEN_NONE) {
+ // from here on we found a token. If we have something
+ // in our buffer already, we end the search now.
+ if (!bufEmpty) {
+ break;
+ } else {
+ // if we want to return this token ( = we have nothing
+ // in our buffer yet) we look greedily for the longest
+ // possible token we can construct.
+ input.consumePeek();
+ }
+ }
+ if (!input.peek(&c)) {
+ // if we are at the end we break off the search.
+ break;
+ }
+ if (n->children.find(c) == root.children.end()) {
+ // if we do not find a possible continuation anymore,
+ // break off the search.
+ break;
+ }
+ }
+ // check if we did indeed find a special token.
+ if (match != TOKEN_NONE) {
+ input.resetPeek();
+ if (bufEmpty) {
+ // if we did not have text before, construct that token.
+ peeked.push_back(Token{match, tBuf.str(), startColumn,
+ startLine, input.getColumn(),
+ input.getLine()});
+ return true;
+ } else {
+ // otherwise we return the text before the token.
+ peeked.push_back(Token{TOKEN_TEXT, buffer.str(),
+ startColumn, startLine,
+ input.getColumn(), input.getLine()});
+ return true;
+ }
+ }
+ }
+ buffer << c;
+ bufEmpty = false;
+ input.consumePeek();
+ }
+ if (!bufEmpty) {
+ peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
+ input.getColumn(), input.getLine()});
+ return true;
+ }
+ return false;
+}
+bool Tokenizer::next(Token &t)
{
+ if (peeked.empty()) {
+ if (!prepare()) {
+ return false;
+ }
+ }
+ t = peeked.front();
+ peeked.pop_front();
+ resetPeek();
+ return true;
+}
+
+bool Tokenizer::peek(Token &t)
+{
+ if (peekCursor >= peeked.size()) {
+ if (!prepare()) {
+ return false;
+ }
+ }
+ t = peeked[peekCursor];
+ return true;
+}
+
+void Tokenizer::resetPeek() { peekCursor = 0; }
+
+void Tokenizer::consumePeek()
+{
+ while (peekCursor > 0) {
+ peeked.pop_front();
+ peekCursor--;
+ }
}
}
}
diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
index 24c4f30..924b670 100644
--- a/src/core/utils/Tokenizer.hpp
+++ b/src/core/utils/Tokenizer.hpp
@@ -21,7 +21,9 @@
#include <istream>
#include <map>
-#include <queue>
+#include <deque>
+
+#include "BufferedCharReader.hpp"
namespace ousia {
namespace utils {
@@ -44,33 +46,47 @@ public:
};
struct Token {
- const int tokenId;
- const std::string content;
- const int column;
- const int line;
-
- Token(int tokenId, std::string content, int column, int line)
- : tokenId(tokenId), content(content), column(column), line(line)
+ int tokenId;
+ std::string content;
+ int startColumn;
+ int startLine;
+ int endColumn;
+ int endLine;
+
+ Token(int tokenId, std::string content, int startColumn, int startLine,
+ int endColumn, int endLine)
+ : tokenId(tokenId),
+ content(content),
+ startColumn(startColumn),
+ startLine(startLine),
+ endColumn(endColumn),
+ endLine(endLine)
{
}
};
+static const int TOKEN_NONE = -1;
+static const int TOKEN_TEXT = -2;
+
class Tokenizer {
private:
- const std::istream &input;
- const TokenTreeNode root;
- const std::queue<Token> peekQueue;
+ BufferedCharReader &input;
+ const TokenTreeNode &root;
+ std::deque<Token> peeked;
+ unsigned int peekCursor = 0;
+
+ bool prepare();
public:
- Tokenizer(const TokenTreeNode &root, std::istream &input);
+ Tokenizer(BufferedCharReader &input, const TokenTreeNode &root);
- bool hasNext();
+ bool next(Token &t);
- const Token &next();
+ bool peek(Token &t);
- const Token &peek();
+ void resetPeek();
- void reset();
+ void consumePeek();
};
}
}
diff --git a/test/core/utils/BufferedCharReader.cpp b/test/core/utils/BufferedCharReaderTest.cpp
index f8f668c..69c0974 100644
--- a/test/core/utils/BufferedCharReader.cpp
+++ b/test/core/utils/BufferedCharReaderTest.cpp
@@ -21,7 +21,10 @@
#include "gtest/gtest.h"
-#include "BufferedCharReader.hpp"
+#include <core/utils/BufferedCharReader.hpp>
+
+namespace ousia{
+namespace utils{
TEST(BufferedCharReaderTest, SimpleReadTest)
{
@@ -29,7 +32,7 @@ TEST(BufferedCharReaderTest, SimpleReadTest)
char c;
// Feed a test string into the reader
- scaenea::compiler::BufferedCharReader reader;
+ BufferedCharReader reader;
reader.feed(testStr);
reader.close();
@@ -58,7 +61,7 @@ TEST(BufferedCharReaderTest, SimplePeekTest)
char c;
// Feed a test string into the reader
- scaenea::compiler::BufferedCharReader reader;
+ BufferedCharReader reader;
reader.feed(testStr);
reader.close();
@@ -97,7 +100,7 @@ TEST(BufferedCharReaderTest, SplittedPeakTest)
char c;
// Feed a test string into the reader
- scaenea::compiler::BufferedCharReader reader;
+ BufferedCharReader reader;
// Try to peek the test string, feed char after char into the reader
std::string res;
@@ -132,7 +135,7 @@ TEST(BufferedCharReaderTest, RowColumnCounterTest)
char c;
// Feed a test string into the reader
- scaenea::compiler::BufferedCharReader reader;
+ BufferedCharReader reader;
reader.feed(testStr);
reader.close();
@@ -162,7 +165,7 @@ TEST(BufferedCharReaderTest, LinebreakSubstitutionTest)
const std::string expStr("this\nis\njust\na test\n\ntest\n");
// Feed a test string into the reader
- scaenea::compiler::BufferedCharReader reader;
+ BufferedCharReader reader;
reader.feed(testStr);
// Read all characters from the test string
@@ -183,7 +186,7 @@ TEST(BufferedCharReaderTest, RowColumnCounterUTF8Test)
char c;
// Feed a test string into the reader
- scaenea::compiler::BufferedCharReader reader;
+ BufferedCharReader reader;
reader.feed(testStr);
reader.close();
@@ -196,3 +199,5 @@ TEST(BufferedCharReaderTest, RowColumnCounterUTF8Test)
ASSERT_EQ(6, reader.getColumn());
}
+}
+}