summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/utils/BufferedCharReader.cpp4
-rw-r--r--src/core/utils/BufferedCharReader.hpp6
-rw-r--r--src/core/utils/Tokenizer.cpp113
-rw-r--r--src/core/utils/Tokenizer.hpp48
4 files changed, 148 insertions, 23 deletions
diff --git a/src/core/utils/BufferedCharReader.cpp b/src/core/utils/BufferedCharReader.cpp
index 0377015..c13628f 100644
--- a/src/core/utils/BufferedCharReader.cpp
+++ b/src/core/utils/BufferedCharReader.cpp
@@ -1,6 +1,6 @@
/*
- SCAENEA IDL Compiler (scidlc)
- Copyright (C) 2014 Andreas Stöckel
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/src/core/utils/BufferedCharReader.hpp b/src/core/utils/BufferedCharReader.hpp
index 86f43b5..b13cde6 100644
--- a/src/core/utils/BufferedCharReader.hpp
+++ b/src/core/utils/BufferedCharReader.hpp
@@ -1,6 +1,6 @@
/*
- SCAENEA IDL Compiler (scidlc)
- Copyright (C) 2014 Andreas Stöckel
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -236,5 +236,5 @@ public:
}
}
-#endif /* _OUSISA_UTILS_BUFFERED_CHAR_READER_H_ */
+#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */
diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp
index 38f7585..2c36438 100644
--- a/src/core/utils/Tokenizer.cpp
+++ b/src/core/utils/Tokenizer.cpp
@@ -16,6 +16,8 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <sstream>
+
#include "Tokenizer.hpp"
namespace ousia {
@@ -52,10 +54,10 @@ static std::map<char, TokenTreeNode> buildChildren(
static int buildId(const std::map<std::string, int> &inputs)
{
- int tokenId = -1;
+ int tokenId = TOKEN_NONE;
for (auto &e : inputs) {
if (e.first.empty()) {
- if (tokenId != -1) {
+ if (tokenId != TOKEN_NONE) {
throw TokenizerException{std::string{"Ambigous token found: "} +
std::to_string(e.second)};
} else {
@@ -68,8 +70,115 @@ static int buildId(const std::map<std::string, int> &inputs)
TokenTreeNode::TokenTreeNode(const std::map<std::string, int> &inputs)
: children(buildChildren(inputs)), tokenId(buildId(inputs))
+{
+}
+
+Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root)
+ : input(input), root(root)
+{
+}
+
+bool Tokenizer::prepare()
+{
+ std::stringstream buffer;
+ char c;
+ const int startColumn = input.getColumn();
+ const int startLine = input.getLine();
+ bool bufEmpty = true;
+ while (input.peek(&c)) {
+ if (root.children.find(c) != root.children.end()) {
+ // if there might be a special token, keep peeking forward
+ // until we find the token (or we don't).
+ TokenTreeNode const *n = &root;
+ std::stringstream tBuf;
+ int match = TOKEN_NONE;
+ while (true) {
+ tBuf << c;
+ n = &(n->children.at(c));
+ if (n->tokenId != TOKEN_NONE) {
+ // from here on we found a token. If we have something
+ // in our buffer already, we end the search now.
+ if (!bufEmpty) {
+ break;
+ } else {
+ // if we want to return this token ( = we have nothing
+ // in our buffer yet) we look greedily for the longest
+ // possible token we can construct.
+ input.consumePeek();
+ }
+ }
+ if (!input.peek(&c)) {
+ // if we are at the end we break off the search.
+ break;
+ }
+ if (n->children.find(c) == root.children.end()) {
+ // if we do not find a possible continuation anymore,
+ // break off the search.
+ break;
+ }
+ }
+ // check if we did indeed find a special token.
+ if (match != TOKEN_NONE) {
+ input.resetPeek();
+ if (bufEmpty) {
+ // if we did not have text before, construct that token.
+ peeked.push_back(Token{match, tBuf.str(), startColumn,
+ startLine, input.getColumn(),
+ input.getLine()});
+ return true;
+ } else {
+ // otherwise we return the text before the token.
+ peeked.push_back(Token{TOKEN_TEXT, buffer.str(),
+ startColumn, startLine,
+ input.getColumn(), input.getLine()});
+ return true;
+ }
+ }
+ }
+ buffer << c;
+ bufEmpty = false;
+ input.consumePeek();
+ }
+ if (!bufEmpty) {
+ peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
+ input.getColumn(), input.getLine()});
+ return true;
+ }
+ return false;
+}
+bool Tokenizer::next(Token &t)
{
+ if (peeked.empty()) {
+ if (!prepare()) {
+ return false;
+ }
+ }
+ t = peeked.front();
+ peeked.pop_front();
+ resetPeek();
+ return true;
+}
+
+bool Tokenizer::peek(Token &t)
+{
+ if (peekCursor >= peeked.size()) {
+ if (!prepare()) {
+ return false;
+ }
+ }
+ t = peeked[peekCursor];
+ return true;
+}
+
+void Tokenizer::resetPeek() { peekCursor = 0; }
+
+void Tokenizer::consumePeek()
+{
+ while (peekCursor > 0) {
+ peeked.pop_front();
+ peekCursor--;
+ }
}
}
}
diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
index 24c4f30..924b670 100644
--- a/src/core/utils/Tokenizer.hpp
+++ b/src/core/utils/Tokenizer.hpp
@@ -21,7 +21,9 @@
#include <istream>
#include <map>
-#include <queue>
+#include <deque>
+
+#include "BufferedCharReader.hpp"
namespace ousia {
namespace utils {
@@ -44,33 +46,47 @@ public:
};
struct Token {
- const int tokenId;
- const std::string content;
- const int column;
- const int line;
-
- Token(int tokenId, std::string content, int column, int line)
- : tokenId(tokenId), content(content), column(column), line(line)
+ int tokenId;
+ std::string content;
+ int startColumn;
+ int startLine;
+ int endColumn;
+ int endLine;
+
+ Token(int tokenId, std::string content, int startColumn, int startLine,
+ int endColumn, int endLine)
+ : tokenId(tokenId),
+ content(content),
+ startColumn(startColumn),
+ startLine(startLine),
+ endColumn(endColumn),
+ endLine(endLine)
{
}
};
+static const int TOKEN_NONE = -1;
+static const int TOKEN_TEXT = -2;
+
class Tokenizer {
private:
- const std::istream &input;
- const TokenTreeNode root;
- const std::queue<Token> peekQueue;
+ BufferedCharReader &input;
+ const TokenTreeNode &root;
+ std::deque<Token> peeked;
+ unsigned int peekCursor = 0;
+
+ bool prepare();
public:
- Tokenizer(const TokenTreeNode &root, std::istream &input);
+ Tokenizer(BufferedCharReader &input, const TokenTreeNode &root);
- bool hasNext();
+ bool next(Token &t);
- const Token &next();
+ bool peek(Token &t);
- const Token &peek();
+ void resetPeek();
- void reset();
+ void consumePeek();
};
}
}