summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/utils/CSSParser.cpp10
-rw-r--r--src/core/utils/CodeTokenizer.cpp144
-rw-r--r--src/core/utils/CodeTokenizer.hpp83
-rw-r--r--src/core/utils/Tokenizer.cpp36
-rw-r--r--src/core/utils/Tokenizer.hpp23
5 files changed, 276 insertions, 20 deletions
diff --git a/src/core/utils/CSSParser.cpp b/src/core/utils/CSSParser.cpp
index 1639152..e66eb34 100644
--- a/src/core/utils/CSSParser.cpp
+++ b/src/core/utils/CSSParser.cpp
@@ -44,11 +44,11 @@ static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN},
{"/*", COMMENT_OPEN},
{"*/", COMMENT_CLOSE}}};
-//StyleNode CSSParser::parse(BufferedCharReader &input) {
-// Tokenizer tokenizer {input, CSS_ROOT};
-// //TODO: implement
-//
-//}
+StyleNode CSSParser::parse(BufferedCharReader &input) {
+ Tokenizer tokenizer {input, CSS_ROOT};
+ //TODO: implement
+
+}
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
new file mode 100644
index 0000000..c1376af
--- /dev/null
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -0,0 +1,144 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cassert>
+
+#include "CodeTokenizer.hpp"
+
+namespace ousia {
+namespace utils {
+
+Token CodeTokenizer::constructToken(const Token& t)
+{
+ std::string content = buf.str();
+ buf.str(std::string());
+ return Token{returnTokenId, content,
+ startToken.startColumn, startToken.startLine,
+ t.endColumn, t.endLine};
+}
+
+void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
+
+bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
+{
+ if (t.startLine != t.endLine) {
+ throw TokenizerException(
+ "We did not expect a multiline token. Most likely you did not add "
+ "a linebreak token to your tokenizer!");
+ }
+
+ auto it = descriptors.find(t.tokenId);
+ CodeTokenMode mode = CodeTokenMode::NONE;
+ if (it != descriptors.end()) {
+ mode = it->second.mode;
+ }
+ switch (state) {
+ case CodeTokenizerState::NORMAL:
+ switch (mode) {
+ case CodeTokenMode::STRING_START_END:
+ state = CodeTokenizerState::IN_STRING;
+ break;
+ case CodeTokenMode::BLOCK_COMMENT_START:
+ state = CodeTokenizerState::IN_BLOCK_COMMENT;
+ break;
+ case CodeTokenMode::LINE_COMMENT:
+ state = CodeTokenizerState::IN_LINE_COMMENT;
+ break;
+ default:
+ if (t.tokenId == TOKEN_TEXT) {
+ int begin = -1;
+ for (size_t c = 0; c < t.content.length(); c++) {
+ bool isWhitespace =
+ t.content[c] == ' ' || t.content[c] == '\t';
+ if (begin >= 0 && isWhitespace) {
+ peeked.push_back(Token{
+ TOKEN_TEXT,
+ t.content.substr(begin, (int)c - begin),
+ t.startColumn + begin, t.startLine,
+ t.startColumn + (int)c, t.endLine});
+ }
+ if (!isWhitespace && begin < 0) {
+ begin = c;
+ }
+ }
+ }
+ peeked.push_back(t);
+ return true;
+ }
+ startToken = t;
+ returnTokenId = it->second.id;
+ return false;
+ case CodeTokenizerState::IN_LINE_COMMENT:
+ switch (mode) {
+ case CodeTokenMode::LINEBREAK:
+ state = CodeTokenizerState::NORMAL;
+ if (!ignoreComments) {
+ peeked.push_back(constructToken(t));
+ }
+ return !ignoreComments;
+ default:
+ if (!ignoreComments) {
+ buffer(t);
+ }
+ return false;
+ }
+ case CodeTokenizerState::IN_BLOCK_COMMENT:
+ switch (mode) {
+ case CodeTokenMode::BLOCK_COMMENT_END:
+ state = CodeTokenizerState::NORMAL;
+ if (!ignoreComments) {
+ peeked.push_back(constructToken(t));
+ }
+ return !ignoreComments;
+ default:
+ if (!ignoreComments) {
+ buffer(t);
+ }
+ return false;
+ }
+ case CodeTokenizerState::IN_STRING:
+ switch (mode) {
+ case CodeTokenMode::ESCAPE:
+ if (escaped) {
+ buffer(t);
+ }
+ escaped = !escaped;
+ return false;
+ case CodeTokenMode::STRING_START_END:
+ if (escaped) {
+ buffer(t);
+ escaped = false;
+ return false;
+ } else {
+ peeked.push_back(constructToken(t));
+ state = CodeTokenizerState::NORMAL;
+ return true;
+ }
+ default:
+ if (escaped) {
+ // TODO: handle escaped characters?
+ escaped = false;
+ }
+ buffer(t);
+ return false;
+ }
+ }
+ assert(false);
+}
+}
+}
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
new file mode 100644
index 0000000..f26a74c
--- /dev/null
+++ b/src/core/utils/CodeTokenizer.hpp
@@ -0,0 +1,83 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _OUSIA_UTILS_CODE_TOKENIZER_HPP_
+#define _OUSIA_UTILS_CODE_TOKENIZER_HPP_
+
+#include <map>
+#include <sstream>
+
+#include "BufferedCharReader.hpp"
+#include "Tokenizer.hpp"
+
+namespace ousia {
+namespace utils {
+
+enum class CodeTokenMode {
+ STRING_START_END,
+ LINE_COMMENT,
+ BLOCK_COMMENT_START,
+ BLOCK_COMMENT_END,
+ LINEBREAK,
+ ESCAPE,
+ NONE
+};
+
+struct CodeTokenDescriptor {
+ CodeTokenMode mode;
+ int id;
+
+ CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
+};
+
+
+enum class CodeTokenizerState {
+ NORMAL,
+ IN_BLOCK_COMMENT,
+ IN_LINE_COMMENT,
+ IN_STRING
+};
+
+class CodeTokenizer : public Tokenizer {
+private:
+ std::map<int, CodeTokenDescriptor> descriptors;
+ CodeTokenizerState state;
+ std::stringstream buf;
+ Token startToken;
+ int returnTokenId;
+ bool escaped = false;
+
+ Token constructToken(const Token& t);
+ void buffer(const Token& t);
+
+protected:
+ bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
+
+public:
+ bool ignoreComments = false;
+
+ CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
+ std::map<int, CodeTokenDescriptor> descriptors)
+ : Tokenizer(input, root), descriptors(descriptors)
+ {
+ }
+};
+}
+}
+
+#endif
diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp
index 2c36438..164a30f 100644
--- a/src/core/utils/Tokenizer.cpp
+++ b/src/core/utils/Tokenizer.cpp
@@ -96,6 +96,7 @@ bool Tokenizer::prepare()
tBuf << c;
n = &(n->children.at(c));
if (n->tokenId != TOKEN_NONE) {
+ match = n->tokenId;
// from here on we found a token. If we have something
// in our buffer already, we end the search now.
if (!bufEmpty) {
@@ -111,7 +112,7 @@ bool Tokenizer::prepare()
// if we are at the end we break off the search.
break;
}
- if (n->children.find(c) == root.children.end()) {
+ if (n->children.find(c) == n->children.end()) {
// if we do not find a possible continuation anymore,
// break off the search.
break;
@@ -122,16 +123,21 @@ bool Tokenizer::prepare()
input.resetPeek();
if (bufEmpty) {
// if we did not have text before, construct that token.
- peeked.push_back(Token{match, tBuf.str(), startColumn,
- startLine, input.getColumn(),
- input.getLine()});
- return true;
+ if (doPrepare(
+ Token{match, tBuf.str(), startColumn, startLine,
+ input.getColumn(), input.getLine()},
+ peeked)) {
+ return true;
+ }
+
} else {
// otherwise we return the text before the token.
- peeked.push_back(Token{TOKEN_TEXT, buffer.str(),
- startColumn, startLine,
- input.getColumn(), input.getLine()});
- return true;
+ if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn,
+ startLine, input.getColumn(),
+ input.getLine()},
+ peeked)) {
+ return true;
+ }
}
}
}
@@ -140,13 +146,19 @@ bool Tokenizer::prepare()
input.consumePeek();
}
if (!bufEmpty) {
- peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
- input.getColumn(), input.getLine()});
- return true;
+ return doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
+ input.getColumn(), input.getLine()},
+ peeked);
}
return false;
}
+bool Tokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
+{
+ peeked.push_back(t);
+ return true;
+}
+
bool Tokenizer::next(Token &t)
{
if (peeked.empty()) {
diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
index 924b670..eb8eed4 100644
--- a/src/core/utils/Tokenizer.hpp
+++ b/src/core/utils/Tokenizer.hpp
@@ -45,6 +45,9 @@ public:
TokenTreeNode(const std::map<std::string, int> &inputs);
};
+static const int TOKEN_NONE = -1;
+static const int TOKEN_TEXT = -2;
+
struct Token {
int tokenId;
std::string content;
@@ -63,10 +66,9 @@ struct Token {
endLine(endLine)
{
}
-};
-static const int TOKEN_NONE = -1;
-static const int TOKEN_TEXT = -2;
+ Token() : tokenId(TOKEN_NONE) {}
+};
class Tokenizer {
private:
@@ -77,6 +79,21 @@ private:
bool prepare();
+protected:
+ /**
+ * This method is an interface to build multiple tokens from a single one in
+ * derived classes. This might be interesting if you want to implement
+ * further logic on text tokens or similar applications.
+ *
+ * @param t a Token the "basic" tokenizer found.
+ * @param peeked a reference to the deque containing all temporary Tokens.
+ * You are supposed to append your tokens there. In the trivial case you just
+ * put the given Token on top of the deque.
+ * @return false if no token was appended to the deque (meaning that you want
+ * to ignore the given token explicitly) and true in all other cases.
+ */
+ virtual bool doPrepare(const Token &t, std::deque<Token> &peeked);
+
public:
Tokenizer(BufferedCharReader &input, const TokenTreeNode &root);