summaryrefslogtreecommitdiff
path: root/src/core/utils/CodeTokenizer.cpp
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2014-11-20 17:40:36 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2014-11-20 17:40:36 +0100
commit6c8ee8084a8fa8317be69f5578d9b1052aee3b70 (patch)
treefe673a403e75d3478e1995fba2e40ec268f1cb8b /src/core/utils/CodeTokenizer.cpp
parentadf0b5eaef95484a8d3b8ad1e6e6765018658bdc (diff)
parentd2f14ec9b2d54c8addc03fef147be15327dd8623 (diff)
Merge branch 'master' of somweyr.de:ousia
Diffstat (limited to 'src/core/utils/CodeTokenizer.cpp')
-rw-r--r--src/core/utils/CodeTokenizer.cpp62
1 files changed, 42 insertions, 20 deletions
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
index c1376af..e5b8610 100644
--- a/src/core/utils/CodeTokenizer.cpp
+++ b/src/core/utils/CodeTokenizer.cpp
@@ -23,30 +23,30 @@
namespace ousia {
namespace utils {
-Token CodeTokenizer::constructToken(const Token& t)
+Token CodeTokenizer::constructToken(const Token &t)
{
std::string content = buf.str();
buf.str(std::string());
- return Token{returnTokenId, content,
- startToken.startColumn, startToken.startLine,
- t.endColumn, t.endLine};
+ return Token{returnTokenId, content, startToken.startColumn,
+ startToken.startLine, t.endColumn, t.endLine};
}
void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
{
- if (t.startLine != t.endLine) {
- throw TokenizerException(
- "We did not expect a multiline token. Most likely you did not add "
- "a linebreak token to your tokenizer!");
- }
-
auto it = descriptors.find(t.tokenId);
CodeTokenMode mode = CodeTokenMode::NONE;
if (it != descriptors.end()) {
mode = it->second.mode;
}
+
+ if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
+ throw TokenizerException(
+ "We did not expect a multiline token (except linebreaks). Most "
+ "likely you did not add a linebreak token to your tokenizer!");
+ }
+
switch (state) {
case CodeTokenizerState::NORMAL:
switch (mode) {
@@ -59,25 +59,47 @@ bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
case CodeTokenMode::LINE_COMMENT:
state = CodeTokenizerState::IN_LINE_COMMENT;
break;
+ case CodeTokenMode::LINEBREAK:
+ peeked.push_back({it->second.id, t.content, t.startColumn,
+ t.startLine, t.endColumn, t.endLine});
+ return true;
default:
if (t.tokenId == TOKEN_TEXT) {
int begin = -1;
for (size_t c = 0; c < t.content.length(); c++) {
bool isWhitespace =
t.content[c] == ' ' || t.content[c] == '\t';
- if (begin >= 0 && isWhitespace) {
- peeked.push_back(Token{
- TOKEN_TEXT,
- t.content.substr(begin, (int)c - begin),
- t.startColumn + begin, t.startLine,
- t.startColumn + (int)c, t.endLine});
- }
- if (!isWhitespace && begin < 0) {
- begin = c;
+ if (begin < 0) {
+ // if we have not yet set our beginning,
+ // we wait for the first
+ // non-whitespace-character to set it.
+ if (!isWhitespace) {
+ begin = c;
+ }
+ } else {
+ // if we have set our beginning, we wait for the
+ // first whitespace character, which marks the
+ // end of the current word.
+ if (isWhitespace) {
+ peeked.push_back(Token{
+ TOKEN_TEXT,
+ t.content.substr(begin, (int)c - begin),
+ t.startColumn + begin, t.startLine,
+ t.startColumn + (int)c, t.endLine});
+ begin = -1;
+ }
}
}
+ if(begin >= 0){
+ peeked.push_back(Token{
+ TOKEN_TEXT,
+ t.content.substr(begin),
+ t.startColumn + begin, t.startLine,
+ t.endColumn, t.endLine});
+ }
+ } else {
+ peeked.push_back(t);
}
- peeked.push_back(t);
return true;
}
startToken = t;