/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include "CodeTokenizer.hpp"
namespace ousia {
Token CodeTokenizer::constructToken(const Token &t)
{
std::string content = buf.str();
buf.str(std::string());
return Token{
returnTokenId, content,
SourceLocation{t.location.getSourceId(), startToken.location.getStart(),
t.location.getEnd()}};
}
void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
bool CodeTokenizer::doPrepare(const Token &t, std::deque &peeked)
{
auto it = descriptors.find(t.tokenId);
CodeTokenMode mode = CodeTokenMode::NONE;
if (it != descriptors.end()) {
mode = it->second.mode;
}
switch (state) {
case CodeTokenizerState::NORMAL:
switch (mode) {
case CodeTokenMode::STRING_START_END:
state = CodeTokenizerState::IN_STRING;
break;
case CodeTokenMode::BLOCK_COMMENT_START:
state = CodeTokenizerState::IN_BLOCK_COMMENT;
break;
case CodeTokenMode::LINE_COMMENT:
state = CodeTokenizerState::IN_LINE_COMMENT;
break;
case CodeTokenMode::LINEBREAK:
if (!ignoreLinebreaks) {
peeked.push_back(
{it->second.id, t.content, t.location});
}
return !ignoreLinebreaks;
default:
bool empty = true;
if (t.tokenId == TOKEN_TEXT) {
int begin = -1;
for (size_t c = 0; c < t.content.length(); c++) {
bool isWhitespace =
t.content[c] == ' ' || t.content[c] == '\t';
if (begin < 0) {
// if we have not yet set our beginning,
// we wait for the first
// non-whitespace-character to set it.
if (!isWhitespace) {
begin = c;
}
} else {
// if we have set our beginning, we wait for the
// first whitespace character, which marks the
// end of the current word.
if (isWhitespace) {
peeked.push_back(Token{
TOKEN_TEXT,
t.content.substr(begin, (int)c - begin),
SourceLocation{
t.location.getSourceId(),
t.location.getStart() + begin,
t.location.getStart() + c}});
begin = -1;
empty = false;
}
}
}
if (begin >= 0) {
peeked.push_back(Token{
TOKEN_TEXT, t.content.substr(begin),
SourceLocation{t.location.getSourceId(),
t.location.getStart() + begin,
t.location.getEnd()}});
empty = false;
}
} else {
empty = false;
peeked.push_back(t);
}
return !empty;
}
startToken = t;
returnTokenId = it->second.id;
return false;
case CodeTokenizerState::IN_LINE_COMMENT:
switch (mode) {
case CodeTokenMode::LINEBREAK:
state = CodeTokenizerState::NORMAL;
if (!ignoreComments) {
peeked.push_back(constructToken(t));
}
return !ignoreComments;
default:
if (!ignoreComments) {
buffer(t);
}
return false;
}
case CodeTokenizerState::IN_BLOCK_COMMENT:
switch (mode) {
case CodeTokenMode::BLOCK_COMMENT_END:
state = CodeTokenizerState::NORMAL;
if (!ignoreComments) {
peeked.push_back(constructToken(t));
}
return !ignoreComments;
default:
if (!ignoreComments) {
buffer(t);
}
return false;
}
case CodeTokenizerState::IN_STRING:
switch (mode) {
case CodeTokenMode::ESCAPE:
if (escaped) {
buffer(t);
}
escaped = !escaped;
return false;
case CodeTokenMode::STRING_START_END:
if (escaped) {
buffer(t);
escaped = false;
return false;
} else {
peeked.push_back(constructToken(t));
state = CodeTokenizerState::NORMAL;
return true;
}
default:
if (escaped) {
// TODO: handle escaped characters?
escaped = false;
}
buffer(t);
return false;
}
}
assert(false);
}
}