/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include "Tokenizer.hpp"
namespace ousia {
static std::map buildChildren(
const std::map &inputs)
{
std::map children;
std::map> nexts;
for (auto &e : inputs) {
const std::string &s = e.first;
const int id = e.second;
if (s.empty()) {
continue;
}
char start = s[0];
const std::string suffix = s.substr(1);
if (nexts.find(start) != nexts.end()) {
nexts[start].insert(std::make_pair(suffix, id));
} else {
nexts.insert(std::make_pair(
start, std::map{{suffix, id}}));
}
}
for (auto &n : nexts) {
children.insert(std::make_pair(n.first, TokenTreeNode{n.second}));
}
return children;
}
static int buildId(const std::map &inputs)
{
int tokenId = TOKEN_NONE;
for (auto &e : inputs) {
if (e.first.empty()) {
if (tokenId != TOKEN_NONE) {
throw TokenizerException{std::string{"Ambigous token found: "} +
std::to_string(e.second)};
} else {
tokenId = e.second;
}
}
}
return tokenId;
}
TokenTreeNode::TokenTreeNode(const std::map &inputs)
: children(buildChildren(inputs)), tokenId(buildId(inputs))
{
}
Tokenizer::Tokenizer(CharReader &input, const TokenTreeNode &root)
: input(input), root(root)
{
}
bool Tokenizer::prepare()
{
std::stringstream buffer;
char c;
SourcePosition start = input.getOffset();
bool bufEmpty = true;
while (input.peek(c)) {
if (root.children.find(c) != root.children.end()) {
// if there might be a special token, keep peeking forward
// until we find the token (or we don't).
TokenTreeNode const *n = &root;
std::stringstream tBuf;
int match = TOKEN_NONE;
while (true) {
tBuf << c;
n = &(n->children.at(c));
if (n->tokenId != TOKEN_NONE) {
match = n->tokenId;
// from here on we found a token. If we have something
// in our buffer already, we end the search now.
if (!bufEmpty) {
break;
} else {
// if we want to return this token ( = we have nothing
// in our buffer yet) we look greedily for the longest
// possible token we can construct.
input.consumePeek();
}
}
if (!input.peek(c)) {
// if we are at the end we break off the search.
break;
}
if (n->children.find(c) == n->children.end()) {
// if we do not find a possible continuation anymore,
// break off the search.
break;
}
}
//reset the peek pointer to the last valid position.
input.resetPeek();
// check if we did indeed find a special token.
if (match != TOKEN_NONE) {
if (bufEmpty) {
// if we did not have text before, construct that token.
if (doPrepare(
Token{match, tBuf.str(), input.getLocation(start)},
peeked)) {
return true;
} else {
start = input.getOffset();
continue;
}
} else {
// otherwise we return the text before the token.
if (doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)},
peeked)) {
return true;
} else{
//we need to clear the buffer here. After all the token
//corresponding to this buffer segment is already
//constructed.
buffer.str(std::string());
bufEmpty = true;
start = input.getOffset();
continue;
}
}
} else{
//if we found nothing, read at least one character.
input.peek(c);
}
}
buffer << c;
bufEmpty = false;
input.consumePeek();
}
if (!bufEmpty) {
return doPrepare(Token{TOKEN_TEXT, buffer.str(), input.getLocation(start)},
peeked);
}
return false;
}
bool Tokenizer::doPrepare(const Token &t, std::deque &peeked)
{
peeked.push_back(t);
return true;
}
bool Tokenizer::next(Token &t)
{
if (peeked.empty()) {
if (!prepare()) {
return false;
}
}
t = peeked.front();
peeked.pop_front();
resetPeek();
return true;
}
bool Tokenizer::peek(Token &t)
{
if (peekCursor >= peeked.size()) {
if (!prepare()) {
return false;
}
}
t = peeked[peekCursor];
peekCursor++;
return true;
}
void Tokenizer::resetPeek() { peekCursor = 0; }
void Tokenizer::consumePeek()
{
while (peekCursor > 0) {
peeked.pop_front();
peekCursor--;
}
}
}