/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include "Tokenizer.hpp" namespace ousia { static std::map buildChildren( const std::map &inputs) { std::map children; std::map> nexts; for (auto &e : inputs) { const std::string &s = e.first; const int id = e.second; if (s.empty()) { continue; } char start = s[0]; const std::string suffix = s.substr(1); if (nexts.find(start) != nexts.end()) { nexts[start].insert(std::make_pair(suffix, id)); } else { nexts.insert(std::make_pair( start, std::map{{suffix, id}})); } } for (auto &n : nexts) { children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); } return children; } static int buildId(const std::map &inputs) { int tokenId = TOKEN_NONE; for (auto &e : inputs) { if (e.first.empty()) { if (tokenId != TOKEN_NONE) { throw TokenizerException{std::string{"Ambigous token found: "} + std::to_string(e.second)}; } else { tokenId = e.second; } } } return tokenId; } TokenTreeNode::TokenTreeNode(const std::map &inputs) : children(buildChildren(inputs)), tokenId(buildId(inputs)) { } Tokenizer::Tokenizer(CharReader &input, const TokenTreeNode &root) : input(input), root(root) { } bool Tokenizer::prepare() { std::stringstream buffer; char c; int startColumn = input.getColumn(); int startLine = input.getLine(); bool bufEmpty = true; while (input.peek(c)) { if (root.children.find(c) != root.children.end()) { // if there might be a special token, keep peeking forward // until we find the token (or we don't). TokenTreeNode const *n = &root; std::stringstream tBuf; int match = TOKEN_NONE; while (true) { tBuf << c; n = &(n->children.at(c)); if (n->tokenId != TOKEN_NONE) { match = n->tokenId; // from here on we found a token. If we have something // in our buffer already, we end the search now. if (!bufEmpty) { break; } else { // if we want to return this token ( = we have nothing // in our buffer yet) we look greedily for the longest // possible token we can construct. input.consumePeek(); } } if (!input.peek(c)) { // if we are at the end we break off the search. break; } if (n->children.find(c) == n->children.end()) { // if we do not find a possible continuation anymore, // break off the search. break; } } //reset the peek pointer to the last valid position. input.resetPeek(); // check if we did indeed find a special token. if (match != TOKEN_NONE) { if (bufEmpty) { // if we did not have text before, construct that token. if (doPrepare( Token{match, tBuf.str(), startColumn, startLine, input.getColumn(), input.getLine()}, peeked)) { return true; } else { startColumn = input.getColumn(); startLine = input.getLine(); continue; } } else { // otherwise we return the text before the token. if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, input.getColumn(), input.getLine()}, peeked)) { return true; } else{ //we need to clear the buffer here. After all the token //corresponding to this buffer segment is already //constructed. buffer.str(std::string()); bufEmpty = true; startColumn = input.getColumn(); startLine = input.getLine(); continue; } } } else{ //if we found nothing, read at least one character. input.peek(c); } } buffer << c; bufEmpty = false; input.consumePeek(); } if (!bufEmpty) { return doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, input.getColumn(), input.getLine()}, peeked); } return false; } bool Tokenizer::doPrepare(const Token &t, std::deque &peeked) { peeked.push_back(t); return true; } bool Tokenizer::next(Token &t) { if (peeked.empty()) { if (!prepare()) { return false; } } t = peeked.front(); peeked.pop_front(); resetPeek(); return true; } bool Tokenizer::peek(Token &t) { if (peekCursor >= peeked.size()) { if (!prepare()) { return false; } } t = peeked[peekCursor]; peekCursor++; return true; } void Tokenizer::resetPeek() { peekCursor = 0; } void Tokenizer::consumePeek() { while (peekCursor > 0) { peeked.pop_front(); peekCursor--; } } }