From 73cc54cbf494d9da61b640035f25ad9c5eb86d84 Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Fri, 31 Oct 2014 12:39:39 +0000 Subject: added tokenizer class code. git-svn-id: file:///var/local/svn/basicwriter@87 daaaf23c-2e50-4459-9457-1e69db5a47bf --- src/core/utils/Tokenizer.cpp | 76 ++++++++++++++++++++++++++++++++++++++++++ src/core/utils/Tokenizer.hpp | 78 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 src/core/utils/Tokenizer.cpp create mode 100644 src/core/utils/Tokenizer.hpp diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp new file mode 100644 index 0000000..1a84f0c --- /dev/null +++ b/src/core/utils/Tokenizer.cpp @@ -0,0 +1,76 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Tokenizer.hpp" + +namespace ousia { +namespace utils { + +static std::unordered_map buildChildren( + const std::map &inputs) +{ + std::std::unordered_map children; + std::unordered_map> nexts; + + for (auto &e : inputs) { + const std::string &s = e.first; + const int id = e.second; + if (s.empty()) { + continue; + } + char start = s[0]; + const std::string suffix = s.substr(1); + if (nexts.find(start) != nexts.end()) { + nexts[start].insert(std::make_pair(suffix, id)); + } else { + nexts.insert(std::make_pair( + start, std::map{{suffix, id}})); + } + } + + for (auto &n : nexts) { + children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); + } + + return children; +} + +static int buildId(const std::map &inputs) +{ + int tokenId = -1; + for (auto &e : inputs) { + if (e.first.empty()) { + if (tokenId != -1) { + throw TokenizerException{std::string{"Ambigous token found: "} + + e.second}; + } else { + tokenId = e.second; + } + } + } + return tokenId; +} + +TokenTreeNode::TokenTreeNode(const std::map &inputs) + : children(buildChildren(inputs), tokenId(buildId(inputs))) +{ +} + +} +} + diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp new file mode 100644 index 0000000..1d0db43 --- /dev/null +++ b/src/core/utils/Tokenizer.hpp @@ -0,0 +1,78 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_UTILS_TOKENIZER_HPP_ +#define _OUSIA_UTILS_TOKENIZER_HPP_ + +#include +#include +#include + +namespace ousia { +namespace utils { + +class TokenizerException : public std::exception { +public: + const std::string msg; + + ArgumentValidatorError(const std::string &msg) : msg(msg){}; + + virtual const char *what() const noexcept override { return msg.c_str(); } +}; + +class TokenTreeNode { +public: + const std::unordered_map children; + const int tokenId; + + TokenTreeNode(const std::map &inputs); +}; + +struct Token { + const int tokenId; + const std::string content; + const int column; + const int line; + + Token(int tokenId, std::string content, int column, int line) + : tokenId(tokenId), content(content), column(column), line(line) + { + } +}; + +class Tokenizer { +private: + const std::istream &input; + const TokenTreeNode root; + const std::queue peek; + +public: + Tokenizer(const TokenTreeNode &root, std::istream &input); + + bool hasNext(); + + const Token &next(); + + const Token &peek(); + + void reset(); +}; +} +} + +#endif -- cgit v1.2.3