diff options
author | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2014-11-21 10:51:11 +0100 |
---|---|---|
committer | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2014-11-21 10:51:11 +0100 |
commit | ca2040ba16459f5b3dfe8a86a14680eecea7ef7d (patch) | |
tree | 516b475ba0c32002235fbfbc9bc951945e142523 /src/core/utils/CodeTokenizer.hpp | |
parent | 4dedb85d2408c90dea1d3c36d8f3a985a4b26024 (diff) |
According to my lords holy command I enacted his graces order of moving all peasants and lordlings, big and small, young and old, to our common land and inner sanctuary to be safe from outer threats by those usurpers.
Diffstat (limited to 'src/core/utils/CodeTokenizer.hpp')
-rw-r--r-- | src/core/utils/CodeTokenizer.hpp | 130 |
1 files changed, 0 insertions, 130 deletions
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp deleted file mode 100644 index 0fc0862..0000000 --- a/src/core/utils/CodeTokenizer.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#ifndef _OUSIA_UTILS_CODE_TOKENIZER_HPP_ -#define _OUSIA_UTILS_CODE_TOKENIZER_HPP_ - -#include <map> -#include <sstream> - -#include "BufferedCharReader.hpp" -#include "Tokenizer.hpp" - -namespace ousia { -namespace utils { - -/* - * This enum contains all special Token the CodeTokenizer supports, namely: - * - * 1.) An ambigous Tokens - in post programming languages single-quotes ' or - * double-quotes " - to delimit string tokens. - * 2.) A start token for line comments, which would e.g. be // in Java. - * 3.) A start token for a block comment - * 4.) An end token for a block comment. - * 5.) A linebreak token - * 6.) The escape token, which would e.g. be \ in java. - */ -enum class CodeTokenMode { - STRING_START_END, - LINE_COMMENT, - BLOCK_COMMENT_START, - BLOCK_COMMENT_END, - LINEBREAK, - ESCAPE, - NONE -}; - -/** - * A CodeTokenDescriptor defines the id the user likes to have returned for - * a Token of the mode specified, e.g. if you want to get the id 4 for a - * String Token the corresponding CodeTokenDescriptor would be inizialized - * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; - */ -struct CodeTokenDescriptor { - CodeTokenMode mode; - int id; - - CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} -}; - -/** - * The CodeTokenizer is a finite state machine with the states NORMAL, being - * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. - */ -enum class CodeTokenizerState { - NORMAL, - IN_BLOCK_COMMENT, - IN_LINE_COMMENT, - IN_STRING -}; - -/** - * The purpose of a CodeTokenizer is to make it easier to parse classical - * programming Code. It adds the following features to a regular Tokenizer: - * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens - * for the opening delimiter, the text and the closing delimiter. - * 2.) Escaping in String tokens. - * 3.) Comment Tokens (for line comments as well as block comments) - */ -class CodeTokenizer : public Tokenizer { -private: - std::map<int, CodeTokenDescriptor> descriptors; - CodeTokenizerState state; - std::stringstream buf; - Token startToken; - int returnTokenId; - bool escaped = false; - - Token constructToken(const Token &t); - void buffer(const Token &t); - -protected: - bool doPrepare(const Token &t, std::deque<Token> &peeked) override; - -public: - /** - * If you do not want comment tokens to be returned you can set this to - * true. - */ - bool ignoreComments = false; - - /** - * - * @param input a BufferedCharReader containing the input for this - *tokenizer, - * as with a regular tokenizer. - * @param root a TokenTreeNode representing the root of the TokenTree. - * Please note that you have to specify all tokenIDs here that you use - * in the descriptors map. - * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. - * In this way you can specify the meaning of certain Tokens. Say you - * specified the Token "//" with the id 1 in the TokenTree. Then you could - * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map - * and this CodeTokenizer would recognize the token "//" as starting a - * line comment. - */ - CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root, - std::map<int, CodeTokenDescriptor> descriptors) - : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) - { - } -}; -} -} - -#endif |