summaryrefslogtreecommitdiff
path: root/src/core/CodeTokenizer.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CodeTokenizer.hpp')
-rw-r--r--src/core/CodeTokenizer.hpp136
1 files changed, 0 insertions, 136 deletions
diff --git a/src/core/CodeTokenizer.hpp b/src/core/CodeTokenizer.hpp
deleted file mode 100644
index 154f949..0000000
--- a/src/core/CodeTokenizer.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file CodeTokenizer.hpp
-
- * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de)
- */
-#ifndef _OUSIA_CODE_TOKENIZER_HPP_
-#define _OUSIA_CODE_TOKENIZER_HPP_
-
-#include <map>
-#include <sstream>
-
-#include <core/common/CharReader.hpp>
-#include "Tokenizer.hpp"
-
-namespace ousia {
-
-/*
- * This enum contains all special Token the CodeTokenizer supports, namely:
- *
- * 1.) An ambigous Tokens - in post programming languages single-quotes ' or
- * double-quotes " - to delimit string tokens.
- * 2.) A start token for line comments, which would e.g. be // in Java.
- * 3.) A start token for a block comment
- * 4.) An end token for a block comment.
- * 5.) A linebreak token
- * 6.) The escape token, which would e.g. be \ in java.
- */
-enum class CodeTokenMode {
- STRING_START_END,
- LINE_COMMENT,
- BLOCK_COMMENT_START,
- BLOCK_COMMENT_END,
- LINEBREAK,
- ESCAPE,
- NONE
-};
-
-/**
- * A CodeTokenDescriptor defines the id the user likes to have returned for
- * a Token of the mode specified, e.g. if you want to get the id 4 for a
- * String Token the corresponding CodeTokenDescriptor would be inizialized
- * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
- */
-struct CodeTokenDescriptor {
- CodeTokenMode mode;
- int id;
-
- CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
-};
-
-/**
- * The CodeTokenizer is a finite state machine with the states NORMAL, being
- * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING.
- */
-enum class CodeTokenizerState {
- NORMAL,
- IN_BLOCK_COMMENT,
- IN_LINE_COMMENT,
- IN_STRING
-};
-
-/**
- * The purpose of a CodeTokenizer is to make it easier to parse classical
- * programming Code. It adds the following features to a regular Tokenizer:
- * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens
- * for the opening delimiter, the text and the closing delimiter.
- * 2.) Escaping in String tokens.
- * 3.) Comment Tokens (for line comments as well as block comments)
- */
-class CodeTokenizer : public Tokenizer {
-private:
- std::map<int, CodeTokenDescriptor> descriptors;
- CodeTokenizerState state;
- std::stringstream buf;
- Token startToken;
- int returnTokenId;
- bool escaped = false;
-
- Token constructToken(const Token &t);
- void buffer(const Token &t);
-
-protected:
- bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
-
-public:
- /**
- * If you do not want comment tokens to be returned you can set this to
- * true.
- */
- bool ignoreComments = false;
- /**
- * If you do not want linebreaks to be returned you can set this to true.
- */
- bool ignoreLinebreaks = false;
-
- /**
- *
- * @param input a CharReader containing the input for this tokenizer, as
- * with a regular tokenizer.
- * @param root a TokenTreeNode representing the root of the TokenTree.
- * Please note that you have to specify all tokenIDs here that you use
- * in the descriptors map.
- * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors.
- * In this way you can specify the meaning of certain Tokens. Say you
- * specified the Token "//" with the id 1 in the TokenTree. Then you could
- * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map
- * and this CodeTokenizer would recognize the token "//" as starting a
- * line comment.
- */
- CodeTokenizer(CharReader &input, const TokenTreeNode &root,
- std::map<int, CodeTokenDescriptor> descriptors)
- : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL)
- {
- }
-};
-}
-
-#endif