diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/core/utils/BufferedCharReader.cpp | 4 | ||||
| -rw-r--r-- | src/core/utils/BufferedCharReader.hpp | 6 | ||||
| -rw-r--r-- | src/core/utils/Tokenizer.cpp | 113 | ||||
| -rw-r--r-- | src/core/utils/Tokenizer.hpp | 48 | 
4 files changed, 148 insertions, 23 deletions
diff --git a/src/core/utils/BufferedCharReader.cpp b/src/core/utils/BufferedCharReader.cpp index 0377015..c13628f 100644 --- a/src/core/utils/BufferedCharReader.cpp +++ b/src/core/utils/BufferedCharReader.cpp @@ -1,6 +1,6 @@  /* -    SCAENEA IDL Compiler (scidlc) -    Copyright (C) 2014  Andreas Stöckel +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel      This program is free software: you can redistribute it and/or modify      it under the terms of the GNU General Public License as published by diff --git a/src/core/utils/BufferedCharReader.hpp b/src/core/utils/BufferedCharReader.hpp index 86f43b5..b13cde6 100644 --- a/src/core/utils/BufferedCharReader.hpp +++ b/src/core/utils/BufferedCharReader.hpp @@ -1,6 +1,6 @@  /* -    SCAENEA IDL Compiler (scidlc) -    Copyright (C) 2014  Andreas Stöckel +    Ousía +    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel      This program is free software: you can redistribute it and/or modify      it under the terms of the GNU General Public License as published by @@ -236,5 +236,5 @@ public:  }  } -#endif /* _OUSISA_UTILS_BUFFERED_CHAR_READER_H_ */ +#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */ diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp index 38f7585..2c36438 100644 --- a/src/core/utils/Tokenizer.cpp +++ b/src/core/utils/Tokenizer.cpp @@ -16,6 +16,8 @@      along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ +#include <sstream> +  #include "Tokenizer.hpp"  namespace ousia { @@ -52,10 +54,10 @@ static std::map<char, TokenTreeNode> buildChildren(  static int buildId(const std::map<std::string, int> &inputs)  { -	int tokenId = -1; +	int tokenId = TOKEN_NONE;  	for (auto &e : inputs) {  		if (e.first.empty()) { -			if (tokenId != -1) { +			if (tokenId != TOKEN_NONE) {  				throw TokenizerException{std::string{"Ambigous token found: "} +  				                         std::to_string(e.second)};  			} else { @@ -68,8 +70,115 @@ static int buildId(const std::map<std::string, int> &inputs)  TokenTreeNode::TokenTreeNode(const std::map<std::string, int> &inputs)      : children(buildChildren(inputs)), tokenId(buildId(inputs)) +{ +} + +Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root) +    : input(input), root(root) +{ +} + +bool Tokenizer::prepare() +{ +	std::stringstream buffer; +	char c; +	const int startColumn = input.getColumn(); +	const int startLine = input.getLine(); +	bool bufEmpty = true; +	while (input.peek(&c)) { +		if (root.children.find(c) != root.children.end()) { +			// if there might be a special token, keep peeking forward +			// until we find the token (or we don't). +			TokenTreeNode const *n = &root; +			std::stringstream tBuf; +			int match = TOKEN_NONE; +			while (true) { +				tBuf << c; +				n = &(n->children.at(c)); +				if (n->tokenId != TOKEN_NONE) { +					// from here on we found a token. If we have something +					// in our buffer already, we end the search now. +					if (!bufEmpty) { +						break; +					} else { +						// if we want to return this token ( = we have nothing +						// in our buffer yet) we look greedily for the longest +						// possible token we can construct. +						input.consumePeek(); +					} +				} +				if (!input.peek(&c)) { +					// if we are at the end we break off the search. +					break; +				} +				if (n->children.find(c) == root.children.end()) { +					// if we do not find a possible continuation anymore, +					// break off the search. +					break; +				} +			} +			// check if we did indeed find a special token. +			if (match != TOKEN_NONE) { +				input.resetPeek(); +				if (bufEmpty) { +					// if we did not have text before, construct that token. +					peeked.push_back(Token{match, tBuf.str(), startColumn, +					                       startLine, input.getColumn(), +					                       input.getLine()}); +					return true; +				} else { +					// otherwise we return the text before the token. +					peeked.push_back(Token{TOKEN_TEXT, buffer.str(), +					                       startColumn, startLine, +					                       input.getColumn(), input.getLine()}); +					return true; +				} +			} +		} +		buffer << c; +		bufEmpty = false; +		input.consumePeek(); +	} +	if (!bufEmpty) { +		peeked.push_back(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, +		                       input.getColumn(), input.getLine()}); +		return true; +	} +	return false; +} +bool Tokenizer::next(Token &t)  { +	if (peeked.empty()) { +		if (!prepare()) { +			return false; +		} +	} +	t = peeked.front(); +	peeked.pop_front(); +	resetPeek(); +	return true; +} + +bool Tokenizer::peek(Token &t) +{ +	if (peekCursor >= peeked.size()) { +		if (!prepare()) { +			return false; +		} +	} +	t = peeked[peekCursor]; +	return true; +} + +void Tokenizer::resetPeek() { peekCursor = 0; } + +void Tokenizer::consumePeek() +{ +	while (peekCursor > 0) { +		peeked.pop_front(); +		peekCursor--; +	}  }  }  } diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp index 24c4f30..924b670 100644 --- a/src/core/utils/Tokenizer.hpp +++ b/src/core/utils/Tokenizer.hpp @@ -21,7 +21,9 @@  #include <istream>  #include <map> -#include <queue> +#include <deque> + +#include "BufferedCharReader.hpp"  namespace ousia {  namespace utils { @@ -44,33 +46,47 @@ public:  };  struct Token { -	const int tokenId; -	const std::string content; -	const int column; -	const int line; - -	Token(int tokenId, std::string content, int column, int line) -	    : tokenId(tokenId), content(content), column(column), line(line) +	int tokenId; +	std::string content; +	int startColumn; +	int startLine; +	int endColumn; +	int endLine; + +	Token(int tokenId, std::string content, int startColumn, int startLine, +	      int endColumn, int endLine) +	    : tokenId(tokenId), +	      content(content), +	      startColumn(startColumn), +	      startLine(startLine), +	      endColumn(endColumn), +	      endLine(endLine)  	{  	}  }; +static const int TOKEN_NONE = -1; +static const int TOKEN_TEXT = -2; +  class Tokenizer {  private: -	const std::istream &input; -	const TokenTreeNode root; -	const std::queue<Token> peekQueue; +	BufferedCharReader &input; +	const TokenTreeNode &root; +	std::deque<Token> peeked; +	unsigned int peekCursor = 0; + +	bool prepare();  public: -	Tokenizer(const TokenTreeNode &root, std::istream &input); +	Tokenizer(BufferedCharReader &input, const TokenTreeNode &root); -	bool hasNext(); +	bool next(Token &t); -	const Token &next(); +	bool peek(Token &t); -	const Token &peek(); +	void resetPeek(); -	void reset(); +	void consumePeek();  };  }  }  | 
