summaryrefslogtreecommitdiff
path: root/src/core/utils
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/utils')
-rw-r--r--src/core/utils/BufferedCharReader.cpp216
-rw-r--r--src/core/utils/BufferedCharReader.hpp240
-rw-r--r--src/core/utils/CSSParser.cpp81
-rw-r--r--src/core/utils/CSSParser.hpp167
-rw-r--r--src/core/utils/CodeTokenizer.cpp166
-rw-r--r--src/core/utils/CodeTokenizer.hpp130
-rw-r--r--src/core/utils/RangeSet.hpp326
-rw-r--r--src/core/utils/Tokenizer.cpp212
-rw-r--r--src/core/utils/Tokenizer.hpp231
-rw-r--r--src/core/utils/Utils.cpp39
-rw-r--r--src/core/utils/Utils.hpp65
11 files changed, 0 insertions, 1873 deletions
diff --git a/src/core/utils/BufferedCharReader.cpp b/src/core/utils/BufferedCharReader.cpp
deleted file mode 100644
index c13628f..0000000
--- a/src/core/utils/BufferedCharReader.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "BufferedCharReader.hpp"
-
-namespace ousia {
-namespace utils {
-
-// Constants used within the linebreak statemachine.
-static const uint8_t LB_STATE_NONE = 0x00;
-static const uint8_t LB_STATE_ONE = 0x01;
-static const uint8_t LB_STATE_LF = 0x10;
-static const uint8_t LB_STATE_CR = 0x20;
-static const uint8_t LB_STATE_MASK_CNT = 0x0F;
-static const uint8_t LB_STATE_MASK_TYPE = 0xF0;
-
-/*******************************************************************************
- * Struct BufferedCharReader::ReadCursor
- ******************************************************************************/
-
-BufferedCharReader::ReadCursor::ReadCursor(const bool destructive) :
- destructive(destructive)
-{
- reset();
-}
-
-void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor)
-{
- this->line = cursor.line;
- this->column = cursor.column;
- this->bufferElem = cursor.bufferElem;
- this->bufferPos = cursor.bufferPos;
- this->lbState = cursor.lbState;
-}
-
-void BufferedCharReader::ReadCursor::reset()
-{
- this->line = 1;
- this->column = 1;
- this->bufferElem = 0;
- this->bufferPos = 0;
- this->lbState = LB_STATE_NONE;
-}
-
-/*******************************************************************************
- * Class BufferedCharReader
- ******************************************************************************/
-
-BufferedCharReader::BufferedCharReader() :
- readCursor(true), peekCursor(false)
-{
- reset();
-}
-
-void BufferedCharReader::reset()
-{
- readCursor.reset();
- peekCursor.reset();
- buffer.clear();
- closed = false;
-}
-
-bool BufferedCharReader::feed(const std::string &data)
-{
- // Abort if the BufferedCharReader was closed
- if (closed) {
- return false;
- }
-
- // Append the data onto the queue
- buffer.push_back(data);
- return true;
-}
-
-void BufferedCharReader::close()
-{
- closed = true;
-}
-
-bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c)
-{
- // Handle line breaks, inserts breakes after the following character
- // combinations: \n, \r, \n\r, \r\n TODO: Change behaviour to \n, \n\r, \r\n
- if ((*c == '\n') || (*c == '\r')) {
- // Determine the type of the current linebreak character
- const uint8_t type = (*c == '\n') ? LB_STATE_LF : LB_STATE_CR;
-
- // Read the last count and the last type from the state
- const uint8_t lastCount = cursor->lbState & LB_STATE_MASK_CNT;
- const uint8_t lastType = cursor->lbState & LB_STATE_MASK_TYPE;
-
- // Set the current linebreak type and counter in the state
- cursor->lbState = ((lastCount + 1) & 1) | type;
-
- // If either this is the first instance of this character or the same
- // return character is repeated
- if (!lastCount || (lastType == type)) {
- *c = '\n';
- return true;
- }
- return false;
- }
-
- // Find the state
- cursor->lbState = LB_STATE_NONE;
- return true;
-}
-
-bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor,
- char *c)
-{
- bool hasChar = false;
- while (!hasChar) {
- // Abort if the current buffer element does not point to a valid entry
- // in the buffer -- we must wait until another data block has been fed
- // into the buffer
- if (cursor->bufferElem >= buffer.size()) {
- return false;
- }
-
- // Fetch the current element the peek pointer points to
- const std::string &data = buffer[cursor->bufferElem];
-
- // Handle the "no data" case -- either in a destructive or
- // non-destructive manner.
- if (cursor->bufferPos >= data.length()) {
- if (cursor->destructive) {
- buffer.pop_front();
- } else {
- cursor->bufferElem++;
- }
- cursor->bufferPos = 0;
- continue;
- }
-
- // Read the character, advance the buffer position
- *c = *(data.data() + cursor->bufferPos);
- cursor->bufferPos++;
-
- // Substitute linebreaks with a single LF (0x0A)
- hasChar = substituteLinebreaks(cursor, c);
- }
-
- // Update the position counter
- if (*c == '\n') {
- cursor->line++;
- cursor->column = 1;
- } else {
- // Ignore UTF-8 continuation bytes
- if (!((*c & 0x80) && !(*c & 0x40))) {
- cursor->column++;
- }
- }
-
- return true;
-}
-
-bool BufferedCharReader::peek(char *c)
-{
- return readCharacterAtCursor(&peekCursor, c);
-}
-
-bool BufferedCharReader::read(char *c)
-{
- resetPeek();
- return readCharacterAtCursor(&readCursor, c);
-}
-
-void BufferedCharReader::consumePeek()
-{
- // Remove all no longer needed buffer elements
- for (unsigned int i = 0; i < peekCursor.bufferElem; i++) {
- buffer.pop_front();
- }
- peekCursor.bufferElem = 0;
-
- // Copy the peek cursor to the read cursor
- readCursor.assign(peekCursor);
-}
-
-void BufferedCharReader::resetPeek()
-{
- // Reset the peek cursor to the read cursor
- peekCursor.assign(readCursor);
-}
-
-bool BufferedCharReader::atEnd()
-{
- if (closed) {
- if (buffer.size() <= 0) {
- return true;
- } else if (buffer.size() == 1) {
- return buffer[0].size() == readCursor.bufferPos;
- }
- }
- return false;
-}
-
-}
-}
-
diff --git a/src/core/utils/BufferedCharReader.hpp b/src/core/utils/BufferedCharReader.hpp
deleted file mode 100644
index b13cde6..0000000
--- a/src/core/utils/BufferedCharReader.hpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _OUSIA_UTILS_BUFFERED_CHAR_READER_H_
-#define _OUSIA_UTILS_BUFFERED_CHAR_READER_H_
-
-#include <deque>
-#include <string>
-#include <cstdint>
-
-namespace ousia {
-namespace utils {
-
-/**
- * The BufferedCharReader class is used for storing incomming data that
- * is fed into the pipeline as well as reading/peeking single characters
- * from that buffer. Additionally it counts the current column/row
- * (with correct handling for UTF-8) and contains an internal state
- * machine that handles the detection of linebreaks.
- *
- * Additionally the BufferedCharReader performs the following tasks:
- * 1. Convert the incomming character encoding to UTF-8 (TODO: implement)
- * 2. Convert arbitrary linebreaks to a single "\n"
- */
-class BufferedCharReader {
-
-private:
-
- /**
- * The ReadCursor structure is responsible for representing the read
- * position within the text an all state machine states belonging to the
- * cursor. There are two types of read cursors: destructive and
- * non-destructive read cursors.
- */
- struct ReadCursor {
- /**
- * Specifies whether this is a destructive cursor (bytes are discarded
- * once they were read from the buffer).
- */
- const bool destructive;
-
- /**
- * The line the cursor currently points to.
- */
- unsigned int line;
-
- /**
- * The column the cursor currently points to.
- */
- unsigned int column;
-
- /**
- * The index of the element in the data buffer we're currently reading
- * from.
- */
- unsigned int bufferElem;
-
- /**
- * The byte position within this data buffer.
- */
- unsigned int bufferPos;
-
- /**
- * State variable used in the internal state machine of the
- * line feed detection.
- */
- uint8_t lbState;
-
- /**
- * Constructor of the ReadCursor structure.
- *
- * @param destructive specifies whether the ReadCursor is destructive
- * (consumes all read characters, as used in the "read cursor") or
- * non-destructive (as used in the "peek cursor").
- */
- ReadCursor(const bool destructive);
-
- /**
- * Copys the data from another ReadCursor without overriding the
- * "destructive" flag.
- */
- void assign(const ReadCursor &cursor);
-
- /**
- * Resets the cursor without changing the "destructive" flag.
- */
- void reset();
- };
-
- /**
- * Queue containing the data that has been fed into the char reader.
- */
- std::deque<std::string> buffer;
-
- /**
- * The read and the peek cursor.
- */
- ReadCursor readCursor, peekCursor;
-
- /**
- * Determines whether the reader has been closed.
- */
- bool closed;
-
- /**
- * Substitute any combination of linebreaks in the incomming code with "\n".
- * Returns true if the current character is meant as output, false
- * otherwise.
- */
- bool substituteLinebreaks(ReadCursor *cursor, char *c);
-
- /**
- * Reads a character from the input buffer and advances the given read
- * cursor.
- *
- * @param cursor is a reference to the read cursor that should be used
- * for reading.
- * @param hasChar is set to true, if a character is available, false if
- * no character is available (e.g. because line breaks are substituted or
- * the end of a buffer boundary is reached -- in this case this function
- * should be called again with the same parameters.)
- * @param c is a output parameter, which will be set to the read character.
- * @param returns true if there was enough data in the buffer, false
- * otherwise.
- */
- bool readCharacterAtCursor(ReadCursor *cursor, char *c);
-
- /**
- * Function that is called for each read character -- updates the row and
- * column count.
- */
- void updatePositionCounters(const char c);
-
-public:
-
- /**
- * Constructor of the buffered char reader class.
- */
- BufferedCharReader();
-
- /**
- * Resets the reader to its initial state.
- */
- void reset();
-
- /**
- * Feeds new data into the internal buffer of the BufferedCharReader
- * class.
- *
- * @param data is a string containing the data that should be
- * appended to the internal buffer.
- * @return true if the operation was successful, false otherwise (e.g.
- * because the reader is closed).
- */
- bool feed(const std::string &data);
-
- /**
- * Marks the end of the input, allowing successors in the pipeline
- * to react properly (e.g. creating the end of stream token).
- */
- void close();
-
- /**
- * Peeks a single character. If called multiple times, returns the
- * character after the previously peeked character.
- *
- * @param c is a reference to the character to which the result should be
- * writtern.
- * @return true if the character was successfully read, false if there are
- * no more characters to be read in the buffer.
- */
- bool peek(char *c);
-
- /**
- * Reads a character from the input data. If "peek" was called
- * beforehand resets the peek pointer.
- *
- * @param c is a reference to the character to which the result should be
- * writtern.
- * @return true if the character was successfully read, false if there are
- * no more characters to be read in the buffer.
- */
- bool read(char *c);
-
- /**
- * Advances the read pointer to the peek pointer -- so if the "peek"
- * function was called, "read" will now return the character after
- * the last peeked character.
- */
- void consumePeek();
-
- /**
- * Resets the peek pointer to the "read" pointer.
- */
- void resetPeek();
-
- /**
- * Returns true if there are no more characters as the stream was
- * closed.
- */
- bool atEnd();
-
- /**
- * Returns the current line (starting with one).
- */
- inline int getLine()
- {
- return readCursor.line;
- }
-
- /**
- * Returns the current column (starting with one).
- */
- inline int getColumn()
- {
- return readCursor.column;
- }
-
-};
-
-}
-}
-
-#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */
-
diff --git a/src/core/utils/CSSParser.cpp b/src/core/utils/CSSParser.cpp
deleted file mode 100644
index 1763cc2..0000000
--- a/src/core/utils/CSSParser.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "BufferedCharReader.hpp"
-#include "CodeTokenizer.hpp"
-#include "Tokenizer.hpp"
-
-#include "CSSParser.hpp"
-
-namespace ousia {
-namespace utils {
-
-// CSS code tokens
-static const int CURLY_OPEN = 1;
-static const int CURLY_CLOSE = 2;
-static const int COLON = 3;
-static const int SEMICOLON = 4;
-static const int HASH = 5;
-static const int BRACKET_OPEN = 6;
-static const int BRACKET_CLOSE = 7;
-static const int PAREN_OPEN = 8;
-static const int PAREN_CLOSE = 9;
-// comments
-static const int COMMENT = 100;
-static const int COMMENT_OPEN = 101;
-static const int COMMENT_CLOSE = 102;
-// strings
-static const int STRING = 200;
-static const int SINGLE_QUOTE = 201;
-static const int DOUBLE_QUOTE = 202;
-static const int ESCAPE = 203;
-// general syntax
-static const int LINEBREAK = 300;
-
-static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN},
- {"}", CURLY_CLOSE},
- {":", COLON},
- {";", SEMICOLON},
- {"#", HASH},
- {"[", BRACKET_OPEN},
- {"]", BRACKET_CLOSE},
- {"(", PAREN_OPEN},
- {")", PAREN_CLOSE},
- {"/*", COMMENT_OPEN},
- {"*/", COMMENT_CLOSE},
- {"\\", ESCAPE},
- {"\''", SINGLE_QUOTE},
- {"\"", DOUBLE_QUOTE},
- {"\n", LINEBREAK}}};
-
-static const std::map<int, CodeTokenDescriptor> CSS_DESCRIPTORS = {
- {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}},
- {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}},
- {SINGLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}},
- {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}},
- {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}},
- {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
-
-StyleNode CSSParser::parse(BufferedCharReader &input)
-{
- CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS};
- tokenizer.ignoreComments = true;
- // TODO: implement
-}
-}
-}
diff --git a/src/core/utils/CSSParser.hpp b/src/core/utils/CSSParser.hpp
deleted file mode 100644
index c8b772d..0000000
--- a/src/core/utils/CSSParser.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _OUSIA_UTILS_CSS_PARSER_HPP_
-#define _OUSIA_UTILS_CSS_PARSER_HPP_
-
-#include <istream>
-#include <map>
-#include <vector>
-#include <tuple>
-
-#include <core/Managed.hpp>
-#include <core/dom/Node.hpp>
-
-#include "BufferedCharReader.hpp"
-
-namespace ousia {
-namespace utils {
-
-/*
- * The Specificity or Precedence of a CSS RuleSet, which decides which
- * rules are applied when different RuleSets contain conflicting information.
- *
- * The Specificity is calculated using the official W3C recommendation
- * http://www.w3.org/TR/CSS2/cascade.html#specificity
- *
- * Note that we do not need to use the integer 'a', since we do not allow
- * local style definitions for single nodes.
- */
-struct Specificity {
- int b;
- int c;
- int d;
-
- Specificity(int b, int c, int d) : b(b), c(c), d(d) {}
-};
-
-bool operator<(const Specificity &x, const Specificity &y)
-{
- return std::tie(x.b, x.c, x.d) < std::tie(y.b, y.c, y.d);
-}
-
-bool operator>(const Specificity &x, const Specificity &y)
-{
- return std::tie(x.b, x.c, x.d) > std::tie(y.b, y.c, y.d);
-}
-
-bool operator==(const Specificity &x, const Specificity &y)
-{
- return std::tie(x.b, x.c, x.d) == std::tie(y.b, y.c, y.d);
-}
-
-class RuleSet : public Managed {
-private:
- const std::map<std::string, std::string> values;
- const Specificity specificity;
-
-public:
- RuleSet(Manager &mgr, std::map<std::string, std::string> values,
- Specificity specificity)
- : Managed(mgr), values(std::move(values)), specificity(specificity)
- {
- }
-
- const std::map<std::string, std::string> &getValues() const
- {
- return values;
- }
-
- const Specificity &getSpecificity() const { return specificity; }
-};
-
-class PseudoSelector {
-private:
- const std::string name;
- const std::vector<std::string> args;
- const bool generative;
-
-public:
- PseudoSelector(std::string name, std::vector<std::string> args,
- bool generative)
- : name(std::move(name)), args(std::move(args)), generative(generative)
- {
- }
-
- const std::string &getName() const { return name; }
-
- const std::vector<std::string> &getArgs() const { return args; }
-
- const bool &isGenerative() const { return generative; }
-};
-
-enum class SelectionOperator { DESCENDANT, DIRECT_DESCENDANT };
-
-class StyleNode : public dom::Node {
-public:
- class StyleEdge : public Managed {
- private:
- Owned<StyleNode> target;
- const SelectionOperator selectionOperator;
-
- public:
- StyleEdge(Manager &mgr, Handle<StyleNode> target,
- SelectionOperator selectionOperator)
- : Managed(mgr),
- target(acquire(target)),
- selectionOperator(selectionOperator)
- {
- }
-
- Rooted<StyleNode> getTarget() const { return target; }
-
- const SelectionOperator &getSelectionOperator() const
- {
- return selectionOperator;
- }
- };
-
-private:
- const PseudoSelector pseudoSelector;
- std::vector<Owned<StyleEdge>> edges;
- const std::vector<Owned<RuleSet>> ruleSets;
-
-public:
- StyleNode(Manager &mgr, std::string name,
- PseudoSelector pseudoSelector,
- const std::vector<Handle<StyleEdge>> &edges,
- const std::vector<Handle<RuleSet>> &ruleSets)
- : dom::Node(mgr, std::move(name)),
- pseudoSelector(std::move(pseudoSelector)),
- edges(acquire(edges)),
- ruleSets(acquire(ruleSets))
- {
- }
-
- const PseudoSelector &getPseudoSelector() const { return pseudoSelector; }
-
- const std::vector<Owned<StyleEdge>> &getEdges() const { return edges; }
-
- const std::vector<Owned<RuleSet>> &getRuleSets() const { return ruleSets; }
-};
-
-class CSSParser {
-
-private:
-
-public:
- StyleNode parse(BufferedCharReader &input);
-};
-}
-}
-#endif
diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp
deleted file mode 100644
index e5b8610..0000000
--- a/src/core/utils/CodeTokenizer.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <cassert>
-
-#include "CodeTokenizer.hpp"
-
-namespace ousia {
-namespace utils {
-
-Token CodeTokenizer::constructToken(const Token &t)
-{
- std::string content = buf.str();
- buf.str(std::string());
- return Token{returnTokenId, content, startToken.startColumn,
- startToken.startLine, t.endColumn, t.endLine};
-}
-
-void CodeTokenizer::buffer(const Token &t) { buf << t.content; }
-
-bool CodeTokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
-{
- auto it = descriptors.find(t.tokenId);
- CodeTokenMode mode = CodeTokenMode::NONE;
- if (it != descriptors.end()) {
- mode = it->second.mode;
- }
-
- if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) {
- throw TokenizerException(
- "We did not expect a multiline token (except linebreaks). Most "
- "likely you did not add a linebreak token to your tokenizer!");
- }
-
- switch (state) {
- case CodeTokenizerState::NORMAL:
- switch (mode) {
- case CodeTokenMode::STRING_START_END:
- state = CodeTokenizerState::IN_STRING;
- break;
- case CodeTokenMode::BLOCK_COMMENT_START:
- state = CodeTokenizerState::IN_BLOCK_COMMENT;
- break;
- case CodeTokenMode::LINE_COMMENT:
- state = CodeTokenizerState::IN_LINE_COMMENT;
- break;
- case CodeTokenMode::LINEBREAK:
- peeked.push_back({it->second.id, t.content, t.startColumn,
- t.startLine, t.endColumn, t.endLine});
- return true;
- default:
- if (t.tokenId == TOKEN_TEXT) {
- int begin = -1;
- for (size_t c = 0; c < t.content.length(); c++) {
- bool isWhitespace =
- t.content[c] == ' ' || t.content[c] == '\t';
- if (begin < 0) {
- // if we have not yet set our beginning,
- // we wait for the first
- // non-whitespace-character to set it.
- if (!isWhitespace) {
- begin = c;
- }
- } else {
- // if we have set our beginning, we wait for the
- // first whitespace character, which marks the
- // end of the current word.
- if (isWhitespace) {
- peeked.push_back(Token{
- TOKEN_TEXT,
- t.content.substr(begin, (int)c - begin),
- t.startColumn + begin, t.startLine,
- t.startColumn + (int)c, t.endLine});
- begin = -1;
- }
- }
- }
- if(begin >= 0){
- peeked.push_back(Token{
- TOKEN_TEXT,
- t.content.substr(begin),
- t.startColumn + begin, t.startLine,
- t.endColumn, t.endLine});
- }
- } else {
- peeked.push_back(t);
- }
- return true;
- }
- startToken = t;
- returnTokenId = it->second.id;
- return false;
- case CodeTokenizerState::IN_LINE_COMMENT:
- switch (mode) {
- case CodeTokenMode::LINEBREAK:
- state = CodeTokenizerState::NORMAL;
- if (!ignoreComments) {
- peeked.push_back(constructToken(t));
- }
- return !ignoreComments;
- default:
- if (!ignoreComments) {
- buffer(t);
- }
- return false;
- }
- case CodeTokenizerState::IN_BLOCK_COMMENT:
- switch (mode) {
- case CodeTokenMode::BLOCK_COMMENT_END:
- state = CodeTokenizerState::NORMAL;
- if (!ignoreComments) {
- peeked.push_back(constructToken(t));
- }
- return !ignoreComments;
- default:
- if (!ignoreComments) {
- buffer(t);
- }
- return false;
- }
- case CodeTokenizerState::IN_STRING:
- switch (mode) {
- case CodeTokenMode::ESCAPE:
- if (escaped) {
- buffer(t);
- }
- escaped = !escaped;
- return false;
- case CodeTokenMode::STRING_START_END:
- if (escaped) {
- buffer(t);
- escaped = false;
- return false;
- } else {
- peeked.push_back(constructToken(t));
- state = CodeTokenizerState::NORMAL;
- return true;
- }
- default:
- if (escaped) {
- // TODO: handle escaped characters?
- escaped = false;
- }
- buffer(t);
- return false;
- }
- }
- assert(false);
-}
-}
-}
diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp
deleted file mode 100644
index 0fc0862..0000000
--- a/src/core/utils/CodeTokenizer.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _OUSIA_UTILS_CODE_TOKENIZER_HPP_
-#define _OUSIA_UTILS_CODE_TOKENIZER_HPP_
-
-#include <map>
-#include <sstream>
-
-#include "BufferedCharReader.hpp"
-#include "Tokenizer.hpp"
-
-namespace ousia {
-namespace utils {
-
-/*
- * This enum contains all special Token the CodeTokenizer supports, namely:
- *
- * 1.) An ambigous Tokens - in post programming languages single-quotes ' or
- * double-quotes " - to delimit string tokens.
- * 2.) A start token for line comments, which would e.g. be // in Java.
- * 3.) A start token for a block comment
- * 4.) An end token for a block comment.
- * 5.) A linebreak token
- * 6.) The escape token, which would e.g. be \ in java.
- */
-enum class CodeTokenMode {
- STRING_START_END,
- LINE_COMMENT,
- BLOCK_COMMENT_START,
- BLOCK_COMMENT_END,
- LINEBREAK,
- ESCAPE,
- NONE
-};
-
-/**
- * A CodeTokenDescriptor defines the id the user likes to have returned for
- * a Token of the mode specified, e.g. if you want to get the id 4 for a
- * String Token the corresponding CodeTokenDescriptor would be inizialized
- * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4};
- */
-struct CodeTokenDescriptor {
- CodeTokenMode mode;
- int id;
-
- CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {}
-};
-
-/**
- * The CodeTokenizer is a finite state machine with the states NORMAL, being
- * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING.
- */
-enum class CodeTokenizerState {
- NORMAL,
- IN_BLOCK_COMMENT,
- IN_LINE_COMMENT,
- IN_STRING
-};
-
-/**
- * The purpose of a CodeTokenizer is to make it easier to parse classical
- * programming Code. It adds the following features to a regular Tokenizer:
- * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens
- * for the opening delimiter, the text and the closing delimiter.
- * 2.) Escaping in String tokens.
- * 3.) Comment Tokens (for line comments as well as block comments)
- */
-class CodeTokenizer : public Tokenizer {
-private:
- std::map<int, CodeTokenDescriptor> descriptors;
- CodeTokenizerState state;
- std::stringstream buf;
- Token startToken;
- int returnTokenId;
- bool escaped = false;
-
- Token constructToken(const Token &t);
- void buffer(const Token &t);
-
-protected:
- bool doPrepare(const Token &t, std::deque<Token> &peeked) override;
-
-public:
- /**
- * If you do not want comment tokens to be returned you can set this to
- * true.
- */
- bool ignoreComments = false;
-
- /**
- *
- * @param input a BufferedCharReader containing the input for this
- *tokenizer,
- * as with a regular tokenizer.
- * @param root a TokenTreeNode representing the root of the TokenTree.
- * Please note that you have to specify all tokenIDs here that you use
- * in the descriptors map.
- * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors.
- * In this way you can specify the meaning of certain Tokens. Say you
- * specified the Token "//" with the id 1 in the TokenTree. Then you could
- * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map
- * and this CodeTokenizer would recognize the token "//" as starting a
- * line comment.
- */
- CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root,
- std::map<int, CodeTokenDescriptor> descriptors)
- : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL)
- {
- }
-};
-}
-}
-
-#endif
diff --git a/src/core/utils/RangeSet.hpp b/src/core/utils/RangeSet.hpp
deleted file mode 100644
index 841d476..0000000
--- a/src/core/utils/RangeSet.hpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _OUSIA_MODEL_RANGE_SET_HPP_
-#define _OUSIA_MODEL_RANGE_SET_HPP_
-
-#include <limits>
-#include <set>
-
-namespace ousia {
-namespace model {
-/**
- * The Range structure represents an interval of numerical values of type T.
- */
-template <typename T>
-struct Range {
- /**
- * Start is the start value of the range.
- */
- T start;
-
- /**
- * End is the end value of the range (inclusively).
- */
- T end;
-
- /**
- * Default constructor of the range class. The range is initialized as
- * invalid, with start being set to the maximum possible value of the
- * numerical type T, and end being set to the minimum possible value.
- */
- Range() :
- start(std::numeric_limits<T>::max()),
- end(std::numeric_limits<T>::min())
- {
- // Do nothing here
- }
-
- /**
- * Copies the given start and end value. The given values are not checked
- * for validity. Use the "isValid"
- *
- * @param start is the minimum value the range still covers.
- * @param end is the maximum value the range still covers.
- */
- Range(const T &start, const T &end) :
- start(start), end(end)
- {
- // Do nothing here
- }
-
- /**
- * Creates a range that covers exactly one element, namely the value given
- * as parameter n.
- */
- Range(const T &n) :
- start(n), end(n)
- {
- // Do nothing here
- }
-
- /**
- * Returns true if this range is valid, e.g. its start value is smaller or
- * equal to its end value.
- *
- * @return true if start is smaller or equal to end, false otherwise.
- */
- bool isValid() const
- {
- return start <= end;
- }
-
- /**
- * Checks whether the given value lies inside the range.
- *
- * @param v is the value that is being checked.
- * @return true if the value lies within the range, false otherwise.
- */
- bool inRange(T v) const
- {
- return (v >= start) && (v <= end);
- }
-
- /**
- * Checks whether the given range overlaps with another range. Not that
- * this check is only meaningful if both ranges are valid.
- *
- * @param r is the range that should be checked for overlapping with this
- * range.
- */
- bool overlaps(const Range<T> &r) const
- {
- return (((r.start >= start) || (r.end >= start))
- && ((r.start <= end) || (r.end <= end)));
- }
-
- /**
- * Returns true if the two given ranges are neighbours (their limits only
- * differ in the smallest representable difference between them).
- */
- bool neighbours(const Range<T> &r) const
- {
- constexpr T eps = std::numeric_limits<T>::is_integer
- ? 1 : std::numeric_limits<T>::epsilon();
- return ((r.start > end) && ((r.start - eps) <= end))
- || ((r.end < start) && ((r.end + eps) >= start));
- }
-
- /**
- * Checks whether the given range completely covers this range.
- */
- bool coveredBy(const Range<T> &r) const
- {
- return (r.start <= start) && (r.end >= end);
- }
-
- /**
- * Checks whether this range completely covers the given range.
- */
- bool covers(const Range<T> &r) const
- {
- return r.coveredBy(*this);
- }
-
- /**
- * Calculates the union of the two ranges -- not that this operation is only
- * valid if the ranges overlapp. Use the RangeSet class if you cannot
- * guarantee that.
- */
- Range<T> merge(const Range<T> &r) const
- {
- return Range(std::min(start, r.start), std::max(end, r.end));
- }
-
- /**
- * Returns a range that represents the spans the complete set defined by the
- * given type T.
- */
- static Range<T> typeRange()
- {
- return Range(std::numeric_limits<T>::min(),
- std::numeric_limits<T>::max());
- }
-
- /**
- * Returns a range that represents the spans the complete set defined by the
- * given type T up to a given value.
- *
- * @param till is the value up to which the range should be defined (till is
- * included in the set).
- */
- static Range<T> typeRangeUntil(const T &till)
- {
- return Range(std::numeric_limits<T>::min(), till);
- }
-
- /**
- * Returns a range that represents the spans the complete set defined by the
- * given type T up to a given value.
- *
- * @param from is the value from which the range should be defined (from is
- * included in the set).
- */
- static Range<T> typeRangeFrom(const T &from)
- {
- return Range(from, std::numeric_limits<T>::max());
- }
-};
-
-/**
- * RangeComp is a comperator used to order to sort the ranges within the
- * ranges list. Sorts by the start element.
- */
-template<typename T>
-struct RangeComp {
- bool operator() (const Range<T>& lhs, const Range<T>& rhs) const
- {
- return lhs.start < rhs.start;
- }
-};
-
-/**
- * RangeSet represents a set of ranges of the given numerical type and is thus
- * capable of representing any possible subset of the given numerical type T.
- */
-template<typename T>
-class RangeSet {
-
-protected:
- /**
- * Set of ranges used internally.
- */
- std::set<Range<T>, RangeComp<T>> ranges;
-
- /**
- * Returns an iterator to the first element in the ranges list that overlaps
- * with the given range.
- *
- * @param r is the range for which the first overlapping element should be
- * found.
- * @return an iterator pointing to the first overlapping element or to the
- * end of the list if no such element was found.
- */
- typename std::set<Range<T>, RangeComp<T>>::iterator firstOverlapping(
- const Range<T> &r, const bool allowNeighbours)
- {
- // Find the element with the next larger start value compared to the
- // start value given in r.
- auto it = ranges.upper_bound(r);
-
- // Go back one element
- if (it != ranges.begin()) {
- it--;
- }
-
- // Iterate until an overlapping element is found
- while (!(it->overlaps(r) || (allowNeighbours && it->neighbours(r)))
- && (it != ranges.end())) {
- it++;
- }
- return it;
- }
-
-public:
- /**
- * Calculates the union of this range set and the given range.
- *
- * @param range is the range that should be merged into this range set.
- */
- void merge(Range<T> r)
- {
- // Calculate a new range that covers both the new range and all old
- // ranges in the set -- delete all old elements on the way
- auto it = firstOverlapping(r, true);
- while ((it->overlaps(r) || it->neighbours(r)) && it != ranges.end()) {
- r = r.merge(*it);
- it = ranges.erase(it);
- }
-
- // Insert the new range
- ranges.insert(r);
- }
-
- /**
- * Calculates the union of this range set and the given range set.
- *
- * @param ranges is another range set for which the union with this set
- * should be calculated.
- */
- void merge(const RangeSet<T> &s)
- {
- for (Range<T> &r : s.ranges) {
- merge(r);
- }
- }
-
- /**
- * Checks whether this range set S contains the given range R:
- * S u R = R
- * (The intersection between R and S equals the given range)
- *
- * @param r is the range for which the containment should be checked.
- * @return true if the above condition is met, false otherwise.
- */
- bool contains(const Range<T> &r)
- {
- auto it = firstOverlapping(r, false);
- if (it != ranges.end()) {
- return (*it).covers(r);
- }
- return false;
- }
-
- /**
- * Checks whether this range set S1 contains the given range set S2:
- *
- * @param s is the range for which the containment should be checked.
- * @return true if the above condition is met, false otherwise.
- */
- bool contains(const RangeSet<T> &s)
- {
- bool res = true;
- for (Range<T> &r : s.ranges) {
- res = res && contains(r);
- }
- return res;
- }
-
- /**
- * Empties the set.
- */
- void clear()
- {
- ranges.clear();
- }
-
- /**
- * Returns the current list of ranges as a const reference.
- */
- const std::set<Range<T>, RangeComp<T>>& getRanges()
- {
- return this->ranges;
- }
-
-};
-
-}
-}
-
-#endif /* _OUSIA_MODEL_RANGE_SET_HPP_ */
-
diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp
deleted file mode 100644
index a0ca3aa..0000000
--- a/src/core/utils/Tokenizer.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <sstream>
-
-#include "Tokenizer.hpp"
-
-namespace ousia {
-namespace utils {
-
-static std::map<char, TokenTreeNode> buildChildren(
- const std::map<std::string, int> &inputs)
-{
- std::map<char, TokenTreeNode> children;
- std::map<char, std::map<std::string, int>> nexts;
-
- for (auto &e : inputs) {
- const std::string &s = e.first;
- const int id = e.second;
- if (s.empty()) {
- continue;
- }
- char start = s[0];
- const std::string suffix = s.substr(1);
- if (nexts.find(start) != nexts.end()) {
- nexts[start].insert(std::make_pair(suffix, id));
- } else {
- nexts.insert(std::make_pair(
- start, std::map<std::string, int>{{suffix, id}}));
- }
- }
-
- for (auto &n : nexts) {
- children.insert(std::make_pair(n.first, TokenTreeNode{n.second}));
- }
-
- return children;
-}
-
-static int buildId(const std::map<std::string, int> &inputs)
-{
- int tokenId = TOKEN_NONE;
- for (auto &e : inputs) {
- if (e.first.empty()) {
- if (tokenId != TOKEN_NONE) {
- throw TokenizerException{std::string{"Ambigous token found: "} +
- std::to_string(e.second)};
- } else {
- tokenId = e.second;
- }
- }
- }
- return tokenId;
-}
-
-TokenTreeNode::TokenTreeNode(const std::map<std::string, int> &inputs)
- : children(buildChildren(inputs)), tokenId(buildId(inputs))
-{
-}
-
-Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root)
- : input(input), root(root)
-{
-}
-
-bool Tokenizer::prepare()
-{
- std::stringstream buffer;
- char c;
- int startColumn = input.getColumn();
- int startLine = input.getLine();
- bool bufEmpty = true;
- while (input.peek(&c)) {
- if (root.children.find(c) != root.children.end()) {
- // if there might be a special token, keep peeking forward
- // until we find the token (or we don't).
- TokenTreeNode const *n = &root;
- std::stringstream tBuf;
- int match = TOKEN_NONE;
- while (true) {
- tBuf << c;
- n = &(n->children.at(c));
- if (n->tokenId != TOKEN_NONE) {
- match = n->tokenId;
- // from here on we found a token. If we have something
- // in our buffer already, we end the search now.
- if (!bufEmpty) {
- break;
- } else {
- // if we want to return this token ( = we have nothing
- // in our buffer yet) we look greedily for the longest
- // possible token we can construct.
- input.consumePeek();
- }
- }
- if (!input.peek(&c)) {
- // if we are at the end we break off the search.
- break;
- }
- if (n->children.find(c) == n->children.end()) {
- // if we do not find a possible continuation anymore,
- // break off the search.
- break;
- }
- }
- //reset the peek pointer to the last valid position.
- input.resetPeek();
- // check if we did indeed find a special token.
- if (match != TOKEN_NONE) {
- if (bufEmpty) {
- // if we did not have text before, construct that token.
- if (doPrepare(
- Token{match, tBuf.str(), startColumn, startLine,
- input.getColumn(), input.getLine()},
- peeked)) {
- return true;
- } else {
- startColumn = input.getColumn();
- startLine = input.getLine();
- continue;
- }
- } else {
- // otherwise we return the text before the token.
- if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn,
- startLine, input.getColumn(),
- input.getLine()},
- peeked)) {
- return true;
- } else{
- //we need to clear the buffer here. After all the token
- //corresponding to this buffer segment is already
- //constructed.
- buffer.str(std::string());
- bufEmpty = true;
- startColumn = input.getColumn();
- startLine = input.getLine();
- continue;
- }
- }
- } else{
- //if we found nothing, read at least one character.
- input.peek(&c);
- }
- }
- buffer << c;
- bufEmpty = false;
- input.consumePeek();
- }
- if (!bufEmpty) {
- return doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine,
- input.getColumn(), input.getLine()},
- peeked);
- }
- return false;
-}
-
-bool Tokenizer::doPrepare(const Token &t, std::deque<Token> &peeked)
-{
- peeked.push_back(t);
- return true;
-}
-
-bool Tokenizer::next(Token &t)
-{
- if (peeked.empty()) {
- if (!prepare()) {
- return false;
- }
- }
- t = peeked.front();
- peeked.pop_front();
- resetPeek();
- return true;
-}
-
-bool Tokenizer::peek(Token &t)
-{
- if (peekCursor >= peeked.size()) {
- if (!prepare()) {
- return false;
- }
- }
- t = peeked[peekCursor];
- return true;
-}
-
-void Tokenizer::resetPeek() { peekCursor = 0; }
-
-void Tokenizer::consumePeek()
-{
- while (peekCursor > 0) {
- peeked.pop_front();
- peekCursor--;
- }
-}
-}
-}
diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp
deleted file mode 100644
index 2debc75..0000000
--- a/src/core/utils/Tokenizer.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _OUSIA_UTILS_TOKENIZER_HPP_
-#define _OUSIA_UTILS_TOKENIZER_HPP_
-
-#include <istream>
-#include <map>
-#include <deque>
-
-#include "BufferedCharReader.hpp"
-
-namespace ousia {
-namespace utils {
-
-/**
- * This exception is currently only thrown if errors are made during the
- * initialization of the Tokenizer. Have a closer look at the documentation
- * of the TokenTreeNode constructor for more information.
- */
-class TokenizerException : public std::exception {
-public:
- const std::string msg;
-
- TokenizerException(const std::string &msg) : msg(msg){};
-
- virtual const char *what() const noexcept override { return msg.c_str(); }
-};
-
-/**
- * The Tokenizer internally uses a TokenTree to be efficiently able to identify
- * the longest consecutive token in the text. This is equivalent to a prefix
- * trie.
- *
- * The TokenTree is a construct that structures all special tokens this
- * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then
- * the TokenTree would look like this:
- *
- * a
- * | \
- * a $
- * | \
- * b c
- * | |
- * $ $
- *
- * Every node in the TokenTree is a valid end state that has a $ attached to it.
- * During the search algorithm the Tokenizer goes through the tree and stores
- * the last valid position. If a character follows that does not lead to a new
- * node in the TokenTree the search ends (and starts again at this character).
- * The token corresponding to the last valid position is returned.
- *
- * This allows us to uniquely identify the matching token given a certain
- * input text. Note that this is a greedy matching approach that does not
- * work if you're using truly ambiguous tokens (that have the same text).
- *
- * It is also not allowed that tokens have common middle parts but varying
- * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and
- * the input string "abc". In that case we start looking for "abd" at the
- * start, won't find it, wenn we hit "c" and start the scanning process
- * anew. Thus the "bc" token is not found.
- *
- * For most (well-behaved) tokenization schemes this is not the case,
- * though.
- */
-class TokenTreeNode {
-public:
- const std::map<char, TokenTreeNode> children;
- const int tokenId;
-
- /**
- * The TokenTreeNode constructor builds a TokenTree from the given token
- * specifications. The node returned by this constructor then is the root of
- * said TokenTree.
- * @param inputs Specifications of tokens in map form. Each specification
- * is a tuple of the text that should be matched and some unique ID (>= 0)
- * that is returned to you if that Token is found in the text.
- * An example for such a map would be
- * {
- * { "#" , 1},
- * { "##", 2},
- * { "/" , 3}
- * }
- * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE
- * (-1) and TOKEN_TEXT (-2).
- */
- TokenTreeNode(const std::map<std::string, int> &inputs);
-};
-
-/**
- * This is a reserved constant for the empty token.
- */
-static const int TOKEN_NONE = -1;
-/**
- * This is a reserved constant for every part of the input text that is not a
- * specified token.
- */
-static const int TOKEN_TEXT = -2;
-
-/**
- * A token for us is identified by an integer tokenID (either one of the
- * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants).
- * Additionally we return the matched text (which should only be really
- * interesting in case of TOKEN_TEXT tokens) and the position in the input text.
- */
-struct Token {
- int tokenId;
- std::string content;
- int startColumn;
- int startLine;
- int endColumn;
- int endLine;
-
- Token(int tokenId, std::string content, int startColumn, int startLine,
- int endColumn, int endLine)
- : tokenId(tokenId),
- content(content),
- startColumn(startColumn),
- startLine(startLine),
- endColumn(endColumn),
- endLine(endLine)
- {
- }
-
- Token() : tokenId(TOKEN_NONE) {}
-};
-
-/**
- * A Tokenizer has the purpose of subdividing an input text into tokens. In our
- * definition here we distinguish between two kinds of tokens:
- * 1.) User-specified tokens that match a fixed text.
- * 2.) Any other text between those tokens.
- * The user might want to specify the tokens '#{' and '#}' for example, because
- * they have some meaning in her code. The user sets the IDs to 1 and 2.
- * Given the input text
- * "some text #{ special command #} some text"
- * the tokenizer would return the tokens:
- * 1.) "some text " with the id TOKEN_TEXT (-2).
- * 2.) "#{" with the id 1.
- * 3.) " special command " with the id TOKEN_TEXT (-2).
- * 4.) "#}" with the id 2.
- * 5.) " some text" with the id TOKEN_TEXT (-2).
- * This makes the subsequent parsing of files of a specific type easier.
- * Note that in case of tokens with that are prefixes of other tokens the
- * longest possible match is returned.
- */
-class Tokenizer {
-private:
- BufferedCharReader &input;
- const TokenTreeNode &root;
- std::deque<Token> peeked;
- unsigned int peekCursor = 0;
-
- bool prepare();
-
-protected:
- /**
- * This method is an interface to build multiple tokens from a single one in
- * derived classes. This might be interesting if you want to implement
- * further logic on text tokens or similar applications.
- *
- * @param t a Token the "basic" tokenizer found.
- * @param peeked a reference to the deque containing all temporary Tokens.
- * You are supposed to append your tokens there. In the trivial case you just
- * put the given Token on top of the deque.
- * @return false if no token was appended to the deque (meaning that you want
- * to ignore the given token explicitly) and true in all other cases.
- */
- virtual bool doPrepare(const Token &t, std::deque<Token> &peeked);
-
-public:
- /**
- * @param input The input of a Tokenizer is given in the form of a
- * BufferedCharReader. Please refer to the respective documentation.
- * @param root This is meant to be the root of a TokenTree giving the
- * specification of user-defined tokens this Tokenizer should recognize.
- * The Tokenizer promises to not change the TokenTree such that you can
- * re-use the same specification for multiple inputs.
- * Please refer to the TokenTreeNode documentation for more information.
- */
- Tokenizer(BufferedCharReader &input, const TokenTreeNode &root);
-
- /**
- * The next method consumes one Token from the input stream and gives
- * it to the user (stored in the input argument).
- *
- * @param t a Token reference that is set to the next found token.
- * @return true if a next token was found and false if the input is at its
- * end.
- */
- bool next(Token &t);
- /**
- * The peek method does not consume the next Token but buffers it and
- * shows it to the user (stored in the input argument).
- *
- * @param t a Token reference that is set to the next found token.
- * @return true if a next token was found and false if the input is at its
- * end.
- */
- bool peek(Token &t);
-
- /**
- * Resets the peek pointer to the current position in the stream (to the
- * beginning of the buffer).
- */
- void resetPeek();
-
- /**
- * Clears the peek buffer, such that all peeked Tokens are consumed.
- */
- void consumePeek();
-};
-}
-}
-
-#endif
diff --git a/src/core/utils/Utils.cpp b/src/core/utils/Utils.cpp
deleted file mode 100644
index 184fdd0..0000000
--- a/src/core/utils/Utils.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "Utils.hpp"
-
-namespace ousia {
-
-bool Utils::isIdentifier(const std::string &name)
-{
- bool first = true;
- for (char c : name) {
- if (first && !(isAlphabetic(c) || c == '_')) {
- return false;
- }
- if (first && !(isAlphanumeric(c) || c == '_' || c == '-')) {
- return false;
- }
- first = false;
- }
- return true;
-}
-
-}
-
diff --git a/src/core/utils/Utils.hpp b/src/core/utils/Utils.hpp
deleted file mode 100644
index 2fcd794..0000000
--- a/src/core/utils/Utils.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- Ousía
- Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef _OUSIA_UTILS_H_
-#define _OUSIA_UTILS_H_
-
-#include <string>
-
-namespace ousia {
-
-class Utils {
-
-public:
-
- /**
- * Returns true if the given character is in [A-Za-z]
- */
- static bool isAlphabetic(const char c)
- {
- return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z'));
- }
-
- /**
- * Returns true if the given character is in [0-9]
- */
- static bool isNumeric(const char c)
- {
- return (c >= '0') && (c <= '9');
- }
-
- /**
- * Returns true if the given character is in [A-Za-z0-9]
- */
- static bool isAlphanumeric(const char c)
- {
- return isAlphabetic(c) || isNumeric(c);
- }
-
- /**
- * Returns true if the given character is in [A-Za-z_][A-Za-z0-9_-]*
- */
- static bool isIdentifier(const std::string &name);
-
-};
-
-}
-
-#endif /* _OUSIA_UTILS_H_ */
-