From ca2040ba16459f5b3dfe8a86a14680eecea7ef7d Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Fri, 21 Nov 2014 10:51:11 +0100 Subject: According to my lords holy command I enacted his graces order of moving all peasants and lordlings, big and small, young and old, to our common land and inner sanctuary to be safe from outer threats by those usurpers. --- src/core/BufferedCharReader.cpp | 216 ++++++++++++++ src/core/BufferedCharReader.hpp | 240 ++++++++++++++++ src/core/CSSParser.cpp | 81 ++++++ src/core/CSSParser.hpp | 167 +++++++++++ src/core/CodeTokenizer.cpp | 166 +++++++++++ src/core/CodeTokenizer.hpp | 130 +++++++++ src/core/Node.cpp | 145 ++++++++++ src/core/Node.hpp | 518 ++++++++++++++++++++++++++++++++++ src/core/RangeSet.hpp | 326 +++++++++++++++++++++ src/core/Tokenizer.cpp | 212 ++++++++++++++ src/core/Tokenizer.hpp | 231 +++++++++++++++ src/core/Typesystem.hpp | 87 ++++++ src/core/Utils.cpp | 39 +++ src/core/Utils.hpp | 65 +++++ src/core/dom/Node.cpp | 145 ---------- src/core/dom/Node.hpp | 518 ---------------------------------- src/core/dom/Typesystem.hpp | 87 ------ src/core/utils/BufferedCharReader.cpp | 216 -------------- src/core/utils/BufferedCharReader.hpp | 240 ---------------- src/core/utils/CSSParser.cpp | 81 ------ src/core/utils/CSSParser.hpp | 167 ----------- src/core/utils/CodeTokenizer.cpp | 166 ----------- src/core/utils/CodeTokenizer.hpp | 130 --------- src/core/utils/RangeSet.hpp | 326 --------------------- src/core/utils/Tokenizer.cpp | 212 -------------- src/core/utils/Tokenizer.hpp | 231 --------------- src/core/utils/Utils.cpp | 39 --- src/core/utils/Utils.hpp | 65 ----- 28 files changed, 2623 insertions(+), 2623 deletions(-) create mode 100644 src/core/BufferedCharReader.cpp create mode 100644 src/core/BufferedCharReader.hpp create mode 100644 src/core/CSSParser.cpp create mode 100644 src/core/CSSParser.hpp create mode 100644 src/core/CodeTokenizer.cpp create mode 100644 src/core/CodeTokenizer.hpp create mode 100644 src/core/Node.cpp create mode 100644 src/core/Node.hpp create mode 100644 src/core/RangeSet.hpp create mode 100644 src/core/Tokenizer.cpp create mode 100644 src/core/Tokenizer.hpp create mode 100644 src/core/Typesystem.hpp create mode 100644 src/core/Utils.cpp create mode 100644 src/core/Utils.hpp delete mode 100644 src/core/dom/Node.cpp delete mode 100644 src/core/dom/Node.hpp delete mode 100644 src/core/dom/Typesystem.hpp delete mode 100644 src/core/utils/BufferedCharReader.cpp delete mode 100644 src/core/utils/BufferedCharReader.hpp delete mode 100644 src/core/utils/CSSParser.cpp delete mode 100644 src/core/utils/CSSParser.hpp delete mode 100644 src/core/utils/CodeTokenizer.cpp delete mode 100644 src/core/utils/CodeTokenizer.hpp delete mode 100644 src/core/utils/RangeSet.hpp delete mode 100644 src/core/utils/Tokenizer.cpp delete mode 100644 src/core/utils/Tokenizer.hpp delete mode 100644 src/core/utils/Utils.cpp delete mode 100644 src/core/utils/Utils.hpp (limited to 'src/core') diff --git a/src/core/BufferedCharReader.cpp b/src/core/BufferedCharReader.cpp new file mode 100644 index 0000000..c13628f --- /dev/null +++ b/src/core/BufferedCharReader.cpp @@ -0,0 +1,216 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "BufferedCharReader.hpp" + +namespace ousia { +namespace utils { + +// Constants used within the linebreak statemachine. +static const uint8_t LB_STATE_NONE = 0x00; +static const uint8_t LB_STATE_ONE = 0x01; +static const uint8_t LB_STATE_LF = 0x10; +static const uint8_t LB_STATE_CR = 0x20; +static const uint8_t LB_STATE_MASK_CNT = 0x0F; +static const uint8_t LB_STATE_MASK_TYPE = 0xF0; + +/******************************************************************************* + * Struct BufferedCharReader::ReadCursor + ******************************************************************************/ + +BufferedCharReader::ReadCursor::ReadCursor(const bool destructive) : + destructive(destructive) +{ + reset(); +} + +void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor) +{ + this->line = cursor.line; + this->column = cursor.column; + this->bufferElem = cursor.bufferElem; + this->bufferPos = cursor.bufferPos; + this->lbState = cursor.lbState; +} + +void BufferedCharReader::ReadCursor::reset() +{ + this->line = 1; + this->column = 1; + this->bufferElem = 0; + this->bufferPos = 0; + this->lbState = LB_STATE_NONE; +} + +/******************************************************************************* + * Class BufferedCharReader + ******************************************************************************/ + +BufferedCharReader::BufferedCharReader() : + readCursor(true), peekCursor(false) +{ + reset(); +} + +void BufferedCharReader::reset() +{ + readCursor.reset(); + peekCursor.reset(); + buffer.clear(); + closed = false; +} + +bool BufferedCharReader::feed(const std::string &data) +{ + // Abort if the BufferedCharReader was closed + if (closed) { + return false; + } + + // Append the data onto the queue + buffer.push_back(data); + return true; +} + +void BufferedCharReader::close() +{ + closed = true; +} + +bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c) +{ + // Handle line breaks, inserts breakes after the following character + // combinations: \n, \r, \n\r, \r\n TODO: Change behaviour to \n, \n\r, \r\n + if ((*c == '\n') || (*c == '\r')) { + // Determine the type of the current linebreak character + const uint8_t type = (*c == '\n') ? LB_STATE_LF : LB_STATE_CR; + + // Read the last count and the last type from the state + const uint8_t lastCount = cursor->lbState & LB_STATE_MASK_CNT; + const uint8_t lastType = cursor->lbState & LB_STATE_MASK_TYPE; + + // Set the current linebreak type and counter in the state + cursor->lbState = ((lastCount + 1) & 1) | type; + + // If either this is the first instance of this character or the same + // return character is repeated + if (!lastCount || (lastType == type)) { + *c = '\n'; + return true; + } + return false; + } + + // Find the state + cursor->lbState = LB_STATE_NONE; + return true; +} + +bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor, + char *c) +{ + bool hasChar = false; + while (!hasChar) { + // Abort if the current buffer element does not point to a valid entry + // in the buffer -- we must wait until another data block has been fed + // into the buffer + if (cursor->bufferElem >= buffer.size()) { + return false; + } + + // Fetch the current element the peek pointer points to + const std::string &data = buffer[cursor->bufferElem]; + + // Handle the "no data" case -- either in a destructive or + // non-destructive manner. + if (cursor->bufferPos >= data.length()) { + if (cursor->destructive) { + buffer.pop_front(); + } else { + cursor->bufferElem++; + } + cursor->bufferPos = 0; + continue; + } + + // Read the character, advance the buffer position + *c = *(data.data() + cursor->bufferPos); + cursor->bufferPos++; + + // Substitute linebreaks with a single LF (0x0A) + hasChar = substituteLinebreaks(cursor, c); + } + + // Update the position counter + if (*c == '\n') { + cursor->line++; + cursor->column = 1; + } else { + // Ignore UTF-8 continuation bytes + if (!((*c & 0x80) && !(*c & 0x40))) { + cursor->column++; + } + } + + return true; +} + +bool BufferedCharReader::peek(char *c) +{ + return readCharacterAtCursor(&peekCursor, c); +} + +bool BufferedCharReader::read(char *c) +{ + resetPeek(); + return readCharacterAtCursor(&readCursor, c); +} + +void BufferedCharReader::consumePeek() +{ + // Remove all no longer needed buffer elements + for (unsigned int i = 0; i < peekCursor.bufferElem; i++) { + buffer.pop_front(); + } + peekCursor.bufferElem = 0; + + // Copy the peek cursor to the read cursor + readCursor.assign(peekCursor); +} + +void BufferedCharReader::resetPeek() +{ + // Reset the peek cursor to the read cursor + peekCursor.assign(readCursor); +} + +bool BufferedCharReader::atEnd() +{ + if (closed) { + if (buffer.size() <= 0) { + return true; + } else if (buffer.size() == 1) { + return buffer[0].size() == readCursor.bufferPos; + } + } + return false; +} + +} +} + diff --git a/src/core/BufferedCharReader.hpp b/src/core/BufferedCharReader.hpp new file mode 100644 index 0000000..b13cde6 --- /dev/null +++ b/src/core/BufferedCharReader.hpp @@ -0,0 +1,240 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ +#define _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ + +#include +#include +#include + +namespace ousia { +namespace utils { + +/** + * The BufferedCharReader class is used for storing incomming data that + * is fed into the pipeline as well as reading/peeking single characters + * from that buffer. Additionally it counts the current column/row + * (with correct handling for UTF-8) and contains an internal state + * machine that handles the detection of linebreaks. + * + * Additionally the BufferedCharReader performs the following tasks: + * 1. Convert the incomming character encoding to UTF-8 (TODO: implement) + * 2. Convert arbitrary linebreaks to a single "\n" + */ +class BufferedCharReader { + +private: + + /** + * The ReadCursor structure is responsible for representing the read + * position within the text an all state machine states belonging to the + * cursor. There are two types of read cursors: destructive and + * non-destructive read cursors. + */ + struct ReadCursor { + /** + * Specifies whether this is a destructive cursor (bytes are discarded + * once they were read from the buffer). + */ + const bool destructive; + + /** + * The line the cursor currently points to. + */ + unsigned int line; + + /** + * The column the cursor currently points to. + */ + unsigned int column; + + /** + * The index of the element in the data buffer we're currently reading + * from. + */ + unsigned int bufferElem; + + /** + * The byte position within this data buffer. + */ + unsigned int bufferPos; + + /** + * State variable used in the internal state machine of the + * line feed detection. + */ + uint8_t lbState; + + /** + * Constructor of the ReadCursor structure. + * + * @param destructive specifies whether the ReadCursor is destructive + * (consumes all read characters, as used in the "read cursor") or + * non-destructive (as used in the "peek cursor"). + */ + ReadCursor(const bool destructive); + + /** + * Copys the data from another ReadCursor without overriding the + * "destructive" flag. + */ + void assign(const ReadCursor &cursor); + + /** + * Resets the cursor without changing the "destructive" flag. + */ + void reset(); + }; + + /** + * Queue containing the data that has been fed into the char reader. + */ + std::deque buffer; + + /** + * The read and the peek cursor. + */ + ReadCursor readCursor, peekCursor; + + /** + * Determines whether the reader has been closed. + */ + bool closed; + + /** + * Substitute any combination of linebreaks in the incomming code with "\n". + * Returns true if the current character is meant as output, false + * otherwise. + */ + bool substituteLinebreaks(ReadCursor *cursor, char *c); + + /** + * Reads a character from the input buffer and advances the given read + * cursor. + * + * @param cursor is a reference to the read cursor that should be used + * for reading. + * @param hasChar is set to true, if a character is available, false if + * no character is available (e.g. because line breaks are substituted or + * the end of a buffer boundary is reached -- in this case this function + * should be called again with the same parameters.) + * @param c is a output parameter, which will be set to the read character. + * @param returns true if there was enough data in the buffer, false + * otherwise. + */ + bool readCharacterAtCursor(ReadCursor *cursor, char *c); + + /** + * Function that is called for each read character -- updates the row and + * column count. + */ + void updatePositionCounters(const char c); + +public: + + /** + * Constructor of the buffered char reader class. + */ + BufferedCharReader(); + + /** + * Resets the reader to its initial state. + */ + void reset(); + + /** + * Feeds new data into the internal buffer of the BufferedCharReader + * class. + * + * @param data is a string containing the data that should be + * appended to the internal buffer. + * @return true if the operation was successful, false otherwise (e.g. + * because the reader is closed). + */ + bool feed(const std::string &data); + + /** + * Marks the end of the input, allowing successors in the pipeline + * to react properly (e.g. creating the end of stream token). + */ + void close(); + + /** + * Peeks a single character. If called multiple times, returns the + * character after the previously peeked character. + * + * @param c is a reference to the character to which the result should be + * writtern. + * @return true if the character was successfully read, false if there are + * no more characters to be read in the buffer. + */ + bool peek(char *c); + + /** + * Reads a character from the input data. If "peek" was called + * beforehand resets the peek pointer. + * + * @param c is a reference to the character to which the result should be + * writtern. + * @return true if the character was successfully read, false if there are + * no more characters to be read in the buffer. + */ + bool read(char *c); + + /** + * Advances the read pointer to the peek pointer -- so if the "peek" + * function was called, "read" will now return the character after + * the last peeked character. + */ + void consumePeek(); + + /** + * Resets the peek pointer to the "read" pointer. + */ + void resetPeek(); + + /** + * Returns true if there are no more characters as the stream was + * closed. + */ + bool atEnd(); + + /** + * Returns the current line (starting with one). + */ + inline int getLine() + { + return readCursor.line; + } + + /** + * Returns the current column (starting with one). + */ + inline int getColumn() + { + return readCursor.column; + } + +}; + +} +} + +#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */ + diff --git a/src/core/CSSParser.cpp b/src/core/CSSParser.cpp new file mode 100644 index 0000000..1763cc2 --- /dev/null +++ b/src/core/CSSParser.cpp @@ -0,0 +1,81 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "BufferedCharReader.hpp" +#include "CodeTokenizer.hpp" +#include "Tokenizer.hpp" + +#include "CSSParser.hpp" + +namespace ousia { +namespace utils { + +// CSS code tokens +static const int CURLY_OPEN = 1; +static const int CURLY_CLOSE = 2; +static const int COLON = 3; +static const int SEMICOLON = 4; +static const int HASH = 5; +static const int BRACKET_OPEN = 6; +static const int BRACKET_CLOSE = 7; +static const int PAREN_OPEN = 8; +static const int PAREN_CLOSE = 9; +// comments +static const int COMMENT = 100; +static const int COMMENT_OPEN = 101; +static const int COMMENT_CLOSE = 102; +// strings +static const int STRING = 200; +static const int SINGLE_QUOTE = 201; +static const int DOUBLE_QUOTE = 202; +static const int ESCAPE = 203; +// general syntax +static const int LINEBREAK = 300; + +static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN}, + {"}", CURLY_CLOSE}, + {":", COLON}, + {";", SEMICOLON}, + {"#", HASH}, + {"[", BRACKET_OPEN}, + {"]", BRACKET_CLOSE}, + {"(", PAREN_OPEN}, + {")", PAREN_CLOSE}, + {"/*", COMMENT_OPEN}, + {"*/", COMMENT_CLOSE}, + {"\\", ESCAPE}, + {"\''", SINGLE_QUOTE}, + {"\"", DOUBLE_QUOTE}, + {"\n", LINEBREAK}}}; + +static const std::map CSS_DESCRIPTORS = { + {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}}, + {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}}, + {SINGLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, + {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, + {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}}, + {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; + +StyleNode CSSParser::parse(BufferedCharReader &input) +{ + CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS}; + tokenizer.ignoreComments = true; + // TODO: implement +} +} +} diff --git a/src/core/CSSParser.hpp b/src/core/CSSParser.hpp new file mode 100644 index 0000000..c8b772d --- /dev/null +++ b/src/core/CSSParser.hpp @@ -0,0 +1,167 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_UTILS_CSS_PARSER_HPP_ +#define _OUSIA_UTILS_CSS_PARSER_HPP_ + +#include +#include +#include +#include + +#include +#include + +#include "BufferedCharReader.hpp" + +namespace ousia { +namespace utils { + +/* + * The Specificity or Precedence of a CSS RuleSet, which decides which + * rules are applied when different RuleSets contain conflicting information. + * + * The Specificity is calculated using the official W3C recommendation + * http://www.w3.org/TR/CSS2/cascade.html#specificity + * + * Note that we do not need to use the integer 'a', since we do not allow + * local style definitions for single nodes. + */ +struct Specificity { + int b; + int c; + int d; + + Specificity(int b, int c, int d) : b(b), c(c), d(d) {} +}; + +bool operator<(const Specificity &x, const Specificity &y) +{ + return std::tie(x.b, x.c, x.d) < std::tie(y.b, y.c, y.d); +} + +bool operator>(const Specificity &x, const Specificity &y) +{ + return std::tie(x.b, x.c, x.d) > std::tie(y.b, y.c, y.d); +} + +bool operator==(const Specificity &x, const Specificity &y) +{ + return std::tie(x.b, x.c, x.d) == std::tie(y.b, y.c, y.d); +} + +class RuleSet : public Managed { +private: + const std::map values; + const Specificity specificity; + +public: + RuleSet(Manager &mgr, std::map values, + Specificity specificity) + : Managed(mgr), values(std::move(values)), specificity(specificity) + { + } + + const std::map &getValues() const + { + return values; + } + + const Specificity &getSpecificity() const { return specificity; } +}; + +class PseudoSelector { +private: + const std::string name; + const std::vector args; + const bool generative; + +public: + PseudoSelector(std::string name, std::vector args, + bool generative) + : name(std::move(name)), args(std::move(args)), generative(generative) + { + } + + const std::string &getName() const { return name; } + + const std::vector &getArgs() const { return args; } + + const bool &isGenerative() const { return generative; } +}; + +enum class SelectionOperator { DESCENDANT, DIRECT_DESCENDANT }; + +class StyleNode : public dom::Node { +public: + class StyleEdge : public Managed { + private: + Owned target; + const SelectionOperator selectionOperator; + + public: + StyleEdge(Manager &mgr, Handle target, + SelectionOperator selectionOperator) + : Managed(mgr), + target(acquire(target)), + selectionOperator(selectionOperator) + { + } + + Rooted getTarget() const { return target; } + + const SelectionOperator &getSelectionOperator() const + { + return selectionOperator; + } + }; + +private: + const PseudoSelector pseudoSelector; + std::vector> edges; + const std::vector> ruleSets; + +public: + StyleNode(Manager &mgr, std::string name, + PseudoSelector pseudoSelector, + const std::vector> &edges, + const std::vector> &ruleSets) + : dom::Node(mgr, std::move(name)), + pseudoSelector(std::move(pseudoSelector)), + edges(acquire(edges)), + ruleSets(acquire(ruleSets)) + { + } + + const PseudoSelector &getPseudoSelector() const { return pseudoSelector; } + + const std::vector> &getEdges() const { return edges; } + + const std::vector> &getRuleSets() const { return ruleSets; } +}; + +class CSSParser { + +private: + +public: + StyleNode parse(BufferedCharReader &input); +}; +} +} +#endif diff --git a/src/core/CodeTokenizer.cpp b/src/core/CodeTokenizer.cpp new file mode 100644 index 0000000..e5b8610 --- /dev/null +++ b/src/core/CodeTokenizer.cpp @@ -0,0 +1,166 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include "CodeTokenizer.hpp" + +namespace ousia { +namespace utils { + +Token CodeTokenizer::constructToken(const Token &t) +{ + std::string content = buf.str(); + buf.str(std::string()); + return Token{returnTokenId, content, startToken.startColumn, + startToken.startLine, t.endColumn, t.endLine}; +} + +void CodeTokenizer::buffer(const Token &t) { buf << t.content; } + +bool CodeTokenizer::doPrepare(const Token &t, std::deque &peeked) +{ + auto it = descriptors.find(t.tokenId); + CodeTokenMode mode = CodeTokenMode::NONE; + if (it != descriptors.end()) { + mode = it->second.mode; + } + + if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) { + throw TokenizerException( + "We did not expect a multiline token (except linebreaks). Most " + "likely you did not add a linebreak token to your tokenizer!"); + } + + switch (state) { + case CodeTokenizerState::NORMAL: + switch (mode) { + case CodeTokenMode::STRING_START_END: + state = CodeTokenizerState::IN_STRING; + break; + case CodeTokenMode::BLOCK_COMMENT_START: + state = CodeTokenizerState::IN_BLOCK_COMMENT; + break; + case CodeTokenMode::LINE_COMMENT: + state = CodeTokenizerState::IN_LINE_COMMENT; + break; + case CodeTokenMode::LINEBREAK: + peeked.push_back({it->second.id, t.content, t.startColumn, + t.startLine, t.endColumn, t.endLine}); + return true; + default: + if (t.tokenId == TOKEN_TEXT) { + int begin = -1; + for (size_t c = 0; c < t.content.length(); c++) { + bool isWhitespace = + t.content[c] == ' ' || t.content[c] == '\t'; + if (begin < 0) { + // if we have not yet set our beginning, + // we wait for the first + // non-whitespace-character to set it. + if (!isWhitespace) { + begin = c; + } + } else { + // if we have set our beginning, we wait for the + // first whitespace character, which marks the + // end of the current word. + if (isWhitespace) { + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin, (int)c - begin), + t.startColumn + begin, t.startLine, + t.startColumn + (int)c, t.endLine}); + begin = -1; + } + } + } + if(begin >= 0){ + peeked.push_back(Token{ + TOKEN_TEXT, + t.content.substr(begin), + t.startColumn + begin, t.startLine, + t.endColumn, t.endLine}); + } + } else { + peeked.push_back(t); + } + return true; + } + startToken = t; + returnTokenId = it->second.id; + return false; + case CodeTokenizerState::IN_LINE_COMMENT: + switch (mode) { + case CodeTokenMode::LINEBREAK: + state = CodeTokenizerState::NORMAL; + if (!ignoreComments) { + peeked.push_back(constructToken(t)); + } + return !ignoreComments; + default: + if (!ignoreComments) { + buffer(t); + } + return false; + } + case CodeTokenizerState::IN_BLOCK_COMMENT: + switch (mode) { + case CodeTokenMode::BLOCK_COMMENT_END: + state = CodeTokenizerState::NORMAL; + if (!ignoreComments) { + peeked.push_back(constructToken(t)); + } + return !ignoreComments; + default: + if (!ignoreComments) { + buffer(t); + } + return false; + } + case CodeTokenizerState::IN_STRING: + switch (mode) { + case CodeTokenMode::ESCAPE: + if (escaped) { + buffer(t); + } + escaped = !escaped; + return false; + case CodeTokenMode::STRING_START_END: + if (escaped) { + buffer(t); + escaped = false; + return false; + } else { + peeked.push_back(constructToken(t)); + state = CodeTokenizerState::NORMAL; + return true; + } + default: + if (escaped) { + // TODO: handle escaped characters? + escaped = false; + } + buffer(t); + return false; + } + } + assert(false); +} +} +} diff --git a/src/core/CodeTokenizer.hpp b/src/core/CodeTokenizer.hpp new file mode 100644 index 0000000..0fc0862 --- /dev/null +++ b/src/core/CodeTokenizer.hpp @@ -0,0 +1,130 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_UTILS_CODE_TOKENIZER_HPP_ +#define _OUSIA_UTILS_CODE_TOKENIZER_HPP_ + +#include +#include + +#include "BufferedCharReader.hpp" +#include "Tokenizer.hpp" + +namespace ousia { +namespace utils { + +/* + * This enum contains all special Token the CodeTokenizer supports, namely: + * + * 1.) An ambigous Tokens - in post programming languages single-quotes ' or + * double-quotes " - to delimit string tokens. + * 2.) A start token for line comments, which would e.g. be // in Java. + * 3.) A start token for a block comment + * 4.) An end token for a block comment. + * 5.) A linebreak token + * 6.) The escape token, which would e.g. be \ in java. + */ +enum class CodeTokenMode { + STRING_START_END, + LINE_COMMENT, + BLOCK_COMMENT_START, + BLOCK_COMMENT_END, + LINEBREAK, + ESCAPE, + NONE +}; + +/** + * A CodeTokenDescriptor defines the id the user likes to have returned for + * a Token of the mode specified, e.g. if you want to get the id 4 for a + * String Token the corresponding CodeTokenDescriptor would be inizialized + * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; + */ +struct CodeTokenDescriptor { + CodeTokenMode mode; + int id; + + CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} +}; + +/** + * The CodeTokenizer is a finite state machine with the states NORMAL, being + * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. + */ +enum class CodeTokenizerState { + NORMAL, + IN_BLOCK_COMMENT, + IN_LINE_COMMENT, + IN_STRING +}; + +/** + * The purpose of a CodeTokenizer is to make it easier to parse classical + * programming Code. It adds the following features to a regular Tokenizer: + * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens + * for the opening delimiter, the text and the closing delimiter. + * 2.) Escaping in String tokens. + * 3.) Comment Tokens (for line comments as well as block comments) + */ +class CodeTokenizer : public Tokenizer { +private: + std::map descriptors; + CodeTokenizerState state; + std::stringstream buf; + Token startToken; + int returnTokenId; + bool escaped = false; + + Token constructToken(const Token &t); + void buffer(const Token &t); + +protected: + bool doPrepare(const Token &t, std::deque &peeked) override; + +public: + /** + * If you do not want comment tokens to be returned you can set this to + * true. + */ + bool ignoreComments = false; + + /** + * + * @param input a BufferedCharReader containing the input for this + *tokenizer, + * as with a regular tokenizer. + * @param root a TokenTreeNode representing the root of the TokenTree. + * Please note that you have to specify all tokenIDs here that you use + * in the descriptors map. + * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. + * In this way you can specify the meaning of certain Tokens. Say you + * specified the Token "//" with the id 1 in the TokenTree. Then you could + * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map + * and this CodeTokenizer would recognize the token "//" as starting a + * line comment. + */ + CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root, + std::map descriptors) + : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) + { + } +}; +} +} + +#endif diff --git a/src/core/Node.cpp b/src/core/Node.cpp new file mode 100644 index 0000000..c9651fb --- /dev/null +++ b/src/core/Node.cpp @@ -0,0 +1,145 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Node.hpp" + +namespace ousia { +namespace dom { + +/* Class Node */ + +void Node::setName(std::string name) +{ + // Call the name change event + NameChangeEvent ev{this->name, name}; + triggerEvent(ev); + + // Set the new name + this->name = std::move(name); +} + +void Node::path(std::vector &p) const +{ + if (!isRoot()) { + parent->path(p); + } + p.push_back(name); +} + +std::vector Node::path() const +{ + std::vector res; + path(res); + return res; +} + +void Node::doResolve(std::vector> &res, + const std::vector &path, Filter filter, + void *filterData, unsigned idx, VisitorSet &visited) +{ + // Do nothing in the default implementation +} + +int Node::resolve(std::vector> &res, + const std::vector &path, Filter filter, + void *filterData, unsigned idx, VisitorSet &visited, + const std::string *alias) +{ + // Abort if this node was already visited for this path index + std::pair recKey = std::make_pair(this, idx); + if (visited.find(recKey) != visited.end()) { + return res.size(); + } + visited.insert(recKey); + + // Check whether the we can continue the path + if (path[idx] == name || (alias && *alias == name)) { + // If we have reached the end of the path and the node is successfully + // tested by the filter function, add it to the result. Otherwise + // continue searching along the path + if (idx + 1 == path.size()) { + if (!filter || filter(this, filterData)) { + res.push_back(this); + } + } else { + doResolve(res, path, filter, filterData, idx + 1, visited); + } + } + + // Restart the search from here in order to find all possible nodes that can + // be matched to the given path + doResolve(res, path, filter, filterData, 0, visited); + + return res.size(); +} + +std::vector> Node::resolve(const std::vector &path, + Filter filter = nullptr, + void *filterData = nullptr) +{ + std::vector> res; + VisitorSet visited; + resolve(res, path, filter, filterData, 0, visited, nullptr); + return res; +} + +int Node::registerEventHandler(EventType type, EventHandler handler, + Handle owner, + bool includeChildren) +{ + const int id = handlerIdCounter++; + handlers.insert(std::make_pair( + type, + EventHandlerDescriptor{id, handler, owner, this, includeChildren})); + return id; +} + +bool Node::unregisterEventHandler(int id) { + for (auto it = handlers.begin(); it != handlers.end(); it++) { + if (it->second.id == id) { + handlers.erase(it); + return true; + } + } + return false; +} + +bool Node::triggerEvent(Event &event, bool fromChild) { + bool res = false; + // Iterate over all event handlers + const auto range = handlers.equal_range(event.type); + for (auto it = range.first; it != range.second; it++) { + // Fetch a reference to the descriptor, check whether it should be + // called for bubbled events + EventHandlerDescriptor descr = it->second; + if (!fromChild || descr.includeChildren) { + descr.handler(event, descr.owner); + res = true; + } + } + + // If possible, let the event bubble up to the parent node + if (event.canBubble() && !parent.isNull()) { + res = parent->triggerEvent(event, true) | res; + } + return res; +} + +} +} + diff --git a/src/core/Node.hpp b/src/core/Node.hpp new file mode 100644 index 0000000..249d1f2 --- /dev/null +++ b/src/core/Node.hpp @@ -0,0 +1,518 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_DOM_NODE_HPP_ +#define _OUSIA_DOM_NODE_HPP_ + +#include +#include +#include +#include +#include + +#include + +namespace ousia { +namespace dom { + +/* Forward declarations */ +class Node; +class Event; + +/** + * EventType is an enum containing all possible node events. New event types + * should be added here. + */ +enum class EventType : int { + /** + * Generic update event which may be triggered if some important property + * of the node is changed. + */ + UPDATE, + + /** + * The NAME_CHANGE event is used to inform listeners that the name of the + * node has changed. + */ + NAME_CHANGE, + + /** + * The ADD_CHILD event is used to inform listeners that the node got a new + * child in any of its child node lists. + */ + ADD_CHILD, + + /** + * The DELETE_CHILD event is used to inform listeners that the node got a + * new child in any of its child node lists. + */ + DELETE_CHILD +}; + +/** + * Definition of the EventHandler function. + * + * @param event is a reference to the object holding the event data. + * @param owner is a reference to the managed object that was given in the + * registerEventHandler function. + */ +using EventHandler = void (*)(const Event &event, Handle owner); + +/** + * The Event class and its child classes are responsible for containing the + * actual event data which further describes the event to the event handlers. + * Instances of this class and its children must be declared on the stack or as + * a temporary. + */ +class Event { +private: + /** + * True as long as the event can bubble up the node hirarchy. + */ + mutable bool bubble; + +public: + /** + * Contains the actual event type of this class. + */ + const EventType type; + + /** + * Node on which the event was triggered. + */ + Rooted sender; + + /** + * Constructor of the Event class. + * + * @param type is an element from the EventType enum. + * @param bubble if set to true, the event can bubble up the node hirarchy. + */ + Event(EventType type, bool bubble = true) : bubble(bubble), type(type){}; + + /** + * Delete the copy constructor. + */ + Event(const Event &) = delete; + + /** + * Delete the assignment operator. + */ + Event &operator=(const Event &) = delete; + + /** + * Stops the propagation of this event to the parent element. + */ + void stopPropagation() const { bubble = false; } + + /** + * Returns true if the event can still bubble. + */ + bool canBubble() const { return bubble; } +}; + +/** + * Event used when the name of a node has changed. + */ +class NameChangeEvent : public Event { +public: + /** + * Reference to a string containing the old name of the node. + */ + const std::string &oldName; + + /** + * Reference to a string containing the new name of the node. + */ + const std::string &newName; + + /** + * Constructor of the NameChangeEvent class. + * + * @param oldName is a reference to a string containing the old name of the + * node. + * @param newName is a reference to a string containing the new name of the + * node. + * @param bubble if set to true, the event can bubble up the node hirarchy. + */ + NameChangeEvent(const std::string &oldName, const std::string &newName, + bool bubble = true) + : Event(EventType::NAME_CHANGE, bubble), + oldName(oldName), + newName(newName) + { + } +}; + +/** + * Struct containing the data which describes a single registered event handler. + * Note that the event type (e.g. which type of event this element was + * registered for) is stored outside the EventHandlerDescriptor (in the map + * storing the registered event handlers). + */ +struct EventHandlerDescriptor { + /** + * Unique id of the event handler. + */ + const int id; + + /** + * Reference to the event handler containing the events. + */ + const EventHandler handler; + + /** + * Reference to the managed element which owns the event handler. The object + * which owns the Owned handler is given in the constructor. + */ + const Owned owner; + + /** + * Set to true, if this event handler listens to bubbled events comming from + * child nodes. + */ + const bool includeChildren; + + /** + * Constructor of the EventHandlerDescriptor struct. + * + * @param id is the node-unique id of the EventHandlerDescriptor. + * @param handler is the function pointer which is going to be called once + * the associated event handler has fired. + * @param owner is a user-specified object which owns the method that is + * going to be called. This can be used to make sure that the method which + * handles the events has access to its owned object as long as the event + * handler lives. + * @param parent is the parent element this descriptor belongs to. The + * a handle to the "owner" object will be created on behalf of the parent. + * @param includeChildren is set to true if the event handler should handle + * events comming from child elements. + */ + EventHandlerDescriptor(int id, EventHandler handler, Handle owner, + Managed *parent, bool includeChildren) + : id(id), + handler(handler), + owner(owner, parent), + includeChildren(includeChildren) + { + } +}; + +/** + * The Node class builds the base class for any Node within the DOM graph. A + * node may either be a descriptive node (such as a domain description etc.) + * or a document element. Each node is identified by acharacteristic name and + * a parent element. Note that the node name is not required to be unique. Nodes + * without parent are considered root nodes. + */ +class Node : public Managed { +public: + /** + * The Filter function is used when resolving names to Node instances. The + * filter tests whether the given node meets the requirements for inclusion + * in the result list. + * + * @param node is the node which should be tested. + * @param data is user-defined data passed to the filter. + * @return true if the node should be included in the result set, false + * otherwise. + */ + using Filter = bool (*)(Handle node, void *data); + + /** + * Hash functional used to convert pairs of nodes and int to hashes which + * can be used within a unordered_set. + */ + struct VisitorHash { + size_t operator()(const std::pair &p) const + { + const std::hash nodeHash; + const std::hash intHash; + return nodeHash(p.first) + 37 * intHash(p.second); + } + }; + + /** + * Alias for the VisitorSet class which represents all nodes which have been + * visited in the name resolution process. The map stores pairs of node + * pointers and integers, indicating for which path start id the node has + * already been visited. + */ + using VisitorSet = + std::unordered_set, VisitorHash>; + +private: + /** + * Name of the node. As names are always looked up relative to a node, + * names are not required to be unique. + */ + std::string name; + + /** + * Reference to a parent node instace. + */ + Owned parent; + + /** + * Current id counter. The id counter may be used to create ids which are + * unique inside the realm of this manager instance. + */ + int handlerIdCounter = 0; + + /** + * Multimap containing all registered event handlers for this node. + */ + std::multimap handlers; + + /** + * Private version of the "path" function used to construct the path. Calls + * the path function of the parent node and adds the own name to the given + * vector. + * + * @param p is the list the path should be constructed in. + */ + void path(std::vector &p) const; + +protected: + /** + * Function which should be overwritten by derived classes in order to + * resolve node names to a list of possible nodes. The implementations of + * this function do not need to do anything but call the "resovle" function + * of any child instance of NamedNode. + * + * @param res is the result list containing all possible nodes matching the + * name specified in the path. + * @param path is a list specifying a path of node names which is meant to + * specify a certain named node. + * @param idx is the current index in the path. + * @param visited is a map which is used to prevent unwanted recursion. + * @param filter is a callback function which may check whether a certain + * node should be in the result set. If nullptr is given, all nodes matching + * the path are included. The filter function can be used to restrict the + * type of matched functions. + * @param filterData is user-defined data that should be passed to the + * filter. + */ + virtual void doResolve(std::vector> &res, + const std::vector &path, Filter filter, + void *filterData, unsigned idx, VisitorSet &visited); + +public: + /** + * Initializes the node with empty name and parent. + * + * @param mgr is a reference to the Manager instace the node belongs to. + */ + Node(Manager &mgr, Handle parent = nullptr) + : Managed(mgr), parent(acquire(parent)) + { + } + + /** + * Constructs a new node with the given name and the given parent element. + * + * @param mgr is a reference to the Manager instace the node belongs to. + * @param name is the name of the Node. + * @param parent is a handle pointing at the parent node. + */ + Node(Manager &mgr, std::string name, Handle parent = nullptr) + : Managed(mgr), name(name), parent(acquire(parent)) + { + } + + /** + * Sets the name of the node to the given name. Note: The name set here may + * be invalid (contain spaces, colons or other special characters). However, + * in this case the node will not be reachable as reference from a input + * document. This behaviour allows for gracefully degradation in error + * cases. + * + * @param name is the name that should be assigned to the node. + */ + void setName(std::string name); + + /** + * Returns the name of the node. + */ + std::string getName() const { return name; } + + /** + * Specifies whether the node has a name, e.g. whether the current name is + * not empty. + * + * @return true if the name of this node is not empty, false otherwise. + */ + bool hasName() const { return !name.empty(); } + + /** + * Sets the parent node. + * + * @param parent is a Handle to the parent node. + */ + void setParent(Handle parent) { this->parent = acquire(parent); } + + /** + * Returns a handle to the parent node of the Node instance. + * + * @return a handle to the root node. + */ + Rooted getParent() const { return parent; } + + /** + * Returns true, if the node does not have a parent. Root nodes may either + * be the root element of the complete DOM tree + * + * @return true if the node is a root node (has no parent) or false if the + * node is no root node (has a parent). + */ + bool isRoot() const { return parent.isNull(); } + + /** + * Returns the vector containing the complete path to this node (including + * the name of the parent nodes). + * + * @return a vector containing the path (starting with the root node) to + * this node as a list of names. + */ + std::vector path() const; + + /** + * Function which resolves a name path to a list of possible nodes. + * + * @param res is the result list containing all possible nodes matching the + * name specified in the path. + * @param path is a list specifying a path of node names which is meant to + * specify a certain named node. + * @param filter is a callback function which may check whether a certain + * node should be in the result set. If nullptr is given, all nodes matching + * the path are included. The filter function can be used to restrict the + * type of matched functions. + * @param filterData is user-defined data that should be passed to the + * filter. + * @param idx is the current index in the path. + * @param visited is a map which is used to prevent unwanted recursion. + * @param alias is a pointer at a string which contains an alternative name + * for this node. If nullptr is given, not such alternative name is + * provided. + * @return the number of elements in the result list. + */ + int resolve(std::vector> &res, + const std::vector &path, Filter filter, + void *filterData, unsigned idx, VisitorSet &visited, + const std::string *alias); + + /** + * Function which resolves a name path to a list of possible nodes starting + * from this node. + * + * @param path is a list specifying a path of node names meant to specify a + * certain named node. + * @param filter is a callback function which may check whether a certain + * node should be in the result set. If nullptr is given, all nodes matching + * the path are included. The filter function can e.g. be used to restrict + * the type of matched functions. + * @param filterData is user-defined data that should be passed to the + * filter. + * @return a vector containing all found node references. + */ + std::vector> resolve(const std::vector &path, + Filter filter, void *filterData); + + /** + * Function which resolves a name path to a list of possible nodes starting + * from this node. + * + * @param path is a list specifying a path of node names meant to specify a + * certain named node. + * @return a vector containing all found node references. + */ + std::vector> resolve(const std::vector &path) + { + return resolve(path, nullptr, nullptr); + } + + /** + * Function which resolves a single name to a list of possible nodes + * starting from this node. + * + * @param name is the name which should be resolved. + * @param filter is a callback function which may check whether a certain + * node should be in the result set. If nullptr is given, all nodes matching + * the path are included. The filter function can e.g. be used to restrict + * the type of matched functions. + * @param filterData is user-defined data that should be passed to the + * filter. + * @return a vector containing all found node references. + */ + std::vector> resolve(const char *, Filter filter, + void *filterData) + { + return resolve(std::vector{name}, filter, filterData); + } + + /** + * Function which resolves a single name to a list of possible nodes + * starting from this node. + * + * @param name is the name which should be resolved. + * @return a vector containing all found node references. + */ + std::vector> resolve(const std::string &name) + { + return resolve(std::vector{name}, nullptr, nullptr); + } + + /** + * Registers a new event handler for listening to the given event type. + * + * @param type is the event type the handler should listen to. + * @param handler is the handler that should be called. + * @param owner is an object the handler belongs to. May be nullptr. + * @param includeChildren if set to true, the event handler is also called + * if the same event is triggered on one of the child nodes. + * @return a unique event handler. + */ + int registerEventHandler(EventType type, EventHandler handler, + Handle owner = nullptr, + bool includeChildren = false); + + /** + * Unregisters the given event handler from the node. Note that removing an + * event handler has linear time. + * + * @param id is the unique event handler id. + * @return true if the given event handler was successfully unregistered. + */ + bool unregisterEventHandler(int id); + + /** + * Triggers an event on this node. + * + * @param event is a pointer at the event that should be triggered. The + * calling function has ownership over the given event. + * @param fromChild is set to true if the triggerEvent function is called + * from a child node. + * @return true if any event handler was found. + */ + bool triggerEvent(Event &event, bool fromChild = false); +}; +} +} + +#endif /* _OUSIA_DOM_NODE_HPP_ */ + diff --git a/src/core/RangeSet.hpp b/src/core/RangeSet.hpp new file mode 100644 index 0000000..841d476 --- /dev/null +++ b/src/core/RangeSet.hpp @@ -0,0 +1,326 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_MODEL_RANGE_SET_HPP_ +#define _OUSIA_MODEL_RANGE_SET_HPP_ + +#include +#include + +namespace ousia { +namespace model { +/** + * The Range structure represents an interval of numerical values of type T. + */ +template +struct Range { + /** + * Start is the start value of the range. + */ + T start; + + /** + * End is the end value of the range (inclusively). + */ + T end; + + /** + * Default constructor of the range class. The range is initialized as + * invalid, with start being set to the maximum possible value of the + * numerical type T, and end being set to the minimum possible value. + */ + Range() : + start(std::numeric_limits::max()), + end(std::numeric_limits::min()) + { + // Do nothing here + } + + /** + * Copies the given start and end value. The given values are not checked + * for validity. Use the "isValid" + * + * @param start is the minimum value the range still covers. + * @param end is the maximum value the range still covers. + */ + Range(const T &start, const T &end) : + start(start), end(end) + { + // Do nothing here + } + + /** + * Creates a range that covers exactly one element, namely the value given + * as parameter n. + */ + Range(const T &n) : + start(n), end(n) + { + // Do nothing here + } + + /** + * Returns true if this range is valid, e.g. its start value is smaller or + * equal to its end value. + * + * @return true if start is smaller or equal to end, false otherwise. + */ + bool isValid() const + { + return start <= end; + } + + /** + * Checks whether the given value lies inside the range. + * + * @param v is the value that is being checked. + * @return true if the value lies within the range, false otherwise. + */ + bool inRange(T v) const + { + return (v >= start) && (v <= end); + } + + /** + * Checks whether the given range overlaps with another range. Not that + * this check is only meaningful if both ranges are valid. + * + * @param r is the range that should be checked for overlapping with this + * range. + */ + bool overlaps(const Range &r) const + { + return (((r.start >= start) || (r.end >= start)) + && ((r.start <= end) || (r.end <= end))); + } + + /** + * Returns true if the two given ranges are neighbours (their limits only + * differ in the smallest representable difference between them). + */ + bool neighbours(const Range &r) const + { + constexpr T eps = std::numeric_limits::is_integer + ? 1 : std::numeric_limits::epsilon(); + return ((r.start > end) && ((r.start - eps) <= end)) + || ((r.end < start) && ((r.end + eps) >= start)); + } + + /** + * Checks whether the given range completely covers this range. + */ + bool coveredBy(const Range &r) const + { + return (r.start <= start) && (r.end >= end); + } + + /** + * Checks whether this range completely covers the given range. + */ + bool covers(const Range &r) const + { + return r.coveredBy(*this); + } + + /** + * Calculates the union of the two ranges -- not that this operation is only + * valid if the ranges overlapp. Use the RangeSet class if you cannot + * guarantee that. + */ + Range merge(const Range &r) const + { + return Range(std::min(start, r.start), std::max(end, r.end)); + } + + /** + * Returns a range that represents the spans the complete set defined by the + * given type T. + */ + static Range typeRange() + { + return Range(std::numeric_limits::min(), + std::numeric_limits::max()); + } + + /** + * Returns a range that represents the spans the complete set defined by the + * given type T up to a given value. + * + * @param till is the value up to which the range should be defined (till is + * included in the set). + */ + static Range typeRangeUntil(const T &till) + { + return Range(std::numeric_limits::min(), till); + } + + /** + * Returns a range that represents the spans the complete set defined by the + * given type T up to a given value. + * + * @param from is the value from which the range should be defined (from is + * included in the set). + */ + static Range typeRangeFrom(const T &from) + { + return Range(from, std::numeric_limits::max()); + } +}; + +/** + * RangeComp is a comperator used to order to sort the ranges within the + * ranges list. Sorts by the start element. + */ +template +struct RangeComp { + bool operator() (const Range& lhs, const Range& rhs) const + { + return lhs.start < rhs.start; + } +}; + +/** + * RangeSet represents a set of ranges of the given numerical type and is thus + * capable of representing any possible subset of the given numerical type T. + */ +template +class RangeSet { + +protected: + /** + * Set of ranges used internally. + */ + std::set, RangeComp> ranges; + + /** + * Returns an iterator to the first element in the ranges list that overlaps + * with the given range. + * + * @param r is the range for which the first overlapping element should be + * found. + * @return an iterator pointing to the first overlapping element or to the + * end of the list if no such element was found. + */ + typename std::set, RangeComp>::iterator firstOverlapping( + const Range &r, const bool allowNeighbours) + { + // Find the element with the next larger start value compared to the + // start value given in r. + auto it = ranges.upper_bound(r); + + // Go back one element + if (it != ranges.begin()) { + it--; + } + + // Iterate until an overlapping element is found + while (!(it->overlaps(r) || (allowNeighbours && it->neighbours(r))) + && (it != ranges.end())) { + it++; + } + return it; + } + +public: + /** + * Calculates the union of this range set and the given range. + * + * @param range is the range that should be merged into this range set. + */ + void merge(Range r) + { + // Calculate a new range that covers both the new range and all old + // ranges in the set -- delete all old elements on the way + auto it = firstOverlapping(r, true); + while ((it->overlaps(r) || it->neighbours(r)) && it != ranges.end()) { + r = r.merge(*it); + it = ranges.erase(it); + } + + // Insert the new range + ranges.insert(r); + } + + /** + * Calculates the union of this range set and the given range set. + * + * @param ranges is another range set for which the union with this set + * should be calculated. + */ + void merge(const RangeSet &s) + { + for (Range &r : s.ranges) { + merge(r); + } + } + + /** + * Checks whether this range set S contains the given range R: + * S u R = R + * (The intersection between R and S equals the given range) + * + * @param r is the range for which the containment should be checked. + * @return true if the above condition is met, false otherwise. + */ + bool contains(const Range &r) + { + auto it = firstOverlapping(r, false); + if (it != ranges.end()) { + return (*it).covers(r); + } + return false; + } + + /** + * Checks whether this range set S1 contains the given range set S2: + * + * @param s is the range for which the containment should be checked. + * @return true if the above condition is met, false otherwise. + */ + bool contains(const RangeSet &s) + { + bool res = true; + for (Range &r : s.ranges) { + res = res && contains(r); + } + return res; + } + + /** + * Empties the set. + */ + void clear() + { + ranges.clear(); + } + + /** + * Returns the current list of ranges as a const reference. + */ + const std::set, RangeComp>& getRanges() + { + return this->ranges; + } + +}; + +} +} + +#endif /* _OUSIA_MODEL_RANGE_SET_HPP_ */ + diff --git a/src/core/Tokenizer.cpp b/src/core/Tokenizer.cpp new file mode 100644 index 0000000..a0ca3aa --- /dev/null +++ b/src/core/Tokenizer.cpp @@ -0,0 +1,212 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include "Tokenizer.hpp" + +namespace ousia { +namespace utils { + +static std::map buildChildren( + const std::map &inputs) +{ + std::map children; + std::map> nexts; + + for (auto &e : inputs) { + const std::string &s = e.first; + const int id = e.second; + if (s.empty()) { + continue; + } + char start = s[0]; + const std::string suffix = s.substr(1); + if (nexts.find(start) != nexts.end()) { + nexts[start].insert(std::make_pair(suffix, id)); + } else { + nexts.insert(std::make_pair( + start, std::map{{suffix, id}})); + } + } + + for (auto &n : nexts) { + children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); + } + + return children; +} + +static int buildId(const std::map &inputs) +{ + int tokenId = TOKEN_NONE; + for (auto &e : inputs) { + if (e.first.empty()) { + if (tokenId != TOKEN_NONE) { + throw TokenizerException{std::string{"Ambigous token found: "} + + std::to_string(e.second)}; + } else { + tokenId = e.second; + } + } + } + return tokenId; +} + +TokenTreeNode::TokenTreeNode(const std::map &inputs) + : children(buildChildren(inputs)), tokenId(buildId(inputs)) +{ +} + +Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root) + : input(input), root(root) +{ +} + +bool Tokenizer::prepare() +{ + std::stringstream buffer; + char c; + int startColumn = input.getColumn(); + int startLine = input.getLine(); + bool bufEmpty = true; + while (input.peek(&c)) { + if (root.children.find(c) != root.children.end()) { + // if there might be a special token, keep peeking forward + // until we find the token (or we don't). + TokenTreeNode const *n = &root; + std::stringstream tBuf; + int match = TOKEN_NONE; + while (true) { + tBuf << c; + n = &(n->children.at(c)); + if (n->tokenId != TOKEN_NONE) { + match = n->tokenId; + // from here on we found a token. If we have something + // in our buffer already, we end the search now. + if (!bufEmpty) { + break; + } else { + // if we want to return this token ( = we have nothing + // in our buffer yet) we look greedily for the longest + // possible token we can construct. + input.consumePeek(); + } + } + if (!input.peek(&c)) { + // if we are at the end we break off the search. + break; + } + if (n->children.find(c) == n->children.end()) { + // if we do not find a possible continuation anymore, + // break off the search. + break; + } + } + //reset the peek pointer to the last valid position. + input.resetPeek(); + // check if we did indeed find a special token. + if (match != TOKEN_NONE) { + if (bufEmpty) { + // if we did not have text before, construct that token. + if (doPrepare( + Token{match, tBuf.str(), startColumn, startLine, + input.getColumn(), input.getLine()}, + peeked)) { + return true; + } else { + startColumn = input.getColumn(); + startLine = input.getLine(); + continue; + } + } else { + // otherwise we return the text before the token. + if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, + startLine, input.getColumn(), + input.getLine()}, + peeked)) { + return true; + } else{ + //we need to clear the buffer here. After all the token + //corresponding to this buffer segment is already + //constructed. + buffer.str(std::string()); + bufEmpty = true; + startColumn = input.getColumn(); + startLine = input.getLine(); + continue; + } + } + } else{ + //if we found nothing, read at least one character. + input.peek(&c); + } + } + buffer << c; + bufEmpty = false; + input.consumePeek(); + } + if (!bufEmpty) { + return doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, + input.getColumn(), input.getLine()}, + peeked); + } + return false; +} + +bool Tokenizer::doPrepare(const Token &t, std::deque &peeked) +{ + peeked.push_back(t); + return true; +} + +bool Tokenizer::next(Token &t) +{ + if (peeked.empty()) { + if (!prepare()) { + return false; + } + } + t = peeked.front(); + peeked.pop_front(); + resetPeek(); + return true; +} + +bool Tokenizer::peek(Token &t) +{ + if (peekCursor >= peeked.size()) { + if (!prepare()) { + return false; + } + } + t = peeked[peekCursor]; + return true; +} + +void Tokenizer::resetPeek() { peekCursor = 0; } + +void Tokenizer::consumePeek() +{ + while (peekCursor > 0) { + peeked.pop_front(); + peekCursor--; + } +} +} +} diff --git a/src/core/Tokenizer.hpp b/src/core/Tokenizer.hpp new file mode 100644 index 0000000..2debc75 --- /dev/null +++ b/src/core/Tokenizer.hpp @@ -0,0 +1,231 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_UTILS_TOKENIZER_HPP_ +#define _OUSIA_UTILS_TOKENIZER_HPP_ + +#include +#include +#include + +#include "BufferedCharReader.hpp" + +namespace ousia { +namespace utils { + +/** + * This exception is currently only thrown if errors are made during the + * initialization of the Tokenizer. Have a closer look at the documentation + * of the TokenTreeNode constructor for more information. + */ +class TokenizerException : public std::exception { +public: + const std::string msg; + + TokenizerException(const std::string &msg) : msg(msg){}; + + virtual const char *what() const noexcept override { return msg.c_str(); } +}; + +/** + * The Tokenizer internally uses a TokenTree to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * The TokenTree is a construct that structures all special tokens this + * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then + * the TokenTree would look like this: + * + * a + * | \ + * a $ + * | \ + * b c + * | | + * $ $ + * + * Every node in the TokenTree is a valid end state that has a $ attached to it. + * During the search algorithm the Tokenizer goes through the tree and stores + * the last valid position. If a character follows that does not lead to a new + * node in the TokenTree the search ends (and starts again at this character). + * The token corresponding to the last valid position is returned. + * + * This allows us to uniquely identify the matching token given a certain + * input text. Note that this is a greedy matching approach that does not + * work if you're using truly ambiguous tokens (that have the same text). + * + * It is also not allowed that tokens have common middle parts but varying + * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and + * the input string "abc". In that case we start looking for "abd" at the + * start, won't find it, wenn we hit "c" and start the scanning process + * anew. Thus the "bc" token is not found. + * + * For most (well-behaved) tokenization schemes this is not the case, + * though. + */ +class TokenTreeNode { +public: + const std::map children; + const int tokenId; + + /** + * The TokenTreeNode constructor builds a TokenTree from the given token + * specifications. The node returned by this constructor then is the root of + * said TokenTree. + * @param inputs Specifications of tokens in map form. Each specification + * is a tuple of the text that should be matched and some unique ID (>= 0) + * that is returned to you if that Token is found in the text. + * An example for such a map would be + * { + * { "#" , 1}, + * { "##", 2}, + * { "/" , 3} + * } + * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE + * (-1) and TOKEN_TEXT (-2). + */ + TokenTreeNode(const std::map &inputs); +}; + +/** + * This is a reserved constant for the empty token. + */ +static const int TOKEN_NONE = -1; +/** + * This is a reserved constant for every part of the input text that is not a + * specified token. + */ +static const int TOKEN_TEXT = -2; + +/** + * A token for us is identified by an integer tokenID (either one of the + * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). + * Additionally we return the matched text (which should only be really + * interesting in case of TOKEN_TEXT tokens) and the position in the input text. + */ +struct Token { + int tokenId; + std::string content; + int startColumn; + int startLine; + int endColumn; + int endLine; + + Token(int tokenId, std::string content, int startColumn, int startLine, + int endColumn, int endLine) + : tokenId(tokenId), + content(content), + startColumn(startColumn), + startLine(startLine), + endColumn(endColumn), + endLine(endLine) + { + } + + Token() : tokenId(TOKEN_NONE) {} +}; + +/** + * A Tokenizer has the purpose of subdividing an input text into tokens. In our + * definition here we distinguish between two kinds of tokens: + * 1.) User-specified tokens that match a fixed text. + * 2.) Any other text between those tokens. + * The user might want to specify the tokens '#{' and '#}' for example, because + * they have some meaning in her code. The user sets the IDs to 1 and 2. + * Given the input text + * "some text #{ special command #} some text" + * the tokenizer would return the tokens: + * 1.) "some text " with the id TOKEN_TEXT (-2). + * 2.) "#{" with the id 1. + * 3.) " special command " with the id TOKEN_TEXT (-2). + * 4.) "#}" with the id 2. + * 5.) " some text" with the id TOKEN_TEXT (-2). + * This makes the subsequent parsing of files of a specific type easier. + * Note that in case of tokens with that are prefixes of other tokens the + * longest possible match is returned. + */ +class Tokenizer { +private: + BufferedCharReader &input; + const TokenTreeNode &root; + std::deque peeked; + unsigned int peekCursor = 0; + + bool prepare(); + +protected: + /** + * This method is an interface to build multiple tokens from a single one in + * derived classes. This might be interesting if you want to implement + * further logic on text tokens or similar applications. + * + * @param t a Token the "basic" tokenizer found. + * @param peeked a reference to the deque containing all temporary Tokens. + * You are supposed to append your tokens there. In the trivial case you just + * put the given Token on top of the deque. + * @return false if no token was appended to the deque (meaning that you want + * to ignore the given token explicitly) and true in all other cases. + */ + virtual bool doPrepare(const Token &t, std::deque &peeked); + +public: + /** + * @param input The input of a Tokenizer is given in the form of a + * BufferedCharReader. Please refer to the respective documentation. + * @param root This is meant to be the root of a TokenTree giving the + * specification of user-defined tokens this Tokenizer should recognize. + * The Tokenizer promises to not change the TokenTree such that you can + * re-use the same specification for multiple inputs. + * Please refer to the TokenTreeNode documentation for more information. + */ + Tokenizer(BufferedCharReader &input, const TokenTreeNode &root); + + /** + * The next method consumes one Token from the input stream and gives + * it to the user (stored in the input argument). + * + * @param t a Token reference that is set to the next found token. + * @return true if a next token was found and false if the input is at its + * end. + */ + bool next(Token &t); + /** + * The peek method does not consume the next Token but buffers it and + * shows it to the user (stored in the input argument). + * + * @param t a Token reference that is set to the next found token. + * @return true if a next token was found and false if the input is at its + * end. + */ + bool peek(Token &t); + + /** + * Resets the peek pointer to the current position in the stream (to the + * beginning of the buffer). + */ + void resetPeek(); + + /** + * Clears the peek buffer, such that all peeked Tokens are consumed. + */ + void consumePeek(); +}; +} +} + +#endif diff --git a/src/core/Typesystem.hpp b/src/core/Typesystem.hpp new file mode 100644 index 0000000..201aed2 --- /dev/null +++ b/src/core/Typesystem.hpp @@ -0,0 +1,87 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_DOM_TYPESYSTEM_HPP_ +#define _OUSIA_DOM_TYPESYSTEM_HPP_ + +#include +#include + +#include + +#include "Node.hpp" + +namespace ousia { +namespace dom { + +class Type; + +class TypeInstance : public Managed { +public: + const Owned type; + + TypeInstance(Manager &mgr, Handle type) + : Managed(mgr), type(acquire(type)) + { + } +}; + +class Type : public Node { +public: + using Node::Node; + + virtual bool isFinal() const { return true; } + + virtual bool isPrimitive() const { return true; } + + virtual Rooted create() = 0; + + virtual Rooted parse(const std::string &str) = 0; +}; + +class Typesystem : public Node { +private: + const std::vector> types; + const std::vector> constants; + +protected: + void doResolve(std::vector> &res, + const std::vector &path, Filter filter, + void *filterData, unsigned idx, + VisitorSet &visited) override; + +public: + using Node::Node; + + const &std::vector> getTypes() { return types; } + + const &std::vector> getConstants() { return constants; } + + void addType(Handle type) { + types.push_back(acquire(type)); + } + + void addConstant(Handle ) { + + } +}; +} +} + +#endif /* _OUSIA_DOM_TYPESYSTEM_HPP_ */ + diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp new file mode 100644 index 0000000..184fdd0 --- /dev/null +++ b/src/core/Utils.cpp @@ -0,0 +1,39 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Utils.hpp" + +namespace ousia { + +bool Utils::isIdentifier(const std::string &name) +{ + bool first = true; + for (char c : name) { + if (first && !(isAlphabetic(c) || c == '_')) { + return false; + } + if (first && !(isAlphanumeric(c) || c == '_' || c == '-')) { + return false; + } + first = false; + } + return true; +} + +} + diff --git a/src/core/Utils.hpp b/src/core/Utils.hpp new file mode 100644 index 0000000..2fcd794 --- /dev/null +++ b/src/core/Utils.hpp @@ -0,0 +1,65 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + + +#ifndef _OUSIA_UTILS_H_ +#define _OUSIA_UTILS_H_ + +#include + +namespace ousia { + +class Utils { + +public: + + /** + * Returns true if the given character is in [A-Za-z] + */ + static bool isAlphabetic(const char c) + { + return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')); + } + + /** + * Returns true if the given character is in [0-9] + */ + static bool isNumeric(const char c) + { + return (c >= '0') && (c <= '9'); + } + + /** + * Returns true if the given character is in [A-Za-z0-9] + */ + static bool isAlphanumeric(const char c) + { + return isAlphabetic(c) || isNumeric(c); + } + + /** + * Returns true if the given character is in [A-Za-z_][A-Za-z0-9_-]* + */ + static bool isIdentifier(const std::string &name); + +}; + +} + +#endif /* _OUSIA_UTILS_H_ */ + diff --git a/src/core/dom/Node.cpp b/src/core/dom/Node.cpp deleted file mode 100644 index c9651fb..0000000 --- a/src/core/dom/Node.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "Node.hpp" - -namespace ousia { -namespace dom { - -/* Class Node */ - -void Node::setName(std::string name) -{ - // Call the name change event - NameChangeEvent ev{this->name, name}; - triggerEvent(ev); - - // Set the new name - this->name = std::move(name); -} - -void Node::path(std::vector &p) const -{ - if (!isRoot()) { - parent->path(p); - } - p.push_back(name); -} - -std::vector Node::path() const -{ - std::vector res; - path(res); - return res; -} - -void Node::doResolve(std::vector> &res, - const std::vector &path, Filter filter, - void *filterData, unsigned idx, VisitorSet &visited) -{ - // Do nothing in the default implementation -} - -int Node::resolve(std::vector> &res, - const std::vector &path, Filter filter, - void *filterData, unsigned idx, VisitorSet &visited, - const std::string *alias) -{ - // Abort if this node was already visited for this path index - std::pair recKey = std::make_pair(this, idx); - if (visited.find(recKey) != visited.end()) { - return res.size(); - } - visited.insert(recKey); - - // Check whether the we can continue the path - if (path[idx] == name || (alias && *alias == name)) { - // If we have reached the end of the path and the node is successfully - // tested by the filter function, add it to the result. Otherwise - // continue searching along the path - if (idx + 1 == path.size()) { - if (!filter || filter(this, filterData)) { - res.push_back(this); - } - } else { - doResolve(res, path, filter, filterData, idx + 1, visited); - } - } - - // Restart the search from here in order to find all possible nodes that can - // be matched to the given path - doResolve(res, path, filter, filterData, 0, visited); - - return res.size(); -} - -std::vector> Node::resolve(const std::vector &path, - Filter filter = nullptr, - void *filterData = nullptr) -{ - std::vector> res; - VisitorSet visited; - resolve(res, path, filter, filterData, 0, visited, nullptr); - return res; -} - -int Node::registerEventHandler(EventType type, EventHandler handler, - Handle owner, - bool includeChildren) -{ - const int id = handlerIdCounter++; - handlers.insert(std::make_pair( - type, - EventHandlerDescriptor{id, handler, owner, this, includeChildren})); - return id; -} - -bool Node::unregisterEventHandler(int id) { - for (auto it = handlers.begin(); it != handlers.end(); it++) { - if (it->second.id == id) { - handlers.erase(it); - return true; - } - } - return false; -} - -bool Node::triggerEvent(Event &event, bool fromChild) { - bool res = false; - // Iterate over all event handlers - const auto range = handlers.equal_range(event.type); - for (auto it = range.first; it != range.second; it++) { - // Fetch a reference to the descriptor, check whether it should be - // called for bubbled events - EventHandlerDescriptor descr = it->second; - if (!fromChild || descr.includeChildren) { - descr.handler(event, descr.owner); - res = true; - } - } - - // If possible, let the event bubble up to the parent node - if (event.canBubble() && !parent.isNull()) { - res = parent->triggerEvent(event, true) | res; - } - return res; -} - -} -} - diff --git a/src/core/dom/Node.hpp b/src/core/dom/Node.hpp deleted file mode 100644 index 249d1f2..0000000 --- a/src/core/dom/Node.hpp +++ /dev/null @@ -1,518 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_DOM_NODE_HPP_ -#define _OUSIA_DOM_NODE_HPP_ - -#include -#include -#include -#include -#include - -#include - -namespace ousia { -namespace dom { - -/* Forward declarations */ -class Node; -class Event; - -/** - * EventType is an enum containing all possible node events. New event types - * should be added here. - */ -enum class EventType : int { - /** - * Generic update event which may be triggered if some important property - * of the node is changed. - */ - UPDATE, - - /** - * The NAME_CHANGE event is used to inform listeners that the name of the - * node has changed. - */ - NAME_CHANGE, - - /** - * The ADD_CHILD event is used to inform listeners that the node got a new - * child in any of its child node lists. - */ - ADD_CHILD, - - /** - * The DELETE_CHILD event is used to inform listeners that the node got a - * new child in any of its child node lists. - */ - DELETE_CHILD -}; - -/** - * Definition of the EventHandler function. - * - * @param event is a reference to the object holding the event data. - * @param owner is a reference to the managed object that was given in the - * registerEventHandler function. - */ -using EventHandler = void (*)(const Event &event, Handle owner); - -/** - * The Event class and its child classes are responsible for containing the - * actual event data which further describes the event to the event handlers. - * Instances of this class and its children must be declared on the stack or as - * a temporary. - */ -class Event { -private: - /** - * True as long as the event can bubble up the node hirarchy. - */ - mutable bool bubble; - -public: - /** - * Contains the actual event type of this class. - */ - const EventType type; - - /** - * Node on which the event was triggered. - */ - Rooted sender; - - /** - * Constructor of the Event class. - * - * @param type is an element from the EventType enum. - * @param bubble if set to true, the event can bubble up the node hirarchy. - */ - Event(EventType type, bool bubble = true) : bubble(bubble), type(type){}; - - /** - * Delete the copy constructor. - */ - Event(const Event &) = delete; - - /** - * Delete the assignment operator. - */ - Event &operator=(const Event &) = delete; - - /** - * Stops the propagation of this event to the parent element. - */ - void stopPropagation() const { bubble = false; } - - /** - * Returns true if the event can still bubble. - */ - bool canBubble() const { return bubble; } -}; - -/** - * Event used when the name of a node has changed. - */ -class NameChangeEvent : public Event { -public: - /** - * Reference to a string containing the old name of the node. - */ - const std::string &oldName; - - /** - * Reference to a string containing the new name of the node. - */ - const std::string &newName; - - /** - * Constructor of the NameChangeEvent class. - * - * @param oldName is a reference to a string containing the old name of the - * node. - * @param newName is a reference to a string containing the new name of the - * node. - * @param bubble if set to true, the event can bubble up the node hirarchy. - */ - NameChangeEvent(const std::string &oldName, const std::string &newName, - bool bubble = true) - : Event(EventType::NAME_CHANGE, bubble), - oldName(oldName), - newName(newName) - { - } -}; - -/** - * Struct containing the data which describes a single registered event handler. - * Note that the event type (e.g. which type of event this element was - * registered for) is stored outside the EventHandlerDescriptor (in the map - * storing the registered event handlers). - */ -struct EventHandlerDescriptor { - /** - * Unique id of the event handler. - */ - const int id; - - /** - * Reference to the event handler containing the events. - */ - const EventHandler handler; - - /** - * Reference to the managed element which owns the event handler. The object - * which owns the Owned handler is given in the constructor. - */ - const Owned owner; - - /** - * Set to true, if this event handler listens to bubbled events comming from - * child nodes. - */ - const bool includeChildren; - - /** - * Constructor of the EventHandlerDescriptor struct. - * - * @param id is the node-unique id of the EventHandlerDescriptor. - * @param handler is the function pointer which is going to be called once - * the associated event handler has fired. - * @param owner is a user-specified object which owns the method that is - * going to be called. This can be used to make sure that the method which - * handles the events has access to its owned object as long as the event - * handler lives. - * @param parent is the parent element this descriptor belongs to. The - * a handle to the "owner" object will be created on behalf of the parent. - * @param includeChildren is set to true if the event handler should handle - * events comming from child elements. - */ - EventHandlerDescriptor(int id, EventHandler handler, Handle owner, - Managed *parent, bool includeChildren) - : id(id), - handler(handler), - owner(owner, parent), - includeChildren(includeChildren) - { - } -}; - -/** - * The Node class builds the base class for any Node within the DOM graph. A - * node may either be a descriptive node (such as a domain description etc.) - * or a document element. Each node is identified by acharacteristic name and - * a parent element. Note that the node name is not required to be unique. Nodes - * without parent are considered root nodes. - */ -class Node : public Managed { -public: - /** - * The Filter function is used when resolving names to Node instances. The - * filter tests whether the given node meets the requirements for inclusion - * in the result list. - * - * @param node is the node which should be tested. - * @param data is user-defined data passed to the filter. - * @return true if the node should be included in the result set, false - * otherwise. - */ - using Filter = bool (*)(Handle node, void *data); - - /** - * Hash functional used to convert pairs of nodes and int to hashes which - * can be used within a unordered_set. - */ - struct VisitorHash { - size_t operator()(const std::pair &p) const - { - const std::hash nodeHash; - const std::hash intHash; - return nodeHash(p.first) + 37 * intHash(p.second); - } - }; - - /** - * Alias for the VisitorSet class which represents all nodes which have been - * visited in the name resolution process. The map stores pairs of node - * pointers and integers, indicating for which path start id the node has - * already been visited. - */ - using VisitorSet = - std::unordered_set, VisitorHash>; - -private: - /** - * Name of the node. As names are always looked up relative to a node, - * names are not required to be unique. - */ - std::string name; - - /** - * Reference to a parent node instace. - */ - Owned parent; - - /** - * Current id counter. The id counter may be used to create ids which are - * unique inside the realm of this manager instance. - */ - int handlerIdCounter = 0; - - /** - * Multimap containing all registered event handlers for this node. - */ - std::multimap handlers; - - /** - * Private version of the "path" function used to construct the path. Calls - * the path function of the parent node and adds the own name to the given - * vector. - * - * @param p is the list the path should be constructed in. - */ - void path(std::vector &p) const; - -protected: - /** - * Function which should be overwritten by derived classes in order to - * resolve node names to a list of possible nodes. The implementations of - * this function do not need to do anything but call the "resovle" function - * of any child instance of NamedNode. - * - * @param res is the result list containing all possible nodes matching the - * name specified in the path. - * @param path is a list specifying a path of node names which is meant to - * specify a certain named node. - * @param idx is the current index in the path. - * @param visited is a map which is used to prevent unwanted recursion. - * @param filter is a callback function which may check whether a certain - * node should be in the result set. If nullptr is given, all nodes matching - * the path are included. The filter function can be used to restrict the - * type of matched functions. - * @param filterData is user-defined data that should be passed to the - * filter. - */ - virtual void doResolve(std::vector> &res, - const std::vector &path, Filter filter, - void *filterData, unsigned idx, VisitorSet &visited); - -public: - /** - * Initializes the node with empty name and parent. - * - * @param mgr is a reference to the Manager instace the node belongs to. - */ - Node(Manager &mgr, Handle parent = nullptr) - : Managed(mgr), parent(acquire(parent)) - { - } - - /** - * Constructs a new node with the given name and the given parent element. - * - * @param mgr is a reference to the Manager instace the node belongs to. - * @param name is the name of the Node. - * @param parent is a handle pointing at the parent node. - */ - Node(Manager &mgr, std::string name, Handle parent = nullptr) - : Managed(mgr), name(name), parent(acquire(parent)) - { - } - - /** - * Sets the name of the node to the given name. Note: The name set here may - * be invalid (contain spaces, colons or other special characters). However, - * in this case the node will not be reachable as reference from a input - * document. This behaviour allows for gracefully degradation in error - * cases. - * - * @param name is the name that should be assigned to the node. - */ - void setName(std::string name); - - /** - * Returns the name of the node. - */ - std::string getName() const { return name; } - - /** - * Specifies whether the node has a name, e.g. whether the current name is - * not empty. - * - * @return true if the name of this node is not empty, false otherwise. - */ - bool hasName() const { return !name.empty(); } - - /** - * Sets the parent node. - * - * @param parent is a Handle to the parent node. - */ - void setParent(Handle parent) { this->parent = acquire(parent); } - - /** - * Returns a handle to the parent node of the Node instance. - * - * @return a handle to the root node. - */ - Rooted getParent() const { return parent; } - - /** - * Returns true, if the node does not have a parent. Root nodes may either - * be the root element of the complete DOM tree - * - * @return true if the node is a root node (has no parent) or false if the - * node is no root node (has a parent). - */ - bool isRoot() const { return parent.isNull(); } - - /** - * Returns the vector containing the complete path to this node (including - * the name of the parent nodes). - * - * @return a vector containing the path (starting with the root node) to - * this node as a list of names. - */ - std::vector path() const; - - /** - * Function which resolves a name path to a list of possible nodes. - * - * @param res is the result list containing all possible nodes matching the - * name specified in the path. - * @param path is a list specifying a path of node names which is meant to - * specify a certain named node. - * @param filter is a callback function which may check whether a certain - * node should be in the result set. If nullptr is given, all nodes matching - * the path are included. The filter function can be used to restrict the - * type of matched functions. - * @param filterData is user-defined data that should be passed to the - * filter. - * @param idx is the current index in the path. - * @param visited is a map which is used to prevent unwanted recursion. - * @param alias is a pointer at a string which contains an alternative name - * for this node. If nullptr is given, not such alternative name is - * provided. - * @return the number of elements in the result list. - */ - int resolve(std::vector> &res, - const std::vector &path, Filter filter, - void *filterData, unsigned idx, VisitorSet &visited, - const std::string *alias); - - /** - * Function which resolves a name path to a list of possible nodes starting - * from this node. - * - * @param path is a list specifying a path of node names meant to specify a - * certain named node. - * @param filter is a callback function which may check whether a certain - * node should be in the result set. If nullptr is given, all nodes matching - * the path are included. The filter function can e.g. be used to restrict - * the type of matched functions. - * @param filterData is user-defined data that should be passed to the - * filter. - * @return a vector containing all found node references. - */ - std::vector> resolve(const std::vector &path, - Filter filter, void *filterData); - - /** - * Function which resolves a name path to a list of possible nodes starting - * from this node. - * - * @param path is a list specifying a path of node names meant to specify a - * certain named node. - * @return a vector containing all found node references. - */ - std::vector> resolve(const std::vector &path) - { - return resolve(path, nullptr, nullptr); - } - - /** - * Function which resolves a single name to a list of possible nodes - * starting from this node. - * - * @param name is the name which should be resolved. - * @param filter is a callback function which may check whether a certain - * node should be in the result set. If nullptr is given, all nodes matching - * the path are included. The filter function can e.g. be used to restrict - * the type of matched functions. - * @param filterData is user-defined data that should be passed to the - * filter. - * @return a vector containing all found node references. - */ - std::vector> resolve(const char *, Filter filter, - void *filterData) - { - return resolve(std::vector{name}, filter, filterData); - } - - /** - * Function which resolves a single name to a list of possible nodes - * starting from this node. - * - * @param name is the name which should be resolved. - * @return a vector containing all found node references. - */ - std::vector> resolve(const std::string &name) - { - return resolve(std::vector{name}, nullptr, nullptr); - } - - /** - * Registers a new event handler for listening to the given event type. - * - * @param type is the event type the handler should listen to. - * @param handler is the handler that should be called. - * @param owner is an object the handler belongs to. May be nullptr. - * @param includeChildren if set to true, the event handler is also called - * if the same event is triggered on one of the child nodes. - * @return a unique event handler. - */ - int registerEventHandler(EventType type, EventHandler handler, - Handle owner = nullptr, - bool includeChildren = false); - - /** - * Unregisters the given event handler from the node. Note that removing an - * event handler has linear time. - * - * @param id is the unique event handler id. - * @return true if the given event handler was successfully unregistered. - */ - bool unregisterEventHandler(int id); - - /** - * Triggers an event on this node. - * - * @param event is a pointer at the event that should be triggered. The - * calling function has ownership over the given event. - * @param fromChild is set to true if the triggerEvent function is called - * from a child node. - * @return true if any event handler was found. - */ - bool triggerEvent(Event &event, bool fromChild = false); -}; -} -} - -#endif /* _OUSIA_DOM_NODE_HPP_ */ - diff --git a/src/core/dom/Typesystem.hpp b/src/core/dom/Typesystem.hpp deleted file mode 100644 index 201aed2..0000000 --- a/src/core/dom/Typesystem.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_DOM_TYPESYSTEM_HPP_ -#define _OUSIA_DOM_TYPESYSTEM_HPP_ - -#include -#include - -#include - -#include "Node.hpp" - -namespace ousia { -namespace dom { - -class Type; - -class TypeInstance : public Managed { -public: - const Owned type; - - TypeInstance(Manager &mgr, Handle type) - : Managed(mgr), type(acquire(type)) - { - } -}; - -class Type : public Node { -public: - using Node::Node; - - virtual bool isFinal() const { return true; } - - virtual bool isPrimitive() const { return true; } - - virtual Rooted create() = 0; - - virtual Rooted parse(const std::string &str) = 0; -}; - -class Typesystem : public Node { -private: - const std::vector> types; - const std::vector> constants; - -protected: - void doResolve(std::vector> &res, - const std::vector &path, Filter filter, - void *filterData, unsigned idx, - VisitorSet &visited) override; - -public: - using Node::Node; - - const &std::vector> getTypes() { return types; } - - const &std::vector> getConstants() { return constants; } - - void addType(Handle type) { - types.push_back(acquire(type)); - } - - void addConstant(Handle ) { - - } -}; -} -} - -#endif /* _OUSIA_DOM_TYPESYSTEM_HPP_ */ - diff --git a/src/core/utils/BufferedCharReader.cpp b/src/core/utils/BufferedCharReader.cpp deleted file mode 100644 index c13628f..0000000 --- a/src/core/utils/BufferedCharReader.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "BufferedCharReader.hpp" - -namespace ousia { -namespace utils { - -// Constants used within the linebreak statemachine. -static const uint8_t LB_STATE_NONE = 0x00; -static const uint8_t LB_STATE_ONE = 0x01; -static const uint8_t LB_STATE_LF = 0x10; -static const uint8_t LB_STATE_CR = 0x20; -static const uint8_t LB_STATE_MASK_CNT = 0x0F; -static const uint8_t LB_STATE_MASK_TYPE = 0xF0; - -/******************************************************************************* - * Struct BufferedCharReader::ReadCursor - ******************************************************************************/ - -BufferedCharReader::ReadCursor::ReadCursor(const bool destructive) : - destructive(destructive) -{ - reset(); -} - -void BufferedCharReader::ReadCursor::assign(const ReadCursor &cursor) -{ - this->line = cursor.line; - this->column = cursor.column; - this->bufferElem = cursor.bufferElem; - this->bufferPos = cursor.bufferPos; - this->lbState = cursor.lbState; -} - -void BufferedCharReader::ReadCursor::reset() -{ - this->line = 1; - this->column = 1; - this->bufferElem = 0; - this->bufferPos = 0; - this->lbState = LB_STATE_NONE; -} - -/******************************************************************************* - * Class BufferedCharReader - ******************************************************************************/ - -BufferedCharReader::BufferedCharReader() : - readCursor(true), peekCursor(false) -{ - reset(); -} - -void BufferedCharReader::reset() -{ - readCursor.reset(); - peekCursor.reset(); - buffer.clear(); - closed = false; -} - -bool BufferedCharReader::feed(const std::string &data) -{ - // Abort if the BufferedCharReader was closed - if (closed) { - return false; - } - - // Append the data onto the queue - buffer.push_back(data); - return true; -} - -void BufferedCharReader::close() -{ - closed = true; -} - -bool BufferedCharReader::substituteLinebreaks(ReadCursor *cursor, char *c) -{ - // Handle line breaks, inserts breakes after the following character - // combinations: \n, \r, \n\r, \r\n TODO: Change behaviour to \n, \n\r, \r\n - if ((*c == '\n') || (*c == '\r')) { - // Determine the type of the current linebreak character - const uint8_t type = (*c == '\n') ? LB_STATE_LF : LB_STATE_CR; - - // Read the last count and the last type from the state - const uint8_t lastCount = cursor->lbState & LB_STATE_MASK_CNT; - const uint8_t lastType = cursor->lbState & LB_STATE_MASK_TYPE; - - // Set the current linebreak type and counter in the state - cursor->lbState = ((lastCount + 1) & 1) | type; - - // If either this is the first instance of this character or the same - // return character is repeated - if (!lastCount || (lastType == type)) { - *c = '\n'; - return true; - } - return false; - } - - // Find the state - cursor->lbState = LB_STATE_NONE; - return true; -} - -bool BufferedCharReader::readCharacterAtCursor(ReadCursor *cursor, - char *c) -{ - bool hasChar = false; - while (!hasChar) { - // Abort if the current buffer element does not point to a valid entry - // in the buffer -- we must wait until another data block has been fed - // into the buffer - if (cursor->bufferElem >= buffer.size()) { - return false; - } - - // Fetch the current element the peek pointer points to - const std::string &data = buffer[cursor->bufferElem]; - - // Handle the "no data" case -- either in a destructive or - // non-destructive manner. - if (cursor->bufferPos >= data.length()) { - if (cursor->destructive) { - buffer.pop_front(); - } else { - cursor->bufferElem++; - } - cursor->bufferPos = 0; - continue; - } - - // Read the character, advance the buffer position - *c = *(data.data() + cursor->bufferPos); - cursor->bufferPos++; - - // Substitute linebreaks with a single LF (0x0A) - hasChar = substituteLinebreaks(cursor, c); - } - - // Update the position counter - if (*c == '\n') { - cursor->line++; - cursor->column = 1; - } else { - // Ignore UTF-8 continuation bytes - if (!((*c & 0x80) && !(*c & 0x40))) { - cursor->column++; - } - } - - return true; -} - -bool BufferedCharReader::peek(char *c) -{ - return readCharacterAtCursor(&peekCursor, c); -} - -bool BufferedCharReader::read(char *c) -{ - resetPeek(); - return readCharacterAtCursor(&readCursor, c); -} - -void BufferedCharReader::consumePeek() -{ - // Remove all no longer needed buffer elements - for (unsigned int i = 0; i < peekCursor.bufferElem; i++) { - buffer.pop_front(); - } - peekCursor.bufferElem = 0; - - // Copy the peek cursor to the read cursor - readCursor.assign(peekCursor); -} - -void BufferedCharReader::resetPeek() -{ - // Reset the peek cursor to the read cursor - peekCursor.assign(readCursor); -} - -bool BufferedCharReader::atEnd() -{ - if (closed) { - if (buffer.size() <= 0) { - return true; - } else if (buffer.size() == 1) { - return buffer[0].size() == readCursor.bufferPos; - } - } - return false; -} - -} -} - diff --git a/src/core/utils/BufferedCharReader.hpp b/src/core/utils/BufferedCharReader.hpp deleted file mode 100644 index b13cde6..0000000 --- a/src/core/utils/BufferedCharReader.hpp +++ /dev/null @@ -1,240 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ -#define _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ - -#include -#include -#include - -namespace ousia { -namespace utils { - -/** - * The BufferedCharReader class is used for storing incomming data that - * is fed into the pipeline as well as reading/peeking single characters - * from that buffer. Additionally it counts the current column/row - * (with correct handling for UTF-8) and contains an internal state - * machine that handles the detection of linebreaks. - * - * Additionally the BufferedCharReader performs the following tasks: - * 1. Convert the incomming character encoding to UTF-8 (TODO: implement) - * 2. Convert arbitrary linebreaks to a single "\n" - */ -class BufferedCharReader { - -private: - - /** - * The ReadCursor structure is responsible for representing the read - * position within the text an all state machine states belonging to the - * cursor. There are two types of read cursors: destructive and - * non-destructive read cursors. - */ - struct ReadCursor { - /** - * Specifies whether this is a destructive cursor (bytes are discarded - * once they were read from the buffer). - */ - const bool destructive; - - /** - * The line the cursor currently points to. - */ - unsigned int line; - - /** - * The column the cursor currently points to. - */ - unsigned int column; - - /** - * The index of the element in the data buffer we're currently reading - * from. - */ - unsigned int bufferElem; - - /** - * The byte position within this data buffer. - */ - unsigned int bufferPos; - - /** - * State variable used in the internal state machine of the - * line feed detection. - */ - uint8_t lbState; - - /** - * Constructor of the ReadCursor structure. - * - * @param destructive specifies whether the ReadCursor is destructive - * (consumes all read characters, as used in the "read cursor") or - * non-destructive (as used in the "peek cursor"). - */ - ReadCursor(const bool destructive); - - /** - * Copys the data from another ReadCursor without overriding the - * "destructive" flag. - */ - void assign(const ReadCursor &cursor); - - /** - * Resets the cursor without changing the "destructive" flag. - */ - void reset(); - }; - - /** - * Queue containing the data that has been fed into the char reader. - */ - std::deque buffer; - - /** - * The read and the peek cursor. - */ - ReadCursor readCursor, peekCursor; - - /** - * Determines whether the reader has been closed. - */ - bool closed; - - /** - * Substitute any combination of linebreaks in the incomming code with "\n". - * Returns true if the current character is meant as output, false - * otherwise. - */ - bool substituteLinebreaks(ReadCursor *cursor, char *c); - - /** - * Reads a character from the input buffer and advances the given read - * cursor. - * - * @param cursor is a reference to the read cursor that should be used - * for reading. - * @param hasChar is set to true, if a character is available, false if - * no character is available (e.g. because line breaks are substituted or - * the end of a buffer boundary is reached -- in this case this function - * should be called again with the same parameters.) - * @param c is a output parameter, which will be set to the read character. - * @param returns true if there was enough data in the buffer, false - * otherwise. - */ - bool readCharacterAtCursor(ReadCursor *cursor, char *c); - - /** - * Function that is called for each read character -- updates the row and - * column count. - */ - void updatePositionCounters(const char c); - -public: - - /** - * Constructor of the buffered char reader class. - */ - BufferedCharReader(); - - /** - * Resets the reader to its initial state. - */ - void reset(); - - /** - * Feeds new data into the internal buffer of the BufferedCharReader - * class. - * - * @param data is a string containing the data that should be - * appended to the internal buffer. - * @return true if the operation was successful, false otherwise (e.g. - * because the reader is closed). - */ - bool feed(const std::string &data); - - /** - * Marks the end of the input, allowing successors in the pipeline - * to react properly (e.g. creating the end of stream token). - */ - void close(); - - /** - * Peeks a single character. If called multiple times, returns the - * character after the previously peeked character. - * - * @param c is a reference to the character to which the result should be - * writtern. - * @return true if the character was successfully read, false if there are - * no more characters to be read in the buffer. - */ - bool peek(char *c); - - /** - * Reads a character from the input data. If "peek" was called - * beforehand resets the peek pointer. - * - * @param c is a reference to the character to which the result should be - * writtern. - * @return true if the character was successfully read, false if there are - * no more characters to be read in the buffer. - */ - bool read(char *c); - - /** - * Advances the read pointer to the peek pointer -- so if the "peek" - * function was called, "read" will now return the character after - * the last peeked character. - */ - void consumePeek(); - - /** - * Resets the peek pointer to the "read" pointer. - */ - void resetPeek(); - - /** - * Returns true if there are no more characters as the stream was - * closed. - */ - bool atEnd(); - - /** - * Returns the current line (starting with one). - */ - inline int getLine() - { - return readCursor.line; - } - - /** - * Returns the current column (starting with one). - */ - inline int getColumn() - { - return readCursor.column; - } - -}; - -} -} - -#endif /* _OUSIA_UTILS_BUFFERED_CHAR_READER_H_ */ - diff --git a/src/core/utils/CSSParser.cpp b/src/core/utils/CSSParser.cpp deleted file mode 100644 index 1763cc2..0000000 --- a/src/core/utils/CSSParser.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "BufferedCharReader.hpp" -#include "CodeTokenizer.hpp" -#include "Tokenizer.hpp" - -#include "CSSParser.hpp" - -namespace ousia { -namespace utils { - -// CSS code tokens -static const int CURLY_OPEN = 1; -static const int CURLY_CLOSE = 2; -static const int COLON = 3; -static const int SEMICOLON = 4; -static const int HASH = 5; -static const int BRACKET_OPEN = 6; -static const int BRACKET_CLOSE = 7; -static const int PAREN_OPEN = 8; -static const int PAREN_CLOSE = 9; -// comments -static const int COMMENT = 100; -static const int COMMENT_OPEN = 101; -static const int COMMENT_CLOSE = 102; -// strings -static const int STRING = 200; -static const int SINGLE_QUOTE = 201; -static const int DOUBLE_QUOTE = 202; -static const int ESCAPE = 203; -// general syntax -static const int LINEBREAK = 300; - -static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN}, - {"}", CURLY_CLOSE}, - {":", COLON}, - {";", SEMICOLON}, - {"#", HASH}, - {"[", BRACKET_OPEN}, - {"]", BRACKET_CLOSE}, - {"(", PAREN_OPEN}, - {")", PAREN_CLOSE}, - {"/*", COMMENT_OPEN}, - {"*/", COMMENT_CLOSE}, - {"\\", ESCAPE}, - {"\''", SINGLE_QUOTE}, - {"\"", DOUBLE_QUOTE}, - {"\n", LINEBREAK}}}; - -static const std::map CSS_DESCRIPTORS = { - {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}}, - {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}}, - {SINGLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, - {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, - {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}}, - {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; - -StyleNode CSSParser::parse(BufferedCharReader &input) -{ - CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS}; - tokenizer.ignoreComments = true; - // TODO: implement -} -} -} diff --git a/src/core/utils/CSSParser.hpp b/src/core/utils/CSSParser.hpp deleted file mode 100644 index c8b772d..0000000 --- a/src/core/utils/CSSParser.hpp +++ /dev/null @@ -1,167 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_UTILS_CSS_PARSER_HPP_ -#define _OUSIA_UTILS_CSS_PARSER_HPP_ - -#include -#include -#include -#include - -#include -#include - -#include "BufferedCharReader.hpp" - -namespace ousia { -namespace utils { - -/* - * The Specificity or Precedence of a CSS RuleSet, which decides which - * rules are applied when different RuleSets contain conflicting information. - * - * The Specificity is calculated using the official W3C recommendation - * http://www.w3.org/TR/CSS2/cascade.html#specificity - * - * Note that we do not need to use the integer 'a', since we do not allow - * local style definitions for single nodes. - */ -struct Specificity { - int b; - int c; - int d; - - Specificity(int b, int c, int d) : b(b), c(c), d(d) {} -}; - -bool operator<(const Specificity &x, const Specificity &y) -{ - return std::tie(x.b, x.c, x.d) < std::tie(y.b, y.c, y.d); -} - -bool operator>(const Specificity &x, const Specificity &y) -{ - return std::tie(x.b, x.c, x.d) > std::tie(y.b, y.c, y.d); -} - -bool operator==(const Specificity &x, const Specificity &y) -{ - return std::tie(x.b, x.c, x.d) == std::tie(y.b, y.c, y.d); -} - -class RuleSet : public Managed { -private: - const std::map values; - const Specificity specificity; - -public: - RuleSet(Manager &mgr, std::map values, - Specificity specificity) - : Managed(mgr), values(std::move(values)), specificity(specificity) - { - } - - const std::map &getValues() const - { - return values; - } - - const Specificity &getSpecificity() const { return specificity; } -}; - -class PseudoSelector { -private: - const std::string name; - const std::vector args; - const bool generative; - -public: - PseudoSelector(std::string name, std::vector args, - bool generative) - : name(std::move(name)), args(std::move(args)), generative(generative) - { - } - - const std::string &getName() const { return name; } - - const std::vector &getArgs() const { return args; } - - const bool &isGenerative() const { return generative; } -}; - -enum class SelectionOperator { DESCENDANT, DIRECT_DESCENDANT }; - -class StyleNode : public dom::Node { -public: - class StyleEdge : public Managed { - private: - Owned target; - const SelectionOperator selectionOperator; - - public: - StyleEdge(Manager &mgr, Handle target, - SelectionOperator selectionOperator) - : Managed(mgr), - target(acquire(target)), - selectionOperator(selectionOperator) - { - } - - Rooted getTarget() const { return target; } - - const SelectionOperator &getSelectionOperator() const - { - return selectionOperator; - } - }; - -private: - const PseudoSelector pseudoSelector; - std::vector> edges; - const std::vector> ruleSets; - -public: - StyleNode(Manager &mgr, std::string name, - PseudoSelector pseudoSelector, - const std::vector> &edges, - const std::vector> &ruleSets) - : dom::Node(mgr, std::move(name)), - pseudoSelector(std::move(pseudoSelector)), - edges(acquire(edges)), - ruleSets(acquire(ruleSets)) - { - } - - const PseudoSelector &getPseudoSelector() const { return pseudoSelector; } - - const std::vector> &getEdges() const { return edges; } - - const std::vector> &getRuleSets() const { return ruleSets; } -}; - -class CSSParser { - -private: - -public: - StyleNode parse(BufferedCharReader &input); -}; -} -} -#endif diff --git a/src/core/utils/CodeTokenizer.cpp b/src/core/utils/CodeTokenizer.cpp deleted file mode 100644 index e5b8610..0000000 --- a/src/core/utils/CodeTokenizer.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "CodeTokenizer.hpp" - -namespace ousia { -namespace utils { - -Token CodeTokenizer::constructToken(const Token &t) -{ - std::string content = buf.str(); - buf.str(std::string()); - return Token{returnTokenId, content, startToken.startColumn, - startToken.startLine, t.endColumn, t.endLine}; -} - -void CodeTokenizer::buffer(const Token &t) { buf << t.content; } - -bool CodeTokenizer::doPrepare(const Token &t, std::deque &peeked) -{ - auto it = descriptors.find(t.tokenId); - CodeTokenMode mode = CodeTokenMode::NONE; - if (it != descriptors.end()) { - mode = it->second.mode; - } - - if (t.startLine != t.endLine && mode != CodeTokenMode::LINEBREAK) { - throw TokenizerException( - "We did not expect a multiline token (except linebreaks). Most " - "likely you did not add a linebreak token to your tokenizer!"); - } - - switch (state) { - case CodeTokenizerState::NORMAL: - switch (mode) { - case CodeTokenMode::STRING_START_END: - state = CodeTokenizerState::IN_STRING; - break; - case CodeTokenMode::BLOCK_COMMENT_START: - state = CodeTokenizerState::IN_BLOCK_COMMENT; - break; - case CodeTokenMode::LINE_COMMENT: - state = CodeTokenizerState::IN_LINE_COMMENT; - break; - case CodeTokenMode::LINEBREAK: - peeked.push_back({it->second.id, t.content, t.startColumn, - t.startLine, t.endColumn, t.endLine}); - return true; - default: - if (t.tokenId == TOKEN_TEXT) { - int begin = -1; - for (size_t c = 0; c < t.content.length(); c++) { - bool isWhitespace = - t.content[c] == ' ' || t.content[c] == '\t'; - if (begin < 0) { - // if we have not yet set our beginning, - // we wait for the first - // non-whitespace-character to set it. - if (!isWhitespace) { - begin = c; - } - } else { - // if we have set our beginning, we wait for the - // first whitespace character, which marks the - // end of the current word. - if (isWhitespace) { - peeked.push_back(Token{ - TOKEN_TEXT, - t.content.substr(begin, (int)c - begin), - t.startColumn + begin, t.startLine, - t.startColumn + (int)c, t.endLine}); - begin = -1; - } - } - } - if(begin >= 0){ - peeked.push_back(Token{ - TOKEN_TEXT, - t.content.substr(begin), - t.startColumn + begin, t.startLine, - t.endColumn, t.endLine}); - } - } else { - peeked.push_back(t); - } - return true; - } - startToken = t; - returnTokenId = it->second.id; - return false; - case CodeTokenizerState::IN_LINE_COMMENT: - switch (mode) { - case CodeTokenMode::LINEBREAK: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_BLOCK_COMMENT: - switch (mode) { - case CodeTokenMode::BLOCK_COMMENT_END: - state = CodeTokenizerState::NORMAL; - if (!ignoreComments) { - peeked.push_back(constructToken(t)); - } - return !ignoreComments; - default: - if (!ignoreComments) { - buffer(t); - } - return false; - } - case CodeTokenizerState::IN_STRING: - switch (mode) { - case CodeTokenMode::ESCAPE: - if (escaped) { - buffer(t); - } - escaped = !escaped; - return false; - case CodeTokenMode::STRING_START_END: - if (escaped) { - buffer(t); - escaped = false; - return false; - } else { - peeked.push_back(constructToken(t)); - state = CodeTokenizerState::NORMAL; - return true; - } - default: - if (escaped) { - // TODO: handle escaped characters? - escaped = false; - } - buffer(t); - return false; - } - } - assert(false); -} -} -} diff --git a/src/core/utils/CodeTokenizer.hpp b/src/core/utils/CodeTokenizer.hpp deleted file mode 100644 index 0fc0862..0000000 --- a/src/core/utils/CodeTokenizer.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_UTILS_CODE_TOKENIZER_HPP_ -#define _OUSIA_UTILS_CODE_TOKENIZER_HPP_ - -#include -#include - -#include "BufferedCharReader.hpp" -#include "Tokenizer.hpp" - -namespace ousia { -namespace utils { - -/* - * This enum contains all special Token the CodeTokenizer supports, namely: - * - * 1.) An ambigous Tokens - in post programming languages single-quotes ' or - * double-quotes " - to delimit string tokens. - * 2.) A start token for line comments, which would e.g. be // in Java. - * 3.) A start token for a block comment - * 4.) An end token for a block comment. - * 5.) A linebreak token - * 6.) The escape token, which would e.g. be \ in java. - */ -enum class CodeTokenMode { - STRING_START_END, - LINE_COMMENT, - BLOCK_COMMENT_START, - BLOCK_COMMENT_END, - LINEBREAK, - ESCAPE, - NONE -}; - -/** - * A CodeTokenDescriptor defines the id the user likes to have returned for - * a Token of the mode specified, e.g. if you want to get the id 4 for a - * String Token the corresponding CodeTokenDescriptor would be inizialized - * with CodeTokenDescriptor myDesc {CodeTokenMode::STRING_START_END, 4}; - */ -struct CodeTokenDescriptor { - CodeTokenMode mode; - int id; - - CodeTokenDescriptor(CodeTokenMode mode, int id) : mode(mode), id(id) {} -}; - -/** - * The CodeTokenizer is a finite state machine with the states NORMAL, being - * IN_BLOCK_COMMENT, being IN_LINE_COMMENT or being IN_STRING. - */ -enum class CodeTokenizerState { - NORMAL, - IN_BLOCK_COMMENT, - IN_LINE_COMMENT, - IN_STRING -}; - -/** - * The purpose of a CodeTokenizer is to make it easier to parse classical - * programming Code. It adds the following features to a regular Tokenizer: - * 1.) String tokens (e.g. "string" in Java Code) instead of 3 separate tokens - * for the opening delimiter, the text and the closing delimiter. - * 2.) Escaping in String tokens. - * 3.) Comment Tokens (for line comments as well as block comments) - */ -class CodeTokenizer : public Tokenizer { -private: - std::map descriptors; - CodeTokenizerState state; - std::stringstream buf; - Token startToken; - int returnTokenId; - bool escaped = false; - - Token constructToken(const Token &t); - void buffer(const Token &t); - -protected: - bool doPrepare(const Token &t, std::deque &peeked) override; - -public: - /** - * If you do not want comment tokens to be returned you can set this to - * true. - */ - bool ignoreComments = false; - - /** - * - * @param input a BufferedCharReader containing the input for this - *tokenizer, - * as with a regular tokenizer. - * @param root a TokenTreeNode representing the root of the TokenTree. - * Please note that you have to specify all tokenIDs here that you use - * in the descriptors map. - * @param descriptors a map mapping tokenIDs to CodeTokenDescriptors. - * In this way you can specify the meaning of certain Tokens. Say you - * specified the Token "//" with the id 1 in the TokenTree. Then you could - * add the entry "1" with the Mode "LINE_COMMENT" to the descriptors map - * and this CodeTokenizer would recognize the token "//" as starting a - * line comment. - */ - CodeTokenizer(BufferedCharReader &input, const TokenTreeNode &root, - std::map descriptors) - : Tokenizer(input, root), descriptors(descriptors), state(CodeTokenizerState::NORMAL) - { - } -}; -} -} - -#endif diff --git a/src/core/utils/RangeSet.hpp b/src/core/utils/RangeSet.hpp deleted file mode 100644 index 841d476..0000000 --- a/src/core/utils/RangeSet.hpp +++ /dev/null @@ -1,326 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_MODEL_RANGE_SET_HPP_ -#define _OUSIA_MODEL_RANGE_SET_HPP_ - -#include -#include - -namespace ousia { -namespace model { -/** - * The Range structure represents an interval of numerical values of type T. - */ -template -struct Range { - /** - * Start is the start value of the range. - */ - T start; - - /** - * End is the end value of the range (inclusively). - */ - T end; - - /** - * Default constructor of the range class. The range is initialized as - * invalid, with start being set to the maximum possible value of the - * numerical type T, and end being set to the minimum possible value. - */ - Range() : - start(std::numeric_limits::max()), - end(std::numeric_limits::min()) - { - // Do nothing here - } - - /** - * Copies the given start and end value. The given values are not checked - * for validity. Use the "isValid" - * - * @param start is the minimum value the range still covers. - * @param end is the maximum value the range still covers. - */ - Range(const T &start, const T &end) : - start(start), end(end) - { - // Do nothing here - } - - /** - * Creates a range that covers exactly one element, namely the value given - * as parameter n. - */ - Range(const T &n) : - start(n), end(n) - { - // Do nothing here - } - - /** - * Returns true if this range is valid, e.g. its start value is smaller or - * equal to its end value. - * - * @return true if start is smaller or equal to end, false otherwise. - */ - bool isValid() const - { - return start <= end; - } - - /** - * Checks whether the given value lies inside the range. - * - * @param v is the value that is being checked. - * @return true if the value lies within the range, false otherwise. - */ - bool inRange(T v) const - { - return (v >= start) && (v <= end); - } - - /** - * Checks whether the given range overlaps with another range. Not that - * this check is only meaningful if both ranges are valid. - * - * @param r is the range that should be checked for overlapping with this - * range. - */ - bool overlaps(const Range &r) const - { - return (((r.start >= start) || (r.end >= start)) - && ((r.start <= end) || (r.end <= end))); - } - - /** - * Returns true if the two given ranges are neighbours (their limits only - * differ in the smallest representable difference between them). - */ - bool neighbours(const Range &r) const - { - constexpr T eps = std::numeric_limits::is_integer - ? 1 : std::numeric_limits::epsilon(); - return ((r.start > end) && ((r.start - eps) <= end)) - || ((r.end < start) && ((r.end + eps) >= start)); - } - - /** - * Checks whether the given range completely covers this range. - */ - bool coveredBy(const Range &r) const - { - return (r.start <= start) && (r.end >= end); - } - - /** - * Checks whether this range completely covers the given range. - */ - bool covers(const Range &r) const - { - return r.coveredBy(*this); - } - - /** - * Calculates the union of the two ranges -- not that this operation is only - * valid if the ranges overlapp. Use the RangeSet class if you cannot - * guarantee that. - */ - Range merge(const Range &r) const - { - return Range(std::min(start, r.start), std::max(end, r.end)); - } - - /** - * Returns a range that represents the spans the complete set defined by the - * given type T. - */ - static Range typeRange() - { - return Range(std::numeric_limits::min(), - std::numeric_limits::max()); - } - - /** - * Returns a range that represents the spans the complete set defined by the - * given type T up to a given value. - * - * @param till is the value up to which the range should be defined (till is - * included in the set). - */ - static Range typeRangeUntil(const T &till) - { - return Range(std::numeric_limits::min(), till); - } - - /** - * Returns a range that represents the spans the complete set defined by the - * given type T up to a given value. - * - * @param from is the value from which the range should be defined (from is - * included in the set). - */ - static Range typeRangeFrom(const T &from) - { - return Range(from, std::numeric_limits::max()); - } -}; - -/** - * RangeComp is a comperator used to order to sort the ranges within the - * ranges list. Sorts by the start element. - */ -template -struct RangeComp { - bool operator() (const Range& lhs, const Range& rhs) const - { - return lhs.start < rhs.start; - } -}; - -/** - * RangeSet represents a set of ranges of the given numerical type and is thus - * capable of representing any possible subset of the given numerical type T. - */ -template -class RangeSet { - -protected: - /** - * Set of ranges used internally. - */ - std::set, RangeComp> ranges; - - /** - * Returns an iterator to the first element in the ranges list that overlaps - * with the given range. - * - * @param r is the range for which the first overlapping element should be - * found. - * @return an iterator pointing to the first overlapping element or to the - * end of the list if no such element was found. - */ - typename std::set, RangeComp>::iterator firstOverlapping( - const Range &r, const bool allowNeighbours) - { - // Find the element with the next larger start value compared to the - // start value given in r. - auto it = ranges.upper_bound(r); - - // Go back one element - if (it != ranges.begin()) { - it--; - } - - // Iterate until an overlapping element is found - while (!(it->overlaps(r) || (allowNeighbours && it->neighbours(r))) - && (it != ranges.end())) { - it++; - } - return it; - } - -public: - /** - * Calculates the union of this range set and the given range. - * - * @param range is the range that should be merged into this range set. - */ - void merge(Range r) - { - // Calculate a new range that covers both the new range and all old - // ranges in the set -- delete all old elements on the way - auto it = firstOverlapping(r, true); - while ((it->overlaps(r) || it->neighbours(r)) && it != ranges.end()) { - r = r.merge(*it); - it = ranges.erase(it); - } - - // Insert the new range - ranges.insert(r); - } - - /** - * Calculates the union of this range set and the given range set. - * - * @param ranges is another range set for which the union with this set - * should be calculated. - */ - void merge(const RangeSet &s) - { - for (Range &r : s.ranges) { - merge(r); - } - } - - /** - * Checks whether this range set S contains the given range R: - * S u R = R - * (The intersection between R and S equals the given range) - * - * @param r is the range for which the containment should be checked. - * @return true if the above condition is met, false otherwise. - */ - bool contains(const Range &r) - { - auto it = firstOverlapping(r, false); - if (it != ranges.end()) { - return (*it).covers(r); - } - return false; - } - - /** - * Checks whether this range set S1 contains the given range set S2: - * - * @param s is the range for which the containment should be checked. - * @return true if the above condition is met, false otherwise. - */ - bool contains(const RangeSet &s) - { - bool res = true; - for (Range &r : s.ranges) { - res = res && contains(r); - } - return res; - } - - /** - * Empties the set. - */ - void clear() - { - ranges.clear(); - } - - /** - * Returns the current list of ranges as a const reference. - */ - const std::set, RangeComp>& getRanges() - { - return this->ranges; - } - -}; - -} -} - -#endif /* _OUSIA_MODEL_RANGE_SET_HPP_ */ - diff --git a/src/core/utils/Tokenizer.cpp b/src/core/utils/Tokenizer.cpp deleted file mode 100644 index a0ca3aa..0000000 --- a/src/core/utils/Tokenizer.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "Tokenizer.hpp" - -namespace ousia { -namespace utils { - -static std::map buildChildren( - const std::map &inputs) -{ - std::map children; - std::map> nexts; - - for (auto &e : inputs) { - const std::string &s = e.first; - const int id = e.second; - if (s.empty()) { - continue; - } - char start = s[0]; - const std::string suffix = s.substr(1); - if (nexts.find(start) != nexts.end()) { - nexts[start].insert(std::make_pair(suffix, id)); - } else { - nexts.insert(std::make_pair( - start, std::map{{suffix, id}})); - } - } - - for (auto &n : nexts) { - children.insert(std::make_pair(n.first, TokenTreeNode{n.second})); - } - - return children; -} - -static int buildId(const std::map &inputs) -{ - int tokenId = TOKEN_NONE; - for (auto &e : inputs) { - if (e.first.empty()) { - if (tokenId != TOKEN_NONE) { - throw TokenizerException{std::string{"Ambigous token found: "} + - std::to_string(e.second)}; - } else { - tokenId = e.second; - } - } - } - return tokenId; -} - -TokenTreeNode::TokenTreeNode(const std::map &inputs) - : children(buildChildren(inputs)), tokenId(buildId(inputs)) -{ -} - -Tokenizer::Tokenizer(BufferedCharReader &input, const TokenTreeNode &root) - : input(input), root(root) -{ -} - -bool Tokenizer::prepare() -{ - std::stringstream buffer; - char c; - int startColumn = input.getColumn(); - int startLine = input.getLine(); - bool bufEmpty = true; - while (input.peek(&c)) { - if (root.children.find(c) != root.children.end()) { - // if there might be a special token, keep peeking forward - // until we find the token (or we don't). - TokenTreeNode const *n = &root; - std::stringstream tBuf; - int match = TOKEN_NONE; - while (true) { - tBuf << c; - n = &(n->children.at(c)); - if (n->tokenId != TOKEN_NONE) { - match = n->tokenId; - // from here on we found a token. If we have something - // in our buffer already, we end the search now. - if (!bufEmpty) { - break; - } else { - // if we want to return this token ( = we have nothing - // in our buffer yet) we look greedily for the longest - // possible token we can construct. - input.consumePeek(); - } - } - if (!input.peek(&c)) { - // if we are at the end we break off the search. - break; - } - if (n->children.find(c) == n->children.end()) { - // if we do not find a possible continuation anymore, - // break off the search. - break; - } - } - //reset the peek pointer to the last valid position. - input.resetPeek(); - // check if we did indeed find a special token. - if (match != TOKEN_NONE) { - if (bufEmpty) { - // if we did not have text before, construct that token. - if (doPrepare( - Token{match, tBuf.str(), startColumn, startLine, - input.getColumn(), input.getLine()}, - peeked)) { - return true; - } else { - startColumn = input.getColumn(); - startLine = input.getLine(); - continue; - } - } else { - // otherwise we return the text before the token. - if (doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, - startLine, input.getColumn(), - input.getLine()}, - peeked)) { - return true; - } else{ - //we need to clear the buffer here. After all the token - //corresponding to this buffer segment is already - //constructed. - buffer.str(std::string()); - bufEmpty = true; - startColumn = input.getColumn(); - startLine = input.getLine(); - continue; - } - } - } else{ - //if we found nothing, read at least one character. - input.peek(&c); - } - } - buffer << c; - bufEmpty = false; - input.consumePeek(); - } - if (!bufEmpty) { - return doPrepare(Token{TOKEN_TEXT, buffer.str(), startColumn, startLine, - input.getColumn(), input.getLine()}, - peeked); - } - return false; -} - -bool Tokenizer::doPrepare(const Token &t, std::deque &peeked) -{ - peeked.push_back(t); - return true; -} - -bool Tokenizer::next(Token &t) -{ - if (peeked.empty()) { - if (!prepare()) { - return false; - } - } - t = peeked.front(); - peeked.pop_front(); - resetPeek(); - return true; -} - -bool Tokenizer::peek(Token &t) -{ - if (peekCursor >= peeked.size()) { - if (!prepare()) { - return false; - } - } - t = peeked[peekCursor]; - return true; -} - -void Tokenizer::resetPeek() { peekCursor = 0; } - -void Tokenizer::consumePeek() -{ - while (peekCursor > 0) { - peeked.pop_front(); - peekCursor--; - } -} -} -} diff --git a/src/core/utils/Tokenizer.hpp b/src/core/utils/Tokenizer.hpp deleted file mode 100644 index 2debc75..0000000 --- a/src/core/utils/Tokenizer.hpp +++ /dev/null @@ -1,231 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_UTILS_TOKENIZER_HPP_ -#define _OUSIA_UTILS_TOKENIZER_HPP_ - -#include -#include -#include - -#include "BufferedCharReader.hpp" - -namespace ousia { -namespace utils { - -/** - * This exception is currently only thrown if errors are made during the - * initialization of the Tokenizer. Have a closer look at the documentation - * of the TokenTreeNode constructor for more information. - */ -class TokenizerException : public std::exception { -public: - const std::string msg; - - TokenizerException(const std::string &msg) : msg(msg){}; - - virtual const char *what() const noexcept override { return msg.c_str(); } -}; - -/** - * The Tokenizer internally uses a TokenTree to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * The TokenTree is a construct that structures all special tokens this - * Tokenizer recognizes. Consider the Tokens "aab", "a" and "aac". Then - * the TokenTree would look like this: - * - * a - * | \ - * a $ - * | \ - * b c - * | | - * $ $ - * - * Every node in the TokenTree is a valid end state that has a $ attached to it. - * During the search algorithm the Tokenizer goes through the tree and stores - * the last valid position. If a character follows that does not lead to a new - * node in the TokenTree the search ends (and starts again at this character). - * The token corresponding to the last valid position is returned. - * - * This allows us to uniquely identify the matching token given a certain - * input text. Note that this is a greedy matching approach that does not - * work if you're using truly ambiguous tokens (that have the same text). - * - * It is also not allowed that tokens have common middle parts but varying - * pre- and suffixes. Consider the example of two tokens "abd" and "bc" and - * the input string "abc". In that case we start looking for "abd" at the - * start, won't find it, wenn we hit "c" and start the scanning process - * anew. Thus the "bc" token is not found. - * - * For most (well-behaved) tokenization schemes this is not the case, - * though. - */ -class TokenTreeNode { -public: - const std::map children; - const int tokenId; - - /** - * The TokenTreeNode constructor builds a TokenTree from the given token - * specifications. The node returned by this constructor then is the root of - * said TokenTree. - * @param inputs Specifications of tokens in map form. Each specification - * is a tuple of the text that should be matched and some unique ID (>= 0) - * that is returned to you if that Token is found in the text. - * An example for such a map would be - * { - * { "#" , 1}, - * { "##", 2}, - * { "/" , 3} - * } - * Note that IDs below zero are reserved for system Ids, mainly TOKEN_NONE - * (-1) and TOKEN_TEXT (-2). - */ - TokenTreeNode(const std::map &inputs); -}; - -/** - * This is a reserved constant for the empty token. - */ -static const int TOKEN_NONE = -1; -/** - * This is a reserved constant for every part of the input text that is not a - * specified token. - */ -static const int TOKEN_TEXT = -2; - -/** - * A token for us is identified by an integer tokenID (either one of the - * constants TOKEN_NONE or TOKEN_TEXT or one of the user-defined constants). - * Additionally we return the matched text (which should only be really - * interesting in case of TOKEN_TEXT tokens) and the position in the input text. - */ -struct Token { - int tokenId; - std::string content; - int startColumn; - int startLine; - int endColumn; - int endLine; - - Token(int tokenId, std::string content, int startColumn, int startLine, - int endColumn, int endLine) - : tokenId(tokenId), - content(content), - startColumn(startColumn), - startLine(startLine), - endColumn(endColumn), - endLine(endLine) - { - } - - Token() : tokenId(TOKEN_NONE) {} -}; - -/** - * A Tokenizer has the purpose of subdividing an input text into tokens. In our - * definition here we distinguish between two kinds of tokens: - * 1.) User-specified tokens that match a fixed text. - * 2.) Any other text between those tokens. - * The user might want to specify the tokens '#{' and '#}' for example, because - * they have some meaning in her code. The user sets the IDs to 1 and 2. - * Given the input text - * "some text #{ special command #} some text" - * the tokenizer would return the tokens: - * 1.) "some text " with the id TOKEN_TEXT (-2). - * 2.) "#{" with the id 1. - * 3.) " special command " with the id TOKEN_TEXT (-2). - * 4.) "#}" with the id 2. - * 5.) " some text" with the id TOKEN_TEXT (-2). - * This makes the subsequent parsing of files of a specific type easier. - * Note that in case of tokens with that are prefixes of other tokens the - * longest possible match is returned. - */ -class Tokenizer { -private: - BufferedCharReader &input; - const TokenTreeNode &root; - std::deque peeked; - unsigned int peekCursor = 0; - - bool prepare(); - -protected: - /** - * This method is an interface to build multiple tokens from a single one in - * derived classes. This might be interesting if you want to implement - * further logic on text tokens or similar applications. - * - * @param t a Token the "basic" tokenizer found. - * @param peeked a reference to the deque containing all temporary Tokens. - * You are supposed to append your tokens there. In the trivial case you just - * put the given Token on top of the deque. - * @return false if no token was appended to the deque (meaning that you want - * to ignore the given token explicitly) and true in all other cases. - */ - virtual bool doPrepare(const Token &t, std::deque &peeked); - -public: - /** - * @param input The input of a Tokenizer is given in the form of a - * BufferedCharReader. Please refer to the respective documentation. - * @param root This is meant to be the root of a TokenTree giving the - * specification of user-defined tokens this Tokenizer should recognize. - * The Tokenizer promises to not change the TokenTree such that you can - * re-use the same specification for multiple inputs. - * Please refer to the TokenTreeNode documentation for more information. - */ - Tokenizer(BufferedCharReader &input, const TokenTreeNode &root); - - /** - * The next method consumes one Token from the input stream and gives - * it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool next(Token &t); - /** - * The peek method does not consume the next Token but buffers it and - * shows it to the user (stored in the input argument). - * - * @param t a Token reference that is set to the next found token. - * @return true if a next token was found and false if the input is at its - * end. - */ - bool peek(Token &t); - - /** - * Resets the peek pointer to the current position in the stream (to the - * beginning of the buffer). - */ - void resetPeek(); - - /** - * Clears the peek buffer, such that all peeked Tokens are consumed. - */ - void consumePeek(); -}; -} -} - -#endif diff --git a/src/core/utils/Utils.cpp b/src/core/utils/Utils.cpp deleted file mode 100644 index 184fdd0..0000000 --- a/src/core/utils/Utils.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "Utils.hpp" - -namespace ousia { - -bool Utils::isIdentifier(const std::string &name) -{ - bool first = true; - for (char c : name) { - if (first && !(isAlphabetic(c) || c == '_')) { - return false; - } - if (first && !(isAlphanumeric(c) || c == '_' || c == '-')) { - return false; - } - first = false; - } - return true; -} - -} - diff --git a/src/core/utils/Utils.hpp b/src/core/utils/Utils.hpp deleted file mode 100644 index 2fcd794..0000000 --- a/src/core/utils/Utils.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - - -#ifndef _OUSIA_UTILS_H_ -#define _OUSIA_UTILS_H_ - -#include - -namespace ousia { - -class Utils { - -public: - - /** - * Returns true if the given character is in [A-Za-z] - */ - static bool isAlphabetic(const char c) - { - return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')); - } - - /** - * Returns true if the given character is in [0-9] - */ - static bool isNumeric(const char c) - { - return (c >= '0') && (c <= '9'); - } - - /** - * Returns true if the given character is in [A-Za-z0-9] - */ - static bool isAlphanumeric(const char c) - { - return isAlphabetic(c) || isNumeric(c); - } - - /** - * Returns true if the given character is in [A-Za-z_][A-Za-z0-9_-]* - */ - static bool isIdentifier(const std::string &name); - -}; - -} - -#endif /* _OUSIA_UTILS_H_ */ - -- cgit v1.2.3