summaryrefslogtreecommitdiff
path: root/src/plugins/css
diff options
context:
space:
mode:
authorBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2014-12-05 12:35:58 +0100
committerBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2014-12-05 12:35:58 +0100
commit0a4fbfe17951498f9a5f650f6da81d90f4a533b4 (patch)
tree42382d93989a35294bf127814885d873a9bde29f /src/plugins/css
parentd98fc013878ab28cb062a4f25a45199b9ee9574a (diff)
refactored CSSParser into plugins.
Diffstat (limited to 'src/plugins/css')
-rw-r--r--src/plugins/css/CSSParser.cpp319
-rw-r--r--src/plugins/css/CSSParser.hpp140
2 files changed, 459 insertions, 0 deletions
diff --git a/src/plugins/css/CSSParser.cpp b/src/plugins/css/CSSParser.cpp
new file mode 100644
index 0000000..d239359
--- /dev/null
+++ b/src/plugins/css/CSSParser.cpp
@@ -0,0 +1,319 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "CSSParser.hpp"
+
+namespace ousia {
+namespace parser {
+namespace css {
+
+// CSS code tokens
+static const int CURLY_OPEN = 1;
+static const int CURLY_CLOSE = 2;
+static const int COLON = 3;
+static const int DOUBLE_COLON = 4;
+static const int SEMICOLON = 5;
+static const int HASH = 6;
+static const int BRACKET_OPEN = 7;
+static const int BRACKET_CLOSE = 8;
+static const int PAREN_OPEN = 9;
+static const int PAREN_CLOSE = 10;
+static const int EQUALS = 11;
+static const int ARROW = 12;
+static const int COMMA = 13;
+// comments
+static const int COMMENT = 100;
+static const int COMMENT_OPEN = 101;
+static const int COMMENT_CLOSE = 102;
+// strings
+static const int STRING = 200;
+static const int DOUBLE_QUOTE = 201;
+static const int ESCAPE = 202;
+// general syntax
+static const int LINEBREAK = 300;
+
+static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN},
+ {"}", CURLY_CLOSE},
+ {":", COLON},
+ {"::", DOUBLE_COLON},
+ {";", SEMICOLON},
+ {"#", HASH},
+ {"[", BRACKET_OPEN},
+ {"]", BRACKET_CLOSE},
+ {"(", PAREN_OPEN},
+ {")", PAREN_CLOSE},
+ {"=", EQUALS},
+ {">", ARROW},
+ {",", COMMA},
+ {"/*", COMMENT_OPEN},
+ {"*/", COMMENT_CLOSE},
+ {"\"", DOUBLE_QUOTE},
+ {"\\", ESCAPE},
+ // linux linebreak
+ {"\n", LINEBREAK},
+ // windows linebreak
+ {"\r\n", LINEBREAK},
+ // Mac OS linebreak
+ {"\r", LINEBREAK}}};
+
+static const std::map<int, CodeTokenDescriptor> CSS_DESCRIPTORS = {
+ {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}},
+ {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}},
+ {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}},
+ {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}},
+ {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
+
+Rooted<Node> CSSParser::parse(std::istream &is, ParserContext &ctx)
+{
+ BufferedCharReader input{is};
+ CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS};
+ tokenizer.ignoreComments = true;
+ Rooted<SelectorNode> root = {new SelectorNode{ctx.manager, "root"}};
+ parseDocument(root, tokenizer, ctx);
+ return root;
+}
+
+void CSSParser::parseDocument(Rooted<SelectorNode> root,
+ CodeTokenizer &tokenizer, ParserContext &ctx)
+{
+ Token t;
+ if (!tokenizer.peek(t)) {
+ return;
+ }
+ tokenizer.resetPeek();
+ std::vector<Rooted<SelectorNode>> leafList;
+ parseSelectors(root, tokenizer, leafList, ctx);
+ // TODO: Parse Ruleset
+ parseDocument(root, tokenizer, ctx);
+}
+
+void CSSParser::parseSelectors(Rooted<SelectorNode> root,
+ CodeTokenizer &tokenizer,
+ std::vector<Rooted<SelectorNode>> &leafList,
+ ParserContext &ctx)
+{
+ auto tuple = parseSelector(tokenizer, ctx);
+ // append the SelectorPath to the root node.
+ std::vector<Rooted<SelectorNode>> unmergedLeafs =
+ root->append(std::get<0>(tuple));
+ // append the leaf to the leafList.
+ switch (unmergedLeafs.size()) {
+ case 0:
+ // if the leaf could be merged we take the leaf reference from the
+ // parseSelector method.
+ leafList.push_back(std::get<1>(tuple));
+ break;
+ case 1:
+ // if the leaf could not be merged we take the existing leaf.
+ leafList.push_back(unmergedLeafs[0]);
+ break;
+ case 2:
+ // as the parseSelector is supposed to parse only a SelectorPath
+ // there should not be more than one leaf.
+ throw ParserException{
+ "Internal Error: More than one leaf in SelectorPath!", "",
+ // TODO: Line handling?
+ // tokenizer.getInput().getLine(),
+ // tokenizer.getInput().getColumn()
+ };
+ }
+ // if we find a comma, we can proceed parsing selectors.
+ Token t;
+ if (expect(COMMA, tokenizer, t, false, ctx)) {
+ parseSelectors(root, tokenizer, leafList, ctx);
+ }
+}
+
+std::tuple<Rooted<SelectorNode>, Rooted<SelectorNode>> CSSParser::parseSelector(
+ CodeTokenizer &tokenizer, ParserContext &ctx)
+{
+ Rooted<SelectorNode> s = parsePrimitiveSelector(tokenizer, ctx);
+ Token t;
+ if (!tokenizer.peek(t)) {
+ // if we are at the end the found selector is the immediate child as
+ // well as the leaf.
+ return std::make_tuple(s, s);
+ }
+ switch (t.tokenId) {
+ case TOKEN_TEXT: {
+ // if we find text there is a next token in a DESCENDANT
+ // relationship (A B)
+ tokenizer.resetPeek();
+ // so we parse the rest of the subsequent SelectorPath
+ auto tuple = parseSelector(tokenizer, ctx);
+ // then we establish the DESCENDANT relationship
+ s->getEdges().push_back(new SelectorNode::SelectorEdge(
+ ctx.manager, std::get<0>(tuple)));
+ // and we return this node as well as the leaf.
+ return std::make_tuple(s, std::get<1>(tuple));
+ }
+ case ARROW: {
+ tokenizer.consumePeek();
+ // if we find an arrow there is a next token in a CHILD
+ // relationship (A > B)
+ // so we parse the rest of the subsequent SelectorPath
+ auto tuple = parseSelector(tokenizer, ctx);
+ // then we establish the DESCENDANT relationship
+ s->getEdges().push_back(new SelectorNode::SelectorEdge(
+ ctx.manager, std::get<0>(tuple),
+ SelectionOperator::DIRECT_DESCENDANT));
+ // and we return this node as well as the leaf.
+ return std::make_tuple(s, std::get<1>(tuple));
+ }
+ default:
+ // everything else is not part of the SelectorPath anymore.
+ tokenizer.resetPeek();
+ return std::make_tuple(s, s);
+ }
+}
+
+Rooted<SelectorNode> CSSParser::parsePrimitiveSelector(CodeTokenizer &tokenizer,
+ ParserContext &ctx)
+{
+ // first and foremost we expect a class name.
+ Token t;
+ expect(TOKEN_TEXT, tokenizer, t, true, ctx);
+ const std::string name = t.content;
+ if (!tokenizer.peek(t)) {
+ // if we are at the end, we just return this selector with its name.
+ Rooted<SelectorNode> n{new SelectorNode(ctx.manager, name)};
+ return n;
+ }
+
+ bool isGenerative = false;
+
+ switch (t.tokenId) {
+ case DOUBLE_COLON:
+ // if we find a double colon we have a generative PseudoSelector.
+ isGenerative = true;
+ // this is supposed to fall through; no missing break.
+ case COLON: {
+ // if we find a colon we have a restrictive PseudoSelector.
+ tokenizer.consumePeek();
+ // get the PseudoSelector name.
+ expect(TOKEN_TEXT, tokenizer, t, true, ctx);
+ const std::string pseudo_select_name = t.content;
+ // look for additional arguments.
+ if (!expect(PAREN_OPEN, tokenizer, t, false, ctx)) {
+ // if we don't have any, we return here.
+ Rooted<SelectorNode> n{new SelectorNode(
+ ctx.manager, name, {pseudo_select_name, isGenerative})};
+ return n;
+ }
+ // parse the argument list.
+ std::vector<std::string> args;
+ // we require at least one argument, if parantheses are used
+ expect(TOKEN_TEXT, tokenizer, t, true, ctx);
+ args.push_back(t.content);
+ while (expect(COMMA, tokenizer, t, false, ctx)) {
+ // as long as we find commas we expect new arguments.
+ expect(TOKEN_TEXT, tokenizer, t, true, ctx);
+ args.push_back(t.content);
+ }
+ expect(PAREN_CLOSE, tokenizer, t, true, ctx);
+ // and we return with the finished Selector.
+ Rooted<SelectorNode> n{new SelectorNode(
+ ctx.manager, name, {pseudo_select_name, args, isGenerative})};
+ return n;
+ }
+ case HASH: {
+ // a hash symbol is syntactic sugar for the PseudoSelector
+ // :has_id(id)
+ // so we expect an ID now.
+ Token t;
+ expect(TOKEN_TEXT, tokenizer, t, true, ctx);
+ std::vector<std::string> args{t.content};
+ // and we return the finished Selector
+ Rooted<SelectorNode> n{
+ new SelectorNode(ctx.manager, name, {"has_id", args, false})};
+ return n;
+ }
+ case BRACKET_OPEN: {
+ // in case of brackets we have one of two restrictive
+ // PseudoSelectors
+ // has_attribute ([attribute_name])
+ // or
+ // has_value [attribute_name="value"]
+ // in both cases the attribute name comes first.
+ Token t;
+ expect(TOKEN_TEXT, tokenizer, t, true, ctx);
+ std::vector<std::string> args{t.content};
+ if (!expect(EQUALS, tokenizer, t, false, ctx)) {
+ // if no equals sign follows we have a has_attribute
+ // PseudoSelector
+ // we expect a closing bracket.
+ expect(BRACKET_CLOSE, tokenizer, t, true, ctx);
+ // and then we can return the result.
+ Rooted<SelectorNode> n{new SelectorNode(
+ ctx.manager, name, {"has_attribute", args, false})};
+ return n;
+ } else {
+ // with an equals sign we have a has_value PseudoSelector and
+ // expect the value next.
+ expect(STRING, tokenizer, t, true, ctx);
+ args.push_back(t.content);
+ // then we expect a closing bracket.
+ expect(BRACKET_CLOSE, tokenizer, t, true, ctx);
+ // and then we can return the result.
+ Rooted<SelectorNode> n{new SelectorNode(
+ ctx.manager, name, {"has_value", args, false})};
+ return n;
+ }
+ }
+ default:
+ // everything else is not part of the Selector anymore.
+ tokenizer.resetPeek();
+ Rooted<SelectorNode> n{new SelectorNode(ctx.manager, name)};
+ return n;
+ }
+}
+
+// TODO: Add RuleSet parsing methods.
+
+bool CSSParser::expect(int expectedType, CodeTokenizer &tokenizer, Token &t,
+ bool force, ParserContext &ctx)
+{
+ bool end = !tokenizer.peek(t);
+ if (end || t.tokenId != expectedType) {
+ if (force) {
+ if (end) {
+ throw ParserException{
+ "Unexpected end of file!", "",
+ // TODO: Line handling?
+ // tokenizer.getInput().getLine(),
+ // tokenizer.getInput().getColumn()
+ };
+ } else {
+ throw ParserException{
+ "Unexpected token!", "",
+ // TODO: Line handling?
+ // tokenizer.getInput().getLine(),
+ // tokenizer.getInput().getColumn()
+ };
+ }
+ } else {
+ tokenizer.resetPeek();
+ return false;
+ }
+ }
+ tokenizer.consumePeek();
+ return true;
+}
+}
+}
+}
diff --git a/src/plugins/css/CSSParser.hpp b/src/plugins/css/CSSParser.hpp
new file mode 100644
index 0000000..eb51dfa
--- /dev/null
+++ b/src/plugins/css/CSSParser.hpp
@@ -0,0 +1,140 @@
+/*
+ Ousía
+ Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _OUSIA_CSS_PARSER_HPP_
+#define _OUSIA_CSS_PARSER_HPP_
+
+#include <vector>
+#include <tuple>
+
+#include <core/BufferedCharReader.hpp>
+#include <core/CodeTokenizer.hpp>
+#include <core/CSS.hpp>
+#include <core/parser/Parser.hpp>
+
+namespace ousia {
+namespace parser {
+namespace css {
+
+/**
+ * This is a context free, recursive parser for a subset of the CSS3 language
+ * as defined by W3C. We allow the following grammar:
+ *
+ * DOC := SELECT RULESET DOC | epsilon
+ * SELECTORS := SELECT , SELECTORS | SELECT
+ * SELECT := SELECT' OPERATOR SELECT | SELECT'
+ * SELECT' := TYPE | TYPE:PSEUDO | TYPE::GEN_PSEUDO |
+ * TYPE:PSEUDO(ARGUMENTS) |
+ * TYPE::GEN_PSEUDO(ARGUMENTS) | TYPE#ID |
+ * TYPE[ATTRIBUTE] | TYPE[ATTRIBUTE=VALUE]
+ * TYPE := string
+ * PSEUDO := string
+ * GEN_PSEUDO := string
+ * ARGUMENTS := string , ARGUMENTS
+ * ID := string
+ * ATTRIBUTE := string
+ * VALUE := string
+ * OPERATOR := epsilon | &gt;
+ * RULESET := epsilon | { RULES }
+ * RULES := RULE RULES | epsilon
+ * RULE := KEY : VALUE ;
+ * KEY := string
+ * VALUE := type-specific parser
+ *
+ *
+ * @author Benjamin Paassen - bpaassen@techfak.uni-bielefeld.de
+ */
+class CSSParser : public Parser {
+private:
+ /**
+ * Implements the DOC Nonterminal
+ */
+ void parseDocument(Rooted<SelectorNode> root, CodeTokenizer &tokenizer,
+ ParserContext &ctx);
+ /**
+ * Implements the SELECTORS Nonterminal and adds all leaf nodes of the
+ * resulting SelectorTree to the input leafList so that a parsed RuleSet can
+ * be inserted there.
+ */
+ void parseSelectors(Rooted<SelectorNode> root, CodeTokenizer &tokenizer,
+ std::vector<Rooted<SelectorNode>> &leafList,
+ ParserContext &ctx);
+ /**
+ * Implements the SELECT Nonterminal, which in effect parses a SelectorPath
+ * of the SelectorTree and returns the beginning node of the path as first
+ * element as well as the leaf of the path as second tuple element.
+ */
+ std::tuple<Rooted<SelectorNode>, Rooted<SelectorNode>> parseSelector(
+ CodeTokenizer &tokenizer, ParserContext &ctx);
+
+ /**
+ * Implements the SELECT' Nonterminal, which parses a single Selector with
+ * its PseudoSelector and returns it.
+ */
+ Rooted<SelectorNode> parsePrimitiveSelector(CodeTokenizer &tokenizer,
+ ParserContext &ctx);
+
+ // TODO: Add RuleSet parsing methods.
+
+ /**
+ * A convenience function to wrap around the tokenizer peek() function that
+ * only returns true if an instance of the expected type occurs.
+ *
+ * @param expectedType the ID of the expected type according to the
+ * CodeTokenizer specification.
+ * @param tokenizer the tokenizer for the input.
+ * @param t an empty token that gets the parsed token content
+ * if it has the expected type.
+ * @param force a flag to be set if it would be fatal for the
+ * parsing process to get the wrong type. In that case
+ * an exception is thrown.
+ * @return true iff a token of the expected type was found.
+ */
+ bool expect(int expectedType, CodeTokenizer &tokenizer, Token &t,
+ bool force, ParserContext &ctx);
+
+public:
+ /**
+ * This parses the given input as CSS content as specified by the grammar
+ * seen above. The return value is a Rooted reference to the root of the
+ * SelectorTree.
+ * TODO: The RuleSet at the respective node at the tree lists all CSS Style
+ * rules that apply.
+ *
+ * @param is is a reference to the input stream that should be parsed.
+ * @param ctx is a reference to the context that should be used while
+ * parsing the document.
+ * @return returns the root node of the resulting SelectorTree. For more
+ * information on the return conventions consult the Parser.hpp.
+ */
+ Rooted<Node> parse(std::istream &is, ParserContext &ctx) override;
+
+ /**
+ * As befits a class called CSSParser, this Parser parses CSS.
+ */
+ std::set<std::string> mimetypes()
+ {
+ std::set<std::string> out{"text/css"};
+ return out;
+ }
+};
+}
+}
+}
+
+#endif