/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "CSSParser.hpp" #include #include namespace ousia { // CSS code tokens static const int CURLY_OPEN = 1; static const int CURLY_CLOSE = 2; static const int COLON = 3; static const int DOUBLE_COLON = 4; static const int SEMICOLON = 5; static const int HASH = 6; static const int BRACKET_OPEN = 7; static const int BRACKET_CLOSE = 8; static const int PAREN_OPEN = 9; static const int PAREN_CLOSE = 10; static const int EQUALS = 11; static const int ARROW = 12; static const int COMMA = 13; // comments static const int COMMENT = 100; static const int COMMENT_OPEN = 101; static const int COMMENT_CLOSE = 102; // strings static const int STRING = 200; static const int DOUBLE_QUOTE = 201; static const int ESCAPE = 202; // general syntax static const int LINEBREAK = 300; static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN}, {"}", CURLY_CLOSE}, {":", COLON}, {"::", DOUBLE_COLON}, {";", SEMICOLON}, {"#", HASH}, {"[", BRACKET_OPEN}, {"]", BRACKET_CLOSE}, {"(", PAREN_OPEN}, {")", PAREN_CLOSE}, {"=", EQUALS}, {">", ARROW}, {",", COMMA}, {"/*", COMMENT_OPEN}, {"*/", COMMENT_CLOSE}, {"\"", DOUBLE_QUOTE}, {"\\", ESCAPE}, {"\n", LINEBREAK}}}; static const std::map CSS_DESCRIPTORS = { {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}}, {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}}, {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}}, {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; void CSSParser::doParse(CharReader &reader, ParserContext &ctx) { CodeTokenizer tokenizer{reader, CSS_ROOT, CSS_DESCRIPTORS}; tokenizer.ignoreComments = true; tokenizer.ignoreLinebreaks = true; // Create the root node and push it onto the parser scope Rooted root = { new model::SelectorNode{ctx.getManager(), "root"}}; ctx.getScope().push(root); // Parse the document into the root node parseDocument(root, tokenizer, ctx); // Remove the element from the parser scope ctx.getScope().pop(); } void CSSParser::parseDocument(Rooted root, CodeTokenizer &tokenizer, ParserContext &ctx) { Token t; if (!tokenizer.peek(t)) { return; } tokenizer.resetPeek(); std::vector> leafList; // parse the SelectorTree for this ruleSet. parseSelectors(root, tokenizer, leafList, ctx); // parse the RuleSet itself. Rooted ruleSet = parseRuleSet(tokenizer, ctx); for (auto &leaf : leafList) { /* * every leaf is an accepting node, if one considers the SelectorTree * to be a finite state machine. This is relevant, if users do not use * the CSS Parser to parse actual Ruleset content but to construct a * SelectorTree just to identify a part of the DocumentTree. */ leaf->setAccepting(true); /* * similarly we append the found rules to all leafs. */ leaf->getRuleSet()->merge(ruleSet); } parseDocument(root, tokenizer, ctx); } void CSSParser::parseSelectors( Rooted root, CodeTokenizer &tokenizer, std::vector> &leafList, ParserContext &ctx) { auto tuple = parseSelector(tokenizer, ctx); // append the SelectorPath to the root node. std::vector> unmergedLeafs = root->append(tuple.first); // append the leaf to the leafList. switch (unmergedLeafs.size()) { case 0: // if the leaf could be merged we take the leaf reference from the // parseSelector method. leafList.push_back(tuple.second); break; case 1: // if the leaf could not be merged we take the existing leaf. leafList.push_back(unmergedLeafs[0]); break; case 2: // as the parseSelector is supposed to parse only a SelectorPath // there should not be more than one leaf. throw LoggableException{ "Internal Error: More than one leaf in SelectorPath!", tokenizer.getInput()}; } // if we find a comma, we can proceed parsing selectors. Token t; if (expect(COMMA, tokenizer, t, false, ctx)) { parseSelectors(root, tokenizer, leafList, ctx); } } std::pair, Rooted> CSSParser::parseSelector(CodeTokenizer &tokenizer, ParserContext &ctx) { Rooted s = parsePrimitiveSelector(tokenizer, ctx); Token t; if (!tokenizer.peek(t)) { // if we are at the end the found selector is the immediate child as // well as the leaf. return std::make_pair(s, s); } switch (t.tokenId) { case TOKEN_TEXT: { // if we find text there is a next token in a DESCENDANT // relationship (A B) tokenizer.resetPeek(); // so we parse the rest of the subsequent SelectorPath auto tuple = parseSelector(tokenizer, ctx); // then we establish the DESCENDANT relationship s->getEdges().push_back(new model::SelectorNode::SelectorEdge( ctx.getManager(), tuple.first)); // and we return this node as well as the leaf. return std::make_pair(s, tuple.second); } case ARROW: { tokenizer.consumePeek(); // if we find an arrow there is a next token in a CHILD // relationship (A > B) // so we parse the rest of the subsequent SelectorPath auto tuple = parseSelector(tokenizer, ctx); // then we establish the DESCENDANT relationship s->getEdges().push_back(new model::SelectorNode::SelectorEdge( ctx.getManager(), tuple.first, model::SelectionOperator::DIRECT_DESCENDANT)); // and we return this node as well as the leaf. return std::make_pair(s, tuple.second); } default: // everything else is not part of the SelectorPath anymore. tokenizer.resetPeek(); return std::make_pair(s, s); } } Rooted CSSParser::parsePrimitiveSelector( CodeTokenizer &tokenizer, ParserContext &ctx) { // first and foremost we expect a class name. Token t; expect(TOKEN_TEXT, tokenizer, t, true, ctx); const std::string name = t.content; if (!tokenizer.peek(t)) { // if we are at the end, we just return this selector with its name. Rooted n{ new model::SelectorNode(ctx.getManager(), name)}; return n; } bool isGenerative = false; switch (t.tokenId) { case DOUBLE_COLON: // if we find a double colon we have a generative PseudoSelector. isGenerative = true; // this is supposed to fall through; no missing break. case COLON: { // if we find a colon we have a restrictive PseudoSelector. tokenizer.consumePeek(); // get the PseudoSelector name. expect(TOKEN_TEXT, tokenizer, t, true, ctx); const std::string pseudo_select_name = t.content; // look for additional arguments. if (!expect(PAREN_OPEN, tokenizer, t, false, ctx)) { // if we don't have any, we return here. Rooted n{ new model::SelectorNode(ctx.getManager(), name, {pseudo_select_name, isGenerative})}; return n; } // parse the argument list. Variant::arrayType args; // we require at least one argument, if parantheses are used // XXX args.push_back(VariantReader::parseGeneric(tokenizer.getInput(), ctx.getLogger(), {',', ')'}).second); while (expect(COMMA, tokenizer, t, false, ctx)) { // as long as we find commas we expect new arguments. args.push_back(VariantReader::parseGeneric(tokenizer.getInput(), ctx.getLogger(), {',', ')'}).second); } expect(PAREN_CLOSE, tokenizer, t, true, ctx); // and we return with the finished Selector. Rooted n{ new model::SelectorNode(ctx.getManager(), name, {pseudo_select_name, args, isGenerative})}; return n; } case HASH: { // a hash symbol is syntactic sugar for the PseudoSelector // :has_id(id) // so we expect an ID now. Token t; expect(TOKEN_TEXT, tokenizer, t, true, ctx); Variant::arrayType args{Variant(t.content.c_str())}; // and we return the finished Selector Rooted n{new model::SelectorNode( ctx.getManager(), name, {"has_id", args, false})}; return n; } case BRACKET_OPEN: { // in case of brackets we have one of two restrictive // PseudoSelectors // has_attribute ([attribute_name]) // or // has_value [attribute_name="value"] // in both cases the attribute name comes first. Token t; expect(TOKEN_TEXT, tokenizer, t, true, ctx); Variant::arrayType args{Variant(t.content.c_str())}; if (!expect(EQUALS, tokenizer, t, false, ctx)) { // if no equals sign follows we have a has_attribute // PseudoSelector // we expect a closing bracket. expect(BRACKET_CLOSE, tokenizer, t, true, ctx); // and then we can return the result. Rooted n{new model::SelectorNode( ctx.getManager(), name, {"has_attribute", args, false})}; return n; } else { // with an equals sign we have a has_value PseudoSelector and // expect the value next. expect(STRING, tokenizer, t, true, ctx); args.push_back(Variant(t.content.c_str())); // then we expect a closing bracket. expect(BRACKET_CLOSE, tokenizer, t, true, ctx); // and then we can return the result. Rooted n{new model::SelectorNode( ctx.getManager(), name, {"has_value", args, false})}; return n; } } default: // everything else is not part of the Selector anymore. tokenizer.resetPeek(); Rooted n{ new model::SelectorNode(ctx.getManager(), name)}; return n; } } Rooted CSSParser::parseRuleSet(CodeTokenizer &tokenizer, ParserContext &ctx) { Rooted ruleSet{new model::RuleSet(ctx.getManager())}; // if we have no ruleset content, we return an empty ruleset. Token t; if (!expect(CURLY_OPEN, tokenizer, t, false, ctx)) { return ruleSet; } // otherwise we parse the rules. parseRules(tokenizer, ruleSet, ctx); // and we expect closing curly braces. expect(CURLY_CLOSE, tokenizer, t, true, ctx); return ruleSet; } void CSSParser::parseRules(CodeTokenizer &tokenizer, Rooted ruleSet, ParserContext &ctx) { std::string key; Variant value; while (parseRule(tokenizer, ctx, key, value)) { ruleSet->getRules().insert({key, value}); } } bool CSSParser::parseRule(CodeTokenizer &tokenizer, ParserContext &ctx, std::string &key, Variant &value) { Token t; if (!expect(TOKEN_TEXT, tokenizer, t, false, ctx)) { return false; } // if we find text that is the key first. key = t.content; // then we expect a : expect(COLON, tokenizer, t, true, ctx); // then the value // TODO: Resolve key for appropriate parsing function here. value = VariantReader::parseGeneric(tokenizer.getInput(), ctx.getLogger(), {';'}).second; // and a ; expect(SEMICOLON, tokenizer, t, true, ctx); return true; } bool CSSParser::expect(int expectedType, CodeTokenizer &tokenizer, Token &t, bool force, ParserContext &ctx) { bool end = !tokenizer.peek(t); if (end || t.tokenId != expectedType) { if (force) { if (end) { throw LoggableException{"Unexpected end of file!", tokenizer.getInput()}; } else { throw LoggableException{"Unexpected token!", tokenizer.getInput()}; } } else { tokenizer.resetPeek(); return false; } } tokenizer.consumePeek(); return true; } }