diff options
author | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2014-12-04 13:17:20 +0100 |
---|---|---|
committer | Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> | 2014-12-04 13:17:20 +0100 |
commit | ae7053775ba1c013d53143d2b860fcc88d214458 (patch) | |
tree | fd7ee19b1a86c7c513b5c8e35c3243255da8b0ab /src | |
parent | 51b8d39841ea1e803b07cae65020f1b8df6811aa (diff) |
implemented CSSParser for selectors. The code compiles. Tests are still needed, though and there are some TODOs left.
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CSS.hpp | 1 | ||||
-rw-r--r-- | src/core/CSSParser.cpp | 274 | ||||
-rw-r--r-- | src/core/CSSParser.hpp | 86 | ||||
-rw-r--r-- | src/core/Tokenizer.hpp | 2 |
4 files changed, 341 insertions, 22 deletions
diff --git a/src/core/CSS.hpp b/src/core/CSS.hpp index e730721..1c0ed17 100644 --- a/src/core/CSS.hpp +++ b/src/core/CSS.hpp @@ -21,6 +21,7 @@ #include <map> #include <vector> +#include <tuple> #include "Managed.hpp" #include "Node.hpp" diff --git a/src/core/CSSParser.cpp b/src/core/CSSParser.cpp index 00d9c72..bad1862 100644 --- a/src/core/CSSParser.cpp +++ b/src/core/CSSParser.cpp @@ -18,7 +18,6 @@ #include "BufferedCharReader.hpp" #include "CodeTokenizer.hpp" -#include "Tokenizer.hpp" #include "CSSParser.hpp" @@ -28,52 +27,295 @@ namespace ousia { static const int CURLY_OPEN = 1; static const int CURLY_CLOSE = 2; static const int COLON = 3; -static const int SEMICOLON = 4; -static const int HASH = 5; -static const int BRACKET_OPEN = 6; -static const int BRACKET_CLOSE = 7; -static const int PAREN_OPEN = 8; -static const int PAREN_CLOSE = 9; +static const int DOUBLE_COLON = 4; +static const int SEMICOLON = 5; +static const int HASH = 6; +static const int BRACKET_OPEN = 7; +static const int BRACKET_CLOSE = 8; +static const int PAREN_OPEN = 9; +static const int PAREN_CLOSE = 10; +static const int EQUALS = 11; +static const int ARROW = 12; +static const int COMMA = 13; // comments static const int COMMENT = 100; static const int COMMENT_OPEN = 101; static const int COMMENT_CLOSE = 102; // strings static const int STRING = 200; -static const int SINGLE_QUOTE = 201; -static const int DOUBLE_QUOTE = 202; -static const int ESCAPE = 203; +static const int DOUBLE_QUOTE = 201; +static const int ESCAPE = 202; // general syntax static const int LINEBREAK = 300; static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN}, {"}", CURLY_CLOSE}, {":", COLON}, + {"::", DOUBLE_COLON}, {";", SEMICOLON}, {"#", HASH}, {"[", BRACKET_OPEN}, {"]", BRACKET_CLOSE}, {"(", PAREN_OPEN}, {")", PAREN_CLOSE}, + {"=", EQUALS}, + {">", ARROW}, + {",", COMMA}, {"/*", COMMENT_OPEN}, {"*/", COMMENT_CLOSE}, - {"\\", ESCAPE}, - {"\''", SINGLE_QUOTE}, {"\"", DOUBLE_QUOTE}, - {"\n", LINEBREAK}}}; + {"\\", ESCAPE}, + // linux linebreak + {"\n", LINEBREAK}, + // windows linebreak + {"\r\n", LINEBREAK}, + // Mac OS linebreak + {"\r", LINEBREAK}}}; static const std::map<int, CodeTokenDescriptor> CSS_DESCRIPTORS = { {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}}, {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}}, - {SINGLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}}, {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}}, {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; -StyleNode CSSParser::parse(BufferedCharReader &input) +Rooted<SelectorNode> CSSParser::parse(BufferedCharReader &input) { CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS}; tokenizer.ignoreComments = true; - // TODO: implement + // TODO: Is this the correct way to retrieve the Manager? + Manager mgr; + Rooted<SelectorNode> root = {new SelectorNode{mgr, "root"}}; + parseDocument(root, tokenizer); + return root; +} + +void CSSParser::parseDocument(Rooted<SelectorNode> root, + CodeTokenizer &tokenizer) +{ + Token t; + if (!tokenizer.peek(t)) { + return; + } + tokenizer.resetPeek(); + std::vector<Rooted<SelectorNode>> leafList; + parseSelectors(root, tokenizer, leafList); + // TODO: Parse Ruleset + parseDocument(root, tokenizer); +} + +void CSSParser::parseSelectors(Rooted<SelectorNode> root, + CodeTokenizer &tokenizer, + std::vector<Rooted<SelectorNode>> &leafList) +{ + auto tuple = parseSelector(tokenizer); + // append the SelectorPath to the root node. + std::vector<Rooted<SelectorNode>> unmergedLeafs = + root->append(std::get<0>(tuple)); + // append the leaf to the leafList. + switch (unmergedLeafs.size()) { + case 0: + // if the leaf could be merged we take the leaf reference from the + // parseSelector method. + leafList.push_back(std::get<1>(tuple)); + break; + case 1: + // if the leaf could not be merged we take the existing leaf. + leafList.push_back(unmergedLeafs[0]); + break; + case 2: + // as the parseSelector is supposed to parse only a SelectorPath + // there should not be more than one leaf. + throw LoggableException{ + "Internal Error: More than one leaf in SelectorPath!", "", + // TODO: Line handling? + // tokenizer.getInput().getLine(), + // tokenizer.getInput().getColumn() + }; + } + // if we find a comma, we can proceed parsing selectors. + Token t; + if (expect(COMMA, tokenizer, t, false)) { + parseSelectors(root, tokenizer, leafList); + } +} + +std::tuple<Rooted<SelectorNode>, Rooted<SelectorNode>> CSSParser::parseSelector( + CodeTokenizer &tokenizer) +{ + Rooted<SelectorNode> s = parsePrimitiveSelector(tokenizer); + Token t; + if (!tokenizer.peek(t)) { + // if we are at the end the found selector is the immediate child as + // well as the leaf. + return std::make_tuple(s, s); + } + switch (t.tokenId) { + case TOKEN_TEXT: { + // if we find text there is a next token in a DESCENDANT + // relationship (A B) + tokenizer.resetPeek(); + // so we parse the rest of the subsequent SelectorPath + auto tuple = parseSelector(tokenizer); + // then we establish the DESCENDANT relationship + // TODO: Is this the correct way to retrieve the Manager? + Manager mgr; + s->getEdges().push_back( + new SelectorNode::SelectorEdge(mgr, std::get<0>(tuple))); + // and we return this node as well as the leaf. + return std::make_tuple(s, std::get<1>(tuple)); + } + case ARROW: { + tokenizer.consumePeek(); + // if we find an arrow there is a next token in a CHILD + // relationship (A > B) + // so we parse the rest of the subsequent SelectorPath + auto tuple = parseSelector(tokenizer); + // then we establish the DESCENDANT relationship + // TODO: Is this the correct way to retrieve the Manager? + Manager mgr; + s->getEdges().push_back(new SelectorNode::SelectorEdge( + mgr, std::get<0>(tuple), SelectionOperator::DIRECT_DESCENDANT)); + // and we return this node as well as the leaf. + return std::make_tuple(s, std::get<1>(tuple)); + } + default: + // everything else is not part of the SelectorPath anymore. + tokenizer.resetPeek(); + return std::make_tuple(s, s); + } +} + +Rooted<SelectorNode> CSSParser::parsePrimitiveSelector(CodeTokenizer &tokenizer) +{ + // first and foremost we expect a class name. + Token t; + expect(TOKEN_TEXT, tokenizer, t, true); + const std::string name = t.content; + // TODO: Is this the correct way to retrieve the Manager? + Manager mgr; + if (!tokenizer.peek(t)) { + // if we are at the end, we just return this selector with its name. + Rooted<SelectorNode> n{new SelectorNode(mgr, name)}; + return n; + } + + bool isGenerative = false; + + switch (t.tokenId) { + case DOUBLE_COLON: + // if we find a double colon we have a generative PseudoSelector. + isGenerative = true; + case COLON: { + // if we find a colon we have a restrictive PseudoSelector. + tokenizer.consumePeek(); + // get the PseudoSelector name. + expect(TOKEN_TEXT, tokenizer, t, true); + const std::string pseudo_select_name = t.content; + // look for additional arguments. + if (!expect(PAREN_OPEN, tokenizer, t, false)) { + // if we don't have any, we return here. + Rooted<SelectorNode> n{new SelectorNode( + mgr, name, {pseudo_select_name, isGenerative})}; + return n; + } + // parse the argument list. + std::vector<std::string> args; + // we require at least one argument, if parantheses are used + expect(TOKEN_TEXT, tokenizer, t, true); + args.push_back(t.content); + while (expect(COMMA, tokenizer, t, false)) { + // as long as we find commas we expect new arguments. + expect(TOKEN_TEXT, tokenizer, t, true); + args.push_back(t.content); + } + expect(PAREN_CLOSE, tokenizer, t, true); + // and we return with the finished Selector. + Rooted<SelectorNode> n{new SelectorNode( + mgr, name, {pseudo_select_name, args, isGenerative})}; + return n; + } + case HASH: { + // a hash symbol is syntactic sugar for the PseudoSelector + // :has_id(id) + // so we expect an ID now. + Token t; + expect(TOKEN_TEXT, tokenizer, t, true); + std::vector<std::string> args{t.content}; + // and we return the finished Selector + Rooted<SelectorNode> n{ + new SelectorNode(mgr, name, {"has_id", args, false})}; + return n; + } + case BRACKET_OPEN: { + // in case of brackets we have one of two restrictive + // PseudoSelectors + // has_attribute ([attribute_name]) + // or + // has_value [attribute_name="value"] + // in both cases the attribute name comes first. + Token t; + expect(TOKEN_TEXT, tokenizer, t, true); + std::vector<std::string> args{t.content}; + if (!expect(EQUALS, tokenizer, t, false)) { + // if no equals sign follows we have a has_attribute + // PseudoSelector + // we expect a closing bracket. + expect(BRACKET_CLOSE, tokenizer, t, true); + // and then we can return the result. + Rooted<SelectorNode> n{new SelectorNode( + mgr, name, {"has_attribute", args, false})}; + return n; + } else { + // with an equals sign we have a has_value PseudoSelector and + // expect the value next. + expect(STRING, tokenizer, t, true); + args.push_back(t.content); + // then we expect a closing bracket. + expect(BRACKET_CLOSE, tokenizer, t, true); + // and then we can return the result. + Rooted<SelectorNode> n{ + new SelectorNode(mgr, name, {"has_value", args, false})}; + return n; + } + } + default: + // everything else is not part of the Selector anymore. + tokenizer.resetPeek(); + Rooted<SelectorNode> n{new SelectorNode(mgr, name)}; + return n; + } +} + +// TODO: Add RuleSet parsing methods. + +bool CSSParser::expect(int expectedType, CodeTokenizer &tokenizer, Token &t, + bool force) +{ + bool end = !tokenizer.peek(t); + if (end || t.tokenId != expectedType) { + if (force) { + if (end) { + throw LoggableException{ + "Unexpected end of file!", "", + // TODO: Line handling? + // tokenizer.getInput().getLine(), + // tokenizer.getInput().getColumn() + }; + } else { + throw LoggableException{ + "Unexpected token!", "", + // TODO: Line handling? + // tokenizer.getInput().getLine(), + // tokenizer.getInput().getColumn() + }; + } + } else { + tokenizer.resetPeek(); + return false; + } + } + tokenizer.consumePeek(); + return true; } } diff --git a/src/core/CSSParser.hpp b/src/core/CSSParser.hpp index 4c99a5a..c1a4c0d 100644 --- a/src/core/CSSParser.hpp +++ b/src/core/CSSParser.hpp @@ -19,24 +19,98 @@ #ifndef _OUSIA_CSS_PARSER_HPP_ #define _OUSIA_CSS_PARSER_HPP_ -#include <istream> -#include <map> #include <vector> #include <tuple> #include "BufferedCharReader.hpp" -#include "Managed.hpp" -#include "Node.hpp" #include "CSS.hpp" +#include "Exceptions.hpp" namespace ousia { +/** + * This is a context free, recursive parser for a subset of the CSS3 language + * as defined by W3C. We allow the following grammar: + * + * DOC := SELECT RULESET DOC | epsilon + * SELECTORS := SELECT , SELECTORS | SELECT + * SELECT := SELECT' OPERATOR SELECT | SELECT' + * SELECT' := TYPE | TYPE:PSEUDO | TYPE::GEN_PSEUDO | + * TYPE:PSEUDO(ARGUMENTS) | + * TYPE::GEN_PSEUDO(ARGUMENTS) | TYPE#ID | + * TYPE[ATTRIBUTE] | TYPE[ATTRIBUTE=VALUE] + * TYPE := string + * PSEUDO := string + * GEN_PSEUDO := string + * ARGUMENTS := string , ARGUMENTS + * ID := string + * ATTRIBUTE := string + * VALUE := string + * OPERATOR := epsilon | > + * RULESET := epsilon | { RULES } + * RULES := RULE RULES | epsilon + * RULE := KEY : VALUE ; + * KEY := string + * VALUE := type-specific parser + * + * + * @author Benjamin Paassen - bpaassen@techfak.uni-bielefeld.de + */ class CSSParser { - private: + /** + * Implements the DOC Nonterminal + */ + void parseDocument(Rooted<SelectorNode> root, CodeTokenizer &tokenizer); + /** + * Implements the SELECTORS Nonterminal and adds all leaf nodes of the + * resulting SelectorTree to the input leafList so that a parsed RuleSet can + * be inserted there. + */ + void parseSelectors(Rooted<SelectorNode> root, CodeTokenizer &tokenizer, + std::vector<Rooted<SelectorNode>> &leafList); + /** + * Implements the SELECT Nonterminal, which in effect parses a SelectorPath + * of the SelectorTree and returns the beginning node of the path as first + * element as well as the leaf of the path as second tuple element. + */ + std::tuple<Rooted<SelectorNode>, Rooted<SelectorNode>> parseSelector( + CodeTokenizer &tokenizer); + + /** + * Implements the SELECT' Nonterminal, which parses a single Selector with + * its PseudoSelector and returns it. + */ + Rooted<SelectorNode> parsePrimitiveSelector(CodeTokenizer &tokenizer); + + // TODO: Add RuleSet parsing methods. + + /** + * A convenience function to wrap around the tokenizer peek() function that + * only returns true if an instance of the expected type occurs. + * + * @param expectedType the ID of the expected type according to the + * CodeTokenizer specification. + * @param tokenizer the tokenizer for the input. + * @param t an empty token that gets the parsed token content + * if it has the expected type. + * @param force a flag to be set if it would be fatal for the + * parsing process to get the wrong type. In that case + * an exception is thrown. + * @return true iff a token of the expected type was found. + */ + bool expect(int expectedType, CodeTokenizer &tokenizer, Token &t, + bool force); public: - StyleNode parse(BufferedCharReader &input); + /** + * This parses the given input as CSS content as specified by the grammar + * seen above. The return value is a Rooted reference to the root of the + * SelectorTree. + * TODO: The RuleSet at the respective node at the tree lists all CSS Style + * rules that apply. + */ + Rooted<SelectorNode> parse(BufferedCharReader &input); }; } diff --git a/src/core/Tokenizer.hpp b/src/core/Tokenizer.hpp index f962ead..4aebf56 100644 --- a/src/core/Tokenizer.hpp +++ b/src/core/Tokenizer.hpp @@ -223,6 +223,8 @@ public: * Clears the peek buffer, such that all peeked Tokens are consumed. */ void consumePeek(); + + const BufferedCharReader &getInput() const { return input; } }; } |