implemented CSSParser for selectors. The code compiles. Tests are still needed, though and there are some TODOs left.

author: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-12-04 13:17:20 +0100
committer: Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de> 2014-12-04 13:17:20 +0100
commit: ae7053775ba1c013d53143d2b860fcc88d214458 (patch)
tree: fd7ee19b1a86c7c513b5c8e35c3243255da8b0ab /src
parent: 51b8d39841ea1e803b07cae65020f1b8df6811aa (diff)
4 files changed, 341 insertions, 22 deletions
diff --git a/src/core/CSS.hpp b/src/core/CSS.hpp
index e730721..1c0ed17 100644
--- a/src/core/CSS.hpp
+++ b/src/core/CSS.hpp
@@ -21,6 +21,7 @@
 
 #include <map>
 #include <vector>
+#include <tuple>
 
 #include "Managed.hpp"
 #include "Node.hpp"
diff --git a/src/core/CSSParser.cpp b/src/core/CSSParser.cpp
index 00d9c72..bad1862 100644
--- a/src/core/CSSParser.cpp
+++ b/src/core/CSSParser.cpp
@@ -18,7 +18,6 @@
 
 #include "BufferedCharReader.hpp"
 #include "CodeTokenizer.hpp"
-#include "Tokenizer.hpp"
 
 #include "CSSParser.hpp"
 
@@ -28,52 +27,295 @@ namespace ousia {
 static const int CURLY_OPEN = 1;
 static const int CURLY_CLOSE = 2;
 static const int COLON = 3;
-static const int SEMICOLON = 4;
-static const int HASH = 5;
-static const int BRACKET_OPEN = 6;
-static const int BRACKET_CLOSE = 7;
-static const int PAREN_OPEN = 8;
-static const int PAREN_CLOSE = 9;
+static const int DOUBLE_COLON = 4;
+static const int SEMICOLON = 5;
+static const int HASH = 6;
+static const int BRACKET_OPEN = 7;
+static const int BRACKET_CLOSE = 8;
+static const int PAREN_OPEN = 9;
+static const int PAREN_CLOSE = 10;
+static const int EQUALS = 11;
+static const int ARROW = 12;
+static const int COMMA = 13;
 // comments
 static const int COMMENT = 100;
 static const int COMMENT_OPEN = 101;
 static const int COMMENT_CLOSE = 102;
 // strings
 static const int STRING = 200;
-static const int SINGLE_QUOTE = 201;
-static const int DOUBLE_QUOTE = 202;
-static const int ESCAPE = 203;
+static const int DOUBLE_QUOTE = 201;
+static const int ESCAPE = 202;
 // general syntax
 static const int LINEBREAK = 300;
 
 static const TokenTreeNode CSS_ROOT{{{"{", CURLY_OPEN},
                                      {"}", CURLY_CLOSE},
                                      {":", COLON},
+                                     {"::", DOUBLE_COLON},
                                      {";", SEMICOLON},
                                      {"#", HASH},
                                      {"[", BRACKET_OPEN},
                                      {"]", BRACKET_CLOSE},
                                      {"(", PAREN_OPEN},
                                      {")", PAREN_CLOSE},
+                                     {"=", EQUALS},
+                                     {">", ARROW},
+                                     {",", COMMA},
                                      {"/*", COMMENT_OPEN},
                                      {"*/", COMMENT_CLOSE},
-                                     {"\\", ESCAPE},
-                                     {"\''", SINGLE_QUOTE},
                                      {"\"", DOUBLE_QUOTE},
-                                     {"\n", LINEBREAK}}};
+                                     {"\\", ESCAPE},
+                                     // linux linebreak
+                                     {"\n", LINEBREAK},
+                                     // windows linebreak
+                                     {"\r\n", LINEBREAK},
+                                     // Mac OS linebreak
+                                     {"\r", LINEBREAK}}};
 
 static const std::map<int, CodeTokenDescriptor> CSS_DESCRIPTORS = {
     {COMMENT_OPEN, {CodeTokenMode::BLOCK_COMMENT_START, COMMENT}},
     {COMMENT_CLOSE, {CodeTokenMode::BLOCK_COMMENT_END, COMMENT}},
-    {SINGLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}},
     {DOUBLE_QUOTE, {CodeTokenMode::STRING_START_END, STRING}},
     {ESCAPE, {CodeTokenMode::ESCAPE, ESCAPE}},
     {LINEBREAK, {CodeTokenMode::LINEBREAK, LINEBREAK}}};
 
-StyleNode CSSParser::parse(BufferedCharReader &input)
+Rooted<SelectorNode> CSSParser::parse(BufferedCharReader &input)
 {
 	CodeTokenizer tokenizer{input, CSS_ROOT, CSS_DESCRIPTORS};
 	tokenizer.ignoreComments = true;
-	// TODO: implement
+	// TODO: Is this the correct way to retrieve the Manager?
+	Manager mgr;
+	Rooted<SelectorNode> root = {new SelectorNode{mgr, "root"}};
+	parseDocument(root, tokenizer);
+	return root;
+}
+
+void CSSParser::parseDocument(Rooted<SelectorNode> root,
+                              CodeTokenizer &tokenizer)
+{
+	Token t;
+	if (!tokenizer.peek(t)) {
+		return;
+	}
+	tokenizer.resetPeek();
+	std::vector<Rooted<SelectorNode>> leafList;
+	parseSelectors(root, tokenizer, leafList);
+	// TODO: Parse Ruleset
+	parseDocument(root, tokenizer);
+}
+
+void CSSParser::parseSelectors(Rooted<SelectorNode> root,
+                               CodeTokenizer &tokenizer,
+                               std::vector<Rooted<SelectorNode>> &leafList)
+{
+	auto tuple = parseSelector(tokenizer);
+	// append the SelectorPath to the root node.
+	std::vector<Rooted<SelectorNode>> unmergedLeafs =
+	    root->append(std::get<0>(tuple));
+	// append the leaf to the leafList.
+	switch (unmergedLeafs.size()) {
+		case 0:
+			// if the leaf could be merged we take the leaf reference from the
+			// parseSelector method.
+			leafList.push_back(std::get<1>(tuple));
+			break;
+		case 1:
+			// if the leaf could not be merged we take the existing leaf.
+			leafList.push_back(unmergedLeafs[0]);
+			break;
+		case 2:
+			// as the parseSelector is supposed to parse only a SelectorPath
+			// there should not be more than one leaf.
+			throw LoggableException{
+			    "Internal Error: More than one leaf in SelectorPath!", "",
+			    // TODO: Line handling?
+			    //			    tokenizer.getInput().getLine(),
+			    //			    tokenizer.getInput().getColumn()
+			};
+	}
+	// if we find a comma, we can proceed parsing selectors.
+	Token t;
+	if (expect(COMMA, tokenizer, t, false)) {
+		parseSelectors(root, tokenizer, leafList);
+	}
+}
+
+std::tuple<Rooted<SelectorNode>, Rooted<SelectorNode>> CSSParser::parseSelector(
+    CodeTokenizer &tokenizer)
+{
+	Rooted<SelectorNode> s = parsePrimitiveSelector(tokenizer);
+	Token t;
+	if (!tokenizer.peek(t)) {
+		// if we are at the end the found selector is the immediate child as
+		// well as the leaf.
+		return std::make_tuple(s, s);
+	}
+	switch (t.tokenId) {
+		case TOKEN_TEXT: {
+			// if we find text there is a next token in a DESCENDANT
+			// relationship (A B)
+			tokenizer.resetPeek();
+			// so we parse the rest of the subsequent SelectorPath
+			auto tuple = parseSelector(tokenizer);
+			// then we establish the DESCENDANT relationship
+			// TODO: Is this the correct way to retrieve the Manager?
+			Manager mgr;
+			s->getEdges().push_back(
+			    new SelectorNode::SelectorEdge(mgr, std::get<0>(tuple)));
+			// and we return this node as well as the leaf.
+			return std::make_tuple(s, std::get<1>(tuple));
+		}
+		case ARROW: {
+			tokenizer.consumePeek();
+			// if we find an arrow there is a next token in a CHILD
+			// relationship (A > B)
+			// so we parse the rest of the subsequent SelectorPath
+			auto tuple = parseSelector(tokenizer);
+			// then we establish the DESCENDANT relationship
+			// TODO: Is this the correct way to retrieve the Manager?
+			Manager mgr;
+			s->getEdges().push_back(new SelectorNode::SelectorEdge(
+			    mgr, std::get<0>(tuple), SelectionOperator::DIRECT_DESCENDANT));
+			// and we return this node as well as the leaf.
+			return std::make_tuple(s, std::get<1>(tuple));
+		}
+		default:
+			// everything else is not part of the SelectorPath anymore.
+			tokenizer.resetPeek();
+			return std::make_tuple(s, s);
+	}
+}
+
+Rooted<SelectorNode> CSSParser::parsePrimitiveSelector(CodeTokenizer &tokenizer)
+{
+	// first and foremost we expect a class name.
+	Token t;
+	expect(TOKEN_TEXT, tokenizer, t, true);
+	const std::string name = t.content;
+	// TODO: Is this the correct way to retrieve the Manager?
+	Manager mgr;
+	if (!tokenizer.peek(t)) {
+		// if we are at the end, we just return this selector with its name.
+		Rooted<SelectorNode> n{new SelectorNode(mgr, name)};
+		return n;
+	}
+
+	bool isGenerative = false;
+
+	switch (t.tokenId) {
+		case DOUBLE_COLON:
+			// if we find a double colon we have a generative PseudoSelector.
+			isGenerative = true;
+		case COLON: {
+			// if we find a colon we have a restrictive PseudoSelector.
+			tokenizer.consumePeek();
+			// get the PseudoSelector name.
+			expect(TOKEN_TEXT, tokenizer, t, true);
+			const std::string pseudo_select_name = t.content;
+			// look for additional arguments.
+			if (!expect(PAREN_OPEN, tokenizer, t, false)) {
+				// if we don't have any, we return here.
+				Rooted<SelectorNode> n{new SelectorNode(
+				    mgr, name, {pseudo_select_name, isGenerative})};
+				return n;
+			}
+			// parse the argument list.
+			std::vector<std::string> args;
+			// we require at least one argument, if parantheses are used
+			expect(TOKEN_TEXT, tokenizer, t, true);
+			args.push_back(t.content);
+			while (expect(COMMA, tokenizer, t, false)) {
+				// as long as we find commas we expect new arguments.
+				expect(TOKEN_TEXT, tokenizer, t, true);
+				args.push_back(t.content);
+			}
+			expect(PAREN_CLOSE, tokenizer, t, true);
+			// and we return with the finished Selector.
+			Rooted<SelectorNode> n{new SelectorNode(
+			    mgr, name, {pseudo_select_name, args, isGenerative})};
+			return n;
+		}
+		case HASH: {
+			// a hash symbol is syntactic sugar for the PseudoSelector
+			// :has_id(id)
+			// so we expect an ID now.
+			Token t;
+			expect(TOKEN_TEXT, tokenizer, t, true);
+			std::vector<std::string> args{t.content};
+			// and we return the finished Selector
+			Rooted<SelectorNode> n{
+			    new SelectorNode(mgr, name, {"has_id", args, false})};
+			return n;
+		}
+		case BRACKET_OPEN: {
+			// in case of brackets we have one of two restrictive
+			// PseudoSelectors
+			// has_attribute ([attribute_name])
+			// or
+			// has_value [attribute_name="value"]
+			// in both cases the attribute name comes first.
+			Token t;
+			expect(TOKEN_TEXT, tokenizer, t, true);
+			std::vector<std::string> args{t.content};
+			if (!expect(EQUALS, tokenizer, t, false)) {
+				// if no equals sign follows we have a has_attribute
+				// PseudoSelector
+				// we expect a closing bracket.
+				expect(BRACKET_CLOSE, tokenizer, t, true);
+				// and then we can return the result.
+				Rooted<SelectorNode> n{new SelectorNode(
+				    mgr, name, {"has_attribute", args, false})};
+				return n;
+			} else {
+				// with an equals sign we have a has_value PseudoSelector and
+				// expect the value next.
+				expect(STRING, tokenizer, t, true);
+				args.push_back(t.content);
+				// then we expect a closing bracket.
+				expect(BRACKET_CLOSE, tokenizer, t, true);
+				// and then we can return the result.
+				Rooted<SelectorNode> n{
+				    new SelectorNode(mgr, name, {"has_value", args, false})};
+				return n;
+			}
+		}
+		default:
+			// everything else is not part of the Selector anymore.
+			tokenizer.resetPeek();
+			Rooted<SelectorNode> n{new SelectorNode(mgr, name)};
+			return n;
+	}
+}
+
+// TODO: Add RuleSet parsing methods.
+
+bool CSSParser::expect(int expectedType, CodeTokenizer &tokenizer, Token &t,
+                       bool force)
+{
+	bool end = !tokenizer.peek(t);
+	if (end || t.tokenId != expectedType) {
+		if (force) {
+			if (end) {
+				throw LoggableException{
+				    "Unexpected end of file!", "",
+				    // TODO: Line handling?
+				    //			    tokenizer.getInput().getLine(),
+				    //			    tokenizer.getInput().getColumn()
+				};
+			} else {
+				throw LoggableException{
+				    "Unexpected token!", "",
+				    // TODO: Line handling?
+				    //			    tokenizer.getInput().getLine(),
+				    //			    tokenizer.getInput().getColumn()
+				};
+			}
+		} else {
+			tokenizer.resetPeek();
+			return false;
+		}
+	}
+	tokenizer.consumePeek();
+	return true;
 }
 }
diff --git a/src/core/CSSParser.hpp b/src/core/CSSParser.hpp
index 4c99a5a..c1a4c0d 100644
--- a/src/core/CSSParser.hpp
+++ b/src/core/CSSParser.hpp
@@ -19,24 +19,98 @@
 #ifndef _OUSIA_CSS_PARSER_HPP_
 #define _OUSIA_CSS_PARSER_HPP_
 
-#include <istream>
-#include <map>
 #include <vector>
 #include <tuple>
 
 #include "BufferedCharReader.hpp"
-#include "Managed.hpp"
-#include "Node.hpp"
 #include "CSS.hpp"
+#include "Exceptions.hpp"
 
 namespace ousia {
 
+/**
+ * This is a context free, recursive parser for a subset of the CSS3 language
+ * as defined by W3C. We allow the following grammar:
+ *
+ * DOC              := SELECT RULESET DOC | epsilon
+ * SELECTORS        := SELECT , SELECTORS | SELECT
+ * SELECT           := SELECT' OPERATOR SELECT | SELECT'
+ * SELECT'          := TYPE | TYPE:PSEUDO | TYPE::GEN_PSEUDO |
+ *                     TYPE:PSEUDO(ARGUMENTS) |
+ *                     TYPE::GEN_PSEUDO(ARGUMENTS) | TYPE#ID |
+ *                     TYPE[ATTRIBUTE] | TYPE[ATTRIBUTE=VALUE]
+ * TYPE             := string
+ * PSEUDO           := string
+ * GEN_PSEUDO       := string
+ * ARGUMENTS        := string , ARGUMENTS
+ * ID               := string
+ * ATTRIBUTE        := string
+ * VALUE            := string
+ * OPERATOR         := epsilon | &gt;
+ * RULESET          := epsilon | { RULES }
+ * RULES            := RULE RULES | epsilon
+ * RULE             := KEY : VALUE ;
+ * KEY              := string
+ * VALUE            := type-specific parser
+ *
+ *
+ * @author Benjamin Paassen - bpaassen@techfak.uni-bielefeld.de
+ */
 class CSSParser {
-
 private:
+	/**
+	 * Implements the DOC Nonterminal
+	 */
+	void parseDocument(Rooted<SelectorNode> root, CodeTokenizer &tokenizer);
+	/**
+	 * Implements the SELECTORS Nonterminal and adds all leaf nodes of the
+	 * resulting SelectorTree to the input leafList so that a parsed RuleSet can
+	 * be inserted there.
+	 */
+	void parseSelectors(Rooted<SelectorNode> root, CodeTokenizer &tokenizer,
+	                    std::vector<Rooted<SelectorNode>> &leafList);
+	/**
+	 * Implements the SELECT Nonterminal, which in effect parses a SelectorPath
+	 * of the SelectorTree and returns the beginning node of the path as first
+	 * element as well as the leaf of the path as second tuple element.
+	 */
+	std::tuple<Rooted<SelectorNode>, Rooted<SelectorNode>> parseSelector(
+	    CodeTokenizer &tokenizer);
+
+	/**
+	 * Implements the SELECT' Nonterminal, which parses a single Selector with
+	 * its PseudoSelector and returns it.
+	 */
+	Rooted<SelectorNode> parsePrimitiveSelector(CodeTokenizer &tokenizer);
+
+	// TODO: Add RuleSet parsing methods.
+
+	/**
+	 * A convenience function to wrap around the tokenizer peek() function that
+	 * only returns true if an instance of the expected type occurs.
+	 *
+	 * @param expectedType the ID of the expected type according to the
+	 *                     CodeTokenizer specification.
+	 * @param tokenizer    the tokenizer for the input.
+	 * @param t            an empty token that gets the parsed token content
+	 *                     if it has the expected type.
+	 * @param force        a flag to be set if it would be fatal for the
+	 *                     parsing process to get the wrong type. In that case
+	 *                     an exception is thrown.
+	 * @return             true iff a token of the expected type was found.
+	 */
+	bool expect(int expectedType, CodeTokenizer &tokenizer, Token &t,
+	            bool force);
 
 public:
-	StyleNode parse(BufferedCharReader &input);
+	/**
+	 * This parses the given input as CSS content as specified by the grammar
+	 * seen above. The return value is a Rooted reference to the root of the
+	 * SelectorTree.
+	 * TODO: The RuleSet at the respective node at the tree lists all CSS Style
+	 * rules that apply.
+	 */
+	Rooted<SelectorNode> parse(BufferedCharReader &input);
 };
 }
 
diff --git a/src/core/Tokenizer.hpp b/src/core/Tokenizer.hpp
index f962ead..4aebf56 100644
--- a/src/core/Tokenizer.hpp
+++ b/src/core/Tokenizer.hpp
@@ -223,6 +223,8 @@ public:
 	 * Clears the peek buffer, such that all peeked Tokens are consumed.
 	 */
 	void consumePeek();
+
+	const BufferedCharReader &getInput() const { return input; }
 };
 }
author	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-12-04 13:17:20 +0100
committer	Benjamin Paassen <bpaassen@techfak.uni-bielefeld.de>	2014-12-04 13:17:20 +0100
commit	ae7053775ba1c013d53143d2b860fcc88d214458 (patch)
tree	fd7ee19b1a86c7c513b5c8e35c3243255da8b0ab /src
parent	51b8d39841ea1e803b07cae65020f1b8df6811aa (diff)