From efe60ac3c3a8725ac71329c0bb19fa9d9c58f399 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:42:05 +0100 Subject: Moved specific file format parsers to formats/ folder, moved old tokenizer to css code (this is the only place where it is actually used) --- test/core/CodeTokenizerTest.cpp | 100 ----------- test/core/TokenizerTest.cpp | 118 ------------- test/formats/osdmx/OsdmxParserTest.cpp | 314 +++++++++++++++++++++++++++++++++ test/plugins/css/CodeTokenizerTest.cpp | 100 +++++++++++ test/plugins/css/TokenizerTest.cpp | 118 +++++++++++++ test/plugins/xml/XmlParserTest.cpp | 314 --------------------------------- 6 files changed, 532 insertions(+), 532 deletions(-) delete mode 100644 test/core/CodeTokenizerTest.cpp delete mode 100644 test/core/TokenizerTest.cpp create mode 100644 test/formats/osdmx/OsdmxParserTest.cpp create mode 100644 test/plugins/css/CodeTokenizerTest.cpp create mode 100644 test/plugins/css/TokenizerTest.cpp delete mode 100644 test/plugins/xml/XmlParserTest.cpp (limited to 'test') diff --git a/test/core/CodeTokenizerTest.cpp b/test/core/CodeTokenizerTest.cpp deleted file mode 100644 index 2d4d5a7..0000000 --- a/test/core/CodeTokenizerTest.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -static const int BLOCK_COMMENT = 30; -static const int LINE_COMMENT = 31; -static const int STRING = 20; -static const int ESCAPE = 21; -static const int LINEBREAK = 21; -static const int CURLY_OPEN = 40; -static const int CURLY_CLOSE = 41; - -TEST(CodeTokenizer, testTokenizer) -{ - CharReader reader{ - "/**\n" // 1 - " * Some Block Comment\n" // 2 - " */\n" // 3 - "var my_string = 'My \\'String\\'';\n" // 4 - "// and a line comment\n" // 5 - "var my_obj = { a = 4;}", 0}; // 6 - // 123456789012345678901234567890123456789 - // 0 1 2 3 - TokenTreeNode root{{{"/*", 1}, - {"*/", 2}, - {"//", 3}, - {"'", 4}, - {"\\", 5}, - {"{", CURLY_OPEN}, - {"}", CURLY_CLOSE}, - {"\n", 6}}}; - std::map descriptors{ - // the block comment start Token has the id 1 and if the Tokenizer - // returns a Block Comment Token that should have the id 10. - {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}}, - {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}}, - {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}}, - {4, {CodeTokenMode::STRING_START_END, STRING}}, - {5, {CodeTokenMode::ESCAPE, ESCAPE}}, - {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; - - std::vector expected = { - {BLOCK_COMMENT, "*\n * Some Block Comment\n ", SourceLocation{0, 0, 29}}, - {LINEBREAK, "\n", SourceLocation{0, 29, 30}}, - {TOKEN_TEXT, "var", SourceLocation{0, 30, 33}}, - {TOKEN_TEXT, "my_string", SourceLocation{0, 34, 43}}, - {TOKEN_TEXT, "=", SourceLocation{0, 44, 45}}, - {STRING, "My 'String'", SourceLocation{0, 46, 61}}, - {TOKEN_TEXT, ";", SourceLocation{0, 61, 62}}, - {LINEBREAK, "\n", SourceLocation{0, 62, 63}}, - // this is slightly counter-intuitive but makes sense if you think about - // it: As a line comment is ended by a line break the line break is - // technically still a part of the line comment and thus the ending - // is in the next line. - {LINE_COMMENT, " and a line comment", SourceLocation{0, 63, 85}}, - {TOKEN_TEXT, "var", SourceLocation{0, 85, 88}}, - {TOKEN_TEXT, "my_obj", SourceLocation{0, 89, 95}}, - {TOKEN_TEXT, "=", SourceLocation{0, 96, 97}}, - {CURLY_OPEN, "{", SourceLocation{0, 98, 99}}, - {TOKEN_TEXT, "a", SourceLocation{0, 100, 101}}, - {TOKEN_TEXT, "=", SourceLocation{0, 102, 103}}, - {TOKEN_TEXT, "4;", SourceLocation{0, 104, 106}}, - {CURLY_CLOSE, "}", SourceLocation{0, 106, 107}}, - }; - - CodeTokenizer tokenizer{reader, root, descriptors}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} -} - diff --git a/test/core/TokenizerTest.cpp b/test/core/TokenizerTest.cpp deleted file mode 100644 index c53f93d..0000000 --- a/test/core/TokenizerTest.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include - -namespace ousia { -TEST(TokenTreeNode, testConstructor) -{ - TokenTreeNode root{{{"a", 1}, {"aab", 2}, {"aac", 3}, {"abd", 4}}}; - - ASSERT_EQ(-1, root.tokenId); - ASSERT_EQ(1U, root.children.size()); - ASSERT_TRUE(root.children.find('a') != root.children.end()); - - const TokenTreeNode &a = root.children.at('a'); - ASSERT_EQ(1, a.tokenId); - ASSERT_EQ(2U, a.children.size()); - ASSERT_TRUE(a.children.find('a') != a.children.end()); - ASSERT_TRUE(a.children.find('b') != a.children.end()); - - const TokenTreeNode &aa = a.children.at('a'); - ASSERT_EQ(-1, aa.tokenId); - ASSERT_EQ(2U, aa.children.size()); - ASSERT_TRUE(aa.children.find('b') != aa.children.end()); - ASSERT_TRUE(aa.children.find('c') != aa.children.end()); - - const TokenTreeNode &aab = aa.children.at('b'); - ASSERT_EQ(2, aab.tokenId); - ASSERT_EQ(0U, aab.children.size()); - - const TokenTreeNode &aac = aa.children.at('c'); - ASSERT_EQ(3, aac.tokenId); - ASSERT_EQ(0U, aac.children.size()); - - const TokenTreeNode &ab = a.children.at('b'); - ASSERT_EQ(-1, ab.tokenId); - ASSERT_EQ(1U, ab.children.size()); - ASSERT_TRUE(ab.children.find('d') != ab.children.end()); - - const TokenTreeNode &abd = ab.children.at('d'); - ASSERT_EQ(4, abd.tokenId); - ASSERT_EQ(0U, abd.children.size()); -} - -TEST(Tokenizer, testTokenization) -{ - TokenTreeNode root{{{"/", 1}, {"/*", 2}, {"*/", 3}}}; - - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - - std::vector expected = { - {TOKEN_TEXT, "Test", SourceLocation{0, 0, 4}}, - {1, "/", SourceLocation{0, 4, 5}}, - {TOKEN_TEXT, "Test ", SourceLocation{0, 5, 10}}, - {2, "/*", SourceLocation{0, 10, 12}}, - {TOKEN_TEXT, " Block Comment ", SourceLocation{0, 12, 27}}, - {3, "*/", SourceLocation{0, 27, 29}}}; - - Tokenizer tokenizer{reader, root}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} - -TEST(Tokenizer, testIncompleteTokens) -{ - TokenTreeNode root{{{"ab", 1}, {"c", 2}}}; - - CharReader reader{"ac", 0}; - - std::vector expected = { - {TOKEN_TEXT, "a", SourceLocation{0, 0, 1}}, - {2, "c", SourceLocation{0, 1, 2}}}; - - Tokenizer tokenizer{reader, root}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} -} - diff --git a/test/formats/osdmx/OsdmxParserTest.cpp b/test/formats/osdmx/OsdmxParserTest.cpp new file mode 100644 index 0000000..af1ef56 --- /dev/null +++ b/test/formats/osdmx/OsdmxParserTest.cpp @@ -0,0 +1,314 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace ousia { + +namespace RttiTypes { +extern const Rtti Document; +extern const Rtti Domain; +extern const Rtti Typesystem; +} + +struct XmlStandaloneEnvironment : public StandaloneEnvironment { + XmlParser xmlParser; + FileLocator fileLocator; + + XmlStandaloneEnvironment(ConcreteLogger &logger) + : StandaloneEnvironment(logger) + { + fileLocator.addDefaultSearchPaths(); + fileLocator.addUnittestSearchPath("xmlparser"); + + registry.registerDefaultExtensions(); + registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, + {&RttiTypes::Node}, &xmlParser); + registry.registerResourceLocator(&fileLocator); + } +}; + +static TerminalLogger logger(std::cerr, true); + +TEST(XmlParser, mismatchedTag) +{ + XmlStandaloneEnvironment env(logger); + env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); + ASSERT_TRUE(logger.hasError()); +} + +TEST(XmlParser, generic) +{ + XmlStandaloneEnvironment env(logger); + env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); +#ifdef MANAGER_GRAPHVIZ_EXPORT + env.manager.exportGraphviz("xmlDocument.dot"); +#endif +} + +static void checkAttributes(Handle expected, + Handle desc) +{ + if (expected == nullptr) { + ASSERT_TRUE(desc->getAttributesDescriptor()->getAttributes().empty()); + } else { + ASSERT_EQ(expected->getName(), + desc->getAttributesDescriptor()->getName()); + auto &attrs_exp = expected->getAttributes(); + auto &attrs = desc->getAttributesDescriptor()->getAttributes(); + ASSERT_EQ(attrs_exp.size(), attrs.size()); + for (size_t i = 0; i < attrs_exp.size(); i++) { + ASSERT_EQ(attrs_exp[i]->getName(), attrs[i]->getName()); + ASSERT_EQ(attrs_exp[i]->getType(), attrs[i]->getType()); + ASSERT_EQ(attrs_exp[i]->isOptional(), attrs[i]->isOptional()); + ASSERT_EQ(attrs_exp[i]->getDefaultValue(), + attrs[i]->getDefaultValue()); + } + } +} + +static void checkStructuredClass( + Handle n, const std::string &name, Handle domain, + Variant cardinality = Cardinality::any(), + Handle attributesDescriptor = nullptr, + Handle superclass = nullptr, bool transparent = false, + bool root = false) +{ + ASSERT_FALSE(n == nullptr); + Handle sc = n.cast(); + ASSERT_FALSE(sc == nullptr); + ASSERT_EQ(name, sc->getName()); + ASSERT_EQ(domain, sc->getParent()); + ASSERT_EQ(cardinality, sc->getCardinality()); + ASSERT_EQ(transparent, sc->isTransparent()); + ASSERT_EQ(root, sc->hasRootPermission()); + checkAttributes(attributesDescriptor, sc); +} + +static Rooted checkStructuredClass( + const std::string &resolve, const std::string &name, Handle domain, + Variant cardinality = Cardinality::any(), + Handle attributesDescriptor = nullptr, + Handle superclass = nullptr, bool transparent = false, + bool root = false) +{ + auto res = domain->resolve(&RttiTypes::StructuredClass, resolve); + if (res.size() != 1) { + throw OusiaException("resolution error!"); + } + Handle sc = res[0].node.cast(); + checkStructuredClass(sc, name, domain, cardinality, attributesDescriptor, + superclass, transparent, root); + return sc; +} + +static void checkAnnotationClass( + Handle n, const std::string &name, Handle domain, + Handle attributesDescriptor = nullptr) +{ + ASSERT_FALSE(n == nullptr); + Handle ac = n.cast(); + ASSERT_FALSE(ac == nullptr); + ASSERT_EQ(name, ac->getName()); + ASSERT_EQ(domain, ac->getParent()); + checkAttributes(attributesDescriptor, ac); +} + +static Rooted checkAnnotationClass( + const std::string &resolve, const std::string &name, Handle domain, + Handle attributesDescriptor = nullptr) +{ + auto res = domain->resolve(&RttiTypes::AnnotationClass, resolve); + if (res.size() != 1) { + throw OusiaException("resolution error!"); + } + Handle ac = res[0].node.cast(); + checkAnnotationClass(ac, name, domain, attributesDescriptor); + return ac; +} + +static void checkFieldDescriptor( + Handle n, const std::string &name, Handle parent, + NodeVector children, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + ASSERT_FALSE(n == nullptr); + Handle field = n.cast(); + ASSERT_FALSE(field.isNull()); + ASSERT_EQ(name, field->getName()); + ASSERT_EQ(parent, field->getParent()); + ASSERT_EQ(type, field->getFieldType()); + ASSERT_EQ(primitiveType, field->getPrimitiveType()); + ASSERT_EQ(optional, field->isOptional()); + // check the children. + ASSERT_EQ(children.size(), field->getChildren().size()); + for (unsigned int c = 0; c < children.size(); c++) { + ASSERT_EQ(children[c], field->getChildren()[c]); + } +} + +static void checkFieldDescriptor( + Handle desc, Handle parent, + NodeVector children, + const std::string &name = DEFAULT_FIELD_NAME, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); + ASSERT_EQ(1, res.size()); + checkFieldDescriptor(res[0].node, name, parent, children, type, + primitiveType, optional); +} + +static void checkFieldDescriptor( + Handle desc, NodeVector children, + const std::string &name = DEFAULT_FIELD_NAME, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + checkFieldDescriptor(desc, desc, children, name, type, primitiveType, + optional); +} + +TEST(XmlParser, domainParsing) +{ + XmlStandaloneEnvironment env(logger); + Rooted book_domain_node = + env.parse("book_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(book_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + // check the domain node. + Rooted book_domain = book_domain_node.cast(); + ASSERT_EQ("book", book_domain->getName()); + // get the book struct node. + Cardinality single; + single.merge({1}); + Rooted bookAuthor{ + new StructType(book_domain->getManager(), "", nullptr)}; + bookAuthor->addAttribute( + {new Attribute(book_domain->getManager(), "author", + env.project->getSystemTypesystem()->getStringType(), + "")}, + logger); + Rooted book = checkStructuredClass( + "book", "book", book_domain, single, bookAuthor, nullptr, false, true); + // get the chapter struct node. + Rooted chapter = + checkStructuredClass("chapter", "chapter", book_domain); + Rooted section = + checkStructuredClass("section", "section", book_domain); + Rooted subsection = + checkStructuredClass("subsection", "subsection", book_domain); + Rooted paragraph = + checkStructuredClass("paragraph", "paragraph", book_domain, + Cardinality::any(), nullptr, nullptr, true, false); + Rooted text = + checkStructuredClass("text", "text", book_domain, Cardinality::any(), + nullptr, nullptr, true, false); + + // check the FieldDescriptors. + checkFieldDescriptor(book, {chapter, paragraph}); + checkFieldDescriptor(chapter, {section, paragraph}); + checkFieldDescriptor(section, {subsection, paragraph}); + checkFieldDescriptor(subsection, {paragraph}); + checkFieldDescriptor(paragraph, {text}); + checkFieldDescriptor( + text, {}, DEFAULT_FIELD_NAME, FieldDescriptor::FieldType::PRIMITIVE, + env.project->getSystemTypesystem()->getStringType(), false); + + // check parent handling using the headings domain. + Rooted headings_domain_node = + env.parse("headings_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(headings_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + Rooted headings_domain = headings_domain_node.cast(); + // now there should be a heading struct. + Rooted heading = + checkStructuredClass("heading", "heading", headings_domain, single, + nullptr, nullptr, true, false); + // which should be a reference to the paragraph descriptor. + checkFieldDescriptor(heading, paragraph, {text}); + // and each struct in the book domain (except for text) should have a + // heading field now. + checkFieldDescriptor(book, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(chapter, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(section, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(subsection, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(paragraph, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + + // check annotation handling using the comments domain. + Rooted comments_domain_node = + env.parse("comments_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(comments_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + Rooted comments_domain = comments_domain_node.cast(); + // now we should be able to find a comment annotation. + Rooted comment_anno = + checkAnnotationClass("comment", "comment", comments_domain); + // as well as a comment struct + Rooted comment = + checkStructuredClass("comment", "comment", comments_domain); + // and a reply struct + Rooted reply = + checkStructuredClass("reply", "reply", comments_domain); + // check the fields for each of them. + { + std::vector> descs{comment_anno, comment, reply}; + for (auto &d : descs) { + checkFieldDescriptor(d, {paragraph}, "content", + FieldDescriptor::FieldType::SUBTREE, nullptr, + false); + checkFieldDescriptor(d, {reply}, "replies", + FieldDescriptor::FieldType::SUBTREE, nullptr, + false); + } + } + // paragraph should have comment as child now as well. + checkFieldDescriptor(paragraph, {text, comment}); + // as should heading, because it references the paragraph default field. + checkFieldDescriptor(heading, paragraph, {text, comment}); +} + +TEST(XmlParser, documentParsing) +{ + XmlStandaloneEnvironment env(logger); + Rooted book_domain_node = + env.parse("simple_book.oxd", "", "", RttiSet{&RttiTypes::Document}); + //TODO: Check result +} +} + diff --git a/test/plugins/css/CodeTokenizerTest.cpp b/test/plugins/css/CodeTokenizerTest.cpp new file mode 100644 index 0000000..2d4d5a7 --- /dev/null +++ b/test/plugins/css/CodeTokenizerTest.cpp @@ -0,0 +1,100 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +static const int BLOCK_COMMENT = 30; +static const int LINE_COMMENT = 31; +static const int STRING = 20; +static const int ESCAPE = 21; +static const int LINEBREAK = 21; +static const int CURLY_OPEN = 40; +static const int CURLY_CLOSE = 41; + +TEST(CodeTokenizer, testTokenizer) +{ + CharReader reader{ + "/**\n" // 1 + " * Some Block Comment\n" // 2 + " */\n" // 3 + "var my_string = 'My \\'String\\'';\n" // 4 + "// and a line comment\n" // 5 + "var my_obj = { a = 4;}", 0}; // 6 + // 123456789012345678901234567890123456789 + // 0 1 2 3 + TokenTreeNode root{{{"/*", 1}, + {"*/", 2}, + {"//", 3}, + {"'", 4}, + {"\\", 5}, + {"{", CURLY_OPEN}, + {"}", CURLY_CLOSE}, + {"\n", 6}}}; + std::map descriptors{ + // the block comment start Token has the id 1 and if the Tokenizer + // returns a Block Comment Token that should have the id 10. + {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}}, + {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}}, + {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}}, + {4, {CodeTokenMode::STRING_START_END, STRING}}, + {5, {CodeTokenMode::ESCAPE, ESCAPE}}, + {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; + + std::vector expected = { + {BLOCK_COMMENT, "*\n * Some Block Comment\n ", SourceLocation{0, 0, 29}}, + {LINEBREAK, "\n", SourceLocation{0, 29, 30}}, + {TOKEN_TEXT, "var", SourceLocation{0, 30, 33}}, + {TOKEN_TEXT, "my_string", SourceLocation{0, 34, 43}}, + {TOKEN_TEXT, "=", SourceLocation{0, 44, 45}}, + {STRING, "My 'String'", SourceLocation{0, 46, 61}}, + {TOKEN_TEXT, ";", SourceLocation{0, 61, 62}}, + {LINEBREAK, "\n", SourceLocation{0, 62, 63}}, + // this is slightly counter-intuitive but makes sense if you think about + // it: As a line comment is ended by a line break the line break is + // technically still a part of the line comment and thus the ending + // is in the next line. + {LINE_COMMENT, " and a line comment", SourceLocation{0, 63, 85}}, + {TOKEN_TEXT, "var", SourceLocation{0, 85, 88}}, + {TOKEN_TEXT, "my_obj", SourceLocation{0, 89, 95}}, + {TOKEN_TEXT, "=", SourceLocation{0, 96, 97}}, + {CURLY_OPEN, "{", SourceLocation{0, 98, 99}}, + {TOKEN_TEXT, "a", SourceLocation{0, 100, 101}}, + {TOKEN_TEXT, "=", SourceLocation{0, 102, 103}}, + {TOKEN_TEXT, "4;", SourceLocation{0, 104, 106}}, + {CURLY_CLOSE, "}", SourceLocation{0, 106, 107}}, + }; + + CodeTokenizer tokenizer{reader, root, descriptors}; + + Token t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.next(t)); + EXPECT_EQ(te.tokenId, t.tokenId); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.next(t)); +} +} + diff --git a/test/plugins/css/TokenizerTest.cpp b/test/plugins/css/TokenizerTest.cpp new file mode 100644 index 0000000..c53f93d --- /dev/null +++ b/test/plugins/css/TokenizerTest.cpp @@ -0,0 +1,118 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include + +namespace ousia { +TEST(TokenTreeNode, testConstructor) +{ + TokenTreeNode root{{{"a", 1}, {"aab", 2}, {"aac", 3}, {"abd", 4}}}; + + ASSERT_EQ(-1, root.tokenId); + ASSERT_EQ(1U, root.children.size()); + ASSERT_TRUE(root.children.find('a') != root.children.end()); + + const TokenTreeNode &a = root.children.at('a'); + ASSERT_EQ(1, a.tokenId); + ASSERT_EQ(2U, a.children.size()); + ASSERT_TRUE(a.children.find('a') != a.children.end()); + ASSERT_TRUE(a.children.find('b') != a.children.end()); + + const TokenTreeNode &aa = a.children.at('a'); + ASSERT_EQ(-1, aa.tokenId); + ASSERT_EQ(2U, aa.children.size()); + ASSERT_TRUE(aa.children.find('b') != aa.children.end()); + ASSERT_TRUE(aa.children.find('c') != aa.children.end()); + + const TokenTreeNode &aab = aa.children.at('b'); + ASSERT_EQ(2, aab.tokenId); + ASSERT_EQ(0U, aab.children.size()); + + const TokenTreeNode &aac = aa.children.at('c'); + ASSERT_EQ(3, aac.tokenId); + ASSERT_EQ(0U, aac.children.size()); + + const TokenTreeNode &ab = a.children.at('b'); + ASSERT_EQ(-1, ab.tokenId); + ASSERT_EQ(1U, ab.children.size()); + ASSERT_TRUE(ab.children.find('d') != ab.children.end()); + + const TokenTreeNode &abd = ab.children.at('d'); + ASSERT_EQ(4, abd.tokenId); + ASSERT_EQ(0U, abd.children.size()); +} + +TEST(Tokenizer, testTokenization) +{ + TokenTreeNode root{{{"/", 1}, {"/*", 2}, {"*/", 3}}}; + + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + + std::vector expected = { + {TOKEN_TEXT, "Test", SourceLocation{0, 0, 4}}, + {1, "/", SourceLocation{0, 4, 5}}, + {TOKEN_TEXT, "Test ", SourceLocation{0, 5, 10}}, + {2, "/*", SourceLocation{0, 10, 12}}, + {TOKEN_TEXT, " Block Comment ", SourceLocation{0, 12, 27}}, + {3, "*/", SourceLocation{0, 27, 29}}}; + + Tokenizer tokenizer{reader, root}; + + Token t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.next(t)); + EXPECT_EQ(te.tokenId, t.tokenId); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.next(t)); +} + +TEST(Tokenizer, testIncompleteTokens) +{ + TokenTreeNode root{{{"ab", 1}, {"c", 2}}}; + + CharReader reader{"ac", 0}; + + std::vector expected = { + {TOKEN_TEXT, "a", SourceLocation{0, 0, 1}}, + {2, "c", SourceLocation{0, 1, 2}}}; + + Tokenizer tokenizer{reader, root}; + + Token t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.next(t)); + EXPECT_EQ(te.tokenId, t.tokenId); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.next(t)); +} +} + diff --git a/test/plugins/xml/XmlParserTest.cpp b/test/plugins/xml/XmlParserTest.cpp deleted file mode 100644 index af1ef56..0000000 --- a/test/plugins/xml/XmlParserTest.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace ousia { - -namespace RttiTypes { -extern const Rtti Document; -extern const Rtti Domain; -extern const Rtti Typesystem; -} - -struct XmlStandaloneEnvironment : public StandaloneEnvironment { - XmlParser xmlParser; - FileLocator fileLocator; - - XmlStandaloneEnvironment(ConcreteLogger &logger) - : StandaloneEnvironment(logger) - { - fileLocator.addDefaultSearchPaths(); - fileLocator.addUnittestSearchPath("xmlparser"); - - registry.registerDefaultExtensions(); - registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, - {&RttiTypes::Node}, &xmlParser); - registry.registerResourceLocator(&fileLocator); - } -}; - -static TerminalLogger logger(std::cerr, true); - -TEST(XmlParser, mismatchedTag) -{ - XmlStandaloneEnvironment env(logger); - env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); - ASSERT_TRUE(logger.hasError()); -} - -TEST(XmlParser, generic) -{ - XmlStandaloneEnvironment env(logger); - env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); -#ifdef MANAGER_GRAPHVIZ_EXPORT - env.manager.exportGraphviz("xmlDocument.dot"); -#endif -} - -static void checkAttributes(Handle expected, - Handle desc) -{ - if (expected == nullptr) { - ASSERT_TRUE(desc->getAttributesDescriptor()->getAttributes().empty()); - } else { - ASSERT_EQ(expected->getName(), - desc->getAttributesDescriptor()->getName()); - auto &attrs_exp = expected->getAttributes(); - auto &attrs = desc->getAttributesDescriptor()->getAttributes(); - ASSERT_EQ(attrs_exp.size(), attrs.size()); - for (size_t i = 0; i < attrs_exp.size(); i++) { - ASSERT_EQ(attrs_exp[i]->getName(), attrs[i]->getName()); - ASSERT_EQ(attrs_exp[i]->getType(), attrs[i]->getType()); - ASSERT_EQ(attrs_exp[i]->isOptional(), attrs[i]->isOptional()); - ASSERT_EQ(attrs_exp[i]->getDefaultValue(), - attrs[i]->getDefaultValue()); - } - } -} - -static void checkStructuredClass( - Handle n, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - ASSERT_FALSE(n == nullptr); - Handle sc = n.cast(); - ASSERT_FALSE(sc == nullptr); - ASSERT_EQ(name, sc->getName()); - ASSERT_EQ(domain, sc->getParent()); - ASSERT_EQ(cardinality, sc->getCardinality()); - ASSERT_EQ(transparent, sc->isTransparent()); - ASSERT_EQ(root, sc->hasRootPermission()); - checkAttributes(attributesDescriptor, sc); -} - -static Rooted checkStructuredClass( - const std::string &resolve, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - auto res = domain->resolve(&RttiTypes::StructuredClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle sc = res[0].node.cast(); - checkStructuredClass(sc, name, domain, cardinality, attributesDescriptor, - superclass, transparent, root); - return sc; -} - -static void checkAnnotationClass( - Handle n, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - ASSERT_FALSE(n == nullptr); - Handle ac = n.cast(); - ASSERT_FALSE(ac == nullptr); - ASSERT_EQ(name, ac->getName()); - ASSERT_EQ(domain, ac->getParent()); - checkAttributes(attributesDescriptor, ac); -} - -static Rooted checkAnnotationClass( - const std::string &resolve, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - auto res = domain->resolve(&RttiTypes::AnnotationClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle ac = res[0].node.cast(); - checkAnnotationClass(ac, name, domain, attributesDescriptor); - return ac; -} - -static void checkFieldDescriptor( - Handle n, const std::string &name, Handle parent, - NodeVector children, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - ASSERT_FALSE(n == nullptr); - Handle field = n.cast(); - ASSERT_FALSE(field.isNull()); - ASSERT_EQ(name, field->getName()); - ASSERT_EQ(parent, field->getParent()); - ASSERT_EQ(type, field->getFieldType()); - ASSERT_EQ(primitiveType, field->getPrimitiveType()); - ASSERT_EQ(optional, field->isOptional()); - // check the children. - ASSERT_EQ(children.size(), field->getChildren().size()); - for (unsigned int c = 0; c < children.size(); c++) { - ASSERT_EQ(children[c], field->getChildren()[c]); - } -} - -static void checkFieldDescriptor( - Handle desc, Handle parent, - NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); - ASSERT_EQ(1, res.size()); - checkFieldDescriptor(res[0].node, name, parent, children, type, - primitiveType, optional); -} - -static void checkFieldDescriptor( - Handle desc, NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - checkFieldDescriptor(desc, desc, children, name, type, primitiveType, - optional); -} - -TEST(XmlParser, domainParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("book_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(book_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - // check the domain node. - Rooted book_domain = book_domain_node.cast(); - ASSERT_EQ("book", book_domain->getName()); - // get the book struct node. - Cardinality single; - single.merge({1}); - Rooted bookAuthor{ - new StructType(book_domain->getManager(), "", nullptr)}; - bookAuthor->addAttribute( - {new Attribute(book_domain->getManager(), "author", - env.project->getSystemTypesystem()->getStringType(), - "")}, - logger); - Rooted book = checkStructuredClass( - "book", "book", book_domain, single, bookAuthor, nullptr, false, true); - // get the chapter struct node. - Rooted chapter = - checkStructuredClass("chapter", "chapter", book_domain); - Rooted section = - checkStructuredClass("section", "section", book_domain); - Rooted subsection = - checkStructuredClass("subsection", "subsection", book_domain); - Rooted paragraph = - checkStructuredClass("paragraph", "paragraph", book_domain, - Cardinality::any(), nullptr, nullptr, true, false); - Rooted text = - checkStructuredClass("text", "text", book_domain, Cardinality::any(), - nullptr, nullptr, true, false); - - // check the FieldDescriptors. - checkFieldDescriptor(book, {chapter, paragraph}); - checkFieldDescriptor(chapter, {section, paragraph}); - checkFieldDescriptor(section, {subsection, paragraph}); - checkFieldDescriptor(subsection, {paragraph}); - checkFieldDescriptor(paragraph, {text}); - checkFieldDescriptor( - text, {}, DEFAULT_FIELD_NAME, FieldDescriptor::FieldType::PRIMITIVE, - env.project->getSystemTypesystem()->getStringType(), false); - - // check parent handling using the headings domain. - Rooted headings_domain_node = - env.parse("headings_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(headings_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted headings_domain = headings_domain_node.cast(); - // now there should be a heading struct. - Rooted heading = - checkStructuredClass("heading", "heading", headings_domain, single, - nullptr, nullptr, true, false); - // which should be a reference to the paragraph descriptor. - checkFieldDescriptor(heading, paragraph, {text}); - // and each struct in the book domain (except for text) should have a - // heading field now. - checkFieldDescriptor(book, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(chapter, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(section, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(subsection, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(paragraph, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - - // check annotation handling using the comments domain. - Rooted comments_domain_node = - env.parse("comments_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(comments_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted comments_domain = comments_domain_node.cast(); - // now we should be able to find a comment annotation. - Rooted comment_anno = - checkAnnotationClass("comment", "comment", comments_domain); - // as well as a comment struct - Rooted comment = - checkStructuredClass("comment", "comment", comments_domain); - // and a reply struct - Rooted reply = - checkStructuredClass("reply", "reply", comments_domain); - // check the fields for each of them. - { - std::vector> descs{comment_anno, comment, reply}; - for (auto &d : descs) { - checkFieldDescriptor(d, {paragraph}, "content", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - checkFieldDescriptor(d, {reply}, "replies", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - } - } - // paragraph should have comment as child now as well. - checkFieldDescriptor(paragraph, {text, comment}); - // as should heading, because it references the paragraph default field. - checkFieldDescriptor(heading, paragraph, {text, comment}); -} - -TEST(XmlParser, documentParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("simple_book.oxd", "", "", RttiSet{&RttiTypes::Document}); - //TODO: Check result -} -} - -- cgit v1.2.3 From ce4fd84a714d80859aa01bbca32a81302b93c4d7 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:43:32 +0100 Subject: Moved code for handling whitespaces to own header, including the "WhitespaceMode" enum --- src/core/common/Utils.cpp | 7 -- src/core/common/Utils.hpp | 57 +-------- src/core/common/Whitespace.cpp | 38 ++++++ src/core/common/Whitespace.hpp | 120 ++++++++++++++++++ src/core/common/WhitespaceHandler.hpp | 223 ++++++++++++++++++++++++++++++++++ test/core/common/UtilsTest.cpp | 8 -- test/core/common/Whitespace.cpp | 41 +++++++ 7 files changed, 428 insertions(+), 66 deletions(-) create mode 100644 src/core/common/Whitespace.cpp create mode 100644 src/core/common/Whitespace.hpp create mode 100644 src/core/common/WhitespaceHandler.hpp create mode 100644 test/core/common/Whitespace.cpp (limited to 'test') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 563fe2a..4005143 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -18,19 +18,12 @@ #include #include -#include #include #include "Utils.hpp" namespace ousia { -std::string Utils::trim(const std::string &s) -{ - std::pair bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::isIdentifier(const std::string &name) { bool first = true; diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 2c8a5b3..af7a773 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -78,12 +78,17 @@ public: */ static bool isIdentifier(const std::string &name); + /** + * Returns true if the given character is a linebreak character. + */ + static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } + /** * Returns true if the given character is a whitespace character. */ static bool isWhitespace(const char c) { - return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); + return (c == ' ') || (c == '\t') || isLinebreak(c); } /** @@ -94,56 +99,6 @@ public: */ static bool hasNonWhitepaceChar(const std::string &s); - /** - * Returns true if the given character is a whitespace character. - */ - static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } - - /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** - * Trims the given string or vector of chars by returning the start and end - * index. - * - * @param s is the container that should be trimmed. - * @param f is a function that returns true for values that should be - * removed. - * @return start and end index. Note that "end" points at the character - * beyond the end, thus "end" minus "start" - */ - template - static std::pair trim(const T &s, Filter f) - { - size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { - if (!f(s[i])) { - start = i; - break; - } - } - - size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { - if (!f(s[i])) { - end = i + 1; - break; - } - } - - if (end < start) { - start = 0; - end = 0; - } - - return std::pair{start, end}; - } - /** * Turns the elements of a collection into a string separated by the * given delimiter. diff --git a/src/core/common/Whitespace.cpp b/src/core/common/Whitespace.cpp new file mode 100644 index 0000000..4d7c01a --- /dev/null +++ b/src/core/common/Whitespace.cpp @@ -0,0 +1,38 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Whitespace.hpp" +#include "WhitespaceHandler.hpp" + +namespace ousia { + +std::string Utils::trim(const std::string &s) +{ + std::pair bounds = trim(s, Utils::isWhitespace); + return s.substr(bounds.first, bounds.second - bounds.first); +} + +std::string Utils::collapse(const std::string &s) +{ + CollapsingWhitespaceHandler h; + appendToWhitespaceHandler(h, s, 0); + return h.toString(); +} + +} + diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp new file mode 100644 index 0000000..1e9f36a --- /dev/null +++ b/src/core/common/Whitespace.hpp @@ -0,0 +1,120 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Whitespace.hpp + * + * Contains the WhitespaceMode enum used in various places, as well es functions + * for trimming and collapsing whitespaces. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HPP_ +#define _OUSIA_WHITESPACE_HPP_ + +#include +#include + +namespace ousia { + +/** + * Enum specifying the whitespace handling mode of the tokenizer and the + * parsers. + */ +enum class WhitespaceMode { + /** + * Preserves all whitespaces as they are found in the source file. + */ + PRESERVE, + + /** + * Trims whitespace at the beginning and the end of the found text. + */ + TRIM, + + /** + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ + COLLAPSE +}; + +/** + * Collection of functions for trimming or collapsing whitespace. + */ +class Whitespace { + /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s); + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template + static std::pair trim(const T &s, Filter f) + { + size_t start = 0; + for (size_t i = 0; i < s.size(); i++) { + if (!f(s[i])) { + start = i; + break; + } + } + + size_t end = 0; + for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { + if (!f(s[i])) { + end = i + 1; + break; + } + } + + if (end < start) { + start = 0; + end = 0; + } + + return std::pair{start, end}; + } + + /** + * Collapses the whitespaces in the given string (trims the string and + * replaces all whitespace characters by a single one). + * + * @param s is the string in which the whitespace should be collapsed. + * @return a copy of s with collapsed whitespace. + */ + static std::string collapse(const std::string &s); +}; + +} + +#endif /* _OUSIA_WHITESPACE_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp new file mode 100644 index 0000000..1935c24 --- /dev/null +++ b/src/core/common/WhitespaceHandler.hpp @@ -0,0 +1,223 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file WhitespaceHandler.hpp + * + * Contains the WhitespaceHandler classes which are used in multiple places to + * trim, compact or preserve whitespaces while at the same time maintaining the + * position information associated with the input strings. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ +#define _OUSIA_WHITESPACE_HANDLER_HPP_ + +#include +#include + +#include "WhitespaceHandler.hpp" + +namespace ousia { + +/** + * WhitespaceHandler is a based class that can be used to collect text on a + * character-by-character basis. Note that this class and its descendants are + * hoped to be inlined by the compiler (and used in conjunction with templates), + * thus they are fully defined inside this header. + */ +class WhitespaceHandler { +public: + /** + * Start position of the extracted text. + */ + size_t textStart; + + /** + * End position of the extracted text. + */ + size_t textEnd; + + /** + * Buffer containing the extracted text. + */ + std::vector textBuf; + + /** + * Constructor of the TextHandlerBase base class. Initializes the start and + * end position with zeros. + */ + WhitespaceHandler() : textStart(0), textEnd(0) {} + + /** + * Returns true if this whitespace handler has found any text and a text + * token could be emitted. + * + * @return true if the internal data buffer is non-empty. + */ + bool hasText() { return !textBuf.empty(); } + + /** + * Returns the content of the WhitespaceHandler as string. + */ + std::string toString() + { + return std::string(textBuf.data(), textBuf.size()); + } +}; + +/** + * The PreservingWhitespaceHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Appends the given character to the internal text buffer, does not + * eliminate whitespace. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + textBuf.push_back(c); + } +}; + +/** + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. + */ +class TrimmingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Buffer used internally to temporarily store all whitespace characters. + * They are only added to the output buffer if another non-whitespace + * character is reached. + */ + std::vector whitespaceBuf; + + /** + * Appends the given character to the internal text buffer, eliminates + * whitespace characters at the begin and end of the text. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + whitespaceBuf.push_back(c); + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (!whitespaceBuf.empty()) { + textBuf.insert(textBuf.end(), whitespaceBuf.begin(), + whitespaceBuf.end()); + whitespaceBuf.clear(); + } + textBuf.push_back(c); + } +}; + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Flag set to true if a whitespace character was reached. + */ + bool hasWhitespace = false; + + /** + * Appends the given character to the internal text buffer, eliminates + * redundant whitespace characters. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + hasWhitespace = true; + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (hasWhitespace) { + textBuf.push_back(' '); + hasWhitespace = false; + } + textBuf.push_back(c); + } +}; + +/** + * Function that can be used to append the given buffer (e.g. a string or a + * vector) to the whitespace handler. + * + * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. + * @tparam Buffer is an iterable type. + * @param handler is the handler to which the characters of the Buffer should be + * appended. + * @param buf is the buffer from which the characters should be read. + * @param start is the start byte offset. Each character is counted as one byte. + */ +template +inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, + size_t start) +{ + for (auto elem : buf) { + handler.append(elem, start++); + } +} +} + +#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ + diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 917f45c..6b8a916 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -32,14 +32,6 @@ TEST(Utils, isIdentifier) ASSERT_FALSE(Utils::isIdentifier("invalid key")); } -TEST(Utils, trim) -{ - ASSERT_EQ("hello world", Utils::trim("\t hello world \n\r\t")); - ASSERT_EQ("hello world", Utils::trim("hello world \n\r\t")); - ASSERT_EQ("hello world", Utils::trim(" hello world")); - ASSERT_EQ("hello world", Utils::trim("hello world")); -} - TEST(Utils, split) { ASSERT_EQ(std::vector({"ab"}), Utils::split("ab", '.')); diff --git a/test/core/common/Whitespace.cpp b/test/core/common/Whitespace.cpp new file mode 100644 index 0000000..d6df8b7 --- /dev/null +++ b/test/core/common/Whitespace.cpp @@ -0,0 +1,41 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +TEST(Whitespace, trim) +{ + ASSERT_EQ("hello world", Whitespace::trim("\t hello world \n\r\t")); + ASSERT_EQ("hello world", Whitespace::trim("hello world \n\r\t")); + ASSERT_EQ("hello world", Whitespace::trim(" hello world")); + ASSERT_EQ("hello world", Whitespace::trim("hello world")); +} + +TEST(Whitespace, collapse) +{ + ASSERT("hello world", Whitespace::collapse(" hello \n\t\r world \n\r\t")); + ASSERT("hello world", Whitespace::collapse("hello \n\t\r world \n\r\t")); + ASSERT("hello world", Whitespace::collapse("hello \n\t\r world")); + ASSERT("hello world", Whitespace::collapse("hello world")); +} +} + -- cgit v1.2.3 From fec6ac1d65aee3e4e5c948b0f7cbdec7ceb6cb45 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:46:46 +0100 Subject: Added testcase for Osdmx Parser --- test/formats/osdmx/OsdmxParserTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test') diff --git a/test/formats/osdmx/OsdmxParserTest.cpp b/test/formats/osdmx/OsdmxParserTest.cpp index af1ef56..c0fb50d 100644 --- a/test/formats/osdmx/OsdmxParserTest.cpp +++ b/test/formats/osdmx/OsdmxParserTest.cpp @@ -29,7 +29,7 @@ #include #include -#include +#include namespace ousia { -- cgit v1.2.3 From 65bbbd778f6e0a3668c859b0e22cced7075a726d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:11 +0100 Subject: Moved DynamicTokenizer and TokenTrie to parser/utils --- src/core/parser/utils/TokenTrie.cpp | 119 +++++++++ src/core/parser/utils/TokenTrie.hpp | 150 +++++++++++ src/core/parser/utils/Tokenizer.cpp | 381 ++++++++++++++++++++++++++ src/core/parser/utils/Tokenizer.hpp | 231 ++++++++++++++++ src/formats/osdm/DynamicTokenizer.cpp | 381 -------------------------- src/formats/osdm/DynamicTokenizer.hpp | 231 ---------------- src/formats/osdm/TokenTrie.cpp | 119 --------- src/formats/osdm/TokenTrie.hpp | 150 ----------- test/core/parser/utils/TokenTrieTest.cpp | 92 +++++++ test/core/parser/utils/TokenizerTest.cpp | 415 +++++++++++++++++++++++++++++ test/formats/osdm/DynamicTokenizerTest.cpp | 415 ----------------------------- test/formats/osdm/TokenTrieTest.cpp | 92 ------- 12 files changed, 1388 insertions(+), 1388 deletions(-) create mode 100644 src/core/parser/utils/TokenTrie.cpp create mode 100644 src/core/parser/utils/TokenTrie.hpp create mode 100644 src/core/parser/utils/Tokenizer.cpp create mode 100644 src/core/parser/utils/Tokenizer.hpp delete mode 100644 src/formats/osdm/DynamicTokenizer.cpp delete mode 100644 src/formats/osdm/DynamicTokenizer.hpp delete mode 100644 src/formats/osdm/TokenTrie.cpp delete mode 100644 src/formats/osdm/TokenTrie.hpp create mode 100644 test/core/parser/utils/TokenTrieTest.cpp create mode 100644 test/core/parser/utils/TokenizerTest.cpp delete mode 100644 test/formats/osdm/DynamicTokenizerTest.cpp delete mode 100644 test/formats/osdm/TokenTrieTest.cpp (limited to 'test') diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/core/parser/utils/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::make_shared()).first; + } + node = it->second.get(); + } + + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { + return false; + } + + // Otherwise just set the type to the given type. + node->type = type; + return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another type + node = it->second.get(); + if ((node->type != EmptyToken || node->children.size() > 1) && + (i + 1 != token.size())) { + subtreeRoot = node; + subtreeKey = token[i + 1]; + } + } + + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { + return false; + } + + // If the target node has children, we cannot delete the subtree. Set the + // type to EmptyToken instead + if (!node->children.empty()) { + node->type = EmptyToken; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return EmptyToken; + } + node = it->second.get(); + } + return node->type; +} +} + diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/core/parser/utils/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include +#include +#include +#include + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenTypeId type; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param type is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, TokenTypeId type) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp new file mode 100644 index 0000000..1fac25a --- /dev/null +++ b/src/core/parser/utils/Tokenizer.cpp @@ -0,0 +1,381 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include +#include +#include +#include + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + DynamicToken token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the DynamicToken instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * DynamicTokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + DynamicToken{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; + } + match.token.type = TextToken; +} +} + +/* Class DynamicTokenizer */ + +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) +{ + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector lookups; + std::vector nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = DynamicToken{}; + } + return match.hasMatch(); +} + +bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } + + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; + + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +} + diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp new file mode 100644 index 0000000..3e5aeb3 --- /dev/null +++ b/src/core/parser/utils/Tokenizer.hpp @@ -0,0 +1,231 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file DynamicTokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include +#include +#include + +#include +#include + +#include "TokenTrie.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; + +/** + * The DynamicToken structure describes a token discovered by the Tokenizer. + */ +struct DynamicToken { + /** + * Id of the type of this token. + */ + TokenTypeId type; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + DynamicToken() : type(EmptyToken) {} + + /** + * Constructor of the DynamicToken struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + DynamicToken(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the DynamicToken struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + DynamicToken(TokenTypeId type) : type(type) {} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; + +/** + * The DynamicTokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. Note that the + * DynamicTokenizer always tries to extract the longest possible token from the + * tokenizer. + */ +class DynamicTokenizer { +private: + /** + * Internally used token trie. This object holds all registered tokens. + */ + TokenTrie trie; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token types. + */ + std::vector tokens; + + /** + * Next index in the tokens list where to search for a new token id. + */ + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. + * + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template + bool next(CharReader &reader, DynamicToken &token); + +public: + /** + * Constructor of the DynamicTokenizer class. + * + * @param whitespaceMode specifies how whitespace should be handled. + */ + DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. + */ + TokenTypeId registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenTypeId. + * + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * DynamicToken instance. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(CharReader &reader, DynamicToken &token); + + /** + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool peek(CharReader &reader, DynamicToken &token); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp deleted file mode 100644 index 1fac25a..0000000 --- a/src/formats/osdm/DynamicTokenizer.cpp +++ /dev/null @@ -1,381 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include - -#include -#include -#include -#include - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -namespace { - -/* Internal class TokenMatch */ - -/** - * Contains information about a matching token. - */ -struct TokenMatch { - /** - * Token that was matched. - */ - DynamicToken token; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - - /** - * Constructor of the TokenMatch class. - */ - TokenMatch() : textLength(0), textEnd(0) {} - - /** - * Returns true if this TokenMatch instance actually represents a match. - */ - bool hasMatch() { return token.type != EmptyToken; } -}; - -/* Internal class TokenLookup */ - -/** - * The TokenLookup class is used to represent a thread in a running token - * lookup. - */ -class TokenLookup { -private: - /** - * Current node within the token trie. - */ - TokenTrie::Node const *node; - - /** - * Start offset within the source file. - */ - size_t start; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - -public: - /** - * Constructor of the TokenLookup class. - * - * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. - */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) - { - } - - /** - * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). - * - * @param c is the character that should be appended to the current prefix. - * @param lookups is a list to which new TokeLookup instances are added -- - * which could potentially be expanded in the next iteration. - * @param match is the DynamicToken instance to which the matching token - * should be written. - * @param tokens is a reference at the internal token list of the - * DynamicTokenizer. - * @param end is the end byte offset of the current character. - * @param sourceId is the source if of this file. - */ - void advance(char c, std::vector &lookups, TokenMatch &match, - const std::vector &tokens, SourceOffset end, - SourceId sourceId) - { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node - auto it = node->children.find(c); - if (it == node->children.end()) { - return; - } - - // Check whether the new node represents a complete token a whether it - // is longer than the current token. If yes, replace the current token. - node = it->second.get(); - if (node->type != EmptyToken) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - DynamicToken{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } - } - - // If this state can possibly be advanced, store it in the states list. - if (!node->children.empty()) { - lookups.emplace_back(*this); - } - } -}; - -/** - * Transforms the given token into a text token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.type = TextToken; -} -} - -/* Class DynamicTokenizer */ - -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenTypeId(0) -{ -} - -template -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) -{ - // If we're in the read mode, reset the char reader peek position to the - // current read position - if (read) { - reader.resetPeek(); - } - - // Prepare the lookups in the token trie - const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; - std::vector lookups; - std::vector nextLookups; - - // Instantiate the text handler - TextHandler textHandler; - - // Peek characters from the reader and try to advance the current token tree - // cursor - char c; - size_t charStart = reader.getPeekOffset(); - const SourceId sourceId = reader.getSourceId(); - while (reader.peek(c)) { - const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; - - // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); - } - - // Try to advance all other lookups with the new character - for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); - } - - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { - break; - } - } else { - // Record all incomming characters - textHandler.append(c, charStart, charEnd); - } - - // Swap the lookups and the nextLookups list - lookups = std::move(nextLookups); - nextLookups.clear(); - - // Advance the offset - charStart = charEnd; - } - - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildTextToken(textHandler, match, sourceId); - } - - // Move the read/peek cursor to the end of the token, abort if an error - // happens while doing so - if (match.hasMatch()) { - // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { - throw OusiaException{"Token end position offset out of range"}; - } - - // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); - if (read) { - reader.seek(end); - } else { - reader.seekPeekCursor(end); - } - token = match.token; - } else { - token = DynamicToken{}; - } - return match.hasMatch(); -} - -bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; -} - -bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; -} - -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) -{ - // Abort if an empty token should be registered - if (token.empty()) { - return EmptyToken; - } - - // Search for a new slot in the tokens list - TokenTypeId type = EmptyToken; - for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; - type = i; - break; - } - } - - // No existing slot was found, add a new one -- make sure we do not - // override the special token type handles - if (type == EmptyToken) { - type = tokens.size(); - if (type == TextToken || type == EmptyToken) { - throw OusiaException{"Token type ids depleted!"}; - } - tokens.emplace_back(token); - } - nextTokenTypeId = type + 1; - - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list - if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return EmptyToken; - } - return type; -} - -bool DynamicTokenizer::unregisterToken(TokenTypeId type) -{ - // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return true; - } - return false; -} - -std::string DynamicTokenizer::getTokenString(TokenTypeId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} - -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) -{ - whitespaceMode = mode; -} - -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } - -/* Explicitly instantiate all possible instantiations of the "next" member - function */ -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -} - diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp deleted file mode 100644 index 3e5aeb3..0000000 --- a/src/formats/osdm/DynamicTokenizer.hpp +++ /dev/null @@ -1,231 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file DynamicTokenizer.hpp - * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ - -#include -#include -#include - -#include -#include - -#include "TokenTrie.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; - -/** - * The DynamicToken structure describes a token discovered by the Tokenizer. - */ -struct DynamicToken { - /** - * Id of the type of this token. - */ - TokenTypeId type; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - DynamicToken() : type(EmptyToken) {} - - /** - * Constructor of the DynamicToken struct. - * - * @param id represents the token type. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - DynamicToken(TokenTypeId type, const std::string &content, - SourceLocation location) - : type(type), content(content), location(location) - { - } - - /** - * Constructor of the DynamicToken struct, only initializes the token type - * - * @param type is the id corresponding to the type of the token. - */ - DynamicToken(TokenTypeId type) : type(type) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; - -/** - * The DynamicTokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the - * tokenizer. - */ -class DynamicTokenizer { -private: - /** - * Internally used token trie. This object holds all registered tokens. - */ - TokenTrie trie; - - /** - * Flag defining whether whitespaces should be preserved or not. - */ - WhitespaceMode whitespaceMode; - - /** - * Vector containing all registered token types. - */ - std::vector tokens; - - /** - * Next index in the tokens list where to search for a new token id. - */ - size_t nextTokenTypeId; - - /** - * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. - * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is the token structure into which the token information - * should be written. - * @return false if the end of the stream has been reached, true otherwise. - */ - template - bool next(CharReader &reader, DynamicToken &token); - -public: - /** - * Constructor of the DynamicTokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. - */ - DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); - - /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. - * - * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if - * an error occured. - */ - TokenTypeId registerToken(const std::string &token); - - /** - * Unregisters the token belonging to the given TokenTypeId. - * - * @param type is the token type that should be unregistered. The - *TokenTypeId - * must have been returned by registerToken. - * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). - */ - bool unregisterToken(TokenTypeId type); - - /** - * Returns the token that was registered under the given TokenTypeId id or - *an - * empty string if an invalid TokenTypeId id is given. - * - * @param type is the TokenTypeId id for which the corresponding token - *string - * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenTypeId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. - */ - WhitespaceMode getWhitespaceMode(); - - /** - * Reads a new token from the CharReader and stores it in the given - * DynamicToken instance. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool read(CharReader &reader, DynamicToken &token); - - /** - * The peek method does not advance the read position of the char reader, - * but reads the next token from the current char reader peek position. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool peek(CharReader &reader, DynamicToken &token); -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ - diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp deleted file mode 100644 index 4a0430b..0000000 --- a/src/formats/osdm/TokenTrie.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "TokenTrie.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -TokenTrie::Node::Node() : type(EmptyToken) {} - -/* Class DynamicTokenTree */ - -bool TokenTrie::registerToken(const std::string &token, - TokenTypeId type) noexcept -{ - // Abort if the token is empty -- this would taint the root node - if (token.empty()) { - return false; - } - - // Iterate over each character in the given string and insert them as - // (new) nodes - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Insert a new node if this one does not exist - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - it = node->children.emplace(c, std::make_shared()).first; - } - node = it->second.get(); - } - - // If the resulting node already has a type set, we're screwed. - if (node->type != EmptyToken) { - return false; - } - - // Otherwise just set the type to the given type. - node->type = type; - return true; -} - -bool TokenTrie::unregisterToken(const std::string &token) noexcept -{ - // We cannot remove empty tokens as we need to access the fist character - // upfront - if (token.empty()) { - return false; - } - - // First pass -- search the node in the path that can be deleted - Node *subtreeRoot = &root; - char subtreeKey = token[0]; - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Go to the next node, abort if the tree ends unexpectedly - auto it = node->children.find(token[i]); - if (it == node->children.end()) { - return false; - } - - // Reset the subtree handler if this node has another type - node = it->second.get(); - if ((node->type != EmptyToken || node->children.size() > 1) && - (i + 1 != token.size())) { - subtreeRoot = node; - subtreeKey = token[i + 1]; - } - } - - // If the node type is already EmptyToken, we cannot do anything here - if (node->type == EmptyToken) { - return false; - } - - // If the target node has children, we cannot delete the subtree. Set the - // type to EmptyToken instead - if (!node->children.empty()) { - node->type = EmptyToken; - return true; - } - - // If we end up here, we can safely delete the complete subtree - subtreeRoot->children.erase(subtreeKey); - return true; -} - -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept -{ - Node const *node = &root; - for (size_t i = 0; i < token.size(); i++) { - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - return EmptyToken; - } - node = it->second.get(); - } - return node->type; -} -} - diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp deleted file mode 100644 index 36c2ffa..0000000 --- a/src/formats/osdm/TokenTrie.hpp +++ /dev/null @@ -1,150 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file TokenTrie.hpp - * - * Class representing a token trie that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_TRIE_HPP_ -#define _OUSIA_TOKEN_TRIE_HPP_ - -#include -#include -#include -#include - -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits::max(); - -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits::max() - 1; - -/** - * The Tokenizer internally uses a TokenTrie to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * A token trie is a construct that structures all special tokens a Tokenizer - * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and - * three. Then the token tree would look like this: - * - * \code{*.txt} - * ~ (0) - * / \ - * a (2) b (0) - * | | - * a (0) a (0) - * | | - * b (1) c (0) - * \endcode - * - * Where the number indicates the corresponding token descriptor identifier. - */ -class TokenTrie { -public: - /** - * Structure used to build the node tree. - */ - struct Node { - /** - * Type used for the child map. - */ - using ChildMap = std::unordered_map>; - - /** - * Map from single characters at the corresponding child nodes. - */ - ChildMap children; - - /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. - */ - TokenTypeId type; - - /** - * Default constructor, initializes the descriptor with nullptr. - */ - Node(); - }; - -private: - /** - * Root node of the internal token tree. - */ - Node root; - -public: - /** - * Registers a token containing the given string. Returns false if the - * token already exists, true otherwise. - * - * @param token is the character sequence that should be registered as - * token. - * @param type is the descriptor that should be set for this token. - * @return true if the operation is successful, false otherwise. - */ - bool registerToken(const std::string &token, TokenTypeId type) noexcept; - - /** - * Unregisters the token from the token tree. Returns true if the token was - * unregistered successfully, false otherwise. - * - * @param token is the character sequence that should be unregistered. - * @return true if the operation was successful, false otherwise. - */ - bool unregisterToken(const std::string &token) noexcept; - - /** - * Returns true, if the given token exists within the TokenTree. This - * function is mostly thought for debugging and unit testing. - * - * @param token is the character sequence that should be searched. - * @return the attached token descriptor or nullptr if the given token is - * not found. - */ - TokenTypeId hasToken(const std::string &token) const noexcept; - - /** - * Returns a reference at the root node to be used for traversing the token - * tree. - * - * @return a reference at the root node. - */ - const Node *getRoot() const noexcept { return &root; } -}; -} - -#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ - diff --git a/test/core/parser/utils/TokenTrieTest.cpp b/test/core/parser/utils/TokenTrieTest.cpp new file mode 100644 index 0000000..aacd6c0 --- /dev/null +++ b/test/core/parser/utils/TokenTrieTest.cpp @@ -0,0 +1,92 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +static const TokenTypeId t1 = 0; +static const TokenTypeId t2 = 1; +static const TokenTypeId t3 = 2; +static const TokenTypeId t4 = 3; + +TEST(TokenTrie, registerToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_TRUE(tree.registerToken("hello", t4)); + + ASSERT_FALSE(tree.registerToken("", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + ASSERT_FALSE(tree.registerToken("b", t4)); + ASSERT_FALSE(tree.registerToken("hello", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + ASSERT_EQ(t4, tree.hasToken("hello")); + ASSERT_EQ(EmptyToken, tree.hasToken("")); + ASSERT_EQ(EmptyToken, tree.hasToken("abc")); +} + +TEST(TokenTrie, unregisterToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_FALSE(tree.registerToken("b", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("a")); + ASSERT_FALSE(tree.unregisterToken("a")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("b")); + ASSERT_FALSE(tree.unregisterToken("b")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("ab")); + ASSERT_FALSE(tree.unregisterToken("ab")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(EmptyToken, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); +} +} + diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp new file mode 100644 index 0000000..c1f8785 --- /dev/null +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -0,0 +1,415 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +namespace ousia { + +TEST(DynamicTokenizer, tokenRegistration) +{ + DynamicTokenizer tokenizer; + + ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); + + ASSERT_EQ(0U, tokenizer.registerToken("a")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); + ASSERT_EQ("a", tokenizer.getTokenString(0U)); + + ASSERT_EQ(1U, tokenizer.registerToken("b")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); + ASSERT_EQ("b", tokenizer.getTokenString(1U)); + + ASSERT_EQ(2U, tokenizer.registerToken("c")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); + ASSERT_EQ("c", tokenizer.getTokenString(2U)); + + ASSERT_TRUE(tokenizer.unregisterToken(1U)); + ASSERT_FALSE(tokenizer.unregisterToken(1U)); + ASSERT_EQ("", tokenizer.getTokenString(1U)); + + ASSERT_EQ(1U, tokenizer.registerToken("d")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); + ASSERT_EQ("d", tokenizer.getTokenString(1U)); +} + +TEST(DynamicTokenizer, textTokenPreserveWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(36U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenTrimWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenCollapseWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, simpleReadToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ(':', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ('t', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + + char c; + ASSERT_FALSE(reader.peek(c)); + } +} + +TEST(DynamicTokenizer, simplePeekToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(5U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(6U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(11U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } +} + +TEST(DynamicTokenizer, ambiguousTokens) +{ + CharReader reader{"abc"}; + DynamicTokenizer tokenizer; + + TokenTypeId t1 = tokenizer.registerToken("abd"); + TokenTypeId t2 = tokenizer.registerToken("bc"); + + ASSERT_EQ(0U, t1); + ASSERT_EQ(1U, t2); + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("a", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(t2, token.type); + ASSERT_EQ("bc", token.content); + + loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(3U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); +} + +TEST(DynamicTokenizer, commentTestWhitespacePreserve) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test ", SourceLocation{0, 5, 10}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +TEST(DynamicTokenizer, commentTestWhitespaceCollapse) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test", SourceLocation{0, 5, 9}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +} + diff --git a/test/formats/osdm/DynamicTokenizerTest.cpp b/test/formats/osdm/DynamicTokenizerTest.cpp deleted file mode 100644 index c1f8785..0000000 --- a/test/formats/osdm/DynamicTokenizerTest.cpp +++ /dev/null @@ -1,415 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include - -namespace ousia { - -TEST(DynamicTokenizer, tokenRegistration) -{ - DynamicTokenizer tokenizer; - - ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); - - ASSERT_EQ(0U, tokenizer.registerToken("a")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); - ASSERT_EQ("a", tokenizer.getTokenString(0U)); - - ASSERT_EQ(1U, tokenizer.registerToken("b")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); - ASSERT_EQ("b", tokenizer.getTokenString(1U)); - - ASSERT_EQ(2U, tokenizer.registerToken("c")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); - ASSERT_EQ("c", tokenizer.getTokenString(2U)); - - ASSERT_TRUE(tokenizer.unregisterToken(1U)); - ASSERT_FALSE(tokenizer.unregisterToken(1U)); - ASSERT_EQ("", tokenizer.getTokenString(1U)); - - ASSERT_EQ(1U, tokenizer.registerToken("d")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); - ASSERT_EQ("d", tokenizer.getTokenString(1U)); -} - -TEST(DynamicTokenizer, textTokenPreserveWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ(" this \t is only a \n\n test text ", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(36U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenTrimWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenCollapseWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, simpleReadToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ(':', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ('t', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - - char c; - ASSERT_FALSE(reader.peek(c)); - } -} - -TEST(DynamicTokenizer, simplePeekToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(5U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(6U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(11U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } -} - -TEST(DynamicTokenizer, ambiguousTokens) -{ - CharReader reader{"abc"}; - DynamicTokenizer tokenizer; - - TokenTypeId t1 = tokenizer.registerToken("abd"); - TokenTypeId t2 = tokenizer.registerToken("bc"); - - ASSERT_EQ(0U, t1); - ASSERT_EQ(1U, t2); - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("a", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(t2, token.type); - ASSERT_EQ("bc", token.content); - - loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(3U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); -} - -TEST(DynamicTokenizer, commentTestWhitespacePreserve) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test ", SourceLocation{0, 5, 10}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -TEST(DynamicTokenizer, commentTestWhitespaceCollapse) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test", SourceLocation{0, 5, 9}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -} - diff --git a/test/formats/osdm/TokenTrieTest.cpp b/test/formats/osdm/TokenTrieTest.cpp deleted file mode 100644 index aacd6c0..0000000 --- a/test/formats/osdm/TokenTrieTest.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -static const TokenTypeId t1 = 0; -static const TokenTypeId t2 = 1; -static const TokenTypeId t3 = 2; -static const TokenTypeId t4 = 3; - -TEST(TokenTrie, registerToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_TRUE(tree.registerToken("hello", t4)); - - ASSERT_FALSE(tree.registerToken("", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - ASSERT_FALSE(tree.registerToken("b", t4)); - ASSERT_FALSE(tree.registerToken("hello", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - ASSERT_EQ(t4, tree.hasToken("hello")); - ASSERT_EQ(EmptyToken, tree.hasToken("")); - ASSERT_EQ(EmptyToken, tree.hasToken("abc")); -} - -TEST(TokenTrie, unregisterToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_FALSE(tree.registerToken("b", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("a")); - ASSERT_FALSE(tree.unregisterToken("a")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("b")); - ASSERT_FALSE(tree.unregisterToken("b")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("ab")); - ASSERT_FALSE(tree.unregisterToken("ab")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(EmptyToken, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); -} -} - -- cgit v1.2.3 From 919552bad0f3f4db20419d3d3771c724c2ab997f Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:25 +0100 Subject: Removed Whitespace file again --- src/core/common/Whitespace.cpp | 38 -------------------------------------- test/core/common/Whitespace.cpp | 41 ----------------------------------------- 2 files changed, 79 deletions(-) delete mode 100644 src/core/common/Whitespace.cpp delete mode 100644 test/core/common/Whitespace.cpp (limited to 'test') diff --git a/src/core/common/Whitespace.cpp b/src/core/common/Whitespace.cpp deleted file mode 100644 index 4d7c01a..0000000 --- a/src/core/common/Whitespace.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "Whitespace.hpp" -#include "WhitespaceHandler.hpp" - -namespace ousia { - -std::string Utils::trim(const std::string &s) -{ - std::pair bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - -std::string Utils::collapse(const std::string &s) -{ - CollapsingWhitespaceHandler h; - appendToWhitespaceHandler(h, s, 0); - return h.toString(); -} - -} - diff --git a/test/core/common/Whitespace.cpp b/test/core/common/Whitespace.cpp deleted file mode 100644 index d6df8b7..0000000 --- a/test/core/common/Whitespace.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -TEST(Whitespace, trim) -{ - ASSERT_EQ("hello world", Whitespace::trim("\t hello world \n\r\t")); - ASSERT_EQ("hello world", Whitespace::trim("hello world \n\r\t")); - ASSERT_EQ("hello world", Whitespace::trim(" hello world")); - ASSERT_EQ("hello world", Whitespace::trim("hello world")); -} - -TEST(Whitespace, collapse) -{ - ASSERT("hello world", Whitespace::collapse(" hello \n\t\r world \n\r\t")); - ASSERT("hello world", Whitespace::collapse("hello \n\t\r world \n\r\t")); - ASSERT("hello world", Whitespace::collapse("hello \n\t\r world")); - ASSERT("hello world", Whitespace::collapse("hello world")); -} -} - -- cgit v1.2.3 From 295783320ea3855a14123f9cea163f8f5f689e07 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:50:11 +0100 Subject: Moved some of the whitespace functionality back to Utils --- src/core/common/Utils.cpp | 25 ++++++++++++ src/core/common/Utils.hpp | 72 +++++++++++++++++++++++++++++++++++ src/core/common/Whitespace.hpp | 62 +----------------------------- src/core/common/WhitespaceHandler.hpp | 7 ++-- test/core/common/UtilsTest.cpp | 17 +++++++++ 5 files changed, 119 insertions(+), 64 deletions(-) (limited to 'test') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 4005143..3739c61 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -21,6 +21,7 @@ #include #include "Utils.hpp" +#include "WhitespaceHandler.hpp" namespace ousia { @@ -87,5 +88,29 @@ std::string Utils::extractFileExtension(const std::string &filename) } return std::string{}; } + +std::string Utils::trim(const std::string &s) +{ + std::pair bounds = trim(s, Utils::isWhitespace); + return s.substr(bounds.first, bounds.second - bounds.first); +} + +std::string Utils::collapse(const std::string &s) +{ + CollapsingWhitespaceHandler h; + appendToWhitespaceHandler(h, s, 0); + return h.toString(); +} + +bool Utils::startsWith(const std::string &s, const std::string &prefix) +{ + return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; +} + +bool Utils::endsWith(const std::string &s, const std::string &suffix) +{ + return suffix.size() <= s.size() && + s.substr(s.size() - suffix.size(), suffix.size()) == suffix; +} } diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index af7a773..16a9136 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -99,6 +99,60 @@ public: */ static bool hasNonWhitepaceChar(const std::string &s); + /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s); + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template + static std::pair trim(const T &s, Filter f) + { + size_t start = 0; + for (size_t i = 0; i < s.size(); i++) { + if (!f(s[i])) { + start = i; + break; + } + } + + size_t end = 0; + for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { + if (!f(s[i])) { + end = i + 1; + break; + } + } + + if (end < start) { + start = 0; + end = 0; + } + + return std::pair{start, end}; + } + + /** + * Collapses the whitespaces in the given string (trims the string and + * replaces all whitespace characters by a single one). + * + * @param s is the string in which the whitespace should be collapsed. + * @return a copy of s with collapsed whitespace. + */ + static std::string collapse(const std::string &s); + /** * Turns the elements of a collection into a string separated by the * given delimiter. @@ -159,6 +213,24 @@ public: */ static std::string extractFileExtension(const std::string &filename); + /** + * Checks whether the given string starts with the given prefix. + * + * @param s is the string. + * @param prefix is the string which should be checked for being a prefix of + * s. + */ + static bool startsWith(const std::string &s, const std::string &prefix); + + /** + * Checks whether the given string ends with the given suffix. + * + * @param s is the string. + * @param suffix is the string which should be checked for being a suffix of + * s. + */ + static bool endsWith(const std::string &s, const std::string &suffix); + /** * Hash functional to be used for enum classes. * See http://stackoverflow.com/a/24847480/2188211 diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp index 1e9f36a..72a2291 100644 --- a/src/core/common/Whitespace.hpp +++ b/src/core/common/Whitespace.hpp @@ -19,8 +19,7 @@ /** * @file Whitespace.hpp * - * Contains the WhitespaceMode enum used in various places, as well es functions - * for trimming and collapsing whitespaces. + * Contains the WhitespaceMode enum used in various places. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -55,65 +54,6 @@ enum class WhitespaceMode { COLLAPSE }; -/** - * Collection of functions for trimming or collapsing whitespace. - */ -class Whitespace { - /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** - * Trims the given string or vector of chars by returning the start and end - * index. - * - * @param s is the container that should be trimmed. - * @param f is a function that returns true for values that should be - * removed. - * @return start and end index. Note that "end" points at the character - * beyond the end, thus "end" minus "start" - */ - template - static std::pair trim(const T &s, Filter f) - { - size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { - if (!f(s[i])) { - start = i; - break; - } - } - - size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { - if (!f(s[i])) { - end = i + 1; - break; - } - } - - if (end < start) { - start = 0; - end = 0; - } - - return std::pair{start, end}; - } - - /** - * Collapses the whitespaces in the given string (trims the string and - * replaces all whitespace characters by a single one). - * - * @param s is the string in which the whitespace should be collapsed. - * @return a copy of s with collapsed whitespace. - */ - static std::string collapse(const std::string &s); -}; - } #endif /* _OUSIA_WHITESPACE_HPP_ */ diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp index 1935c24..79e0518 100644 --- a/src/core/common/WhitespaceHandler.hpp +++ b/src/core/common/WhitespaceHandler.hpp @@ -32,7 +32,7 @@ #include #include -#include "WhitespaceHandler.hpp" +#include "Utils.hpp" namespace ousia { @@ -76,7 +76,7 @@ public: /** * Returns the content of the WhitespaceHandler as string. */ - std::string toString() + std::string toString() const { return std::string(textBuf.data(), textBuf.size()); } @@ -214,7 +214,8 @@ inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, size_t start) { for (auto elem : buf) { - handler.append(elem, start++); + handler.append(elem, start, start + 1); + start++; } } } diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 6b8a916..a4bf4b2 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -65,5 +65,22 @@ TEST(Utils, extractFileExtension) ASSERT_EQ("ext", Utils::extractFileExtension("foo.bar/test.EXT")); } +TEST(Utils, startsWith) +{ + ASSERT_TRUE(Utils::startsWith("foobar", "foo")); + ASSERT_TRUE(Utils::startsWith("foo", "foo")); + ASSERT_FALSE(Utils::startsWith("foo", "foobar")); + ASSERT_FALSE(Utils::startsWith("foobar", "bar")); + ASSERT_TRUE(Utils::startsWith("foo", "")); +} + +TEST(Utils, endsWith) +{ + ASSERT_FALSE(Utils::endsWith("foobar", "foo")); + ASSERT_TRUE(Utils::endsWith("foo", "foo")); + ASSERT_FALSE(Utils::endsWith("foo", "foobar")); + ASSERT_TRUE(Utils::endsWith("foobar", "bar")); + ASSERT_TRUE(Utils::endsWith("foo", "")); +} } -- cgit v1.2.3 From 98f43328e566b3a77b75808892246a295adb0eb0 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:59:43 +0100 Subject: Renamed osdm to osml and osdmx to osxml --- src/formats/osml/OsmlStreamParser.cpp | 32 +- src/formats/osml/OsmlStreamParser.hpp | 22 +- src/formats/osxml/OsxmlEventParser.cpp | 524 ++++++++++++++++ src/formats/osxml/OsxmlEventParser.hpp | 205 ++++++ src/formats/osxml/OsxmlParser.cpp | 337 ---------- test/formats/osdm/OsdmStreamParserTest.cpp | 973 ----------------------------- test/formats/osdmx/OsdmxParserTest.cpp | 314 ---------- 7 files changed, 756 insertions(+), 1651 deletions(-) create mode 100644 src/formats/osxml/OsxmlEventParser.cpp create mode 100644 src/formats/osxml/OsxmlEventParser.hpp delete mode 100644 test/formats/osdm/OsdmStreamParserTest.cpp delete mode 100644 test/formats/osdmx/OsdmxParserTest.cpp (limited to 'test') diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 6a55f12..6b00eef 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -21,7 +21,7 @@ #include #include -#include "OsdmStreamParser.hpp" +#include "OsmlStreamParser.hpp" namespace ousia { @@ -160,14 +160,14 @@ public: } }; -OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) : reader(reader), logger(logger), tokenizer(Tokens) { // Place an intial command representing the complete file on the stack commands.push(Command{"", Variant::mapType{}, true, true, true}); } -Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; bool hasCharSiceNSSep = false; @@ -210,7 +210,7 @@ Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) return res; } -OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() { // Expect a '{' after the command reader.consumeWhitespace(); @@ -251,7 +251,7 @@ OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() return State::COMMAND; } -static bool checkStillInField(const OsdmStreamParser::Command &cmd, +static bool checkStillInField(const OsmlStreamParser::Command &cmd, const Variant &endName, Logger &logger) { if (cmd.inField && !cmd.inRangeField) { @@ -264,7 +264,7 @@ static bool checkStillInField(const OsdmStreamParser::Command &cmd, return false; } -OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +OsmlStreamParser::State OsmlStreamParser::parseEndCommand() { // Expect a '{' after the command if (!reader.expect('{')) { @@ -327,7 +327,7 @@ OsdmStreamParser::State OsdmStreamParser::parseEndCommand() return cmd.inRangeField ? State::FIELD_END : State::NONE; } -Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) { // Parse the arguments using the universal VariantReader Variant commandArguments; @@ -353,7 +353,7 @@ Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) return commandArguments; } -void OsdmStreamParser::pushCommand(Variant commandName, +void OsmlStreamParser::pushCommand(Variant commandName, Variant commandArguments, bool hasRange) { // Store the location on the stack @@ -368,7 +368,7 @@ void OsdmStreamParser::pushCommand(Variant commandName, hasRange, false, false}); } -OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); @@ -416,7 +416,7 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) return State::COMMAND; } -void OsdmStreamParser::parseBlockComment() +void OsmlStreamParser::parseBlockComment() { Token token; size_t depth = 1; @@ -436,7 +436,7 @@ void OsdmStreamParser::parseBlockComment() logger.error("File ended while being in a block comment", reader); } -void OsdmStreamParser::parseLineComment() +void OsmlStreamParser::parseLineComment() { char c; while (reader.read(c)) { @@ -446,7 +446,7 @@ void OsdmStreamParser::parseLineComment() } } -bool OsdmStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData(DataHandler &handler) { if (!handler.isEmpty()) { data = handler.toVariant(reader.getSourceId()); @@ -457,7 +457,7 @@ bool OsdmStreamParser::checkIssueData(DataHandler &handler) return false; } -bool OsdmStreamParser::checkIssueFieldStart() +bool OsmlStreamParser::checkIssueFieldStart() { // Fetch the current command, and check whether we're currently inside a // field of this command @@ -482,7 +482,7 @@ bool OsdmStreamParser::checkIssueFieldStart() return false; } -OsdmStreamParser::State OsdmStreamParser::parse() +OsmlStreamParser::State OsmlStreamParser::parse() { // Handler for incomming data DataHandler handler; @@ -627,12 +627,12 @@ OsdmStreamParser::State OsdmStreamParser::parse() return State::END; } -const Variant &OsdmStreamParser::getCommandName() +const Variant &OsmlStreamParser::getCommandName() { return commands.top().name; } -const Variant &OsdmStreamParser::getCommandArguments() +const Variant &OsmlStreamParser::getCommandArguments() { return commands.top().arguments; } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 84674c0..1508012 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -17,17 +17,17 @@ */ /** - * @file OsdmStreamParser.hpp + * @file OsmlStreamParser.hpp * - * Provides classes for low-level classes for reading the TeX-esque osdm + * Provides classes for low-level classes for reading the TeX-esque osml * format. The class provided here does not build any model objects and does not * implement the Parser interface. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ -#define _OUSIA_OSDM_STREAM_PARSER_HPP_ +#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ +#define _OUSIA_OSML_STREAM_PARSER_HPP_ #include @@ -42,7 +42,7 @@ class Logger; class DataHandler; /** - * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml * format. The parser is constructed around a "parse" function, which reads data * from the underlying CharReader until a new state is reached and indicates * this state in a return value. The calling code then has to pull corresponding @@ -52,10 +52,10 @@ class DataHandler; * fields, as this would lead to too many consecutive errors) a * LoggableException is thrown. */ -class OsdmStreamParser { +class OsmlStreamParser { public: /** - * Enum used to indicate which state the OsdmStreamParser class is in + * Enum used to indicate which state the OsmlStreamParser class is in * after calling the "parse" function. */ enum class State { @@ -291,14 +291,14 @@ private: public: /** - * Constructor of the OsdmStreamParser class. Attaches the new - * OsdmStreamParser to the given CharReader and Logger instances. + * Constructor of the OsmlStreamParser class. Attaches the new + * OsmlStreamParser to the given CharReader and Logger instances. * * @param reader is the reader instance from which incomming characters * should be read. * @param logger is the logger instance to which errors should be written. */ - OsdmStreamParser(CharReader &reader, Logger &logger); + OsmlStreamParser(CharReader &reader, Logger &logger); /** * Continues parsing. Returns one of the states defined in the State enum. @@ -346,5 +346,5 @@ public: }; } -#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ +#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */ diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp new file mode 100644 index 0000000..2ef170e --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -0,0 +1,524 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include "OsxmlEventParser.hpp" + +namespace ousia { + +/** + * Class containing data used by the internal functions. + */ +class OsxmlEventParserData { +public: + /** + * Contains the current depth of the parsing process. + */ + ssize_t depth; + + /** + * Set to a value larger or equal to zero if the parser is currently inside + * an annotation end tag -- the value represents the depth in which the + * tag was opened. + */ + ssize_t annotationEndTagDepth; + + /** + * Default constructor. + */ + OsxmlEventParserData() : depth(0), annotationEndTagDepth(-1) {} + + /** + * Increments the depth. + */ + void incrDepth() { depth++; } + + /** + * Decrement the depth and reset the annotationEndTagDepth flag. + */ + void decrDepth() + { + if (depth > 0) { + depth--; + } + if (depth < annotationEndTagDepth) { + annotationEndTagDepth = -1; + } + } + + /** + * Returns true if we're currently inside an end tag. + */ + bool inAnnotationEndTag() { depth >= annotationEndTagDepth; } +}; + +namespace { +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class ScopedExpatXmlParser { +private: + /** + * Internal pointer to the XML_Parser instance. + */ + XML_Parser parser; + +public: + /** + * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * from the expat library. Throws a parser exception if the XML parser + * cannot be initialized. + * + * @param encoding is the protocol-defined encoding passed to expat (or + * nullptr if expat should determine the encoding by itself). + */ + ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + { + parser = XML_ParserCreate(encoding); + if (!parser) { + throw LoggableException{ + "Internal error: Could not create expat XML parser!"}; + } + } + + /** + * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + */ + ~ScopedExpatXmlParser() + { + if (parser) { + XML_ParserFree(parser); + parser = nullptr; + } + } + + /** + * Returns the XML_Parser pointer. + */ + XML_Parser operator&() { return parser; } +}; + +/** + * Enum used internally in the statemachine of the micro-xml argument parser. + */ +enum class XmlAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +/** + * Function used to reconstruct the location of the attributes of a XML tag in + * the source code. This is necessary, as the xml parser only returns an offset + * to the begining of a tag and not to the position of the individual arguments. + * + * @param reader is the char reader from which the character data should be + * read. + * @param offs is a byte offset in the xml file pointing at the "<" character of + * the tag. + * @return a map from attribute keys to the corresponding location (including + * range) of the atribute. Also contains the location of the tagname in the + * form of the virtual attribute "$tag". + */ +static std::map xmlReconstructAttributeOffsets( + CharReader &reader, size_t offs) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + if (!location.isValid() || offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XmlAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + res.emplace("$tag", + SourceLocation{reader.getSourceId(), offs + 1, + readerFork.getOffset() - 1}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + case XmlAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XmlAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XmlAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XmlAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XmlAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XmlAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + state = XmlAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XmlAttributeState::IN_ATTR_NAME; + } + } + break; + case XmlAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, set the end + // location + auto it = res.find(attrName.str()); + if (it != res.end()) { + it->second.setEnd(readerFork.getOffset() - 1); + } + + // Reset the attribute name and restart the search + attrName.str(std::string{}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} + +/** + * Synchronizes the position of the xml parser with the default location of the + * logger instance. + * + * @param p is a pointer at the xml parser instance. + * @param len is the length of the string that should be refered to. + * @return the SourceLocation that has been set in the logger. + */ +static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) +{ + // Fetch the OsxmlEventParser instance + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); + + // Fetch the current location in the XML file and set the default location + // in the logger + size_t offs = XML_GetCurrentByteIndex(p); + SourceLocation loc = + SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; + parser->getLogger().setDefaultLocation(location); + + // Return the fetched location + return loc; +} + +/** + * Prefix used to indicate the start of an annoation, + */ +static const std::string ANNOTATION_START_PREFIX{"a:start:"}; + +/** + * Prefix used to indicate the end of an annotation. + */ +static const std::string ANNOTATION_END_PREFIX{"a:end"}; + +/** + * Callback called by eXpat whenever a start handler is reached. + */ +static void xmlStartElementHandler(void *ref, const XML_Char *name, + const XML_Char **attrs) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast(ref); + OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + + // Read the argument locations -- this is only a stupid and slow hack, + // but it is necessary, as expat doesn't give use the byte offset of the + // arguments. + std::map attributeOffsets = + xmlReconstructXMLAttributeOffsets(*userData->reader, + XML_GetCurrentByteIndex(p)); + + // Update the logger position + SourceLocation loc = xmlSyncLoggerPosition(p); + + // Fetch the location of the name + SourceLocation nameLoc = loc; + auto it = attributeOffsets.find("$tag"); + if (it != attributeOffsets.end()) { + nameLoc = it->second; + } + // Increment the current depth + parser->getData().incrDepth(); + + // Make sure we're currently not inside an annotation end tag -- this would + // be highly illegal! + if (parser->getData().inAnnotationEndTag()) { + logger.error("No tags allowed inside an annotation end tag", nameLoc); + return; + } + + // Assemble the arguments + Variant::mapType args; + const XML_Char **attr = attrs; + while (*attr) { + // Convert the C string to a std::string + const std::string key{*(attr++)}; + + // Search the location of the key + SourceLocation keyLoc; + auto it = attributeOffsets.find(key); + if (it != attributeOffsets.end()) { + keyLoc = it->second; + } + + // Parse the string, pass the location of the key + std::pair value = VariantReader::parseGenericString( + *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + keyLoc.getStart()); + + // Set the overall location of the parsed element to the attribute + // location + value.second->setLocation(keyLoc); + + // Store the + if (!args.emplace(key, value.second).second) { + parser->getLogger().warning( + std::string("Attribute \"") + key + + "\" defined multiple times, only using first definition", + keyLoc); + } + } + + // Fetch the name of the tag, check for special tags + std::string nameStr(name); + if (nameStr == "ousia" && parser->getData().depth == 1) { + // We're in the top-level and the magic "ousia" tag is reached -- just + // ignore it and issue a warning for each argument that has been given + for (const auto &arg : args) { + parser->getLogger().warning( + std::string("Ignoring attribute \"") + arg.first + + std::string("\" for magic tag \"ousia\""), + arg.second); + } + } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { + // Assemble a name variant containing the name minus the prefix + Variant nameVar = nameStr.substr(ANNOTATION_START_PREFIX.size()); + nameVar.setLocation(nameLoc); + + // Issue the "annotationStart" event + parser->getEvents().annotationStart(nameVar, args); + } else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) { + // Assemble a name variant containing the name minus the prefix + nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size()); + + // Discard a potentially leading colon + if (!nameStr.empty() && nameStr[0] == ':') { + nameStr = nameStr.substr(1); + } + + // Assemble the variant containing the name and its location + Variant nameVar = Variant::fromString(nameStr); + nameVar.setLocation(nameLoc); + + // Check whether a "name" attribute was given + Variant elementName; + for (const auto &arg : args) { + if (arg.first == "name") { + elementName = arg.second; + } else { + parser->getLogger().warning( + std::string("Ignoring attribute \"") + arg.first + + "\" in annotation end tag", + arg.second); + } + } + + // Set the annotationEndTagDepth to disallow any further tags to be + // opened inside the annotation end tag. + parser->getData().annotationEndTagDepth = parser->getData().depth; + + // Issue the "annotationEnd" event + parser->getEvents().annotationEnd(nameVar, args); + } else { + // Just issue a "commandStart" event in any other case + Variant nameVar = Variant::fromString(nameStr); + nameVar.setLocation(nameLoc); + parser->getEvents().commandStart(nameVar, args); + } +} + +static void xmlEndElementHandler(void *p, const XML_Char *name) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast(ref); + OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + + // Synchronize the position of the logger with teh position + xmlSyncLoggerPosition(parser); + + // Decrement the current depth + parser->getData().decrDepth(); + + // Abort as long as we're in an annotation end tag + if (parser->getData().inAnnotationEndTag()) { + return; + } + + // Abort if the special ousia tag ends here + if (nameStr == "ousia" && parser->getData().depth == 0) { + return; + } + + // Issue the "fieldEnd" event + parser->getEvents().fieldEnd(); +} + +static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast(ref); + OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + + // TODO +/* size_t ulen = len > 0 ? static_cast(len) : 0; + syncLoggerPosition(parser, ulen); + const std::string data = Utils::trim(std::string{s, ulen}); + if (!data.empty()) { + stack->data(data); + }*/ +} +} + +/* Class OsxmlEventParser */ + +OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, + Logger &logger) + : reader(reader), + events(events), + logger(logger), + whitespaceMode(WhitespaceMode::COLLAPSE), + data(new OsxmlEventParserData()) +{ +} + +void OsxmlEventParser::parse(CharReader &reader) +{ + // Create the parser object + ScopedExpatXmlParser p{"UTF-8"}; + + // Reset the depth + depth = 0; + + // Pass the reference to the ParserStack to the XML handler + XMLUserData data(&stack, &reader); + XML_SetUserData(&p, this); + XML_UseParserAsHandlerArg(&p); + + // Set the callback functions + XML_SetStartElementHandler(&p, xmlStartElementHandler); + XML_SetEndElementHandler(&p, xmlEndElementHandler); + XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + + // Feed data into expat while there is data to process + constexpr size_t BUFFER_SIZE = 64 * 1024; + while (true) { + // Fetch a buffer from expat for the input data + char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); + if (!buf) { + throw OusiaException{"Internal error: XML parser out of memory!"}; + } + + // Read into the buffer + size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + + // Parse the data and handle any XML error as exception + if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { + throw LoggableException{ + "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, + xmlSyncLoggerPosition(p)}; + } + + // Abort once there are no more bytes in the stream + if (bytesRead == 0) { + break; + } + } +} + +void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ + this->whitespaceMode = whitespaceMode; +} + +CharReader &OsxmlEventParser::getCharReader() { return charReader; } + +Logger &OsxmlEventParser::getLogger() { return logger; } + +OsxmlEvents &OsxmlEventParser::getEvents() { return events; } + +OsxmlEventParserData &OsxmlEventParser::getData() { return *data; } +} + diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp new file mode 100644 index 0000000..5319ca6 --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -0,0 +1,205 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsxmlEventParser.hpp + * + * The OsxmlEventParser class is responsible for parsing an XML file and calling + * the corresponding event handler functions if an XML item is found. Event + * handling is performed using a listener interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OSXML_EVENT_PARSER_HPP_ +#define _OSXML_EVENT_PARSER_HPP_ + +#include +#include + +#include + +namespace ousia { + +// Forward declarations +class Logger; +class Variant; +class OsxmlEventParserData; + +/** + * Interface which defines the callback functions which are called by the + * OsxmlEventParser whenever an event occurs. + */ +class OsxmlEvents { +public: + /** + * Virtual destructor. + */ + virtual ~OsxmlEvents() {} + + /** + * Called whenever a command starts. Note that this implicitly always starts + * the default field of the command. + * + * @param name is a string variant containing name and location of the + * command. + * @param args is a map variant containing the arguments that were given + * to the command. + */ + virtual void commandStart(Variant name, Variant args) = 0; + + /** + * Called whenever an annotation starts. Note that this implicitly always + * starts the default field of the annotation. + * + * @param name is a string variant containing the name of the annotation + * class and the location of the annotation definition. + * @param args is a map variant containing the arguments that were given + * to the annotation definition. + */ + virtual void annotationStart(Variant name, Variant args); + + /** + * Called whenever the range of an annotation ends. The callee must + * disambiguate the actual annotation that is finished here. + * + * @param name is a string variant containing the name of the annotation + * class that should end here. May be empty (or nullptr), if no elementName + * has been specified at the end of the annotation. + * @param elementName is the name of the annotation element that should be + * ended here. May be empty (or nullptr), if no elementName has been + * specified at the end of the annotation. + */ + virtual void annotationEnd(Variant name, Variant elementName); + + /** + * Called whenever the default field which was implicitly started by + * commandStart or annotationStart ends. Note that this does not end the + * range of an annotation, but the default field of the annotation. To + * signal the end of the annotation this, the annotationEnd method will be + * invoked. + */ + virtual void fieldEnd() = 0; + + /** + * Called whenever data is found. Whitespace data is handled as specified + * and the data has been parsed to the specified variant type. This function + * is not called if the parsing failed, the parser prints an error message + * instead. + * + * @param data is the already parsed data that should be passed to the + * handler. + */ + virtual void data(Variant data) = 0; + +}; + +/** + * The OsxmlEventParser class is a wrapper around eXpat which implements the + * specialities of the osxml formats class (like annotation ranges). It notifies + * a specified event handler whenever a command, annotation or data has been + * reached. + */ +class OsxmlEventParser { +private: + /** + * Reference at the internal CharReader instance. + */ + CharReader &reader; + + /** + * Set of callback functions to be called whenever an event is triggered. + */ + OsxmlEvents &events; + + /** + * Reference at the Logger object to which error messages or warnings should + * be logged. + */ + Logger &logger; + + /** + * Current whitespace mode. + */ + WhitespaceMode whitespaceMode; + + /** + * Data to be used by the internal functions. + */ + std::unique_ptr data; + +public: + /** + * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents + * of which the callback functions are called. + * + * @param reader is a reference to the CharReader instance from which the + * XML should be read. + * @param events is a refence at an instance of the OsxmlEvents class. All + * events are forwarded to this class. + * @param logger is the Logger instance to which log messages should be + * written. + */ + OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger); + + /** + * Performs the actual parsing. Reads the XML using eXpat and calles the + * callbacks in the event listener instance whenever something interesting + * happens. + */ + void parse(); + + /** + * Sets the whitespace handling mode. + * + * @param whitespaceMode defines how whitespace in the data should be + * handled. + */ + void setWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Returns the internal CharReader reference. + * + * @return the CharReader reference. + */ + CharReader &getCharReader(); + + /** + * Returns the internal Logger reference. + * + * @return the internal Logger reference. + */ + Logger &getLogger(); + + /** + * Returns the internal OsxmlEvents reference. + * + * @return the internal OsxmlEvents reference. + */ + OsxmlEvents &getEvents(); + + /** + * Returns a reference at the internal data. + */ + OsxmlEventParserData &getData(); +}; + +} + +#endif /* _OSXML_EVENT_PARSER_HPP_ */ + diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index c46d9de..4f6503c 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -1093,343 +1093,6 @@ static const std::multimap XmlStates{ {"include", &Include}}; } -/** - * Structue containing the private data that is being passed to the - * XML-Handlers. - */ -struct XMLUserData { - /** - * Containing the depth of the current XML file - */ - size_t depth; - - /** - * Reference at the ParserStack instance. - */ - ParserStack *stack; - - /** - * Reference at the CharReader instance. - */ - CharReader *reader; - - /** - * Constructor of the XMLUserData struct. - * - * @param stack is a pointer at the ParserStack instance. - * @param reader is a pointer at the CharReader instance. - */ - XMLUserData(ParserStack *stack, CharReader *reader) - : depth(0), stack(stack), reader(reader) - { - } -}; - -/** - * Wrapper class around the XML_Parser pointer which safely frees it whenever - * the scope is left (e.g. because an exception was thrown). - */ -class ScopedExpatXmlParser { -private: - /** - * Internal pointer to the XML_Parser instance. - */ - XML_Parser parser; - -public: - /** - * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS - * from the expat library. Throws a parser exception if the XML parser - * cannot be initialized. - * - * @param encoding is the protocol-defined encoding passed to expat (or - * nullptr if expat should determine the encoding by itself). - */ - ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) - { - parser = XML_ParserCreate(encoding); - if (!parser) { - throw LoggableException{ - "Internal error: Could not create expat XML parser!"}; - } - } - - /** - * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. - */ - ~ScopedExpatXmlParser() - { - if (parser) { - XML_ParserFree(parser); - parser = nullptr; - } - } - - /** - * Returns the XML_Parser pointer. - */ - XML_Parser operator&() { return parser; } -}; - -/* Adapter Expat -> ParserStack */ - -static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) -{ - // Fetch the parser stack and the associated user data - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - // Fetch the current location in the XML file - size_t offs = XML_GetCurrentByteIndex(p); - - // Build the source location and update the default location of the - // current - // logger instance - SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; - stack->getContext().getLogger().setDefaultLocation(loc); - return loc; -} - -enum class XMLAttributeState { - IN_TAG_NAME, - SEARCH_ATTR, - IN_ATTR_NAME, - HAS_ATTR_NAME, - HAS_ATTR_EQUALS, - IN_ATTR_DATA -}; - -static std::map reconstructXMLAttributeOffsets( - CharReader &reader, SourceLocation location) -{ - std::map res; - - // Fork the reader, we don't want to mess up the XML parsing process, do we? - CharReaderFork readerFork = reader.fork(); - - // Move the read cursor to the start location, abort if this does not work - size_t offs = location.getStart(); - if (!location.isValid() || offs != readerFork.seek(offs)) { - return res; - } - - // Now all we need to do is to implement one half of an XML parser. As this - // is inherently complicated we'll totaly fail at it. Don't care. All we - // want to get is those darn offsets for pretty error messages... (and we - // can assume the XML is valid as it was already read by expat) - XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; - char c; - std::stringstream attrName; - while (readerFork.read(c)) { - // Abort at the end of the tag - if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { - return res; - } - - // One state machine to rule them all, one state machine to find them, - // One state machine to bring them all and in the darkness bind them - // (the byte offsets) - switch (state) { - case XMLAttributeState::IN_TAG_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::SEARCH_ATTR; - } - break; - case XMLAttributeState::SEARCH_ATTR: - if (!Utils::isWhitespace(c)) { - state = XMLAttributeState::IN_ATTR_NAME; - attrName << c; - } - break; - case XMLAttributeState::IN_ATTR_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::HAS_ATTR_NAME; - } else if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - } else { - attrName << c; - } - break; - case XMLAttributeState::HAS_ATTR_NAME: - if (!Utils::isWhitespace(c)) { - if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - break; - } - // Well, this is a strange XML file... We expected to - // see a '=' here! Try to continue with the - // "HAS_ATTR_EQUALS" state as this state will hopefully - // inlcude some error recovery - } else { - // Skip whitespace here - break; - } - // Fallthrough - case XMLAttributeState::HAS_ATTR_EQUALS: - if (!Utils::isWhitespace(c)) { - if (c == '"') { - // Here we are! We have found the beginning of an - // attribute. Let's quickly lock the current offset away - // in the result map - res.emplace(attrName.str(), - SourceLocation{reader.getSourceId(), - readerFork.getOffset()}); - attrName.str(std::string{}); - state = XMLAttributeState::IN_ATTR_DATA; - } else { - // No, this XML file is not well formed. Assume we're in - // an attribute name once again - attrName.str(std::string{&c, 1}); - state = XMLAttributeState::IN_ATTR_NAME; - } - } - break; - case XMLAttributeState::IN_ATTR_DATA: - if (c == '"') { - // We're at the end of the attribute data, start anew - state = XMLAttributeState::SEARCH_ATTR; - } - break; - } - } - return res; -} - -static void xmlStartElementHandler(void *p, const XML_Char *name, - const XML_Char **attrs) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - SourceLocation loc = syncLoggerPosition(parser); - - // Read the argument locations -- this is only a stupid and slow hack, - // but it is necessary, as expat doesn't give use the byte offset of the - // arguments. - std::map offs = - reconstructXMLAttributeOffsets(*userData->reader, loc); - - // Assemble the arguments - Variant::mapType args; - - const XML_Char **attr = attrs; - while (*attr) { - // Convert the C string to a std::string - const std::string key{*(attr++)}; - - // Search the location of the key - SourceLocation keyLoc; - auto it = offs.find(key); - if (it != offs.end()) { - keyLoc = it->second; - } - - // Parse the string, pass the location of the key - std::pair value = VariantReader::parseGenericString( - *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), - keyLoc.getStart()); - args.emplace(key, value.second); - } - - // Call the start function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->start(std::string(name), args, loc); - } - - // Increment the current depth - userData->depth++; -} - -static void xmlEndElementHandler(void *p, const XML_Char *name) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - syncLoggerPosition(parser); - - // Decrement the current depth - userData->depth--; - - // Call the end function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->end(); - } -} -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - size_t ulen = len > 0 ? static_cast(len) : 0; - syncLoggerPosition(parser, ulen); - const std::string data = Utils::trim(std::string{s, ulen}); - if (!data.empty()) { - stack->data(data); - } -} - -/* Class XmlParser */ - -void XmlParser::doParse(CharReader &reader, ParserContext &ctx) -{ - // Create the parser object - ScopedExpatXmlParser p{"UTF-8"}; - - // Create the parser stack instance, if we're starting on a non-empty scope, - // try to deduce the parser state - ParserStack stack(ctx, ParserStates::XmlStates); - if (!ctx.getScope().isEmpty()) { - if (!stack.deduceState()) { - return; - } - } - - // Pass the reference to the ParserStack to the XML handler - XMLUserData data(&stack, &reader); - XML_SetUserData(&p, &data); - XML_UseParserAsHandlerArg(&p); - - // Set the callback functions - XML_SetStartElementHandler(&p, xmlStartElementHandler); - XML_SetEndElementHandler(&p, xmlEndElementHandler); - XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); - - // Feed data into expat while there is data to process - constexpr size_t BUFFER_SIZE = 64 * 1024; - while (true) { - // Fetch a buffer from expat for the input data - char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); - if (!buf) { - throw LoggableException{ - "Internal error: XML parser out of memory!"}; - } - - // Read into the buffer - size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); - - // Parse the data and handle any XML error - if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { - // Fetch the xml parser byte offset - size_t offs = XML_GetCurrentByteIndex(&p); - - // Throw a corresponding exception - XML_Error code = XML_GetErrorCode(&p); - std::string msg = std::string{XML_ErrorString(code)}; - throw LoggableException{"XML: " + msg, - SourceLocation{ctx.getSourceId(), offs}}; - } - - // Abort once there are no more bytes in the stream - if (bytesRead == 0) { - break; - } - } -} } diff --git a/test/formats/osdm/OsdmStreamParserTest.cpp b/test/formats/osdm/OsdmStreamParserTest.cpp deleted file mode 100644 index 46f4cf6..0000000 --- a/test/formats/osdm/OsdmStreamParserTest.cpp +++ /dev/null @@ -1,973 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include - -#include - -namespace ousia { - -static TerminalLogger logger(std::cerr, true); - -TEST(OsdmStreamParser, empty) -{ - const char *testString = ""; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, oneCharacter) -{ - const char *testString = "a"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); -} - -TEST(OsdmStreamParser, whitespaceElimination) -{ - const char *testString = " hello \t world "; - // 0123456 78901234 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); -} - -TEST(OsdmStreamParser, whitespaceEliminationWithLinebreak) -{ - const char *testString = " hello \n world "; - // 0123456 78901234 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, escapeWhitespace) -{ - const char *testString = " hello\\ \\ world "; - // 012345 67 89012345 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(15U, loc.getEnd()); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -static void testEscapeSpecialCharacter(const std::string &c) -{ - CharReader charReader(std::string("\\") + c); - OsdmStreamParser reader(charReader, logger); - EXPECT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(c, reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - EXPECT_EQ(0U, loc.getStart()); - EXPECT_EQ(1U + c.size(), loc.getEnd()); -} - -TEST(OsdmStreamParser, escapeSpecialCharacters) -{ - testEscapeSpecialCharacter("\\"); - testEscapeSpecialCharacter("{"); - testEscapeSpecialCharacter("}"); - testEscapeSpecialCharacter("<"); - testEscapeSpecialCharacter(">"); -} - -TEST(OsdmStreamParser, simpleSingleLineComment) -{ - const char *testString = "% This is a single line comment"; - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, singleLineComment) -{ - const char *testString = "a% This is a single line comment\nb"; - // 01234567890123456789012345678901 23 - // 0 1 2 3 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(33U, loc.getStart()); - ASSERT_EQ(34U, loc.getEnd()); - } - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, multilineComment) -{ - const char *testString = "a%{ This is a\n\n multiline line comment}%b"; - // 0123456789012 3 456789012345678901234567890 - // 0 1 2 3 4 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(40U, loc.getStart()); - ASSERT_EQ(41U, loc.getEnd()); - } - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, nestedMultilineComment) -{ - const char *testString = "a%{%{Another\n\n}%multiline line comment}%b"; - // 0123456789012 3 456789012345678901234567890 - // 0 1 2 3 4 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(40U, loc.getStart()); - ASSERT_EQ(41U, loc.getEnd()); - } - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommand) -{ - const char *testString = "\\test"; - // 0 12345 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - ASSERT_EQ(0U, reader.getCommandArguments().asMap().size()); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommandWithName) -{ - const char *testString = "\\test#bla"; - // 0 12345678 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(1U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("name")); - ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); - - loc = commandArguments.asMap()["name"].getLocation(); - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommandWithArguments) -{ - const char *testString = "\\test[a=1,b=2,c=\"test\"]"; - // 0 123456789012345 678901 2 - // 0 1 2 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(3U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("a")); - ASSERT_EQ(1U, commandArguments.asMap().count("b")); - ASSERT_EQ(1U, commandArguments.asMap().count("c")); - ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); - ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); - ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); - - loc = commandArguments.asMap()["a"].getLocation(); - ASSERT_EQ(8U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - loc = commandArguments.asMap()["b"].getLocation(); - ASSERT_EQ(12U, loc.getStart()); - ASSERT_EQ(13U, loc.getEnd()); - - loc = commandArguments.asMap()["c"].getLocation(); - ASSERT_EQ(16U, loc.getStart()); - ASSERT_EQ(22U, loc.getEnd()); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommandWithArgumentsAndName) -{ - const char *testString = "\\test#bla[a=1,b=2,c=\"test\"]"; - // 0 1234567890123456789 01234 56 - // 0 1 2 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(4U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("a")); - ASSERT_EQ(1U, commandArguments.asMap().count("b")); - ASSERT_EQ(1U, commandArguments.asMap().count("c")); - ASSERT_EQ(1U, commandArguments.asMap().count("name")); - ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); - ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); - ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); - ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); - - loc = commandArguments.asMap()["a"].getLocation(); - ASSERT_EQ(12U, loc.getStart()); - ASSERT_EQ(13U, loc.getEnd()); - - loc = commandArguments.asMap()["b"].getLocation(); - ASSERT_EQ(16U, loc.getStart()); - ASSERT_EQ(17U, loc.getEnd()); - - loc = commandArguments.asMap()["c"].getLocation(); - ASSERT_EQ(20U, loc.getStart()); - ASSERT_EQ(26U, loc.getEnd()); - - loc = commandArguments.asMap()["name"].getLocation(); - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -static void assertCommand(OsdmStreamParser &reader, const std::string &name, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - EXPECT_EQ(name, reader.getCommandName().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertCommand(OsdmStreamParser &reader, const std::string &name, - const Variant::mapType &args, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - assertCommand(reader, name, start, end); - EXPECT_EQ(args, reader.getCommandArguments()); -} - -static void assertData(OsdmStreamParser &reader, const std::string &data, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(data, reader.getData().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getData().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getData().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldStart(OsdmStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::FIELD_START, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldEnd(OsdmStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::FIELD_END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertEnd(OsdmStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -TEST(OsdmStreamParser, fields) -{ - const char *testString = "\\test{a}{b}{c}"; - // 01234567890123 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); - - assertFieldStart(reader, 8, 9); - assertData(reader, "b", 9, 10); - assertFieldEnd(reader, 10, 11); - - assertFieldStart(reader, 11, 12); - assertData(reader, "c", 12, 13); - assertFieldEnd(reader, 13, 14); - assertEnd(reader, 14, 14); -} - -TEST(OsdmStreamParser, dataOutsideField) -{ - const char *testString = "\\test{a}{b} c"; - // 0123456789012 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); - - assertFieldStart(reader, 8, 9); - assertData(reader, "b", 9, 10); - assertFieldEnd(reader, 10, 11); - - assertData(reader, "c", 12, 13); - assertEnd(reader, 13, 13); -} - -TEST(OsdmStreamParser, nestedCommand) -{ - const char *testString = "\\test{a}{\\test2{b} c} d"; - // 012345678 90123456789012 - // 0 1 2 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - - assertFieldStart(reader, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); - - assertFieldStart(reader, 8, 9); - { - assertCommand(reader, "test2", 9, 15); - assertFieldStart(reader, 15, 16); - assertData(reader, "b", 16, 17); - assertFieldEnd(reader, 17, 18); - } - assertData(reader, "c", 19, 20); - assertFieldEnd(reader, 20, 21); - assertData(reader, "d", 22, 23); - assertEnd(reader, 23, 23); -} - -TEST(OsdmStreamParser, nestedCommandImmediateEnd) -{ - const char *testString = "\\test{\\test2{b}} d"; - // 012345 678901234567 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - { - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); - assertData(reader, "b", 13, 14); - assertFieldEnd(reader, 14, 15); - } - assertFieldEnd(reader, 15, 16); - assertData(reader, "d", 17, 18); - assertEnd(reader, 18, 18); -} - -TEST(OsdmStreamParser, nestedCommandNoData) -{ - const char *testString = "\\test{\\test2}"; - // 012345 6789012 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldEnd(reader, 12, 13); - assertEnd(reader, 13, 13); -} - -TEST(OsdmStreamParser, multipleCommands) -{ - const char *testString = "\\a \\b \\c \\d"; - // 012 345 678 90 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "a", 0, 2); - assertCommand(reader, "b", 3, 5); - assertCommand(reader, "c", 6, 8); - assertCommand(reader, "d", 9, 11); - assertEnd(reader, 11, 11); -} - -TEST(OsdmStreamParser, fieldsWithSpaces) -{ - const char *testString = "\\a {\\b \\c} \n\n {\\d}"; - // 0123 456 789012 3 456 789 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, 3, 4); - assertCommand(reader, "b", 4, 6); - assertCommand(reader, "c", 7, 9); - assertFieldEnd(reader, 9, 10); - assertFieldStart(reader, 16, 17); - assertCommand(reader, "d", 17, 19); - assertFieldEnd(reader, 19, 20); - assertEnd(reader, 20, 20); -} - -TEST(OsdmStreamParser, errorNoFieldToStart) -{ - const char *testString = "\\a b {"; - // 012345 - // 0 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "a", 0, 2); - assertData(reader, "b", 3, 4); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 6, 6); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorNoFieldToEnd) -{ - const char *testString = "\\a b }"; - // 012345 - // 0 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "a", 0, 2); - assertData(reader, "b", 3, 4); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 6, 6); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorNoFieldEndNested) -{ - const char *testString = "\\test{\\test2{}}}"; - // 012345 6789012345 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); - assertFieldEnd(reader, 13, 14); - assertFieldEnd(reader, 14, 15); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 16, 16); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorNoFieldEndNestedData) -{ - const char *testString = "\\test{\\test2{}}a}"; - // 012345 67890123456 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); - assertFieldEnd(reader, 13, 14); - assertFieldEnd(reader, 14, 15); - assertData(reader, "a", 15, 16); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 17, 17); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, beginEnd) -{ - const char *testString = "\\begin{book}\\end{book}"; - // 012345678901 2345678901 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); - assertFieldEnd(reader, 17, 21); - assertEnd(reader, 22, 22); -} - -TEST(OsdmStreamParser, beginEndWithName) -{ - const char *testString = "\\begin{book#a}\\end{book}"; - // 01234567890123 4567890123 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", {{"name", "a"}}, 7, 11); - assertFieldStart(reader, 14, 15); - assertFieldEnd(reader, 19, 23); - assertEnd(reader, 24, 24); -} - -TEST(OsdmStreamParser, beginEndWithNameAndArgs) -{ - const char *testString = "\\begin{book#a}[a=1,b=2,c=\"test\"]\\end{book}"; - // 0123456789012345678901234 56789 01 2345678901 - // 0 1 2 3 4 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); - assertFieldEnd(reader, 37, 41); - assertEnd(reader, 42, 42); -} - -TEST(OsdmStreamParser, beginEndWithNameAndArgsMultipleFields) -{ - const char *testString = - "\\begin{book#a}[a=1,b=2,c=\"test\"]{a \\test}{b \\test{}}\\end{book}"; - // 0123456789012345678901234 56789 01234 567890123 45678901 2345678901 - // 0 1 2 3 4 5 6 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); - assertData(reader, "a", 33, 34); - assertCommand(reader, "test", Variant::mapType{}, 35, 40); - assertFieldEnd(reader, 40, 41); - assertFieldStart(reader, 41, 42); - assertData(reader, "b", 42, 43); - assertCommand(reader, "test", Variant::mapType{}, 44, 49); - assertFieldStart(reader, 49, 50); - assertFieldEnd(reader, 50, 51); - assertFieldEnd(reader, 51, 52); - assertFieldStart(reader, 52, 53); - assertFieldEnd(reader, 57, 61); - assertEnd(reader, 62, 62); -} - -TEST(OsdmStreamParser, beginEndWithData) -{ - const char *testString = "\\begin{book}a\\end{book}"; - // 0123456789012 3456789012 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); - assertData(reader, "a", 12, 13); - assertFieldEnd(reader, 18, 22); - assertEnd(reader, 23, 23); -} - -TEST(OsdmStreamParser, beginEndWithCommand) -{ - const char *testString = "\\begin{book}\\a{test}\\end{book}"; - // 012345678901 23456789 0123456789 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); - assertCommand(reader, "a", 12, 14); - assertFieldStart(reader, 14, 15); - assertData(reader, "test", 15, 19); - assertFieldEnd(reader, 19, 20); - assertFieldEnd(reader, 25, 29); - assertEnd(reader, 30, 30); -} - -TEST(OsdmStreamParser, errorBeginNoBraceOpen) -{ - const char *testString = "\\begin a"; - // 01234567 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 7, 8); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginNoIdentifier) -{ - const char *testString = "\\begin{!"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginNoBraceClose) -{ - const char *testString = "\\begin{a"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginNoName) -{ - const char *testString = "\\begin{a#}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "a"); - ASSERT_TRUE(logger.hasError()); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoBraceOpen) -{ - const char *testString = "\\end a"; - // 012345 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 5, 6); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoIdentifier) -{ - const char *testString = "\\end{!"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoBraceClose) -{ - const char *testString = "\\end{a"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoBegin) -{ - const char *testString = "\\end{a}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginEndMismatch) -{ - const char *testString = "\\begin{a} \\begin{b} test \\end{a}"; - // 0123456789 012345678901234 5678901 - // 0 1 2 3 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, 10, 11); - assertCommand(reader, "b", 17, 18); - assertFieldStart(reader, 20, 24); - assertData(reader, "test", 20, 24); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, commandWithNSSep) -{ - const char *testString = "\\test1:test2"; - // 012345678901 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test1:test2", 0, 12); - assertEnd(reader, 12, 12); -} - -TEST(OsdmStreamParser, beginEndWithNSSep) -{ - const char *testString = "\\begin{test1:test2}\\end{test1:test2}"; - // 0123456789012345678 90123456789012345 - // 0 1 2 3 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test1:test2", 7, 18); - assertFieldStart(reader, 19, 20); - assertFieldEnd(reader, 24, 35); - assertEnd(reader, 36, 36); -} - -TEST(OsdmStreamParser, errorBeginNSSep) -{ - const char *testString = "\\begin:test{blub}\\end{blub}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "blub"); - ASSERT_TRUE(logger.hasError()); - assertFieldStart(reader); - assertFieldEnd(reader); - assertEnd(reader); -} - -TEST(OsdmStreamParser, errorEndNSSep) -{ - const char *testString = "\\begin{blub}\\end:test{blub}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "blub"); - assertFieldStart(reader); - ASSERT_FALSE(logger.hasError()); - assertFieldEnd(reader); - ASSERT_TRUE(logger.hasError()); - assertEnd(reader); -} - -TEST(OsdmStreamParser, errorEmptyNs) -{ - const char *testString = "\\test:"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(reader, ":"); - assertEnd(reader); -} - -TEST(OsdmStreamParser, errorRepeatedNs) -{ - const char *testString = "\\test::"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(reader, "::"); - assertEnd(reader); -} -} - diff --git a/test/formats/osdmx/OsdmxParserTest.cpp b/test/formats/osdmx/OsdmxParserTest.cpp deleted file mode 100644 index c0fb50d..0000000 --- a/test/formats/osdmx/OsdmxParserTest.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace ousia { - -namespace RttiTypes { -extern const Rtti Document; -extern const Rtti Domain; -extern const Rtti Typesystem; -} - -struct XmlStandaloneEnvironment : public StandaloneEnvironment { - XmlParser xmlParser; - FileLocator fileLocator; - - XmlStandaloneEnvironment(ConcreteLogger &logger) - : StandaloneEnvironment(logger) - { - fileLocator.addDefaultSearchPaths(); - fileLocator.addUnittestSearchPath("xmlparser"); - - registry.registerDefaultExtensions(); - registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, - {&RttiTypes::Node}, &xmlParser); - registry.registerResourceLocator(&fileLocator); - } -}; - -static TerminalLogger logger(std::cerr, true); - -TEST(XmlParser, mismatchedTag) -{ - XmlStandaloneEnvironment env(logger); - env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); - ASSERT_TRUE(logger.hasError()); -} - -TEST(XmlParser, generic) -{ - XmlStandaloneEnvironment env(logger); - env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); -#ifdef MANAGER_GRAPHVIZ_EXPORT - env.manager.exportGraphviz("xmlDocument.dot"); -#endif -} - -static void checkAttributes(Handle expected, - Handle desc) -{ - if (expected == nullptr) { - ASSERT_TRUE(desc->getAttributesDescriptor()->getAttributes().empty()); - } else { - ASSERT_EQ(expected->getName(), - desc->getAttributesDescriptor()->getName()); - auto &attrs_exp = expected->getAttributes(); - auto &attrs = desc->getAttributesDescriptor()->getAttributes(); - ASSERT_EQ(attrs_exp.size(), attrs.size()); - for (size_t i = 0; i < attrs_exp.size(); i++) { - ASSERT_EQ(attrs_exp[i]->getName(), attrs[i]->getName()); - ASSERT_EQ(attrs_exp[i]->getType(), attrs[i]->getType()); - ASSERT_EQ(attrs_exp[i]->isOptional(), attrs[i]->isOptional()); - ASSERT_EQ(attrs_exp[i]->getDefaultValue(), - attrs[i]->getDefaultValue()); - } - } -} - -static void checkStructuredClass( - Handle n, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - ASSERT_FALSE(n == nullptr); - Handle sc = n.cast(); - ASSERT_FALSE(sc == nullptr); - ASSERT_EQ(name, sc->getName()); - ASSERT_EQ(domain, sc->getParent()); - ASSERT_EQ(cardinality, sc->getCardinality()); - ASSERT_EQ(transparent, sc->isTransparent()); - ASSERT_EQ(root, sc->hasRootPermission()); - checkAttributes(attributesDescriptor, sc); -} - -static Rooted checkStructuredClass( - const std::string &resolve, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - auto res = domain->resolve(&RttiTypes::StructuredClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle sc = res[0].node.cast(); - checkStructuredClass(sc, name, domain, cardinality, attributesDescriptor, - superclass, transparent, root); - return sc; -} - -static void checkAnnotationClass( - Handle n, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - ASSERT_FALSE(n == nullptr); - Handle ac = n.cast(); - ASSERT_FALSE(ac == nullptr); - ASSERT_EQ(name, ac->getName()); - ASSERT_EQ(domain, ac->getParent()); - checkAttributes(attributesDescriptor, ac); -} - -static Rooted checkAnnotationClass( - const std::string &resolve, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - auto res = domain->resolve(&RttiTypes::AnnotationClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle ac = res[0].node.cast(); - checkAnnotationClass(ac, name, domain, attributesDescriptor); - return ac; -} - -static void checkFieldDescriptor( - Handle n, const std::string &name, Handle parent, - NodeVector children, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - ASSERT_FALSE(n == nullptr); - Handle field = n.cast(); - ASSERT_FALSE(field.isNull()); - ASSERT_EQ(name, field->getName()); - ASSERT_EQ(parent, field->getParent()); - ASSERT_EQ(type, field->getFieldType()); - ASSERT_EQ(primitiveType, field->getPrimitiveType()); - ASSERT_EQ(optional, field->isOptional()); - // check the children. - ASSERT_EQ(children.size(), field->getChildren().size()); - for (unsigned int c = 0; c < children.size(); c++) { - ASSERT_EQ(children[c], field->getChildren()[c]); - } -} - -static void checkFieldDescriptor( - Handle desc, Handle parent, - NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); - ASSERT_EQ(1, res.size()); - checkFieldDescriptor(res[0].node, name, parent, children, type, - primitiveType, optional); -} - -static void checkFieldDescriptor( - Handle desc, NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - checkFieldDescriptor(desc, desc, children, name, type, primitiveType, - optional); -} - -TEST(XmlParser, domainParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("book_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(book_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - // check the domain node. - Rooted book_domain = book_domain_node.cast(); - ASSERT_EQ("book", book_domain->getName()); - // get the book struct node. - Cardinality single; - single.merge({1}); - Rooted bookAuthor{ - new StructType(book_domain->getManager(), "", nullptr)}; - bookAuthor->addAttribute( - {new Attribute(book_domain->getManager(), "author", - env.project->getSystemTypesystem()->getStringType(), - "")}, - logger); - Rooted book = checkStructuredClass( - "book", "book", book_domain, single, bookAuthor, nullptr, false, true); - // get the chapter struct node. - Rooted chapter = - checkStructuredClass("chapter", "chapter", book_domain); - Rooted section = - checkStructuredClass("section", "section", book_domain); - Rooted subsection = - checkStructuredClass("subsection", "subsection", book_domain); - Rooted paragraph = - checkStructuredClass("paragraph", "paragraph", book_domain, - Cardinality::any(), nullptr, nullptr, true, false); - Rooted text = - checkStructuredClass("text", "text", book_domain, Cardinality::any(), - nullptr, nullptr, true, false); - - // check the FieldDescriptors. - checkFieldDescriptor(book, {chapter, paragraph}); - checkFieldDescriptor(chapter, {section, paragraph}); - checkFieldDescriptor(section, {subsection, paragraph}); - checkFieldDescriptor(subsection, {paragraph}); - checkFieldDescriptor(paragraph, {text}); - checkFieldDescriptor( - text, {}, DEFAULT_FIELD_NAME, FieldDescriptor::FieldType::PRIMITIVE, - env.project->getSystemTypesystem()->getStringType(), false); - - // check parent handling using the headings domain. - Rooted headings_domain_node = - env.parse("headings_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(headings_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted headings_domain = headings_domain_node.cast(); - // now there should be a heading struct. - Rooted heading = - checkStructuredClass("heading", "heading", headings_domain, single, - nullptr, nullptr, true, false); - // which should be a reference to the paragraph descriptor. - checkFieldDescriptor(heading, paragraph, {text}); - // and each struct in the book domain (except for text) should have a - // heading field now. - checkFieldDescriptor(book, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(chapter, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(section, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(subsection, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(paragraph, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - - // check annotation handling using the comments domain. - Rooted comments_domain_node = - env.parse("comments_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(comments_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted comments_domain = comments_domain_node.cast(); - // now we should be able to find a comment annotation. - Rooted comment_anno = - checkAnnotationClass("comment", "comment", comments_domain); - // as well as a comment struct - Rooted comment = - checkStructuredClass("comment", "comment", comments_domain); - // and a reply struct - Rooted reply = - checkStructuredClass("reply", "reply", comments_domain); - // check the fields for each of them. - { - std::vector> descs{comment_anno, comment, reply}; - for (auto &d : descs) { - checkFieldDescriptor(d, {paragraph}, "content", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - checkFieldDescriptor(d, {reply}, "replies", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - } - } - // paragraph should have comment as child now as well. - checkFieldDescriptor(paragraph, {text, comment}); - // as should heading, because it references the paragraph default field. - checkFieldDescriptor(heading, paragraph, {text, comment}); -} - -TEST(XmlParser, documentParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("simple_book.oxd", "", "", RttiSet{&RttiTypes::Document}); - //TODO: Check result -} -} - -- cgit v1.2.3 From fde9997a9d321823ba6a2685e20769f5a10982cd Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:00:06 +0100 Subject: Moved TokenTrieTest to new directory --- test/core/parser/utils/TokenTrieTest.cpp | 2 +- test/core/parser/utils/TokenizerTest.cpp | 85 ++++++++++++++++---------------- 2 files changed, 43 insertions(+), 44 deletions(-) (limited to 'test') diff --git a/test/core/parser/utils/TokenTrieTest.cpp b/test/core/parser/utils/TokenTrieTest.cpp index aacd6c0..087e6e6 100644 --- a/test/core/parser/utils/TokenTrieTest.cpp +++ b/test/core/parser/utils/TokenTrieTest.cpp @@ -18,7 +18,7 @@ #include -#include +#include namespace ousia { diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index c1f8785..8565057 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -19,13 +19,13 @@ #include #include -#include +#include namespace ousia { -TEST(DynamicTokenizer, tokenRegistration) +TEST(Tokenizer, tokenRegistration) { - DynamicTokenizer tokenizer; + Tokenizer tokenizer; ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); @@ -50,15 +50,15 @@ TEST(DynamicTokenizer, tokenRegistration) ASSERT_EQ("d", tokenizer.getTokenString(1U)); } -TEST(DynamicTokenizer, textTokenPreserveWhitespace) +TEST(Tokenizer, textTokenPreserveWhitespace) { { CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer{WhitespaceMode::PRESERVE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ(" this \t is only a \n\n test text ", token.content); @@ -74,9 +74,9 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer{WhitespaceMode::PRESERVE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -89,15 +89,15 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) } } -TEST(DynamicTokenizer, textTokenTrimWhitespace) +TEST(Tokenizer, textTokenTrimWhitespace) { { CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer{WhitespaceMode::TRIM}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -113,9 +113,9 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer{WhitespaceMode::TRIM}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -128,15 +128,15 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) } } -TEST(DynamicTokenizer, textTokenCollapseWhitespace) +TEST(Tokenizer, textTokenCollapseWhitespace) { { CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -152,9 +152,9 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -167,16 +167,16 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) } } -TEST(DynamicTokenizer, simpleReadToken) +TEST(Tokenizer, simpleReadToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; + Tokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -192,7 +192,7 @@ TEST(DynamicTokenizer, simpleReadToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); @@ -208,7 +208,7 @@ TEST(DynamicTokenizer, simpleReadToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -223,16 +223,16 @@ TEST(DynamicTokenizer, simpleReadToken) } } -TEST(DynamicTokenizer, simplePeekToken) +TEST(Tokenizer, simplePeekToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; + Tokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -246,7 +246,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(tid, token.type); @@ -260,7 +260,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -274,7 +274,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -288,7 +288,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); @@ -302,7 +302,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -316,10 +316,10 @@ TEST(DynamicTokenizer, simplePeekToken) } } -TEST(DynamicTokenizer, ambiguousTokens) +TEST(Tokenizer, ambiguousTokens) { CharReader reader{"abc"}; - DynamicTokenizer tokenizer; + Tokenizer tokenizer; TokenTypeId t1 = tokenizer.registerToken("abd"); TokenTypeId t2 = tokenizer.registerToken("bc"); @@ -327,7 +327,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(0U, t1); ASSERT_EQ(1U, t2); - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -349,18 +349,18 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_FALSE(tokenizer.read(reader, token)); } -TEST(DynamicTokenizer, commentTestWhitespacePreserve) +TEST(Tokenizer, commentTestWhitespacePreserve) { CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); + Tokenizer tokenizer(WhitespaceMode::PRESERVE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); const TokenTypeId t3 = tokenizer.registerToken("*/"); - std::vector expected = { + std::vector expected = { {TextToken, "Test", SourceLocation{0, 0, 4}}, {t1, "/", SourceLocation{0, 4, 5}}, {TextToken, "Test ", SourceLocation{0, 5, 10}}, @@ -368,7 +368,7 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, {t3, "*/", SourceLocation{0, 27, 29}}}; - DynamicToken t; + Token t; for (auto &te : expected) { EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); @@ -380,18 +380,18 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) ASSERT_FALSE(tokenizer.read(reader, t)); } -TEST(DynamicTokenizer, commentTestWhitespaceCollapse) +TEST(Tokenizer, commentTestWhitespaceCollapse) { CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); + Tokenizer tokenizer(WhitespaceMode::COLLAPSE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); const TokenTypeId t3 = tokenizer.registerToken("*/"); - std::vector expected = { + std::vector expected = { {TextToken, "Test", SourceLocation{0, 0, 4}}, {t1, "/", SourceLocation{0, 4, 5}}, {TextToken, "Test", SourceLocation{0, 5, 9}}, @@ -399,7 +399,7 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, {t3, "*/", SourceLocation{0, 27, 29}}}; - DynamicToken t; + Token t; for (auto &te : expected) { EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); @@ -410,6 +410,5 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) } ASSERT_FALSE(tokenizer.read(reader, t)); } - } -- cgit v1.2.3 From 974afd3fdc54380a43445a180263fb162e1ff2c0 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:00:23 +0100 Subject: Applied renaming to unit tests and added forgotten CMakeLists.txt --- CMakeLists.txt | 177 +++--- test/formats/osml/OsmlStreamParserTest.cpp | 973 +++++++++++++++++++++++++++++ test/formats/osxml/OsxmlParserTest.cpp | 314 ++++++++++ 3 files changed, 1377 insertions(+), 87 deletions(-) create mode 100644 test/formats/osml/OsmlStreamParserTest.cpp create mode 100644 test/formats/osxml/OsxmlParserTest.cpp (limited to 'test') diff --git a/CMakeLists.txt b/CMakeLists.txt index 4458d1b..6e3b90f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -134,7 +134,6 @@ ADD_LIBRARY(ousia_core src/core/common/VariantConverter src/core/common/VariantReader src/core/common/VariantWriter - src/core/common/Whitespace src/core/frontend/Terminal src/core/frontend/TerminalLogger src/core/managed/Events @@ -148,11 +147,15 @@ ADD_LIBRARY(ousia_core src/core/model/RootNode src/core/model/Style src/core/model/Typesystem - src/core/parser/Parser - src/core/parser/ParserContext - src/core/parser/ParserScope - src/core/parser/generic/ParserStateStack - src/core/parser/generic/ParserState +# src/core/parser/Parser +# src/core/parser/ParserContext +# src/core/parser/ParserScope +# src/core/parser/generic/ParserState +# src/core/parser/generic/ParserStateCallbacks +# src/core/parser/generic/ParserStateHandler +# src/core/parser/generic/ParserStateStack + src/core/parser/utils/Tokenizer + src/core/parser/utils/TokenTrie src/core/resource/Resource src/core/resource/ResourceLocator src/core/resource/ResourceManager @@ -160,6 +163,8 @@ ADD_LIBRARY(ousia_core # src/core/script/ScriptEngine ) +# Format libraries + #ADD_LIBRARY(ousia_css # src/plugins/css/CodeTokenizer # src/plugins/css/Tokenizer @@ -170,43 +175,44 @@ ADD_LIBRARY(ousia_core # ousia_core #) -ADD_LIBRARY(ousia_filesystem - src/plugins/filesystem/FileLocator - src/plugins/filesystem/SpecialPaths +ADD_LIBRARY(ousia_osml + src/formats/osml/OsmlStreamParser ) -TARGET_LINK_LIBRARIES(ousia_filesystem +TARGET_LINK_LIBRARIES(ousia_osml ousia_core - ${Boost_LIBRARIES} ) -ADD_LIBRARY(ousia_html - src/plugins/html/DemoOutput -) +#ADD_LIBRARY(ousia_osxml +# src/formats/osxml/osxmlParser +#) -TARGET_LINK_LIBRARIES(ousia_html - ousia_core -) +#TARGET_LINK_LIBRARIES(ousia_osxml +# ousia_core +# ${EXPAT_LIBRARIES} +#) -ADD_LIBRARY(ousia_xml - src/plugins/xml/XmlParser -) +# Resource locators -TARGET_LINK_LIBRARIES(ousia_xml - ousia_core - ${EXPAT_LIBRARIES} -) +#ADD_LIBRARY(ousia_filesystem +# src/plugins/filesystem/FileLocator +# src/plugins/filesystem/SpecialPaths +#) -ADD_LIBRARY(ousia_osdm - src/formats/osdm/DynamicTokenizer - src/formats/osdm/TokenTrie - src/formats/osdm/OsdmStreamParser -) +#TARGET_LINK_LIBRARIES(ousia_filesystem +# ousia_core +# ${Boost_LIBRARIES} +#) -TARGET_LINK_LIBRARIES(ousia_osdm - ousia_core -) +# Output libraries +#ADD_LIBRARY(ousia_html +# src/plugins/html/DemoOutput +#) + +#TARGET_LINK_LIBRARIES(ousia_html +# ousia_core +#) #ADD_LIBRARY(ousia_mozjs # src/plugins/mozjs/MozJsScriptEngine @@ -219,17 +225,17 @@ TARGET_LINK_LIBRARIES(ousia_osdm # Command line interface -ADD_EXECUTABLE(ousia - src/cli/Main -) +#ADD_EXECUTABLE(ousia +# src/cli/Main +#) -TARGET_LINK_LIBRARIES(ousia - ousia_core - ousia_filesystem - ousia_html - ousia_xml - ${Boost_LIBRARIES} -) +#TARGET_LINK_LIBRARIES(ousia +# ousia_core +# ousia_filesystem +# ousia_html +# ousia_xml +# ${Boost_LIBRARIES} +#) # If testing is enabled, build the unit tests IF(TEST) @@ -240,10 +246,8 @@ IF(TEST) ) ADD_EXECUTABLE(ousia_test_core - test/core/CodeTokenizerTest test/core/RangeSetTest - test/core/RegistryTest - test/core/TokenizerTest +# test/core/RegistryTest test/core/XMLTest test/core/common/ArgumentTest test/core/common/CharReaderTest @@ -257,7 +261,6 @@ IF(TEST) test/core/common/VariantWriterTest test/core/common/VariantTest test/core/common/UtilsTest - test/core/common/WhitespaceTest test/core/frontend/TerminalLoggerTest test/core/managed/ManagedContainerTest test/core/managed/ManagedTest @@ -269,9 +272,11 @@ IF(TEST) test/core/model/NodeTest test/core/model/StyleTest test/core/model/TypesystemTest - test/core/parser/ParserScopeTest - test/core/parser/ParserStackTest - test/core/parser/ParserStateTest +# test/core/parser/ParserScopeTest +# test/core/parser/ParserStackTest +# test/core/parser/ParserStateTest + test/core/parser/utils/TokenizerTest + test/core/parser/utils/TokenTrieTest test/core/resource/ResourceLocatorTest test/core/resource/ResourceRequestTest # test/core/script/FunctionTest @@ -284,15 +289,15 @@ IF(TEST) ousia_core ) - ADD_EXECUTABLE(ousia_test_filesystem - test/plugins/filesystem/FileLocatorTest - ) +# ADD_EXECUTABLE(ousia_test_filesystem +# test/plugins/filesystem/FileLocatorTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_filesystem - ${GTEST_LIBRARIES} - ousia_core - ousia_filesystem - ) +# TARGET_LINK_LIBRARIES(ousia_test_filesystem +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_filesystem +# ) # ADD_EXECUTABLE(ousia_test_css # test/plugins/css/Tokenizer @@ -306,38 +311,36 @@ IF(TEST) # ousia_css # ) - ADD_EXECUTABLE(ousia_test_html - test/plugins/html/DemoOutputTest - ) +# ADD_EXECUTABLE(ousia_test_html +# test/plugins/html/DemoOutputTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_html - ${GTEST_LIBRARIES} - ousia_core - ousia_html - ) +# TARGET_LINK_LIBRARIES(ousia_test_html +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_html +# ) - ADD_EXECUTABLE(ousia_test_xml - test/plugins/xml/XmlParserTest + ADD_EXECUTABLE(ousia_test_osml + test/formats/osml/OsmlStreamParserTest ) - TARGET_LINK_LIBRARIES(ousia_test_xml + TARGET_LINK_LIBRARIES(ousia_test_osml ${GTEST_LIBRARIES} ousia_core - ousia_xml - ousia_filesystem + ousia_osml ) - ADD_EXECUTABLE(ousia_test_osdm - test/formats/osdm/TokenTrieTest - test/formats/osdm/DynamicTokenizerTest - test/formats/osdm/OsdmStreamParserTest - ) +# ADD_EXECUTABLE(ousia_test_osxml +# test/plugins/xml/XmlParserTest +# ) - TARGET_LINK_LIBRARIES(ousia_test_osdm - ${GTEST_LIBRARIES} - ousia_core - ousia_osdm - ) +# TARGET_LINK_LIBRARIES(ousia_test_osxml +# ${GTEST_LIBRARIES} +# ousia_core +# ousia_osml +# ousia_filesystem +# ) # ADD_EXECUTABLE(ousia_test_mozjs # test/plugins/mozjs/MozJsScriptEngineTest @@ -351,11 +354,11 @@ IF(TEST) # Register the unit tests ADD_TEST(ousia_test_core ousia_test_core) - ADD_TEST(ousia_test_filesystem ousia_test_filesystem) +# ADD_TEST(ousia_test_filesystem ousia_test_filesystem) # ADD_TEST(ousia_test_css ousia_test_css) - ADD_TEST(ousia_test_html ousia_test_html) - ADD_TEST(ousia_test_xml ousia_test_xml) - ADD_TEST(ousia_test_osdm ousia_test_osdm) +# ADD_TEST(ousia_test_html ousia_test_html) + ADD_TEST(ousia_test_osml ousia_test_osml) +# ADD_TEST(ousia_test_osxml ousia_test_osxml) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) ENDIF() @@ -373,6 +376,6 @@ INSTALL(DIRECTORY data/ DESTINATION share/ousia OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE ) -INSTALL(TARGETS ousia - RUNTIME DESTINATION bin -) +#INSTALL(TARGETS ousia +# RUNTIME DESTINATION bin +#) diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp new file mode 100644 index 0000000..e5eff05 --- /dev/null +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -0,0 +1,973 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include +#include + +#include + +namespace ousia { + +static TerminalLogger logger(std::cerr, true); + +TEST(OsmlStreamParser, empty) +{ + const char *testString = ""; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, oneCharacter) +{ + const char *testString = "a"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); +} + +TEST(OsmlStreamParser, whitespaceElimination) +{ + const char *testString = " hello \t world "; + // 0123456 78901234 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("hello world", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(14U, loc.getEnd()); +} + +TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) +{ + const char *testString = " hello \n world "; + // 0123456 78901234 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("hello world", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(14U, loc.getEnd()); + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, escapeWhitespace) +{ + const char *testString = " hello\\ \\ world "; + // 012345 67 89012345 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("hello world", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(15U, loc.getEnd()); + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +static void testEscapeSpecialCharacter(const std::string &c) +{ + CharReader charReader(std::string("\\") + c); + OsmlStreamParser reader(charReader, logger); + EXPECT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + EXPECT_EQ(c, reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + EXPECT_EQ(0U, loc.getStart()); + EXPECT_EQ(1U + c.size(), loc.getEnd()); +} + +TEST(OsmlStreamParser, escapeSpecialCharacters) +{ + testEscapeSpecialCharacter("\\"); + testEscapeSpecialCharacter("{"); + testEscapeSpecialCharacter("}"); + testEscapeSpecialCharacter("<"); + testEscapeSpecialCharacter(">"); +} + +TEST(OsmlStreamParser, simpleSingleLineComment) +{ + const char *testString = "% This is a single line comment"; + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, singleLineComment) +{ + const char *testString = "a% This is a single line comment\nb"; + // 01234567890123456789012345678901 23 + // 0 1 2 3 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + { + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + } + + { + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("b", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(33U, loc.getStart()); + ASSERT_EQ(34U, loc.getEnd()); + } + + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, multilineComment) +{ + const char *testString = "a%{ This is a\n\n multiline line comment}%b"; + // 0123456789012 3 456789012345678901234567890 + // 0 1 2 3 4 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + { + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + } + + { + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("b", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(40U, loc.getStart()); + ASSERT_EQ(41U, loc.getEnd()); + } + + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, nestedMultilineComment) +{ + const char *testString = "a%{%{Another\n\n}%multiline line comment}%b"; + // 0123456789012 3 456789012345678901234567890 + // 0 1 2 3 4 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + { + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + } + + { + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + ASSERT_EQ("b", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(40U, loc.getStart()); + ASSERT_EQ(41U, loc.getEnd()); + } + + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, simpleCommand) +{ + const char *testString = "\\test"; + // 0 12345 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + + Variant commandName = reader.getCommandName(); + ASSERT_EQ("test", commandName.asString()); + + SourceLocation loc = commandName.getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + ASSERT_EQ(0U, reader.getCommandArguments().asMap().size()); + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, simpleCommandWithName) +{ + const char *testString = "\\test#bla"; + // 0 12345678 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + + Variant commandName = reader.getCommandName(); + ASSERT_EQ("test", commandName.asString()); + SourceLocation loc = commandName.getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + Variant commandArguments = reader.getCommandArguments(); + ASSERT_TRUE(commandArguments.isMap()); + ASSERT_EQ(1U, commandArguments.asMap().size()); + ASSERT_EQ(1U, commandArguments.asMap().count("name")); + ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); + + loc = commandArguments.asMap()["name"].getLocation(); + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(9U, loc.getEnd()); + + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, simpleCommandWithArguments) +{ + const char *testString = "\\test[a=1,b=2,c=\"test\"]"; + // 0 123456789012345 678901 2 + // 0 1 2 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + + Variant commandName = reader.getCommandName(); + ASSERT_EQ("test", commandName.asString()); + SourceLocation loc = commandName.getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + Variant commandArguments = reader.getCommandArguments(); + ASSERT_TRUE(commandArguments.isMap()); + ASSERT_EQ(3U, commandArguments.asMap().size()); + ASSERT_EQ(1U, commandArguments.asMap().count("a")); + ASSERT_EQ(1U, commandArguments.asMap().count("b")); + ASSERT_EQ(1U, commandArguments.asMap().count("c")); + ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); + ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); + ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); + + loc = commandArguments.asMap()["a"].getLocation(); + ASSERT_EQ(8U, loc.getStart()); + ASSERT_EQ(9U, loc.getEnd()); + + loc = commandArguments.asMap()["b"].getLocation(); + ASSERT_EQ(12U, loc.getStart()); + ASSERT_EQ(13U, loc.getEnd()); + + loc = commandArguments.asMap()["c"].getLocation(); + ASSERT_EQ(16U, loc.getStart()); + ASSERT_EQ(22U, loc.getEnd()); + + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +TEST(OsmlStreamParser, simpleCommandWithArgumentsAndName) +{ + const char *testString = "\\test#bla[a=1,b=2,c=\"test\"]"; + // 0 1234567890123456789 01234 56 + // 0 1 2 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + + Variant commandName = reader.getCommandName(); + ASSERT_EQ("test", commandName.asString()); + SourceLocation loc = commandName.getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + Variant commandArguments = reader.getCommandArguments(); + ASSERT_TRUE(commandArguments.isMap()); + ASSERT_EQ(4U, commandArguments.asMap().size()); + ASSERT_EQ(1U, commandArguments.asMap().count("a")); + ASSERT_EQ(1U, commandArguments.asMap().count("b")); + ASSERT_EQ(1U, commandArguments.asMap().count("c")); + ASSERT_EQ(1U, commandArguments.asMap().count("name")); + ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); + ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); + ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); + ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); + + loc = commandArguments.asMap()["a"].getLocation(); + ASSERT_EQ(12U, loc.getStart()); + ASSERT_EQ(13U, loc.getEnd()); + + loc = commandArguments.asMap()["b"].getLocation(); + ASSERT_EQ(16U, loc.getStart()); + ASSERT_EQ(17U, loc.getEnd()); + + loc = commandArguments.asMap()["c"].getLocation(); + ASSERT_EQ(20U, loc.getStart()); + ASSERT_EQ(26U, loc.getEnd()); + + loc = commandArguments.asMap()["name"].getLocation(); + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(9U, loc.getEnd()); + + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); +} + +static void assertCommand(OsmlStreamParser &reader, const std::string &name, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + EXPECT_EQ(name, reader.getCommandName().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertCommand(OsmlStreamParser &reader, const std::string &name, + const Variant::mapType &args, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + assertCommand(reader, name, start, end); + EXPECT_EQ(args, reader.getCommandArguments()); +} + +static void assertData(OsmlStreamParser &reader, const std::string &data, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + EXPECT_EQ(data, reader.getData().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getData().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getData().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertFieldStart(OsmlStreamParser &reader, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertFieldEnd(OsmlStreamParser &reader, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertEnd(OsmlStreamParser &reader, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +TEST(OsmlStreamParser, fields) +{ + const char *testString = "\\test{a}{b}{c}"; + // 01234567890123 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "test", 0, 5); + assertFieldStart(reader, 5, 6); + assertData(reader, "a", 6, 7); + assertFieldEnd(reader, 7, 8); + + assertFieldStart(reader, 8, 9); + assertData(reader, "b", 9, 10); + assertFieldEnd(reader, 10, 11); + + assertFieldStart(reader, 11, 12); + assertData(reader, "c", 12, 13); + assertFieldEnd(reader, 13, 14); + assertEnd(reader, 14, 14); +} + +TEST(OsmlStreamParser, dataOutsideField) +{ + const char *testString = "\\test{a}{b} c"; + // 0123456789012 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "test", 0, 5); + assertFieldStart(reader, 5, 6); + assertData(reader, "a", 6, 7); + assertFieldEnd(reader, 7, 8); + + assertFieldStart(reader, 8, 9); + assertData(reader, "b", 9, 10); + assertFieldEnd(reader, 10, 11); + + assertData(reader, "c", 12, 13); + assertEnd(reader, 13, 13); +} + +TEST(OsmlStreamParser, nestedCommand) +{ + const char *testString = "\\test{a}{\\test2{b} c} d"; + // 012345678 90123456789012 + // 0 1 2 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "test", 0, 5); + + assertFieldStart(reader, 5, 6); + assertData(reader, "a", 6, 7); + assertFieldEnd(reader, 7, 8); + + assertFieldStart(reader, 8, 9); + { + assertCommand(reader, "test2", 9, 15); + assertFieldStart(reader, 15, 16); + assertData(reader, "b", 16, 17); + assertFieldEnd(reader, 17, 18); + } + assertData(reader, "c", 19, 20); + assertFieldEnd(reader, 20, 21); + assertData(reader, "d", 22, 23); + assertEnd(reader, 23, 23); +} + +TEST(OsmlStreamParser, nestedCommandImmediateEnd) +{ + const char *testString = "\\test{\\test2{b}} d"; + // 012345 678901234567 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "test", 0, 5); + assertFieldStart(reader, 5, 6); + { + assertCommand(reader, "test2", 6, 12); + assertFieldStart(reader, 12, 13); + assertData(reader, "b", 13, 14); + assertFieldEnd(reader, 14, 15); + } + assertFieldEnd(reader, 15, 16); + assertData(reader, "d", 17, 18); + assertEnd(reader, 18, 18); +} + +TEST(OsmlStreamParser, nestedCommandNoData) +{ + const char *testString = "\\test{\\test2}"; + // 012345 6789012 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "test", 0, 5); + assertFieldStart(reader, 5, 6); + assertCommand(reader, "test2", 6, 12); + assertFieldEnd(reader, 12, 13); + assertEnd(reader, 13, 13); +} + +TEST(OsmlStreamParser, multipleCommands) +{ + const char *testString = "\\a \\b \\c \\d"; + // 012 345 678 90 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertCommand(reader, "b", 3, 5); + assertCommand(reader, "c", 6, 8); + assertCommand(reader, "d", 9, 11); + assertEnd(reader, 11, 11); +} + +TEST(OsmlStreamParser, fieldsWithSpaces) +{ + const char *testString = "\\a {\\b \\c} \n\n {\\d}"; + // 0123 456 789012 3 456 789 + // 0 1 + CharReader charReader(testString); + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, 3, 4); + assertCommand(reader, "b", 4, 6); + assertCommand(reader, "c", 7, 9); + assertFieldEnd(reader, 9, 10); + assertFieldStart(reader, 16, 17); + assertCommand(reader, "d", 17, 19); + assertFieldEnd(reader, 19, 20); + assertEnd(reader, 20, 20); +} + +TEST(OsmlStreamParser, errorNoFieldToStart) +{ + const char *testString = "\\a b {"; + // 012345 + // 0 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + assertCommand(reader, "a", 0, 2); + assertData(reader, "b", 3, 4); + ASSERT_FALSE(logger.hasError()); + assertEnd(reader, 6, 6); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorNoFieldToEnd) +{ + const char *testString = "\\a b }"; + // 012345 + // 0 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + assertCommand(reader, "a", 0, 2); + assertData(reader, "b", 3, 4); + ASSERT_FALSE(logger.hasError()); + assertEnd(reader, 6, 6); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorNoFieldEndNested) +{ + const char *testString = "\\test{\\test2{}}}"; + // 012345 6789012345 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + assertCommand(reader, "test", 0, 5); + assertFieldStart(reader, 5, 6); + assertCommand(reader, "test2", 6, 12); + assertFieldStart(reader, 12, 13); + assertFieldEnd(reader, 13, 14); + assertFieldEnd(reader, 14, 15); + ASSERT_FALSE(logger.hasError()); + assertEnd(reader, 16, 16); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorNoFieldEndNestedData) +{ + const char *testString = "\\test{\\test2{}}a}"; + // 012345 67890123456 + // 0 1 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + assertCommand(reader, "test", 0, 5); + assertFieldStart(reader, 5, 6); + assertCommand(reader, "test2", 6, 12); + assertFieldStart(reader, 12, 13); + assertFieldEnd(reader, 13, 14); + assertFieldEnd(reader, 14, 15); + assertData(reader, "a", 15, 16); + ASSERT_FALSE(logger.hasError()); + assertEnd(reader, 17, 17); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, beginEnd) +{ + const char *testString = "\\begin{book}\\end{book}"; + // 012345678901 2345678901 + // 0 1 2 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "book", 7, 11); + assertFieldStart(reader, 12, 13); + assertFieldEnd(reader, 17, 21); + assertEnd(reader, 22, 22); +} + +TEST(OsmlStreamParser, beginEndWithName) +{ + const char *testString = "\\begin{book#a}\\end{book}"; + // 01234567890123 4567890123 + // 0 1 2 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "book", {{"name", "a"}}, 7, 11); + assertFieldStart(reader, 14, 15); + assertFieldEnd(reader, 19, 23); + assertEnd(reader, 24, 24); +} + +TEST(OsmlStreamParser, beginEndWithNameAndArgs) +{ + const char *testString = "\\begin{book#a}[a=1,b=2,c=\"test\"]\\end{book}"; + // 0123456789012345678901234 56789 01 2345678901 + // 0 1 2 3 4 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "book", + {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); + assertFieldStart(reader, 32, 33); + assertFieldEnd(reader, 37, 41); + assertEnd(reader, 42, 42); +} + +TEST(OsmlStreamParser, beginEndWithNameAndArgsMultipleFields) +{ + const char *testString = + "\\begin{book#a}[a=1,b=2,c=\"test\"]{a \\test}{b \\test{}}\\end{book}"; + // 0123456789012345678901234 56789 01234 567890123 45678901 2345678901 + // 0 1 2 3 4 5 6 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "book", + {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); + assertFieldStart(reader, 32, 33); + assertData(reader, "a", 33, 34); + assertCommand(reader, "test", Variant::mapType{}, 35, 40); + assertFieldEnd(reader, 40, 41); + assertFieldStart(reader, 41, 42); + assertData(reader, "b", 42, 43); + assertCommand(reader, "test", Variant::mapType{}, 44, 49); + assertFieldStart(reader, 49, 50); + assertFieldEnd(reader, 50, 51); + assertFieldEnd(reader, 51, 52); + assertFieldStart(reader, 52, 53); + assertFieldEnd(reader, 57, 61); + assertEnd(reader, 62, 62); +} + +TEST(OsmlStreamParser, beginEndWithData) +{ + const char *testString = "\\begin{book}a\\end{book}"; + // 0123456789012 3456789012 + // 0 1 2 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "book", 7, 11); + assertFieldStart(reader, 12, 13); + assertData(reader, "a", 12, 13); + assertFieldEnd(reader, 18, 22); + assertEnd(reader, 23, 23); +} + +TEST(OsmlStreamParser, beginEndWithCommand) +{ + const char *testString = "\\begin{book}\\a{test}\\end{book}"; + // 012345678901 23456789 0123456789 + // 0 1 2 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "book", 7, 11); + assertFieldStart(reader, 12, 13); + assertCommand(reader, "a", 12, 14); + assertFieldStart(reader, 14, 15); + assertData(reader, "test", 15, 19); + assertFieldEnd(reader, 19, 20); + assertFieldEnd(reader, 25, 29); + assertEnd(reader, 30, 30); +} + +TEST(OsmlStreamParser, errorBeginNoBraceOpen) +{ + const char *testString = "\\begin a"; + // 01234567 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertData(reader, "a", 7, 8); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorBeginNoIdentifier) +{ + const char *testString = "\\begin{!"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(reader.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorBeginNoBraceClose) +{ + const char *testString = "\\begin{a"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(reader.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorBeginNoName) +{ + const char *testString = "\\begin{a#}"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "a"); + ASSERT_TRUE(logger.hasError()); + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertEnd(reader); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorEndNoBraceOpen) +{ + const char *testString = "\\end a"; + // 012345 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertData(reader, "a", 5, 6); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorEndNoIdentifier) +{ + const char *testString = "\\end{!"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(reader.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorEndNoBraceClose) +{ + const char *testString = "\\end{a"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(reader.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorEndNoBegin) +{ + const char *testString = "\\end{a}"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(reader.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, errorBeginEndMismatch) +{ + const char *testString = "\\begin{a} \\begin{b} test \\end{a}"; + // 0123456789 012345678901234 5678901 + // 0 1 2 3 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + assertCommand(reader, "a", 7, 8); + assertFieldStart(reader, 10, 11); + assertCommand(reader, "b", 17, 18); + assertFieldStart(reader, 20, 24); + assertData(reader, "test", 20, 24); + ASSERT_FALSE(logger.hasError()); + ASSERT_THROW(reader.parse(), LoggableException); + ASSERT_TRUE(logger.hasError()); +} + +TEST(OsmlStreamParser, commandWithNSSep) +{ + const char *testString = "\\test1:test2"; + // 012345678901 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "test1:test2", 0, 12); + assertEnd(reader, 12, 12); +} + +TEST(OsmlStreamParser, beginEndWithNSSep) +{ + const char *testString = "\\begin{test1:test2}\\end{test1:test2}"; + // 0123456789012345678 90123456789012345 + // 0 1 2 3 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "test1:test2", 7, 18); + assertFieldStart(reader, 19, 20); + assertFieldEnd(reader, 24, 35); + assertEnd(reader, 36, 36); +} + +TEST(OsmlStreamParser, errorBeginNSSep) +{ + const char *testString = "\\begin:test{blub}\\end{blub}"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "blub"); + ASSERT_TRUE(logger.hasError()); + assertFieldStart(reader); + assertFieldEnd(reader); + assertEnd(reader); +} + +TEST(OsmlStreamParser, errorEndNSSep) +{ + const char *testString = "\\begin{blub}\\end:test{blub}"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + assertCommand(reader, "blub"); + assertFieldStart(reader); + ASSERT_FALSE(logger.hasError()); + assertFieldEnd(reader); + ASSERT_TRUE(logger.hasError()); + assertEnd(reader); +} + +TEST(OsmlStreamParser, errorEmptyNs) +{ + const char *testString = "\\test:"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "test"); + ASSERT_TRUE(logger.hasError()); + assertData(reader, ":"); + assertEnd(reader); +} + +TEST(OsmlStreamParser, errorRepeatedNs) +{ + const char *testString = "\\test::"; + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "test"); + ASSERT_TRUE(logger.hasError()); + assertData(reader, "::"); + assertEnd(reader); +} +} + diff --git a/test/formats/osxml/OsxmlParserTest.cpp b/test/formats/osxml/OsxmlParserTest.cpp new file mode 100644 index 0000000..c0fb50d --- /dev/null +++ b/test/formats/osxml/OsxmlParserTest.cpp @@ -0,0 +1,314 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace ousia { + +namespace RttiTypes { +extern const Rtti Document; +extern const Rtti Domain; +extern const Rtti Typesystem; +} + +struct XmlStandaloneEnvironment : public StandaloneEnvironment { + XmlParser xmlParser; + FileLocator fileLocator; + + XmlStandaloneEnvironment(ConcreteLogger &logger) + : StandaloneEnvironment(logger) + { + fileLocator.addDefaultSearchPaths(); + fileLocator.addUnittestSearchPath("xmlparser"); + + registry.registerDefaultExtensions(); + registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, + {&RttiTypes::Node}, &xmlParser); + registry.registerResourceLocator(&fileLocator); + } +}; + +static TerminalLogger logger(std::cerr, true); + +TEST(XmlParser, mismatchedTag) +{ + XmlStandaloneEnvironment env(logger); + env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); + ASSERT_TRUE(logger.hasError()); +} + +TEST(XmlParser, generic) +{ + XmlStandaloneEnvironment env(logger); + env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); +#ifdef MANAGER_GRAPHVIZ_EXPORT + env.manager.exportGraphviz("xmlDocument.dot"); +#endif +} + +static void checkAttributes(Handle expected, + Handle desc) +{ + if (expected == nullptr) { + ASSERT_TRUE(desc->getAttributesDescriptor()->getAttributes().empty()); + } else { + ASSERT_EQ(expected->getName(), + desc->getAttributesDescriptor()->getName()); + auto &attrs_exp = expected->getAttributes(); + auto &attrs = desc->getAttributesDescriptor()->getAttributes(); + ASSERT_EQ(attrs_exp.size(), attrs.size()); + for (size_t i = 0; i < attrs_exp.size(); i++) { + ASSERT_EQ(attrs_exp[i]->getName(), attrs[i]->getName()); + ASSERT_EQ(attrs_exp[i]->getType(), attrs[i]->getType()); + ASSERT_EQ(attrs_exp[i]->isOptional(), attrs[i]->isOptional()); + ASSERT_EQ(attrs_exp[i]->getDefaultValue(), + attrs[i]->getDefaultValue()); + } + } +} + +static void checkStructuredClass( + Handle n, const std::string &name, Handle domain, + Variant cardinality = Cardinality::any(), + Handle attributesDescriptor = nullptr, + Handle superclass = nullptr, bool transparent = false, + bool root = false) +{ + ASSERT_FALSE(n == nullptr); + Handle sc = n.cast(); + ASSERT_FALSE(sc == nullptr); + ASSERT_EQ(name, sc->getName()); + ASSERT_EQ(domain, sc->getParent()); + ASSERT_EQ(cardinality, sc->getCardinality()); + ASSERT_EQ(transparent, sc->isTransparent()); + ASSERT_EQ(root, sc->hasRootPermission()); + checkAttributes(attributesDescriptor, sc); +} + +static Rooted checkStructuredClass( + const std::string &resolve, const std::string &name, Handle domain, + Variant cardinality = Cardinality::any(), + Handle attributesDescriptor = nullptr, + Handle superclass = nullptr, bool transparent = false, + bool root = false) +{ + auto res = domain->resolve(&RttiTypes::StructuredClass, resolve); + if (res.size() != 1) { + throw OusiaException("resolution error!"); + } + Handle sc = res[0].node.cast(); + checkStructuredClass(sc, name, domain, cardinality, attributesDescriptor, + superclass, transparent, root); + return sc; +} + +static void checkAnnotationClass( + Handle n, const std::string &name, Handle domain, + Handle attributesDescriptor = nullptr) +{ + ASSERT_FALSE(n == nullptr); + Handle ac = n.cast(); + ASSERT_FALSE(ac == nullptr); + ASSERT_EQ(name, ac->getName()); + ASSERT_EQ(domain, ac->getParent()); + checkAttributes(attributesDescriptor, ac); +} + +static Rooted checkAnnotationClass( + const std::string &resolve, const std::string &name, Handle domain, + Handle attributesDescriptor = nullptr) +{ + auto res = domain->resolve(&RttiTypes::AnnotationClass, resolve); + if (res.size() != 1) { + throw OusiaException("resolution error!"); + } + Handle ac = res[0].node.cast(); + checkAnnotationClass(ac, name, domain, attributesDescriptor); + return ac; +} + +static void checkFieldDescriptor( + Handle n, const std::string &name, Handle parent, + NodeVector children, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + ASSERT_FALSE(n == nullptr); + Handle field = n.cast(); + ASSERT_FALSE(field.isNull()); + ASSERT_EQ(name, field->getName()); + ASSERT_EQ(parent, field->getParent()); + ASSERT_EQ(type, field->getFieldType()); + ASSERT_EQ(primitiveType, field->getPrimitiveType()); + ASSERT_EQ(optional, field->isOptional()); + // check the children. + ASSERT_EQ(children.size(), field->getChildren().size()); + for (unsigned int c = 0; c < children.size(); c++) { + ASSERT_EQ(children[c], field->getChildren()[c]); + } +} + +static void checkFieldDescriptor( + Handle desc, Handle parent, + NodeVector children, + const std::string &name = DEFAULT_FIELD_NAME, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); + ASSERT_EQ(1, res.size()); + checkFieldDescriptor(res[0].node, name, parent, children, type, + primitiveType, optional); +} + +static void checkFieldDescriptor( + Handle desc, NodeVector children, + const std::string &name = DEFAULT_FIELD_NAME, + FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, + Handle primitiveType = nullptr, bool optional = false) +{ + checkFieldDescriptor(desc, desc, children, name, type, primitiveType, + optional); +} + +TEST(XmlParser, domainParsing) +{ + XmlStandaloneEnvironment env(logger); + Rooted book_domain_node = + env.parse("book_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(book_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + // check the domain node. + Rooted book_domain = book_domain_node.cast(); + ASSERT_EQ("book", book_domain->getName()); + // get the book struct node. + Cardinality single; + single.merge({1}); + Rooted bookAuthor{ + new StructType(book_domain->getManager(), "", nullptr)}; + bookAuthor->addAttribute( + {new Attribute(book_domain->getManager(), "author", + env.project->getSystemTypesystem()->getStringType(), + "")}, + logger); + Rooted book = checkStructuredClass( + "book", "book", book_domain, single, bookAuthor, nullptr, false, true); + // get the chapter struct node. + Rooted chapter = + checkStructuredClass("chapter", "chapter", book_domain); + Rooted section = + checkStructuredClass("section", "section", book_domain); + Rooted subsection = + checkStructuredClass("subsection", "subsection", book_domain); + Rooted paragraph = + checkStructuredClass("paragraph", "paragraph", book_domain, + Cardinality::any(), nullptr, nullptr, true, false); + Rooted text = + checkStructuredClass("text", "text", book_domain, Cardinality::any(), + nullptr, nullptr, true, false); + + // check the FieldDescriptors. + checkFieldDescriptor(book, {chapter, paragraph}); + checkFieldDescriptor(chapter, {section, paragraph}); + checkFieldDescriptor(section, {subsection, paragraph}); + checkFieldDescriptor(subsection, {paragraph}); + checkFieldDescriptor(paragraph, {text}); + checkFieldDescriptor( + text, {}, DEFAULT_FIELD_NAME, FieldDescriptor::FieldType::PRIMITIVE, + env.project->getSystemTypesystem()->getStringType(), false); + + // check parent handling using the headings domain. + Rooted headings_domain_node = + env.parse("headings_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(headings_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + Rooted headings_domain = headings_domain_node.cast(); + // now there should be a heading struct. + Rooted heading = + checkStructuredClass("heading", "heading", headings_domain, single, + nullptr, nullptr, true, false); + // which should be a reference to the paragraph descriptor. + checkFieldDescriptor(heading, paragraph, {text}); + // and each struct in the book domain (except for text) should have a + // heading field now. + checkFieldDescriptor(book, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(chapter, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(section, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(subsection, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + checkFieldDescriptor(paragraph, {heading}, "heading", + FieldDescriptor::FieldType::SUBTREE, nullptr, true); + + // check annotation handling using the comments domain. + Rooted comments_domain_node = + env.parse("comments_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); + ASSERT_FALSE(comments_domain_node == nullptr); + ASSERT_FALSE(logger.hasError()); + Rooted comments_domain = comments_domain_node.cast(); + // now we should be able to find a comment annotation. + Rooted comment_anno = + checkAnnotationClass("comment", "comment", comments_domain); + // as well as a comment struct + Rooted comment = + checkStructuredClass("comment", "comment", comments_domain); + // and a reply struct + Rooted reply = + checkStructuredClass("reply", "reply", comments_domain); + // check the fields for each of them. + { + std::vector> descs{comment_anno, comment, reply}; + for (auto &d : descs) { + checkFieldDescriptor(d, {paragraph}, "content", + FieldDescriptor::FieldType::SUBTREE, nullptr, + false); + checkFieldDescriptor(d, {reply}, "replies", + FieldDescriptor::FieldType::SUBTREE, nullptr, + false); + } + } + // paragraph should have comment as child now as well. + checkFieldDescriptor(paragraph, {text, comment}); + // as should heading, because it references the paragraph default field. + checkFieldDescriptor(heading, paragraph, {text, comment}); +} + +TEST(XmlParser, documentParsing) +{ + XmlStandaloneEnvironment env(logger); + Rooted book_domain_node = + env.parse("simple_book.oxd", "", "", RttiSet{&RttiTypes::Document}); + //TODO: Check result +} +} + -- cgit v1.2.3 From 2659b4595d809cbd69a77e5ff7e2fc08d225f065 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:02:54 +0100 Subject: Tidied OsxmlEventParser up, implemented correct whitespace handling, started to write unit tests for the osxml parser --- CMakeLists.txt | 93 +++--- src/core/common/Utils.hpp | 21 +- src/core/common/WhitespaceHandler.hpp | 60 ++++ src/formats/osxml/OsxmlAttributeLocator.cpp | 144 ++++++++++ src/formats/osxml/OsxmlAttributeLocator.hpp | 67 +++++ src/formats/osxml/OsxmlEventParser.cpp | 425 +++++++++++++++------------- src/formats/osxml/OsxmlEventParser.hpp | 44 +-- test/formats/osml/OsmlStreamParserTest.cpp | 1 + test/formats/osxml/OsxmlEventParserTest.cpp | 222 +++++++++++++++ 9 files changed, 811 insertions(+), 266 deletions(-) create mode 100644 src/formats/osxml/OsxmlAttributeLocator.cpp create mode 100644 src/formats/osxml/OsxmlAttributeLocator.hpp create mode 100644 test/formats/osxml/OsxmlEventParserTest.cpp (limited to 'test') diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e3b90f..bdc9541 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,9 +147,9 @@ ADD_LIBRARY(ousia_core src/core/model/RootNode src/core/model/Style src/core/model/Typesystem -# src/core/parser/Parser -# src/core/parser/ParserContext -# src/core/parser/ParserScope + src/core/parser/Parser + src/core/parser/ParserContext + src/core/parser/ParserScope # src/core/parser/generic/ParserState # src/core/parser/generic/ParserStateCallbacks # src/core/parser/generic/ParserStateHandler @@ -183,36 +183,37 @@ TARGET_LINK_LIBRARIES(ousia_osml ousia_core ) -#ADD_LIBRARY(ousia_osxml -# src/formats/osxml/osxmlParser -#) +ADD_LIBRARY(ousia_osxml + src/formats/osxml/OsxmlAttributeLocator + src/formats/osxml/OsxmlEventParser +) -#TARGET_LINK_LIBRARIES(ousia_osxml -# ousia_core -# ${EXPAT_LIBRARIES} -#) +TARGET_LINK_LIBRARIES(ousia_osxml + ousia_core + ${EXPAT_LIBRARIES} +) # Resource locators -#ADD_LIBRARY(ousia_filesystem -# src/plugins/filesystem/FileLocator -# src/plugins/filesystem/SpecialPaths -#) +ADD_LIBRARY(ousia_filesystem + src/plugins/filesystem/FileLocator + src/plugins/filesystem/SpecialPaths +) -#TARGET_LINK_LIBRARIES(ousia_filesystem -# ousia_core -# ${Boost_LIBRARIES} -#) +TARGET_LINK_LIBRARIES(ousia_filesystem + ousia_core + ${Boost_LIBRARIES} +) # Output libraries -#ADD_LIBRARY(ousia_html -# src/plugins/html/DemoOutput -#) +ADD_LIBRARY(ousia_html + src/plugins/html/DemoOutput +) -#TARGET_LINK_LIBRARIES(ousia_html -# ousia_core -#) +TARGET_LINK_LIBRARIES(ousia_html + ousia_core +) #ADD_LIBRARY(ousia_mozjs # src/plugins/mozjs/MozJsScriptEngine @@ -247,7 +248,7 @@ IF(TEST) ADD_EXECUTABLE(ousia_test_core test/core/RangeSetTest -# test/core/RegistryTest + test/core/RegistryTest test/core/XMLTest test/core/common/ArgumentTest test/core/common/CharReaderTest @@ -272,7 +273,7 @@ IF(TEST) test/core/model/NodeTest test/core/model/StyleTest test/core/model/TypesystemTest -# test/core/parser/ParserScopeTest + test/core/parser/ParserScopeTest # test/core/parser/ParserStackTest # test/core/parser/ParserStateTest test/core/parser/utils/TokenizerTest @@ -311,15 +312,15 @@ IF(TEST) # ousia_css # ) -# ADD_EXECUTABLE(ousia_test_html -# test/plugins/html/DemoOutputTest -# ) + ADD_EXECUTABLE(ousia_test_html + test/plugins/html/DemoOutputTest + ) -# TARGET_LINK_LIBRARIES(ousia_test_html -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_html -# ) + TARGET_LINK_LIBRARIES(ousia_test_html + ${GTEST_LIBRARIES} + ousia_core + ousia_html + ) ADD_EXECUTABLE(ousia_test_osml test/formats/osml/OsmlStreamParserTest @@ -331,16 +332,16 @@ IF(TEST) ousia_osml ) -# ADD_EXECUTABLE(ousia_test_osxml -# test/plugins/xml/XmlParserTest -# ) + ADD_EXECUTABLE(ousia_test_osxml + test/formats/osxml/OsxmlEventParserTest + ) -# TARGET_LINK_LIBRARIES(ousia_test_osxml -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_osml -# ousia_filesystem -# ) + TARGET_LINK_LIBRARIES(ousia_test_osxml + ${GTEST_LIBRARIES} + ousia_core + ousia_osxml + ousia_filesystem + ) # ADD_EXECUTABLE(ousia_test_mozjs # test/plugins/mozjs/MozJsScriptEngineTest @@ -354,11 +355,11 @@ IF(TEST) # Register the unit tests ADD_TEST(ousia_test_core ousia_test_core) -# ADD_TEST(ousia_test_filesystem ousia_test_filesystem) + ADD_TEST(ousia_test_filesystem ousia_test_filesystem) # ADD_TEST(ousia_test_css ousia_test_css) -# ADD_TEST(ousia_test_html ousia_test_html) + ADD_TEST(ousia_test_html ousia_test_html) ADD_TEST(ousia_test_osml ousia_test_osml) -# ADD_TEST(ousia_test_osxml ousia_test_osxml) + ADD_TEST(ousia_test_osxml ousia_test_osxml) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) ENDIF() diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 16a9136..8361973 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -119,9 +119,26 @@ public: */ template static std::pair trim(const T &s, Filter f) + { + return trim(s, s.size(), f); + } + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param len is the number of elements in the container. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template + static std::pair trim(const T &s, size_t len, Filter f) { size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { + for (size_t i = 0; i < len; i++) { if (!f(s[i])) { start = i; break; @@ -129,7 +146,7 @@ public: } size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { + for (ssize_t i = len - 1; i >= static_cast(start); i--) { if (!f(s[i])) { end = i + 1; break; diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp index 79e0518..ed52ea3 100644 --- a/src/core/common/WhitespaceHandler.hpp +++ b/src/core/common/WhitespaceHandler.hpp @@ -97,6 +97,25 @@ public: * @param end is the end byte offset of the given character. */ void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd); + } + + /** + * Static version of PreservingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + */ + static void append(char c, size_t start, size_t end, + std::vector &textBuf, size_t &textStart, + size_t &textEnd) { if (textBuf.empty()) { textStart = start; @@ -129,6 +148,27 @@ public: * @param end is the end byte offset of the given character. */ void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); + } + + /** + * Static version of TrimmingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + * @param whitespaceBuf is a reference at the buffer for storing whitespace + * characters. + */ + static void append(char c, size_t start, size_t end, + std::vector &textBuf, size_t &textStart, + size_t &textEnd, std::vector &whitespaceBuf) { // Handle whitespace characters if (Utils::isWhitespace(c)) { @@ -174,6 +214,26 @@ public: * @param end is the end byte offset of the given character. */ void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); + } + + /** + * Static version of CollapsingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + * @param hasWhitespace is a reference at the "hasWhitespace" flag. + */ + static void append(char c, size_t start, size_t end, + std::vector &textBuf, size_t &textStart, + size_t &textEnd, bool &hasWhitespace) { // Handle whitespace characters if (Utils::isWhitespace(c)) { diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp new file mode 100644 index 0000000..e37446a --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.cpp @@ -0,0 +1,144 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include + +#include "OsxmlAttributeLocator.hpp" + +namespace ousia { + +/** + * Enum used internally in the statemachine of the xml argument parser. + */ +enum class XmlAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +std::map OsxmlAttributeLocator::locate( + CharReader &reader, size_t offs) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + if (offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XmlAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + res.emplace("$tag", + SourceLocation{reader.getSourceId(), offs + 1, + readerFork.getOffset() - 1}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + case XmlAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XmlAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XmlAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XmlAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XmlAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XmlAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + state = XmlAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XmlAttributeState::IN_ATTR_NAME; + } + } + break; + case XmlAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, set the end + // location + auto it = res.find(attrName.str()); + if (it != res.end()) { + it->second.setEnd(readerFork.getOffset() - 1); + } + + // Reset the attribute name and restart the search + attrName.str(std::string{}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} +} + diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp new file mode 100644 index 0000000..f9a3437 --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.hpp @@ -0,0 +1,67 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsxmlAttributeLocator.hpp + * + * Contains a class used for locating the byte offsets of the attributes given + * in a XML tag. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ +#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ + +#include + +namespace ousia { + +// Forward declarations +class CharReader; +class SourceLocation; + +/** + * Class containing one static function for locating the byte offsets of the + * attributes in a XML tag. This are not retrieved by our xml parser, so we have + * to do this manually. + */ +class OsxmlAttributeLocator { +public: + /** + * Function used to reconstruct the location of the attributes of a XML tag + * in the source code. This is necessary, as the xml parser only returns an + * offset to the begining of a tag and not to the position of the individual + * arguments. + * + * @param reader is the char reader from which the character data should be + * read. + * @param offs is a byte offset in the xml file pointing at the "<" + * character of the tag. + * @return a map from attribute keys to the corresponding location + * (including range) of the atribute. Also contains the location of the + * tagname in the form of the virtual attribute "$tag". + */ + static std::map locate(CharReader &reader, + size_t offs); +}; + +} + +#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */ + diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index 2ef170e..b4aff77 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -18,14 +18,22 @@ #include +#include + +#include #include #include +#include #include +#include +#include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" namespace ousia { +/* Class OsxmlEventParser */ + /** * Class containing data used by the internal functions. */ @@ -43,41 +51,75 @@ public: */ ssize_t annotationEndTagDepth; + /** + * Current character data buffer. + */ + std::vector textBuf; + + /** + * Current whitespace buffer (for the trimming whitspace mode) + */ + std::vector whitespaceBuf; + + /** + * Flag indicating whether a whitespace character was present (for the + * collapsing whitespace mode). + */ + bool hasWhitespace; + + /** + * Current character data start. + */ + size_t textStart; + + /** + * Current character data end. + */ + size_t textEnd; + /** * Default constructor. */ - OsxmlEventParserData() : depth(0), annotationEndTagDepth(-1) {} + OsxmlEventParserData(); /** * Increments the depth. */ - void incrDepth() { depth++; } + void incrDepth(); /** * Decrement the depth and reset the annotationEndTagDepth flag. */ - void decrDepth() - { - if (depth > 0) { - depth--; - } - if (depth < annotationEndTagDepth) { - annotationEndTagDepth = -1; - } - } + void decrDepth(); /** * Returns true if we're currently inside an end tag. */ - bool inAnnotationEndTag() { depth >= annotationEndTagDepth; } + bool inAnnotationEndTag(); + + /** + * Returns true if character data is available. + * + * @return true if character data is available. + */ + bool hasText(); + + /** + * Returns a Variant containing the character data and its location. + * + * @return a string variant containing the text data and the character + * location. + */ + Variant getText(SourceId sourceId); }; -namespace { +/* Class GuardedExpatXmlParser */ + /** * Wrapper class around the XML_Parser pointer which safely frees it whenever * the scope is left (e.g. because an exception was thrown). */ -class ScopedExpatXmlParser { +class GuardedExpatXmlParser { private: /** * Internal pointer to the XML_Parser instance. @@ -86,14 +128,14 @@ private: public: /** - * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS * from the expat library. Throws a parser exception if the XML parser * cannot be initialized. * * @param encoding is the protocol-defined encoding passed to expat (or * nullptr if expat should determine the encoding by itself). */ - ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) { parser = XML_ParserCreate(encoding); if (!parser) { @@ -103,9 +145,9 @@ public: } /** - * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance. */ - ~ScopedExpatXmlParser() + ~GuardedExpatXmlParser() { if (parser) { XML_ParserFree(parser); @@ -120,134 +162,20 @@ public: }; /** - * Enum used internally in the statemachine of the micro-xml argument parser. + * Name of the special outer tag used for allowing multiple top-level elements + * in an xml file. */ -enum class XmlAttributeState { - IN_TAG_NAME, - SEARCH_ATTR, - IN_ATTR_NAME, - HAS_ATTR_NAME, - HAS_ATTR_EQUALS, - IN_ATTR_DATA -}; +static const std::string TOP_LEVEL_TAG{"ousia"}; /** - * Function used to reconstruct the location of the attributes of a XML tag in - * the source code. This is necessary, as the xml parser only returns an offset - * to the begining of a tag and not to the position of the individual arguments. - * - * @param reader is the char reader from which the character data should be - * read. - * @param offs is a byte offset in the xml file pointing at the "<" character of - * the tag. - * @return a map from attribute keys to the corresponding location (including - * range) of the atribute. Also contains the location of the tagname in the - * form of the virtual attribute "$tag". + * Prefix used to indicate the start of an annoation (note the trailing colon) */ -static std::map xmlReconstructAttributeOffsets( - CharReader &reader, size_t offs) -{ - std::map res; - - // Fork the reader, we don't want to mess up the XML parsing process, do we? - CharReaderFork readerFork = reader.fork(); - - // Move the read cursor to the start location, abort if this does not work - if (!location.isValid() || offs != readerFork.seek(offs)) { - return res; - } - - // Now all we need to do is to implement one half of an XML parser. As this - // is inherently complicated we'll totaly fail at it. Don't care. All we - // want to get is those darn offsets for pretty error messages... (and we - // can assume the XML is valid as it was already read by expat) - XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; - char c; - std::stringstream attrName; - while (readerFork.read(c)) { - // Abort at the end of the tag - if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { - return res; - } +static const std::string ANNOTATION_START_PREFIX{"a:start:"}; - // One state machine to rule them all, one state machine to find them, - // One state machine to bring them all and in the darkness bind them - // (the byte offsets) - switch (state) { - case XmlAttributeState::IN_TAG_NAME: - if (Utils::isWhitespace(c)) { - res.emplace("$tag", - SourceLocation{reader.getSourceId(), offs + 1, - readerFork.getOffset() - 1}); - state = XmlAttributeState::SEARCH_ATTR; - } - break; - case XmlAttributeState::SEARCH_ATTR: - if (!Utils::isWhitespace(c)) { - state = XmlAttributeState::IN_ATTR_NAME; - attrName << c; - } - break; - case XmlAttributeState::IN_ATTR_NAME: - if (Utils::isWhitespace(c)) { - state = XmlAttributeState::HAS_ATTR_NAME; - } else if (c == '=') { - state = XmlAttributeState::HAS_ATTR_EQUALS; - } else { - attrName << c; - } - break; - case XmlAttributeState::HAS_ATTR_NAME: - if (!Utils::isWhitespace(c)) { - if (c == '=') { - state = XmlAttributeState::HAS_ATTR_EQUALS; - break; - } - // Well, this is a strange XML file... We expected to - // see a '=' here! Try to continue with the - // "HAS_ATTR_EQUALS" state as this state will hopefully - // inlcude some error recovery - } else { - // Skip whitespace here - break; - } - // Fallthrough - case XmlAttributeState::HAS_ATTR_EQUALS: - if (!Utils::isWhitespace(c)) { - if (c == '"') { - // Here we are! We have found the beginning of an - // attribute. Let's quickly lock the current offset away - // in the result map - res.emplace(attrName.str(), - SourceLocation{reader.getSourceId(), - readerFork.getOffset()}); - state = XmlAttributeState::IN_ATTR_DATA; - } else { - // No, this XML file is not well formed. Assume we're in - // an attribute name once again - attrName.str(std::string{&c, 1}); - state = XmlAttributeState::IN_ATTR_NAME; - } - } - break; - case XmlAttributeState::IN_ATTR_DATA: - if (c == '"') { - // We're at the end of the attribute data, set the end - // location - auto it = res.find(attrName.str()); - if (it != res.end()) { - it->second.setEnd(readerFork.getOffset() - 1); - } - - // Reset the attribute name and restart the search - attrName.str(std::string{}); - state = XmlAttributeState::SEARCH_ATTR; - } - break; - } - } - return res; -} +/** + * Prefix used to indicate the end of an annotation. + */ +static const std::string ANNOTATION_END_PREFIX{"a:end"}; /** * Synchronizes the position of the xml parser with the default location of the @@ -268,22 +196,12 @@ static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) size_t offs = XML_GetCurrentByteIndex(p); SourceLocation loc = SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; - parser->getLogger().setDefaultLocation(location); + parser->getLogger().setDefaultLocation(loc); // Return the fetched location return loc; } -/** - * Prefix used to indicate the start of an annoation, - */ -static const std::string ANNOTATION_START_PREFIX{"a:start:"}; - -/** - * Prefix used to indicate the end of an annotation. - */ -static const std::string ANNOTATION_END_PREFIX{"a:end"}; - /** * Callback called by eXpat whenever a start handler is reached. */ @@ -292,14 +210,21 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); - OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); + + // If there is any text data in the buffer, issue that first + if (parser->getData().hasText()) { + parser->getEvents().data( + parser->getData().getText(parser->getReader().getSourceId())); + } // Read the argument locations -- this is only a stupid and slow hack, // but it is necessary, as expat doesn't give use the byte offset of the // arguments. std::map attributeOffsets = - xmlReconstructXMLAttributeOffsets(*userData->reader, - XML_GetCurrentByteIndex(p)); + OsxmlAttributeLocator::locate(parser->getReader(), + XML_GetCurrentByteIndex(p)); // Update the logger position SourceLocation loc = xmlSyncLoggerPosition(p); @@ -316,7 +241,8 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Make sure we're currently not inside an annotation end tag -- this would // be highly illegal! if (parser->getData().inAnnotationEndTag()) { - logger.error("No tags allowed inside an annotation end tag", nameLoc); + parser->getLogger().error( + "No tags allowed inside an annotation end tag", nameLoc); return; } @@ -336,36 +262,33 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Parse the string, pass the location of the key std::pair value = VariantReader::parseGenericString( - *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + *(attr++), parser->getLogger(), keyLoc.getSourceId(), keyLoc.getStart()); // Set the overall location of the parsed element to the attribute // location - value.second->setLocation(keyLoc); - - // Store the - if (!args.emplace(key, value.second).second) { - parser->getLogger().warning( - std::string("Attribute \"") + key + - "\" defined multiple times, only using first definition", - keyLoc); - } + value.second.setLocation(keyLoc); + + // Store the keys in the map + args.emplace(key, value.second).second; } // Fetch the name of the tag, check for special tags std::string nameStr(name); - if (nameStr == "ousia" && parser->getData().depth == 1) { - // We're in the top-level and the magic "ousia" tag is reached -- just + if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) { + // We're in the top-level and the magic tag is reached -- just // ignore it and issue a warning for each argument that has been given for (const auto &arg : args) { - parser->getLogger().warning( - std::string("Ignoring attribute \"") + arg.first + - std::string("\" for magic tag \"ousia\""), - arg.second); + parser->getLogger().warning(std::string("Ignoring attribute \"") + + arg.first + + std::string("\" for magic tag \"") + + TOP_LEVEL_TAG + std::string("\""), + arg.second); } } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { // Assemble a name variant containing the name minus the prefix - Variant nameVar = nameStr.substr(ANNOTATION_START_PREFIX.size()); + Variant nameVar = + Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size())); nameVar.setLocation(nameLoc); // Issue the "annotationStart" event @@ -410,25 +333,34 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, } } -static void xmlEndElementHandler(void *p, const XML_Char *name) +static void xmlEndElementHandler(void *ref, const XML_Char *name) { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); - OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); // Synchronize the position of the logger with teh position - xmlSyncLoggerPosition(parser); - - // Decrement the current depth - parser->getData().decrDepth(); + xmlSyncLoggerPosition(p); // Abort as long as we're in an annotation end tag if (parser->getData().inAnnotationEndTag()) { + parser->getData().decrDepth(); return; } + // Decrement the current depth + parser->getData().decrDepth(); + + // If there is any text data in the buffer, issue that first + if (parser->getData().hasText()) { + parser->getEvents().data( + parser->getData().getText(parser->getReader().getSourceId())); + } + // Abort if the special ousia tag ends here - if (nameStr == "ousia" && parser->getData().depth == 0) { + std::string nameStr{name}; + if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) { return; } @@ -436,20 +368,105 @@ static void xmlEndElementHandler(void *p, const XML_Char *name) parser->getEvents().fieldEnd(); } -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); - OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); - - // TODO -/* size_t ulen = len > 0 ? static_cast(len) : 0; - syncLoggerPosition(parser, ulen); - const std::string data = Utils::trim(std::string{s, ulen}); - if (!data.empty()) { - stack->data(data); - }*/ + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); + + // Abort as long as we're in an annotation end tag + if (parser->getData().inAnnotationEndTag()) { + return; + } + + // Convert the signed (smell the 90's C library here?) length to an usigned + // value + size_t ulen = len > 0 ? static_cast(len) : 0; + + // Synchronize the logger position + SourceLocation loc = xmlSyncLoggerPosition(p, ulen); + + // Fetch some variables for convenience + const WhitespaceMode mode = parser->getWhitespaceMode(); + OsxmlEventParserData &data = parser->getData(); + std::vector &textBuf = data.textBuf; + std::vector &whitespaceBuf = data.whitespaceBuf; + bool &hasWhitespace = data.hasWhitespace; + size_t &textStart = data.textStart; + size_t &textEnd = data.textEnd; + + size_t pos = loc.getStart(); + for (size_t i = 0; i < ulen; i++, pos++) { + switch (mode) { + case WhitespaceMode::PRESERVE: + PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd); + break; + case WhitespaceMode::TRIM: + TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd, + whitespaceBuf); + break; + case WhitespaceMode::COLLAPSE: + CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd, + hasWhitespace); + break; + } + } +} + +/* Class OsxmlEvents */ + +OsxmlEvents::~OsxmlEvents() {} + +/* Class OsxmlEventParser */ + +OsxmlEventParserData::OsxmlEventParserData() + : depth(0), + annotationEndTagDepth(-1), + hasWhitespace(false), + textStart(0), + textEnd(0) +{ +} + +void OsxmlEventParserData::incrDepth() { depth++; } + +void OsxmlEventParserData::decrDepth() +{ + if (depth > 0) { + depth--; + } + if (depth < annotationEndTagDepth) { + annotationEndTagDepth = -1; + } +} + +bool OsxmlEventParserData::inAnnotationEndTag() +{ + return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); } + +bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } + +Variant OsxmlEventParserData::getText(SourceId sourceId) +{ + // Create a variant containing the string data and the location + Variant var = + Variant::fromString(std::string{textBuf.data(), textBuf.size()}); + var.setLocation({sourceId, textStart, textEnd}); + + // Reset the text buffers + textBuf.clear(); + whitespaceBuf.clear(); + hasWhitespace = false; + textStart = 0; + textEnd = 0; + + // Return the variant + return var; } /* Class OsxmlEventParser */ @@ -459,21 +476,22 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), + whitespaceMode(WhitespaceMode::TRIM), data(new OsxmlEventParserData()) { } -void OsxmlEventParser::parse(CharReader &reader) +OsxmlEventParser::~OsxmlEventParser() {} + +void OsxmlEventParser::parse() { // Create the parser object - ScopedExpatXmlParser p{"UTF-8"}; + GuardedExpatXmlParser p{"UTF-8"}; // Reset the depth - depth = 0; + data->depth = 0; - // Pass the reference to the ParserStack to the XML handler - XMLUserData data(&stack, &reader); + // Pass the reference to this parser instance to the XML handler XML_SetUserData(&p, this); XML_UseParserAsHandlerArg(&p); @@ -498,7 +516,7 @@ void OsxmlEventParser::parse(CharReader &reader) if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { throw LoggableException{ "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, - xmlSyncLoggerPosition(p)}; + xmlSyncLoggerPosition(&p)}; } // Abort once there are no more bytes in the stream @@ -513,12 +531,17 @@ void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) this->whitespaceMode = whitespaceMode; } -CharReader &OsxmlEventParser::getCharReader() { return charReader; } +WhitespaceMode OsxmlEventParser::getWhitespaceMode() const +{ + return whitespaceMode; +} + +CharReader &OsxmlEventParser::getReader() const { return reader; } -Logger &OsxmlEventParser::getLogger() { return logger; } +Logger &OsxmlEventParser::getLogger() const { return logger; } -OsxmlEvents &OsxmlEventParser::getEvents() { return events; } +OsxmlEvents &OsxmlEventParser::getEvents() const { return events; } -OsxmlEventParserData &OsxmlEventParser::getData() { return *data; } +OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; } } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index 5319ca6..aa20ea9 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -42,7 +42,7 @@ class Variant; class OsxmlEventParserData; /** - * Interface which defines the callback functions which are called by the + * Interface which defines the callback functions which are called by the * OsxmlEventParser whenever an event occurs. */ class OsxmlEvents { @@ -50,13 +50,13 @@ public: /** * Virtual destructor. */ - virtual ~OsxmlEvents() {} + virtual ~OsxmlEvents(); /** * Called whenever a command starts. Note that this implicitly always starts * the default field of the command. * - * @param name is a string variant containing name and location of the + * @param name is a string variant containing name and location of the * command. * @param args is a map variant containing the arguments that were given * to the command. @@ -67,12 +67,12 @@ public: * Called whenever an annotation starts. Note that this implicitly always * starts the default field of the annotation. * - * @param name is a string variant containing the name of the annotation + * @param name is a string variant containing the name of the annotation * class and the location of the annotation definition. * @param args is a map variant containing the arguments that were given * to the annotation definition. */ - virtual void annotationStart(Variant name, Variant args); + virtual void annotationStart(Variant name, Variant args) = 0; /** * Called whenever the range of an annotation ends. The callee must @@ -85,12 +85,12 @@ public: * ended here. May be empty (or nullptr), if no elementName has been * specified at the end of the annotation. */ - virtual void annotationEnd(Variant name, Variant elementName); + virtual void annotationEnd(Variant name, Variant elementName) = 0; /** - * Called whenever the default field which was implicitly started by + * Called whenever the default field which was implicitly started by * commandStart or annotationStart ends. Note that this does not end the - * range of an annotation, but the default field of the annotation. To + * range of an annotation, but the default field of the annotation. To * signal the end of the annotation this, the annotationEnd method will be * invoked. */ @@ -102,11 +102,10 @@ public: * is not called if the parsing failed, the parser prints an error message * instead. * - * @param data is the already parsed data that should be passed to the + * @param data is the already parsed data that should be passed to the * handler. */ virtual void data(Variant data) = 0; - }; /** @@ -148,7 +147,7 @@ public: * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents * of which the callback functions are called. * - * @param reader is a reference to the CharReader instance from which the + * @param reader is a reference to the CharReader instance from which the * XML should be read. * @param events is a refence at an instance of the OsxmlEvents class. All * events are forwarded to this class. @@ -157,6 +156,11 @@ public: */ OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger); + /** + * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type) + */ + ~OsxmlEventParser(); + /** * Performs the actual parsing. Reads the XML using eXpat and calles the * callbacks in the event listener instance whenever something interesting @@ -167,38 +171,44 @@ public: /** * Sets the whitespace handling mode. * - * @param whitespaceMode defines how whitespace in the data should be + * @param whitespaceMode defines how whitespace in the data should be * handled. */ void setWhitespaceMode(WhitespaceMode whitespaceMode); + /** + * Returns the current whitespace handling mode. + * + * @return the currently set whitespace handling mode. + */ + WhitespaceMode getWhitespaceMode() const; + /** * Returns the internal CharReader reference. * * @return the CharReader reference. */ - CharReader &getCharReader(); + CharReader &getReader() const; /** * Returns the internal Logger reference. * * @return the internal Logger reference. */ - Logger &getLogger(); + Logger &getLogger() const; /** * Returns the internal OsxmlEvents reference. * * @return the internal OsxmlEvents reference. */ - OsxmlEvents &getEvents(); + OsxmlEvents &getEvents() const; /** * Returns a reference at the internal data. */ - OsxmlEventParserData &getData(); + OsxmlEventParserData &getData() const; }; - } #endif /* _OSXML_EVENT_PARSER_HPP_ */ diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index e5eff05..b944af8 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -28,6 +28,7 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); +//static ConcreteLogger logger; TEST(OsmlStreamParser, empty) { diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp new file mode 100644 index 0000000..06c800f --- /dev/null +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -0,0 +1,222 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include + +namespace ousia { + +static TerminalLogger logger(std::cerr, true); +// static ConcreteLogger logger; + +namespace { +enum class OsxmlEvent { + COMMAND_START, + ANNOTATION_START, + ANNOTATION_END, + FIELD_END, + DATA +}; + +class TestOsxmlEventListener : public OsxmlEvents { +public: + std::vector> events; + + void commandStart(Variant name, Variant args) override + { + events.emplace_back(OsxmlEvent::COMMAND_START, + Variant::arrayType{name, args}); + } + + void annotationStart(Variant name, Variant args) override + { + events.emplace_back(OsxmlEvent::ANNOTATION_START, + Variant::arrayType{name, args}); + } + + void annotationEnd(Variant name, Variant elementName) override + { + events.emplace_back(OsxmlEvent::ANNOTATION_END, + Variant::arrayType{name, elementName}); + } + + void fieldEnd() override + { + events.emplace_back(OsxmlEvent::FIELD_END, Variant::arrayType{}); + } + + void data(Variant data) override + { + events.emplace_back(OsxmlEvent::DATA, Variant::arrayType{data}); + } +}; + +static std::vector> parseXml( + const char *testString, + WhitespaceMode whitespaceMode = WhitespaceMode::TRIM) +{ + TestOsxmlEventListener listener; + CharReader reader(testString); + OsxmlEventParser parser(reader, listener, logger); + parser.setWhitespaceMode(whitespaceMode); + parser.parse(); + return listener.events; +} +} + +TEST(OsxmlEventParser, simpleCommandWithArgs) +{ + const char *testString = ""; + // 01234567 89012 3456 78 9012 34 5678 90123 456 + // 0 1 2 3 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{ + "a", Variant::mapType{ + {"name", "test"}, {"a", 1}, {"b", 2}, {"c", "blub"}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString); + ASSERT_EQ(expectedEvents, events); + + // Check the locations (I'll do this one time and then just assume it works) + ASSERT_EQ(1U, events[0].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(2U, events[0].second.asArray()[0].getLocation().getEnd()); + ASSERT_EQ( + 9U, + events[0].second.asArray()[1].asMap()["name"].getLocation().getStart()); + ASSERT_EQ( + 13U, + events[0].second.asArray()[1].asMap()["name"].getLocation().getEnd()); + ASSERT_EQ( + 18U, + events[0].second.asArray()[1].asMap()["a"].getLocation().getStart()); + ASSERT_EQ( + 19U, events[0].second.asArray()[1].asMap()["a"].getLocation().getEnd()); + ASSERT_EQ( + 24U, + events[0].second.asArray()[1].asMap()["b"].getLocation().getStart()); + ASSERT_EQ( + 25U, events[0].second.asArray()[1].asMap()["b"].getLocation().getEnd()); + ASSERT_EQ( + 30U, + events[0].second.asArray()[1].asMap()["c"].getLocation().getStart()); + ASSERT_EQ( + 34U, events[0].second.asArray()[1].asMap()["c"].getLocation().getEnd()); +} + +TEST(OsxmlEventParser, magicTopLevelTag) +{ + const char *testString = ""; + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}, + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"b", Variant::mapType{}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString); + ASSERT_EQ(expectedEvents, events); +} + +TEST(OsxmlEventParser, magicTopLevelTagInside) +{ + const char *testString = ""; + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"ousia", Variant::mapType{}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString); + ASSERT_EQ(expectedEvents, events); +} + +TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) +{ + const char *testString = " hello \n world "; + // 012345678901 234567890123 + // 0 1 2 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString, WhitespaceMode::PRESERVE); + ASSERT_EQ(expectedEvents, events); + + // Check the location of the text + ASSERT_EQ(3U, events[1].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(20U, events[1].second.asArray()[0].getLocation().getEnd()); +} + +TEST(OsxmlEventParser, commandWithDataTrimWhitespace) +{ + const char *testString = " hello \n world "; + // 012345678901 234567890123 + // 0 1 2 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString, WhitespaceMode::TRIM); + ASSERT_EQ(expectedEvents, events); + + // Check the location of the text + ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); +} + +TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) +{ + const char *testString = " hello \n world "; + // 012345678901 234567890123 + // 0 1 2 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::DATA, Variant::arrayType{"hello world"}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString, WhitespaceMode::COLLAPSE); + ASSERT_EQ(expectedEvents, events); + + // Check the location of the text + ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); +} + +} + -- cgit v1.2.3 From 9b4cdfabf6527440d6ffa499cc6b57a44daaeadb Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:05:42 +0100 Subject: Added code for the handling of explicit default fields and improved unit tests --- CMakeLists.txt | 16 +- src/formats/osml/OsmlStreamParser.cpp | 78 +++++-- src/formats/osml/OsmlStreamParser.hpp | 45 +++- test/formats/osml/OsmlStreamParserTest.cpp | 340 +++++++++++++++++------------ 4 files changed, 302 insertions(+), 177 deletions(-) (limited to 'test') diff --git a/CMakeLists.txt b/CMakeLists.txt index bdc9541..d311f7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -290,15 +290,15 @@ IF(TEST) ousia_core ) -# ADD_EXECUTABLE(ousia_test_filesystem -# test/plugins/filesystem/FileLocatorTest -# ) + ADD_EXECUTABLE(ousia_test_filesystem + test/plugins/filesystem/FileLocatorTest + ) -# TARGET_LINK_LIBRARIES(ousia_test_filesystem -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_filesystem -# ) + TARGET_LINK_LIBRARIES(ousia_test_filesystem + ${GTEST_LIBRARIES} + ousia_core + ousia_filesystem + ) # ADD_EXECUTABLE(ousia_test_css # test/plugins/css/Tokenizer diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 6b00eef..6606120 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -60,6 +60,11 @@ public: */ TokenTypeId FieldEnd; + /** + * Id of the default field start token. + */ + TokenTypeId DefaultFieldStart; + /** * Registers the plain format tokens in the internal tokenizer. */ @@ -71,6 +76,7 @@ public: BlockCommentEnd = registerToken("}%"); FieldStart = registerToken("{"); FieldEnd = registerToken("}"); + DefaultFieldStart = registerToken("{!"); } }; @@ -164,7 +170,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) : reader(reader), logger(logger), tokenizer(Tokens) { // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true}); + commands.push(Command{"", Variant::mapType{}, true, true, true, false}); } Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) @@ -365,7 +371,7 @@ void OsmlStreamParser::pushCommand(Variant commandName, commands.pop(); } commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false}); + hasRange, false, false, false}); } OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) @@ -482,6 +488,29 @@ bool OsmlStreamParser::checkIssueFieldStart() return false; } +bool OsmlStreamParser::closeField() +{ + // Try to end an open field of the current command -- if the current command + // is not inside an open field, end this command and try to close the next + // one + for (int i = 0; i < 2 && commands.size() > 1; i++) { + Command &cmd = commands.top(); + if (!cmd.inRangeField) { + if (cmd.inField) { + cmd.inField = false; + if (cmd.inDefaultField) { + commands.pop(); + } + return true; + } + commands.pop(); + } else { + return false; + } + } + return false; +} + OsmlStreamParser::State OsmlStreamParser::parse() { // Handler for incomming data @@ -579,27 +608,29 @@ OsmlStreamParser::State OsmlStreamParser::parse() } logger.error( "Got field start token \"{\", but no command for which to " - "start the field. Did you mean \"\\{\"?", + "start the field. Write \"\\{\" to insert this sequence as " + "text.", token); } else if (token.type == Tokens.FieldEnd) { - // Try to end an open field of the current command -- if the current - // command is not inside an open field, end this command and try to - // close the next one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - return State::FIELD_END; - } - commands.pop(); - } else { - break; - } + if (closeField()) { + return State::FIELD_END; + } + logger.error( + "Got field end token \"}\", but there is no field to end. " + "Write \"\\}\" to insert this sequence as text.", + token); + } else if (token.type == Tokens.DefaultFieldStart) { + // Try to start a default field the first time the token is reached + Command &topCmd = commands.top(); + if (!topCmd.inField) { + topCmd.inField = true; + topCmd.inDefaultField = true; + return State::FIELD_START; } logger.error( - "Got field end token \"}\", but there is no field to end. Did " - "you mean \"\\}\"?", + "Got default field start token \"{!\", but no command for " + "which to start the field. Write \"\\{!\" to insert this " + "sequence as text", token); } else { logger.error("Unexpected token \"" + token.content + "\"", token); @@ -627,14 +658,19 @@ OsmlStreamParser::State OsmlStreamParser::parse() return State::END; } -const Variant &OsmlStreamParser::getCommandName() +const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; } -const Variant &OsmlStreamParser::getCommandArguments() +const Variant &OsmlStreamParser::getCommandArguments() const { return commands.top().arguments; } + +bool OsmlStreamParser::inDefaultField() const +{ + return commands.top().inRangeField || commands.top().inDefaultField; +} } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 1508012..bb5db65 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -152,10 +152,16 @@ public: */ bool inRangeField; + /** + * Set to true if we are currently in a field that has been especially + * marked as default field (using the "|") syntax. + */ + bool inDefaultField; + /** * Default constructor. */ - Command() : hasRange(false), inField(false), inRangeField(false) {} + Command() : hasRange(false), inField(false), inRangeField(false), inDefaultField() {} /** * Constructor of the Command class. @@ -168,16 +174,19 @@ public: * explicit range. * @param inField is set to true if we currently are inside a field * of this command. - * @param inRangeField is set to true if we currently inside the outer - * field of the command. + * @param inRangeField is set to true if we currently are inside the + * outer field of a ranged command. + * @param inDefaultField is set to true if we currently are in a + * specially marked default field. */ Command(Variant name, Variant arguments, bool hasRange, bool inField, - bool inRangeField) + bool inRangeField, bool inDefaultField) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), inField(inField), - inRangeField(inRangeField) + inRangeField(inRangeField), + inDefaultField(inDefaultField) { } }; @@ -289,6 +298,16 @@ private: */ bool checkIssueFieldStart(); + /** + * Closes a currently open field. Note that the command will be removed from + * the internal command stack if the field that is being closed is a + * field marked as default field. + * + * @return true if the field could be closed, false if there was no field + * to close. + */ + bool closeField(); + public: /** * Constructor of the OsmlStreamParser class. Attaches the new @@ -317,7 +336,7 @@ public: * @return a reference at a variant containing the data parsed by the * "parse" function. */ - const Variant &getData() { return data; } + const Variant &getData() const { return data; } /** * Returns a reference at the internally stored command name. Only valid if @@ -326,7 +345,7 @@ public: * @return a reference at a variant containing name and location of the * parsed command. */ - const Variant &getCommandName(); + const Variant &getCommandName() const; /** * Returns a reference at the internally stored command name. Only valid if @@ -335,14 +354,22 @@ public: * @return a reference at a variant containing arguments given to the * command. */ - const Variant &getCommandArguments(); + const Variant &getCommandArguments() const; + + /** + * Returns true if the current field is the "default" field. This is true if + * the parser either is in the outer range of a range command or inside a + * field that has been especially marked as "default" field (using the "|" + * syntax). + */ + bool inDefaultField() const; /** * Returns a reference at the char reader. * * @return the last internal token location. */ - SourceLocation &getLocation() { return location; } + const SourceLocation &getLocation() const { return location; } }; } diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index b944af8..da9fe8a 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -28,7 +28,88 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); -//static ConcreteLogger logger; +// static ConcreteLogger logger; + +static void assertCommand(OsmlStreamParser &reader, const std::string &name, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + EXPECT_EQ(name, reader.getCommandName().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertCommand(OsmlStreamParser &reader, const std::string &name, + const Variant::mapType &args, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + assertCommand(reader, name, start, end); + EXPECT_EQ(args, reader.getCommandArguments()); +} + +static void assertData(OsmlStreamParser &reader, const std::string &data, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + EXPECT_EQ(data, reader.getData().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getData().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getData().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertFieldStart(OsmlStreamParser &reader, bool defaultField, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); + EXPECT_EQ(defaultField, reader.inDefaultField()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertFieldEnd(OsmlStreamParser &reader, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertEnd(OsmlStreamParser &reader, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} TEST(OsmlStreamParser, empty) { @@ -47,12 +128,7 @@ TEST(OsmlStreamParser, oneCharacter) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); + assertData(reader, "a", 0, 1); } TEST(OsmlStreamParser, whitespaceElimination) @@ -64,12 +140,7 @@ TEST(OsmlStreamParser, whitespaceElimination) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); + assertData(reader, "hello world", 1, 14); } TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) @@ -81,13 +152,7 @@ TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertData(reader, "hello world", 1, 14); } TEST(OsmlStreamParser, escapeWhitespace) @@ -99,13 +164,7 @@ TEST(OsmlStreamParser, escapeWhitespace) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(15U, loc.getEnd()); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertData(reader, "hello world", 1, 15); } static void testEscapeSpecialCharacter(const std::string &c) @@ -127,6 +186,7 @@ TEST(OsmlStreamParser, escapeSpecialCharacters) testEscapeSpecialCharacter("}"); testEscapeSpecialCharacter("<"); testEscapeSpecialCharacter(">"); + testEscapeSpecialCharacter("|"); } TEST(OsmlStreamParser, simpleSingleLineComment) @@ -347,86 +407,6 @@ TEST(OsmlStreamParser, simpleCommandWithArgumentsAndName) ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); } -static void assertCommand(OsmlStreamParser &reader, const std::string &name, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); - EXPECT_EQ(name, reader.getCommandName().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertCommand(OsmlStreamParser &reader, const std::string &name, - const Variant::mapType &args, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - assertCommand(reader, name, start, end); - EXPECT_EQ(args, reader.getCommandArguments()); -} - -static void assertData(OsmlStreamParser &reader, const std::string &data, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(data, reader.getData().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getData().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getData().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldStart(OsmlStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldEnd(OsmlStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertEnd(OsmlStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - TEST(OsmlStreamParser, fields) { const char *testString = "\\test{a}{b}{c}"; @@ -436,15 +416,15 @@ TEST(OsmlStreamParser, fields) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertData(reader, "a", 6, 7); assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, 8, 9); + assertFieldStart(reader, false, 8, 9); assertData(reader, "b", 9, 10); assertFieldEnd(reader, 10, 11); - assertFieldStart(reader, 11, 12); + assertFieldStart(reader, false, 11, 12); assertData(reader, "c", 12, 13); assertFieldEnd(reader, 13, 14); assertEnd(reader, 14, 14); @@ -459,11 +439,11 @@ TEST(OsmlStreamParser, dataOutsideField) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertData(reader, "a", 6, 7); assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, 8, 9); + assertFieldStart(reader, false, 8, 9); assertData(reader, "b", 9, 10); assertFieldEnd(reader, 10, 11); @@ -481,14 +461,14 @@ TEST(OsmlStreamParser, nestedCommand) assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertData(reader, "a", 6, 7); assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, 8, 9); + assertFieldStart(reader, false, 8, 9); { assertCommand(reader, "test2", 9, 15); - assertFieldStart(reader, 15, 16); + assertFieldStart(reader, false, 15, 16); assertData(reader, "b", 16, 17); assertFieldEnd(reader, 17, 18); } @@ -507,10 +487,10 @@ TEST(OsmlStreamParser, nestedCommandImmediateEnd) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); { assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, false, 12, 13); assertData(reader, "b", 13, 14); assertFieldEnd(reader, 14, 15); } @@ -527,7 +507,7 @@ TEST(OsmlStreamParser, nestedCommandNoData) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertCommand(reader, "test2", 6, 12); assertFieldEnd(reader, 12, 13); assertEnd(reader, 13, 13); @@ -557,11 +537,11 @@ TEST(OsmlStreamParser, fieldsWithSpaces) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, 3, 4); + assertFieldStart(reader, false, 3, 4); assertCommand(reader, "b", 4, 6); assertCommand(reader, "c", 7, 9); assertFieldEnd(reader, 9, 10); - assertFieldStart(reader, 16, 17); + assertFieldStart(reader, false, 16, 17); assertCommand(reader, "d", 17, 19); assertFieldEnd(reader, 19, 20); assertEnd(reader, 20, 20); @@ -612,9 +592,9 @@ TEST(OsmlStreamParser, errorNoFieldEndNested) logger.reset(); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, false, 12, 13); assertFieldEnd(reader, 13, 14); assertFieldEnd(reader, 14, 15); ASSERT_FALSE(logger.hasError()); @@ -633,9 +613,9 @@ TEST(OsmlStreamParser, errorNoFieldEndNestedData) logger.reset(); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, false, 12, 13); assertFieldEnd(reader, 13, 14); assertFieldEnd(reader, 14, 15); assertData(reader, "a", 15, 16); @@ -654,7 +634,7 @@ TEST(OsmlStreamParser, beginEnd) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, true, 12, 13); assertFieldEnd(reader, 17, 21); assertEnd(reader, 22, 22); } @@ -669,7 +649,7 @@ TEST(OsmlStreamParser, beginEndWithName) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", {{"name", "a"}}, 7, 11); - assertFieldStart(reader, 14, 15); + assertFieldStart(reader, true, 14, 15); assertFieldEnd(reader, 19, 23); assertEnd(reader, 24, 24); } @@ -685,7 +665,7 @@ TEST(OsmlStreamParser, beginEndWithNameAndArgs) assertCommand(reader, "book", {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); + assertFieldStart(reader, true, 32, 33); assertFieldEnd(reader, 37, 41); assertEnd(reader, 42, 42); } @@ -702,17 +682,17 @@ TEST(OsmlStreamParser, beginEndWithNameAndArgsMultipleFields) assertCommand(reader, "book", {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); + assertFieldStart(reader, false, 32, 33); assertData(reader, "a", 33, 34); assertCommand(reader, "test", Variant::mapType{}, 35, 40); assertFieldEnd(reader, 40, 41); - assertFieldStart(reader, 41, 42); + assertFieldStart(reader, false, 41, 42); assertData(reader, "b", 42, 43); assertCommand(reader, "test", Variant::mapType{}, 44, 49); - assertFieldStart(reader, 49, 50); + assertFieldStart(reader, false, 49, 50); assertFieldEnd(reader, 50, 51); assertFieldEnd(reader, 51, 52); - assertFieldStart(reader, 52, 53); + assertFieldStart(reader, true, 52, 53); assertFieldEnd(reader, 57, 61); assertEnd(reader, 62, 62); } @@ -727,12 +707,45 @@ TEST(OsmlStreamParser, beginEndWithData) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, true, 12, 13); assertData(reader, "a", 12, 13); assertFieldEnd(reader, 18, 22); assertEnd(reader, 23, 23); } +TEST(OsmlStreamParser, beginEndNested) +{ + const char *testString = + "\\begin{a}{b} c \\begin{d}{e}{f} \\g{h} \\end{d}\\end{a}"; + // 012345678901234 5678901234567890 123456 7890123 4567890 + // 0 1 2 3 4 5 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 7, 8); + assertFieldStart(reader, false, 9, 10); + assertData(reader, "b", 10, 11); + assertFieldEnd(reader, 11, 12); + assertFieldStart(reader, true, 13, 14); + assertData(reader, "c", 13, 14); + assertCommand(reader, "d", 22, 23); + assertFieldStart(reader, false, 24, 25); + assertData(reader, "e", 25, 26); + assertFieldEnd(reader, 26, 27); + assertFieldStart(reader, false, 27, 28); + assertData(reader, "f", 28, 29); + assertFieldEnd(reader, 29, 30); + assertFieldStart(reader, true, 31, 32); + assertCommand(reader, "g", 31, 33); + assertFieldStart(reader, false, 33, 34); + assertData(reader, "h", 34, 35); + assertFieldEnd(reader, 35, 36); + assertFieldEnd(reader, 42, 43); + assertFieldEnd(reader, 49, 50); + assertEnd(reader, 51, 51); +} + TEST(OsmlStreamParser, beginEndWithCommand) { const char *testString = "\\begin{book}\\a{test}\\end{book}"; @@ -743,9 +756,9 @@ TEST(OsmlStreamParser, beginEndWithCommand) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, true, 12, 13); assertCommand(reader, "a", 12, 14); - assertFieldStart(reader, 14, 15); + assertFieldStart(reader, false, 14, 15); assertData(reader, "test", 15, 19); assertFieldEnd(reader, 19, 20); assertFieldEnd(reader, 25, 29); @@ -873,9 +886,9 @@ TEST(OsmlStreamParser, errorBeginEndMismatch) logger.reset(); assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, 10, 11); + assertFieldStart(reader, true, 10, 11); assertCommand(reader, "b", 17, 18); - assertFieldStart(reader, 20, 24); + assertFieldStart(reader, true, 20, 24); assertData(reader, "test", 20, 24); ASSERT_FALSE(logger.hasError()); ASSERT_THROW(reader.parse(), LoggableException); @@ -904,7 +917,7 @@ TEST(OsmlStreamParser, beginEndWithNSSep) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test1:test2", 7, 18); - assertFieldStart(reader, 19, 20); + assertFieldStart(reader, true, 19, 20); assertFieldEnd(reader, 24, 35); assertEnd(reader, 36, 36); } @@ -920,7 +933,7 @@ TEST(OsmlStreamParser, errorBeginNSSep) ASSERT_FALSE(logger.hasError()); assertCommand(reader, "blub"); ASSERT_TRUE(logger.hasError()); - assertFieldStart(reader); + assertFieldStart(reader, true); assertFieldEnd(reader); assertEnd(reader); } @@ -934,7 +947,7 @@ TEST(OsmlStreamParser, errorEndNSSep) logger.reset(); assertCommand(reader, "blub"); - assertFieldStart(reader); + assertFieldStart(reader, true); ASSERT_FALSE(logger.hasError()); assertFieldEnd(reader); ASSERT_TRUE(logger.hasError()); @@ -970,5 +983,54 @@ TEST(OsmlStreamParser, errorRepeatedNs) assertData(reader, "::"); assertEnd(reader); } + +TEST(OsmlStreamParser, explicitDefaultField) +{ + const char *testString = "\\a{!b}c"; + // 01234567 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertData(reader, "b", 4, 5); + assertFieldEnd(reader, 5, 6); + assertData(reader, "c", 6, 7); + assertEnd(reader, 7, 7); +} + +TEST(OsmlStreamParser, explicitDefaultFieldWithCommand) +{ + const char *testString = "\\a{!\\b}c"; + // 0123 4567 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertCommand(reader, "b", 4, 6); + assertFieldEnd(reader, 6, 7); + assertData(reader, "c", 7, 8); + assertEnd(reader, 8, 8); +} + +TEST(OsmlStreamParser, errorFieldAfterExplicitDefaultField) +{ + const char *testString = "\\a{!\\b}{c}"; + // 0123 4567 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertCommand(reader, "b", 4, 6); + assertFieldEnd(reader, 6, 7); + assertData(reader, "c", 7, 8); + assertEnd(reader, 8, 8); +} + } -- cgit v1.2.3 From 856fa8298d55c07313d9638d9f8b8c0913202b2c Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:06:05 +0100 Subject: Fixed forgotten unit test --- test/formats/osml/OsmlStreamParserTest.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'test') diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index da9fe8a..5f23822 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -1019,17 +1019,20 @@ TEST(OsmlStreamParser, explicitDefaultFieldWithCommand) TEST(OsmlStreamParser, errorFieldAfterExplicitDefaultField) { const char *testString = "\\a{!\\b}{c}"; - // 0123 4567 + // 0123 456789 CharReader charReader(testString); OsmlStreamParser reader(charReader, logger); + logger.reset(); assertCommand(reader, "a", 0, 2); assertFieldStart(reader, true, 2, 4); assertCommand(reader, "b", 4, 6); assertFieldEnd(reader, 6, 7); - assertData(reader, "c", 7, 8); - assertEnd(reader, 8, 8); + ASSERT_FALSE(logger.hasError()); + assertData(reader, "c", 8, 9); + ASSERT_TRUE(logger.hasError()); + assertEnd(reader, 10, 10); } } -- cgit v1.2.3 From 205810b44c980998958dcd857c2cb34a914dc760 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Thu, 12 Feb 2015 16:21:36 +0100 Subject: Implemented annotation start and end field --- contrib/test.osdm | 29 ---- contrib/test.osml | 29 ++++ src/formats/osml/OsmlStreamParser.cpp | 116 ++++++++++++--- src/formats/osml/OsmlStreamParser.hpp | 16 +- test/formats/osml/OsmlStreamParserTest.cpp | 228 ++++++++++++++++++++++++++++- 5 files changed, 363 insertions(+), 55 deletions(-) delete mode 100644 contrib/test.osdm create mode 100644 contrib/test.osml (limited to 'test') diff --git a/contrib/test.osdm b/contrib/test.osdm deleted file mode 100644 index 100bc77..0000000 --- a/contrib/test.osdm +++ /dev/null @@ -1,29 +0,0 @@ -%{ - We're currently inside a block comment. - %{ - Note that block comments can be nested, easily allowing you to comment - out blocks which already contain comments. - }% -}% - -% Well, line comments, as we know them from TeX also work - -\import{meta} -\import{book} - -\domain#special_words{ - \struct#latex - \struct#ousia -} - -\book{ - \include{chapters/chapter1} - \include{chapters/chapter2} - - \begin{note}{Behaviour of "Include"} - Analogous to the `include` command in \latex, \ousia forces the included - file to be *complete* in a sense, that it must not have dangling open - commands. - \end{note} -} - diff --git a/contrib/test.osml b/contrib/test.osml new file mode 100644 index 0000000..100bc77 --- /dev/null +++ b/contrib/test.osml @@ -0,0 +1,29 @@ +%{ + We're currently inside a block comment. + %{ + Note that block comments can be nested, easily allowing you to comment + out blocks which already contain comments. + }% +}% + +% Well, line comments, as we know them from TeX also work + +\import{meta} +\import{book} + +\domain#special_words{ + \struct#latex + \struct#ousia +} + +\book{ + \include{chapters/chapter1} + \include{chapters/chapter2} + + \begin{note}{Behaviour of "Include"} + Analogous to the `include` command in \latex, \ousia forces the included + file to be *complete* in a sense, that it must not have dangling open + commands. + \end{note} +} + diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 6606120..0174fa4 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -65,6 +65,16 @@ public: */ TokenTypeId DefaultFieldStart; + /** + * Id of the annotation start token. + */ + TokenTypeId AnnotationStart; + + /** + * Id of the annotation end token. + */ + TokenTypeId AnnotationEnd; + /** * Registers the plain format tokens in the internal tokenizer. */ @@ -77,6 +87,8 @@ public: FieldStart = registerToken("{"); FieldEnd = registerToken("}"); DefaultFieldStart = registerToken("{!"); + AnnotationStart = registerToken("<\\"); + AnnotationEnd = registerToken("\\>"); } }; @@ -374,7 +386,8 @@ void OsmlStreamParser::pushCommand(Variant commandName, hasRange, false, false, false}); } -OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) +OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, + bool isAnnotation) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); @@ -388,6 +401,9 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) Utils::split(commandName.asString(), ':'); const bool isBegin = commandNameComponents[0] == "begin"; const bool isEnd = commandNameComponents[0] == "end"; + + // Parse the begin or end command + State res = State::COMMAND; if (isBegin || isEnd) { if (commandNameComponents.size() > 1) { logger.error( @@ -396,30 +412,76 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) commandName); } if (isBegin) { - return parseBeginCommand(); + res = parseBeginCommand(); } else if (isEnd) { - return parseEndCommand(); + res = parseEndCommand(); + } + } else { + // Check whether the next character is a '#', indicating the start of + // the command name + Variant commandArgName; + start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } } + + // Parse the arugments + Variant commandArguments = + parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), false); } - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); + // Check whether a ">" character is the next character that is to be read. + // In that case the current command could be an annotation end command! + char c; + if (reader.fetch(c) && c == '>') { + // Ignore the character after a begin or end command + if (isBegin || isEnd) { + logger.warning( + "Ignoring annotation end character \">\" after special " + "commands \"begin\" or \"end\". Write \"\\>\" to end a " + "\"begin\"/\"end\" enclosed annotation.", + reader); + return res; } - } - // Parse the arugments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + // If this should be an annoation, ignore the character + if (isAnnotation) { + logger.warning( + "Ignoring annotation end character \">\" after annotation " + "start command. Write \"\\>\" to end the annotation.", + reader); + } else { + // Make sure no arguments apart from the "name" argument are given + // to an annotation end + Variant::mapType &map = commands.top().arguments.asMap(); + if (!map.empty()) { + if (map.count("name") == 0 || map.size() > 1U) { + logger.error( + "An annotation end command may not have any arguments " + "other than \"name\""); + return res; + } + } - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), false); + // If we got here, this is a valid ANNOTATION_END command, issue it + reader.peek(c); + reader.consumePeek(); + return State::ANNOTATION_END; + } + } - return State::COMMAND; + // If we're starting an annotation, return the command as annotation start + // instead of command + if (isAnnotation && res == State::COMMAND) { + return State::ANNOTATION_START; + } + return res; } void OsmlStreamParser::parseBlockComment() @@ -522,7 +584,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() const TokenTypeId type = token.type; // Special handling for Backslash and Text - if (type == Tokens.Backslash) { + if (type == Tokens.Backslash || type == Tokens.AnnotationStart) { // Before appending anything to the output data or starting a new // command, check whether FIELD_START has to be issued, as the // current command is a command with range @@ -548,7 +610,8 @@ OsmlStreamParser::State OsmlStreamParser::parse() } // Parse the actual command - State res = parseCommand(token.location.getStart()); + State res = parseCommand(token.location.getStart(), + type == Tokens.AnnotationStart); switch (res) { case State::ERROR: throw LoggableException( @@ -565,6 +628,14 @@ OsmlStreamParser::State OsmlStreamParser::parse() // to the data buffer, use the escape character start as start // location and the peek offset as end location reader.peek(c); // Peek the previously fetched character + + // If this was an annotation start token, add the parsed < to the + // output + if (type == Tokens.AnnotationStart) { + handler.append('<', token.location.getStart(), + token.location.getStart() + 1); + } + handler.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); @@ -632,6 +703,13 @@ OsmlStreamParser::State OsmlStreamParser::parse() "which to start the field. Write \"\\{!\" to insert this " "sequence as text", token); + } else if (token.type == Tokens.AnnotationEnd) { + // We got a single annotation end token "\>" -- simply issue the + // ANNOTATION_END event + Variant annotationName = Variant::fromString(""); + annotationName.setLocation(token.location); + pushCommand(annotationName, Variant::mapType{}, false); + return State::ANNOTATION_END; } else { logger.error("Unexpected token \"" + token.content + "\"", token); } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index bb5db65..3827118 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -161,7 +161,13 @@ public: /** * Default constructor. */ - Command() : hasRange(false), inField(false), inRangeField(false), inDefaultField() {} + Command() + : hasRange(false), + inField(false), + inRangeField(false), + inDefaultField() + { + } /** * Constructor of the Command class. @@ -179,8 +185,8 @@ public: * @param inDefaultField is set to true if we currently are in a * specially marked default field. */ - Command(Variant name, Variant arguments, bool hasRange, bool inField, - bool inRangeField, bool inDefaultField) + Command(Variant name, Variant arguments, bool hasRange, + bool inField, bool inRangeField, bool inDefaultField) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), @@ -266,9 +272,11 @@ private: * * @param start is the start byte offset of the command (including the * backslash) + * @param isAnnotation if true, the command is not returned as command, but + * as annotation start. * @return true if a command was actuall parsed, false otherwise. */ - State parseCommand(size_t start); + State parseCommand(size_t start, bool isAnnotation); /** * Function used internally to parse a block comment. diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index 5f23822..d52fa5b 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -98,6 +98,56 @@ static void assertFieldEnd(OsmlStreamParser &reader, } } +static void assertAnnotationStart(OsmlStreamParser &reader, + const std::string &name, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, reader.parse()); + EXPECT_EQ(name, reader.getCommandName().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertAnnotationStart(OsmlStreamParser &reader, + const std::string &name, + const Variant::mapType &args, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + assertAnnotationStart(reader, name, start, end); + EXPECT_EQ(args, reader.getCommandArguments()); +} + +static void assertAnnotationEnd(OsmlStreamParser &reader, + const std::string &name, + const std::string &elementName, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, reader.parse()); + ASSERT_EQ(name, reader.getCommandName().asString()); + if (!elementName.empty()) { + ASSERT_EQ(1U, reader.getCommandArguments().asMap().size()); + ASSERT_EQ(1U, reader.getCommandArguments().asMap().count("name")); + + auto it = reader.getCommandArguments().asMap().find("name"); + ASSERT_EQ(elementName, it->second.asString()); + } + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + static void assertEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) @@ -184,9 +234,6 @@ TEST(OsmlStreamParser, escapeSpecialCharacters) testEscapeSpecialCharacter("\\"); testEscapeSpecialCharacter("{"); testEscapeSpecialCharacter("}"); - testEscapeSpecialCharacter("<"); - testEscapeSpecialCharacter(">"); - testEscapeSpecialCharacter("|"); } TEST(OsmlStreamParser, simpleSingleLineComment) @@ -1035,5 +1082,180 @@ TEST(OsmlStreamParser, errorFieldAfterExplicitDefaultField) assertEnd(reader, 10, 10); } +TEST(OsmlStreamParser, annotationStart) +{ + const char *testString = "<\\a"; + // 0 12 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); + assertEnd(reader, 3, 3); +} + +TEST(OsmlStreamParser, annotationStartWithName) +{ + const char *testString = "<\\annotationWithName#aName"; + // 0 1234567890123456789012345 + // 0 1 2 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart(reader, "annotationWithName", + Variant::mapType{{"name", "aName"}}, 0, 20); + assertEnd(reader, 26, 26); +} + +TEST(OsmlStreamParser, annotationStartWithArguments) +{ + const char *testString = "<\\annotationWithName#aName[a=1,b=2]"; + // 0 1234567890123456789012345678901234 + // 0 1 2 3 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart( + reader, "annotationWithName", + Variant::mapType{{"name", "aName"}, {"a", 1}, {"b", 2}}, 0, 20); + assertEnd(reader, 35, 35); +} + +TEST(OsmlStreamParser, simpleAnnotationStartBeginEnd) +{ + const char *testString = "<\\begin{ab#name}[a=1,b=2] a \\end{ab}\\>"; + // 0 123456789012345678901234567 89012345 67 + // 0 1 2 3 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart( + reader, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, + 10); + assertFieldStart(reader, true, 26, 27); + assertData(reader, "a", 26, 27); + assertFieldEnd(reader, 33, 35); + assertAnnotationEnd(reader, "", "", 36, 38); + assertEnd(reader, 38, 38); +} + +TEST(OsmlStreamParser, annotationEnd) +{ + const char *testString = "\\a>"; + // 012 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationEnd(reader, "a", "", 0, 2); + assertEnd(reader, 3, 3); +} + +TEST(OsmlStreamParser, annotationEndWithName) +{ + const char *testString = "\\a#name>"; + // 01234567 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationEnd(reader, "a", "name", 0, 2); + assertEnd(reader, 8, 8); +} + +TEST(OsmlStreamParser, annotationEndWithNameAsArgs) +{ + const char *testString = "\\a[name=name]>"; + // 01234567890123 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationEnd(reader, "a", "name", 0, 2); + assertEnd(reader, 14, 14); +} + +TEST(OsmlStreamParser, errorAnnotationEndWithArguments) +{ + const char *testString = "\\a[foo=bar]>"; + // 012345678901 + // 0 1 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "a", Variant::mapType{{"foo", "bar"}}, 0, 2); + ASSERT_TRUE(logger.hasError()); + assertData(reader, ">", 11, 12); + assertEnd(reader, 12, 12); +} + +TEST(OsmlStreamParser, closingAnnotation) +{ + const char *testString = "<\\a>"; + // 0 123 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); + assertData(reader, ">", 3, 4); + assertEnd(reader, 4, 4); +} + +TEST(OsmlStreamParser, annotationWithFields) +{ + const char *testString = "a <\\b{c}{d}{!e} f \\> g"; + // 012 345678901234567 8901 + // 0 1 2 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertData(reader, "a", 0, 1); + assertAnnotationStart(reader, "b", Variant::mapType{}, 2, 5); + assertFieldStart(reader, false, 5, 6); + assertData(reader, "c", 6, 7); + assertFieldEnd(reader, 7, 8); + assertFieldStart(reader, false, 8, 9); + assertData(reader, "d", 9, 10); + assertFieldEnd(reader, 10, 11); + assertFieldStart(reader, true, 11, 13); + assertData(reader, "e", 13, 14); + assertFieldEnd(reader, 14, 15); + assertData(reader, "f", 16, 17); + assertAnnotationEnd(reader, "", "", 18, 20); + assertData(reader, "g", 21, 22); + assertEnd(reader, 22, 22); +} + +TEST(OsmlStreamParser, annotationStartEscape) +{ + const char *testString = "<\\%test"; + // 0 123456 + // 0 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertData(reader, "<%test", 0, 7); + assertEnd(reader, 7, 7); +} } -- cgit v1.2.3 From cc281d91def921b7bbf5d3d4a0fce53afc5a317b Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:07:58 +0100 Subject: Renamed parser/generic to parser/stack and made filenames much shorter --- src/core/parser/generic/ParserState.cpp | 161 ------------ src/core/parser/generic/ParserState.hpp | 284 --------------------- src/core/parser/generic/ParserStateCallbacks.cpp | 26 -- src/core/parser/generic/ParserStateCallbacks.hpp | 106 -------- src/core/parser/generic/ParserStateHandler.cpp | 104 -------- src/core/parser/generic/ParserStateHandler.hpp | 281 --------------------- src/core/parser/generic/ParserStateStack.cpp | 187 -------------- src/core/parser/generic/ParserStateStack.hpp | 191 -------------- src/core/parser/stack/Callbacks.cpp | 23 ++ src/core/parser/stack/Callbacks.hpp | 99 ++++++++ src/core/parser/stack/Handler.cpp | 90 +++++++ src/core/parser/stack/Handler.hpp | 302 ++++++++++++++++++++++ src/core/parser/stack/Stack.cpp | 188 ++++++++++++++ src/core/parser/stack/Stack.hpp | 191 ++++++++++++++ src/core/parser/stack/State.cpp | 171 +++++++++++++ src/core/parser/stack/State.hpp | 307 +++++++++++++++++++++++ test/core/parser/ParserStateTest.cpp | 77 ------ test/core/parser/stack/StateTest.cpp | 79 ++++++ 18 files changed, 1450 insertions(+), 1417 deletions(-) delete mode 100644 src/core/parser/generic/ParserState.cpp delete mode 100644 src/core/parser/generic/ParserState.hpp delete mode 100644 src/core/parser/generic/ParserStateCallbacks.cpp delete mode 100644 src/core/parser/generic/ParserStateCallbacks.hpp delete mode 100644 src/core/parser/generic/ParserStateHandler.cpp delete mode 100644 src/core/parser/generic/ParserStateHandler.hpp delete mode 100644 src/core/parser/generic/ParserStateStack.cpp delete mode 100644 src/core/parser/generic/ParserStateStack.hpp create mode 100644 src/core/parser/stack/Callbacks.cpp create mode 100644 src/core/parser/stack/Callbacks.hpp create mode 100644 src/core/parser/stack/Handler.cpp create mode 100644 src/core/parser/stack/Handler.hpp create mode 100644 src/core/parser/stack/Stack.cpp create mode 100644 src/core/parser/stack/Stack.hpp create mode 100644 src/core/parser/stack/State.cpp create mode 100644 src/core/parser/stack/State.hpp delete mode 100644 test/core/parser/ParserStateTest.cpp create mode 100644 test/core/parser/stack/StateTest.cpp (limited to 'test') diff --git a/src/core/parser/generic/ParserState.cpp b/src/core/parser/generic/ParserState.cpp deleted file mode 100644 index f635d86..0000000 --- a/src/core/parser/generic/ParserState.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "ParserState.hpp" - -namespace ousia { - -/* Class ParserState */ - -ParserState::ParserState() : elementHandler(nullptr) {} - -ParserState::ParserState(ParserStateSet parents, Arguments arguments, - RttiSet createdNodeTypes, - HandlerConstructor elementHandler) - : parents(parents), - arguments(arguments), - createdNodeTypes(createdNodeTypes), - elementHandler(elementHandler) -{ -} - -ParserState::ParserState(const ParserStateBuilder &builder) - : ParserState(builder.build()) -{ -} - -/* Class ParserStateBuilder */ - -ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) -{ - this->state = state; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) -{ - state.parents = ParserStateSet{parent}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) -{ - state.parents = parents; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) -{ - state.arguments = arguments; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) -{ - state.createdNodeTypes = RttiSet{type}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) -{ - state.createdNodeTypes = types; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::elementHandler( - HandlerConstructor elementHandler) -{ - state.elementHandler = elementHandler; - return *this; -} - -const ParserState &ParserStateBuilder::build() const { return state; } - -/* Class ParserStateDeductor */ - -ParserStateDeductor::ParserStateDeductor( - std::vector signature, - std::vector states) - : tbl(signature.size()), - signature(std::move(signature)), - states(std::move(states)) -{ -} - -bool ParserStateDeductor::isActive(size_t d, const ParserState *s) -{ - // Lookup the "active" state of (d, s), if it was not already set - // (e.second is true) we'll have to calculate it - auto e = tbl[d].emplace(s, false); - bool &res = e.first->second; - if (!e.second) { - return res; - } - - // Check whether this node is generative (may have produced the Node - // described by the current Signature element) - bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); - - if (isGenerative && d == 0) { - // End of recursion -- the last signature element is reached and the - // node was generative - res = true; - } else { - // Try repetition of this node - if (isGenerative && isActive(d - 1, s)) { - res = true; - } else { - // Check whether any of the parent nodes were active -- either for - // the previous element (if this one is generative) or for the - // current element (assuming this node was not generative) - for (const ParserState *parent : s->parents) { - if ((isGenerative && isActive(d - 1, parent)) || - isActive(d, parent)) { - res = true; - break; - } - } - } - } - - return res; -} - -std::vector ParserStateDeductor::deduce() -{ - std::vector res; - if (!signature.empty()) { - const size_t D = signature.size(); - for (auto s : states) { - if (signature[D - 1]->isOneOf(s->createdNodeTypes) && - isActive(D - 1, s)) { - res.push_back(s); - } - } - } - return res; -} - -/* Constant initializations */ - -namespace ParserStates { -const ParserState All; -const ParserState None; -} -} - diff --git a/src/core/parser/generic/ParserState.hpp b/src/core/parser/generic/ParserState.hpp deleted file mode 100644 index 6487fdd..0000000 --- a/src/core/parser/generic/ParserState.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserState.hpp - * - * Defines the ParserState class used within the ParserStack pushdown - * automaton and the ParserStateBuilder class for convenient construction of - * such classes. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_HPP_ -#define _OUSIA_PARSER_STATE_HPP_ - -#include - -#include -#include - -namespace ousia { - -// Forward declarations -class ParserStateBuilder; -class ParserState; -class HandlerData; -class Handler; -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * Set of pointers of parser states -- used for specifying a set of parent - * states. - */ -using ParserStateSet = std::unordered_set; - -/** - * Class used for the complete specification of a ParserState. Stores possible - * parent states, state handlers and arguments to be passed to that state. - */ -struct ParserState { - /** - * Vector containing all possible parent states. - */ - ParserStateSet parents; - - /** - * Descriptor of the arguments that should be passed to the handler. - */ - Arguments arguments; - - /** - * Set containing the types of the nodes that may be created in this - * ParserState. This information is needed for Parsers to reconstruct the - * current ParserState from a given ParserScope when a file is included. - */ - RttiSet createdNodeTypes; - - /** - * Pointer at a function which creates a new concrete Handler instance for - * the elements described by this state. May be nullptr in which case no - * handler instance is created. - */ - HandlerConstructor elementHandler; - - /** - * Default constructor, initializes the handlers with nullptr. - */ - ParserState(); - - /** - * Constructor taking values for all fields. Use the ParserStateBuilder - * class for a more convenient construction of ParserState instances. - * - * @param parents is a vector containing all possible parent states. - * @param arguments is a descriptor of arguments that should be passed to - * the handler. - * @param createdNodeTypes is a set containing the types of the nodes tha - * may be created in this ParserState. This information is needed for - * Parsers to reconstruct the current ParserState from a given ParserScope - * when a file is included. - * @param elementHandler is a pointer at a function which creates a new - * concrete Handler instance for the elements described by this state. May - * be nullptr in which case no handler instance is created. - */ - ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, - RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr); - - /** - * Creates this ParserState from the given ParserStateBuilder instance. - */ - ParserState(const ParserStateBuilder &builder); -}; - -/** - * The ParserStateBuilder class is a class used for conveniently building new - * ParserState instances. - */ -class ParserStateBuilder { -private: - /** - * ParserState instance that is currently being built by the - * ParserStateBuilder. - */ - ParserState state; - -public: - /** - * Copies the ParserState instance and uses it as internal state. Overrides - * all changes made by the ParserStateBuilder. - * - * @param state is the state that should be copied. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder ©(const ParserState &state); - - /** - * Sets the possible parent states to the single given parent element. - * - * @param parent is a pointer at the parent ParserState instance that should - * be the possible parent state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parent(const ParserState *parent); - - /** - * Sets the ParserState instances in the given ParserStateSet as the list of - * supported parent states. - * - * @param parents is a set of pointers at ParserState instances that should - * be the possible parent states. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parents(const ParserStateSet &parents); - - /** - * Sets the arguments that should be passed to the parser state handler to - * those given as argument. - * - * @param arguments is the Arguments instance describing the Arguments that - * should be parsed to a Handler for this ParserState. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &arguments(const Arguments &arguments); - - /** - * Sets the Node types this state may produce to the given Rtti descriptor. - * - * @param type is the Rtti descriptor of the Type that may be produced by - * this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeType(const Rtti *type); - - /** - * Sets the Node types this state may produce to the given Rtti descriptors. - * - * @param types is a set of Rtti descriptors of the Types that may be - * produced by this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeTypes(const RttiSet &types); - - /** - * Sets the constructor for the element handler. The constructor creates a - * new concrete Handler instance for the elements described by this state. - * May be nullptr in which case no handler instance is created (this is - * the default value). - * - * @param elementHandler is the HandlerConstructor that should create a - * new Handler instance. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); - - /** - * Returns a reference at the internal ParserState instance that was built - * using the ParserStateBuilder. - * - * @return the built ParserState. - */ - const ParserState &build() const; -}; - -/** - * Class used to deduce the ParserState a Parser is currently in based on the - * types of the Nodes that currently are on the ParserStack. Uses dynamic - * programming in order to solve this problem. - */ -class ParserStateDeductor { -public: - /** - * Type containing the dynamic programming table. - */ - using Table = std::vector>; - -private: - /** - * Dynamic programming table. - */ - Table tbl; - - /** - * Signature given in the constructor. - */ - const std::vector signature; - - /** - * List of states that should be checked for being active. - */ - const std::vector states; - - /** - * Used internally to check whether the given parser stack s may have been - * active for signature element d. - * - * @param d is the signature element. - * @param s is the parser state. - * @return true if the the given ParserState may have been active. - */ - bool isActive(size_t d, const ParserState *s); - -public: - /** - * Constructor of the ParserStateDeductor class. - * - * @param signature a Node type signature describing the types of the nodes - * which currently reside on e.g. the ParserScope stack. - * @param states is a list of states that should be checked. - */ - ParserStateDeductor(std::vector signature, - std::vector states); - - /** - * Selects all active states from the given states. Only considers those - * states that may have produced the last signature element. - * - * @return a list of states that may actually have been active. - */ - std::vector deduce(); -}; - -/** - * The ParserStates namespace contains all the global state constants used - * in the ParserStack class. - */ -namespace ParserStates { -/** - * State representing all states. - */ -extern const ParserState All; - -/** - * State representing the initial state. - */ -extern const ParserState None; -} -} - -#endif /* _OUSIA_PARSER_STATE_HPP_ */ - diff --git a/src/core/parser/generic/ParserStateCallbacks.cpp b/src/core/parser/generic/ParserStateCallbacks.cpp deleted file mode 100644 index 50bac57..0000000 --- a/src/core/parser/generic/ParserStateCallbacks.cpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -namespace ousia { - -/* Class ParserStateCallbacks */ - -} - diff --git a/src/core/parser/generic/ParserStateCallbacks.hpp b/src/core/parser/generic/ParserStateCallbacks.hpp deleted file mode 100644 index 7ec5264..0000000 --- a/src/core/parser/generic/ParserStateCallbacks.hpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserStateCallbacks.hpp - * - * Contains an interface defining the callbacks that can be directed from a - * ParserStateHandler to the ParserStateStack, and from the ParserStateStack to - * the actual parser. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_CALLBACKS_HPP_ -#define _OUSIA_PARSER_STATE_CALLBACKS_HPP_ - -#include - -#include - -namespace ousia { - -/** - * Interface defining a set of callback functions that act as a basis for the - * ParserStateStackCallbacks and the ParserCallbacks. - */ -class ParserStateCallbacks { -public: - /** - * Virtual descructor. - */ - virtual ~ParserStateCallbacks() {}; - - /** - * Sets the whitespace mode that specifies how string data should be - * processed. - * - * @param whitespaceMode specifies one of the three WhitespaceMode constants - * PRESERVE, TRIM or COLLAPSE. - */ - virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; - - /** - * Sets the type as which the variant data should be parsed. - * - * @param type is one of the VariantType constants, specifying with which - * type the data that is passed to the ParserStateHandler in the "data" - * function should be handled. - */ - virtual void setDataType(VariantType type) = 0; - - /** - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. - */ - virtual void registerToken(const std::string &token) = 0; - - /** - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be unregistered. - */ - virtual void unregisterToken(const std::string &token) = 0; -}; - -/** - * Interface defining the callback functions that can be passed from a - * ParserStateStack to the underlying parser. - */ -class ParserCallbacks : public ParserStateCallbacks { - /** - * Checks whether the given token is supported by the parser. The parser - * returns true, if the token is supported, false if this token cannot be - * registered. Note that parsers that do not support the registration of - * tokens at all should always return "true". - * - * @param token is the token that should be checked for support. - * @return true if the token is generally supported (or the parser does not - * support registering tokens at all), false if the token is not supported, - * because e.g. it is a reserved token or it interferes with other tokens. - */ - virtual bool supportsToken(const std::string &token) = 0; -} - -} - -#endif /* _OUSIA_PARSER_STATE_CALLBACKS_HPP_ */ - diff --git a/src/core/parser/generic/ParserStateHandler.cpp b/src/core/parser/generic/ParserStateHandler.cpp deleted file mode 100644 index 64e2bfa..0000000 --- a/src/core/parser/generic/ParserStateHandler.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "ParserStateHandler.hpp" - -namespace ousia { - -/* Class ParserStatedata */ - -ParserStatedata::ParserStatedata(ParserContext &ctx, - ParserStateCallbacks &callbacks, - std::string name, const ParserState &state, - const ParserState &parentState, - const SourceLocation location) - : ctx(ctx), - callbacks(callbacks), - name(std::move(name)), - state(state), - parentState(parentState), - location(location){}; - -/* Class ParserStateHandler */ - -ParserStateHandler::ParserStateHandler(const ParserStatedata &data) : data(data) -{ -} - -ParserContext &ParserStateHandler::context() { return data.ctx; } - -const std::string &ParserStateHandler::name() { return data.name; } - -ParserScope &ParserStateHandler::scope() { return data.ctx.getScope(); } - -Manager &ParserStateHandler::manager() { return data.ctx.getManager(); } - -Logger &ParserStateHandler::logger() { return data.ctx.getLogger(); } - -Rooted ParserStateHandler::project() { return data.ctx.getProject(); } - -const ParserState &ParserStateHandler::state() { return data.state; } - -SourceLocation ParserStateHandler::location() { return data.location; } - -void ParserStateHandler::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - data.callbacks.setWhitespaceMode(whitespaceMode); -} - -void ParserStateHandler::setDataType(VariantType type) -{ - data.callbacks.setDataType(type); -} - -bool ParserStateHandler::supportsToken(const std::string &token) -{ - return data.callbacks.supportsToken(token); -} - -void ParserStateHandler::registerToken(const std::string &token) -{ - data.callbacks.registerToken(token); -} - -void ParserStateHandler::unregisterToken(const std::string &token) -{ - data.callbacks.unregisterToken(token); -} - -void ParserStateHandler::data(const std::string &data, int field) -{ - if (Utils::hasNonWhitepaceChar(data)) { - logger().error("Expected command but found character data."); - } -} - -/* Class DefaultParserStateHandler */ - -void DefaultParserStateHandler::start(Variant::mapType &args) {} - -void DefaultParserStateHandler::end() {} - -ParserStateHandler *DefaultParserStateHandler::create(const data &data) -{ - return new DefaultHandler{data}; -} -} - diff --git a/src/core/parser/generic/ParserStateHandler.hpp b/src/core/parser/generic/ParserStateHandler.hpp deleted file mode 100644 index f3c836e..0000000 --- a/src/core/parser/generic/ParserStateHandler.hpp +++ /dev/null @@ -1,281 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_PARSER_STATE_HANDLER_HPP_ -#define _OUSIA_PARSER_STATE_HANDLER_HPP_ - -#include -#include - -#include - -namespace ousia { - -// Forward declarations -class ParserContext; -class ParserState; -class ParserStateCallbacks; - -/** - * Class collecting all the data that is being passed to a ParserStateHandler - * instance. - */ -class ParserStateHandlerData { -public: - /** - * Reference to the ParserContext instance that should be used to resolve - * references to nodes in the Graph. - */ - ParserContext &ctx; - - /** - * Reference at an instance of the ParserStateCallbacks class, used for - * modifying the behaviour of the parser (like registering tokens, setting - * the data type or changing the whitespace handling mode). - */ - ParserStateCallbacks &callbacks; - - /** - * Contains the name of the command that is being handled. - */ - const std::string name; - - /** - * Contains the current state of the state machine. - */ - const ParserState &state; - - /** - * Contains the state of the state machine when the parent node was handled. - */ - const ParserState &parentState; - - /** - * Current source code location. - */ - const SourceLocation location; - - /** - * Constructor of the HandlerData class. - * - * @param ctx is the parser context the handler should be executed in. - * @param callbacks is an instance of ParserStateCallbacks used to notify - * the parser about certain state changes. - * @param name is the name of the string. - * @param state is the state this handler was called for. - * @param parentState is the state of the parent command. - * @param location is the location at which the handler is created. - */ - ParserStateHandlerData(ParserContext &ctx, ParserStateCallbacks &callbacks, - std::string name, const ParserState &state, - const ParserState &parentState, - const SourceLocation &location); -}; - -/** - * The handler class provides a context for handling an XML tag. It has to be - * overridden and registered in the StateStack class to form handlers for - * concrete XML tags. - */ -class ParserStateHandler { -private: - /** - * Structure containing the internal handler data. - */ - const ParserStateHandlerData data; - -protected: - /** - * Constructor of the Handler class. - * - * @param data is a structure containing all data being passed to the - * handler. - */ - ParserStateHandler(const ParserStateHandlerData &data){}; - -public: - /** - * Virtual destructor. - */ - virtual ~Handler(){}; - - /** - * Returns a reference at the ParserContext. - * - * @return a reference at the ParserContext. - */ - ParserContext &context(); - - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. - */ - const std::string &name(); - - /** - * Returns a reference at the ParserScope instance. - * - * @return a reference at the ParserScope instance. - */ - ParserScope &scope(); - - /** - * Returns a reference at the Manager instance which manages all nodes. - * - * @return a referance at the Manager instance. - */ - Manager &manager(); - - /** - * Returns a reference at the Logger instance used for logging error - * messages. - * - * @return a reference at the Logger instance. - */ - Logger &logger(); - - /** - * Returns a reference at the Project Node, representing the project into - * which the file is currently being parsed. - * - * @return a referance at the Project Node. - */ - Rooted project(); - - /** - * Reference at the ParserState descriptor for which this Handler was - * created. - * - * @return a const reference at the constructing ParserState descriptor. - */ - const ParserState &state(); - - /** - * Returns the current location in the source file. - * - * @return the current location in the source file. - */ - SourceLocation location(); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Sets the whitespace mode that specifies how string data should be - * processed. - * - * @param whitespaceMode specifies one of the three WhitespaceMode constants - * PRESERVE, TRIM or COLLAPSE. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Sets the type as which the variant data should be parsed. - * - * @param type is one of the VariantType constants, specifying with which - * type the data that is passed to the ParserStateHandler in the "data" - * function should be handled. - */ - void setDataType(VariantType type); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Checks whether the given token is supported by the parser. The parser - * returns true, if the token is supported, false if this token cannot be - * registered. Note that parsers that do not support the registration of - * tokens at all should always return "true". - * - * @param token is the token that should be checked for support. - * @return true if the token is generally supported (or the parser does not - * support registering tokens at all), false if the token is not supported, - * because e.g. it is a reserved token or it interferes with other tokens. - */ - bool supportsToken(const std::string &token); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. - */ - void registerToken(const std::string &token); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be unregistered. - */ - void unregisterToken(const std::string &token); - - /** - * Called when the command that was specified in the constructor is - * instanciated. - * - * @param args is a map from strings to variants (argument name and value). - */ - virtual void start(Variant::mapType &args) = 0; - - /** - * Called whenever the command for which this handler is defined ends. - */ - virtual void end() = 0; - - /** - * Called whenever raw data (int the form of a string) is available for the - * Handler instance. In the default handler an exception is raised if the - * received data contains non-whitespace characters. - * - * @param data is a pointer at the character data that is available for the - * Handler instance. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - virtual void data(const std::string &data, int field); -}; - -/** - * HandlerConstructor is a function pointer type used to create concrete - * instances of the Handler class. - * - * @param handlerData is the data that should be passed to the new handler - * instance. - * @return a newly created handler instance. - */ -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * The DefaultHandler class is used in case no element handler is specified in - * the ParserState descriptor. - */ -class DefaultParserStateHandler : public ParserStateHandler { -public: - using ParserStateHandler::ParserStateHandler; - - void start(Variant::mapType &args) override; - - void end() override; - - static Handler *create(const HandlerData &handlerData); -}; -} - -#endif /* _OUSIA_PARSER_STATE_HANDLER_HPP_ */ - diff --git a/src/core/parser/generic/ParserStateStack.cpp b/src/core/parser/generic/ParserStateStack.cpp deleted file mode 100644 index 8c32f17..0000000 --- a/src/core/parser/generic/ParserStateStack.cpp +++ /dev/null @@ -1,187 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include -#include - -#include "ParserScope.hpp" -#include "ParserStateStack.hpp" - -namespace ousia { - -/* Class ParserStateStack */ - -/** - * Returns an Exception that should be thrown when a currently invalid command - * is thrown. - */ -static LoggableException InvalidCommand(const std::string &name, - const std::set &expected) -{ - if (expected.empty()) { - return LoggableException{ - std::string{"No nested elements allowed, but got \""} + name + - std::string{"\""}}; - } else { - return LoggableException{ - std::string{"Expected "} + - (expected.size() == 1 ? std::string{"\""} - : std::string{"one of \""}) + - Utils::join(expected, "\", \"") + std::string{"\", but got \""} + - name + std::string{"\""}}; - } -} - -ParserStateStack::ParserStateStack( - ParserContext &ctx, - const std::multimap &states) - : ctx(ctx), states(states) -{ -} - -bool ParserStateStack::deduceState() -{ - // Assemble all states - std::vector states; - for (const auto &e : this->states) { - states.push_back(e.second); - } - - // Fetch the type signature of the scope and derive all possible states, - // abort if no unique parser state was found - std::vector possibleStates = - ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) - .deduce(); - if (possibleStates.size() != 1) { - ctx.getLogger().error( - "Error while including file: Cannot deduce parser state."); - return false; - } - - // Switch to this state by creating a dummy handler - const ParserState *state = possibleStates[0]; - Handler *handler = - DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); - stack.emplace(handler); - return true; -} - -std::set ParserStateStack::expectedCommands() -{ - const ParserState *currentState = &(this->currentState()); - std::set res; - for (const auto &v : states) { - if (v.second->parents.count(currentState)) { - res.insert(v.first); - } - } - return res; -} - -const ParserState &ParserStateStack::currentState() -{ - return stack.empty() ? ParserStates::None : stack.top()->state(); -} - -std::string ParserStateStack::currentCommandName() -{ - return stack.empty() ? std::string{} : stack.top()->name(); -} - -const ParserState *ParserStateStack::findTargetState(const std::string &name) -{ - const ParserState *currentState = &(this->currentState()); - auto range = states.equal_range(name); - for (auto it = range.first; it != range.second; it++) { - const ParserStateSet &parents = it->second->parents; - if (parents.count(currentState) || parents.count(&ParserStates::All)) { - return it->second; - } - } - - return nullptr; -} - -void ParserStateStack::start(const std::string &name, Variant::mapType &args, - const SourceLocation &location) -{ - ParserState const *targetState = findTargetState(name); -// TODO: Andreas, please improve this. -// if (!Utils::isIdentifier(name)) { -// throw LoggableException(std::string("Invalid identifier \"") + name + -// std::string("\"")); -// } - - if (targetState == nullptr) { - targetState = findTargetState("*"); - } - if (targetState == nullptr) { - throw InvalidCommand(name, expectedCommands()); - } - - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and call its start function - Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); - handler->start(args); - stack.emplace(handler); -} - -void ParserStateStack::start(std::string name, const Variant::mapType &args, - const SourceLocation &location) -{ - Variant::mapType argsCopy(args); - start(name, argsCopy); -} - -void ParserStateStack::end() -{ - // Check whether the current command could be ended - if (stack.empty()) { - throw LoggableException{"No command to end."}; - } - - // Remove the current HandlerInstance from the stack - std::shared_ptr inst{stack.top()}; - stack.pop(); - - // Call the end function of the last Handler - inst->end(); -} - -void ParserStateStack::data(const std::string &data, int field) -{ - // Check whether there is any command the data can be sent to - if (stack.empty()) { - throw LoggableException{"No command to receive data."}; - } - - // Pass the data to the current Handler instance - stack.top()->data(data, field); -} -} - diff --git a/src/core/parser/generic/ParserStateStack.hpp b/src/core/parser/generic/ParserStateStack.hpp deleted file mode 100644 index b106475..0000000 --- a/src/core/parser/generic/ParserStateStack.hpp +++ /dev/null @@ -1,191 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserStateStack.hpp - * - * Helper classes for document or description parsers. Contains the - * ParserStateStack class, which is an pushdown automaton responsible for - * accepting commands in the correct order and calling specified handlers. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_STACK_HPP_ -#define _OUSIA_PARSER_STATE_STACK_HPP_ - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "Parser.hpp" -#include "ParserContext.hpp" -#include "ParserState.hpp" - -namespace ousia { - -/** - * The ParserStateStack class is a pushdown automaton responsible for turning a - * command stream into a tree of Node instances. - */ -class ParserStateStack { -private: - /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. - */ - const std::multimap &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::stack> stack; - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. - */ - std::set expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from for the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - *state - * otherwise. - */ - const ParserState *findTargetState(const std::string &name); - -public: - /** - * Creates a new instance of the ParserStateStack class. - * - * @param ctx is the parser context the parser stack is working on. - * @param states is a map containing the command names and pointers at the - * corresponding ParserState instances. - */ - ParserStateStack( - ParserContext &ctx, - const std::multimap &states); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - + an equivalent state as the one in the including file. - * - * @param scope is the ParserScope instance from which the ParserState - * should be reconstructed. - * @param logger is the logger instance to which error messages should be - * written. - * @return true if the operation was sucessful, false otherwise. - */ - bool deduceState(); - - /** - * Returns the state the ParserStateStack instance currently is in. - * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. - */ - const ParserState ¤tState(); - - /** - * Returns the command name that is currently being handled. - * - * @return the name of the command currently being handled by the active - * Handler instance or an empty string if no handler is currently active. - */ - std::string currentCommandName(); - - /** - * Function that should be called whenever a new command is reached. - * - * @param name is the name of the command (including the namespace - * separator ':') and its corresponding location. Must be a string variant. - * @param args is a map variant containing the arguments that were passed to - * the command. - */ - void command(Variant name, Variant args); - - /** - * Function that should be called whenever a new field starts. Fields of the - * same command may not be separated by calls to - */ - void fieldStart(); - - /** - * Function that should be called whenever a field ends. - */ - void fieldEnd(); - - /** - * Function that shuold be called whenever character data is found in the - * input stream. - * - * @param data is a variant of any type containing the data that was parsed - * as data. - */ - void data(Variant data); - - /** - * Function that should be called whenever an annotation starts. - * - * @param name is the name of the annotation class. - * @param args is a map variant containing the arguments that were passed - * to the annotation. - */ - void annotationStart(Variant name, Variant args); - - /** - * Function that should be called whenever an annotation ends. - * - * @param name is the name of the annotation class that was ended. - * @param annotationName is the name of the annotation that was ended. - */ - void annotationEnd(Variant name, Variant annotationName); - - /** - * Function that should be called whenever a previously registered token - * is found in the input stream. - * - * @param token is string variant containing the token that was encountered. - */ - void token(Variant token); -}; -} - -#endif /* _OUSIA_PARSER_STATE_STACK_HPP_ */ - diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp new file mode 100644 index 0000000..6ebc549 --- /dev/null +++ b/src/core/parser/stack/Callbacks.cpp @@ -0,0 +1,23 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Callbacks.hpp" + +namespace ousia { +} + diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp new file mode 100644 index 0000000..bb56e44 --- /dev/null +++ b/src/core/parser/stack/Callbacks.hpp @@ -0,0 +1,99 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Callbacks.hpp + * + * Contains an interface defining the callbacks that can be directed from a + * StateHandler to the StateStack, and from the StateStack to + * the actual parser. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_CALLBACKS_HPP_ +#define _OUSIA_PARSER_STATE_CALLBACKS_HPP_ + +#include + +#include + +namespace ousia { +namespace parser_stack { + +/** + * Interface defining a set of callback functions that act as a basis for the + * StateStackCallbacks and the ParserCallbacks. + */ +class Callbacks { +public: + /** + * Virtual descructor. + */ + virtual ~Callbacks() {}; + + /** + * Sets the whitespace mode that specifies how string data should be + * processed. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; + + /** + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + virtual void registerToken(const std::string &token) = 0; + + /** + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + virtual void unregisterToken(const std::string &token) = 0; +}; + +/** + * Interface defining the callback functions that can be passed from a + * StateStack to the underlying parser. + */ +class ParserCallbacks : public Callbacks { + /** + * Checks whether the given token is supported by the parser. The parser + * returns true, if the token is supported, false if this token cannot be + * registered. Note that parsers that do not support the registration of + * tokens at all should always return "true". + * + * @param token is the token that should be checked for support. + * @return true if the token is generally supported (or the parser does not + * support registering tokens at all), false if the token is not supported, + * because e.g. it is a reserved token or it interferes with other tokens. + */ + virtual bool supportsToken(const std::string &token) = 0; +}; + +} +} + +#endif /* _OUSIA_PARSER_STATE_CALLBACKS_HPP_ */ + diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp new file mode 100644 index 0000000..66af2a4 --- /dev/null +++ b/src/core/parser/stack/Handler.cpp @@ -0,0 +1,90 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include "Callbacks.hpp" +#include "Handler.hpp" +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class HandlerData */ + +HandlerData::HandlerData(ParserContext &ctx, Callbacks &callbacks, + std::string name, const State &state, + const SourceLocation &location) + : ctx(ctx), + callbacks(callbacks), + name(std::move(name)), + state(state), + location(location) +{ +} + +/* Class Handler */ + +Handler::Handler(const HandlerData &internalData) : internalData(internalData) +{ +} + +Handler::~Handler() {} + +ParserContext &Handler::context() { return internalData.ctx; } + +const std::string &Handler::name() { return internalData.name; } + +ParserScope &Handler::scope() { return internalData.ctx.getScope(); } + +Manager &Handler::manager() { return internalData.ctx.getManager(); } + +Logger &Handler::logger() { return internalData.ctx.getLogger(); } + +const State &Handler::state() { return internalData.state; } + +SourceLocation Handler::location() { return internalData.location; } + +void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ + internalData.callbacks.setWhitespaceMode(whitespaceMode); +} + +void Handler::registerToken(const std::string &token) +{ + internalData.callbacks.registerToken(token); +} + +void Handler::unregisterToken(const std::string &token) +{ + internalData.callbacks.unregisterToken(token); +} + +/* Class DefaultHandler */ + +/*void DefaultHandler::start(Variant::mapType &args) {} + +void DefaultHandler::end() {} + +Handler *DefaultHandler::create(const data &data) +{ + return new DefaultHandler{data}; +}*/ +} +} + diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp new file mode 100644 index 0000000..0701343 --- /dev/null +++ b/src/core/parser/stack/Handler.hpp @@ -0,0 +1,302 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_PARSER_STATE_HANDLER_HPP_ +#define _OUSIA_PARSER_STATE_HANDLER_HPP_ + +#include +#include + +#include +#include + +namespace ousia { + +// Forward declarations +class ParserContext; +class Callbacks; +class Logger; +class Project; + +namespace parser_stack { + +// More forward declarations +class State; + +/** + * Class collecting all the data that is being passed to a Handler + * instance. + */ +class HandlerData { +public: + /** + * Reference to the ParserContext instance that should be used to resolve + * references to nodes in the Graph. + */ + ParserContext &ctx; + + /** + * Reference at an instance of the Callbacks class, used for + * modifying the behaviour of the parser (like registering tokens, setting + * the data type or changing the whitespace handling mode). + */ + Callbacks &callbacks; + + /** + * Contains the name of the command that is being handled. + */ + std::string name; + + /** + * Contains the current state of the state machine. + */ + const State &state; + + /** + * Current source code location. + */ + SourceLocation location; + + /** + * Constructor of the HandlerData class. + * + * @param ctx is the parser context the handler should be executed in. + * @param callbacks is an instance of Callbacks used to notify + * the parser about certain state changes. + * @param name is the name of the string. + * @param state is the state this handler was called for. + * @param location is the location at which the handler is created. + */ + HandlerData(ParserContext &ctx, Callbacks &callbacks, std::string name, + const State &state, const SourceLocation &location); +}; + +/** + * The Handler class provides a context for handling a generic stack element. + * It has to beoverridden and registered in the StateStack class to form + * handlers for concrete XML tags. + */ +class Handler { +private: + /** + * Structure containing the internal handler data. + */ + const HandlerData internalData; + +protected: + /** + * Constructor of the Handler class. + * + * @param data is a structure containing all data being passed to the + * handler. + */ + Handler(const HandlerData &internalData); + + /** + * Returns a reference at the ParserContext. + * + * @return a reference at the ParserContext. + */ + ParserContext &context(); + + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &name(); + + /** + * Returns a reference at the ParserScope instance. + * + * @return a reference at the ParserScope instance. + */ + ParserScope &scope(); + + /** + * Returns a reference at the Manager instance which manages all nodes. + * + * @return a referance at the Manager instance. + */ + Manager &manager(); + + /** + * Returns a reference at the Logger instance used for logging error + * messages. + * + * @return a reference at the Logger instance. + */ + Logger &logger(); + + /** + * Reference at the State descriptor for which this Handler was created. + * + * @return a const reference at the constructing State descriptor. + */ + const State &state(); + + /** + * Returns the current location in the source file. + * + * @return the current location in the source file. + */ + SourceLocation location(); + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); + + /** + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + void setWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Calls the corresponding function in the Callbacks instance. + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + void registerToken(const std::string &token); + + /** + * Calls the corresponding function in the Callbacks instance. + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + void unregisterToken(const std::string &token); + + /** + * Called when the command that was specified in the constructor is + * instanciated. + * + * @param args is a map from strings to variants (argument name and value). + * @return true if the handler was successful in starting the element it + * represents, false otherwise. + */ + virtual bool start(Variant::mapType &args) = 0; + + /** + * Called before the command for which this handler is defined ends (is + * forever removed from the stack). + */ + virtual void end() = 0; + + /** + * Called when a new field starts, while the handler is active. This + * function should return true if the field is supported, false otherwise. + * No error should be logged if the field cannot be started, the caller will + * take care of that (since it is always valid to start a default field, + * even though the corresponding structure does not have a field, as long as + * no data is fed into the field). + * + * @param isDefaultField is set to true if the field that is being started + * is the default/tree field. The handler should set the value of this + * variable to true if the referenced field is indeed the default field. + * @param isImplicit is set to true if the field is implicitly being started + * by the stack (this field always implies isDefaultField being set to + * true). + * @param fieldIndex is the numerical index of the field. + */ + virtual bool fieldStart(bool &isDefaultField, bool isImplicit, + size_t fieldIndex) = 0; + + /** + * Called when a previously opened field ends, while the handler is active. + * Note that a "fieldStart" and "fieldEnd" are always called alternately. + */ + virtual void fieldEnd() = 0; + + /** + * Called whenever an annotation starts while this handler is active. The + * function should return true if starting the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the name in the source code. + * @param args is a map from strings to variants (argument name and value). + * @return true if the mentioned annotation could be started here, false + * if an error occurred. + */ + virtual bool annotationStart(Variant className, Variant::mapType &args) = 0; + + /** + * Called whenever an annotation ends while this handler is active. The + * function should return true if ending the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the class name in the source code. + * @param elementName is a string variant containing the name of the + * annotation class and the location of the element name in the source code. + * @return true if the mentioned annotation could be started here, false if + * an error occurred. + */ + virtual bool annotationEnd(Variant className, Variant elementName) = 0; + + /** + * Called whenever raw data (int the form of a string) is available for the + * Handler instance. + * + * @param data is a string variant containing the character data and its + * location. + */ + virtual void data(Variant data) = 0; +}; + +/** + * HandlerConstructor is a function pointer type used to create concrete + * instances of the Handler class. + * + * @param handlerData is the data that should be passed to the new handler + * instance. + * @return a newly created handler instance. + */ +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * The DefaultHandler class is used in case no element handler is specified in + * the State descriptor. + */ +/*class EmptyHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override; + + void end() override; + + static Handler *create(const HandlerData &handlerData); +};*/ + +} +} + +#endif /* _OUSIA_PARSER_STATE_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp new file mode 100644 index 0000000..1d83a68 --- /dev/null +++ b/src/core/parser/stack/Stack.cpp @@ -0,0 +1,188 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include "Stack.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class StateStack */ + +/** + * Returns an Exception that should be thrown when a currently invalid command + * is thrown. + */ +static LoggableException InvalidCommand(const std::string &name, + const std::set &expected) +{ + if (expected.empty()) { + return LoggableException{ + std::string{"No nested elements allowed, but got \""} + name + + std::string{"\""}}; + } else { + return LoggableException{ + std::string{"Expected "} + + (expected.size() == 1 ? std::string{"\""} + : std::string{"one of \""}) + + Utils::join(expected, "\", \"") + std::string{"\", but got \""} + + name + std::string{"\""}}; + } +} + +StateStack::StateStack( + ParserContext &ctx, + const std::multimap &states) + : ctx(ctx), states(states) +{ +} + +bool StateStack::deduceState() +{ + // Assemble all states + std::vector states; + for (const auto &e : this->states) { + states.push_back(e.second); + } + + // Fetch the type signature of the scope and derive all possible states, + // abort if no unique parser state was found + std::vector possibleStates = + StateDeductor(ctx.getScope().getStackTypeSignature(), states) + .deduce(); + if (possibleStates.size() != 1) { + ctx.getLogger().error( + "Error while including file: Cannot deduce parser state."); + return false; + } + + // Switch to this state by creating a dummy handler + const State *state = possibleStates[0]; + Handler *handler = + DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); + stack.emplace(handler); + return true; +} + +std::set StateStack::expectedCommands() +{ + const State *currentState = &(this->currentState()); + std::set res; + for (const auto &v : states) { + if (v.second->parents.count(currentState)) { + res.insert(v.first); + } + } + return res; +} + +const State &StateStack::currentState() +{ + return stack.empty() ? States::None : stack.top()->state(); +} + +std::string StateStack::currentCommandName() +{ + return stack.empty() ? std::string{} : stack.top()->name(); +} + +const State *StateStack::findTargetState(const std::string &name) +{ + const State *currentState = &(this->currentState()); + auto range = states.equal_range(name); + for (auto it = range.first; it != range.second; it++) { + const StateSet &parents = it->second->parents; + if (parents.count(currentState) || parents.count(&States::All)) { + return it->second; + } + } + + return nullptr; +} + +void StateStack::start(const std::string &name, Variant::mapType &args, + const SourceLocation &location) +{ + State const *targetState = findTargetState(name); +// TODO: Andreas, please improve this. +// if (!Utils::isIdentifier(name)) { +// throw LoggableException(std::string("Invalid identifier \"") + name + +// std::string("\"")); +// } + + if (targetState == nullptr) { + targetState = findTargetState("*"); + } + if (targetState == nullptr) { + throw InvalidCommand(name, expectedCommands()); + } + + // Fetch the associated constructor + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : DefaultHandler::create; + + // Canonicalize the arguments, allow additional arguments + targetState->arguments.validateMap(args, ctx.getLogger(), true); + + // Instantiate the handler and call its start function + Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); + handler->start(args); + stack.emplace(handler); +} + +void StateStack::start(std::string name, const Variant::mapType &args, + const SourceLocation &location) +{ + Variant::mapType argsCopy(args); + start(name, argsCopy); +} + +void StateStack::end() +{ + // Check whether the current command could be ended + if (stack.empty()) { + throw LoggableException{"No command to end."}; + } + + // Remove the current HandlerInstance from the stack + std::shared_ptr inst{stack.top()}; + stack.pop(); + + // Call the end function of the last Handler + inst->end(); +} + +void StateStack::data(const std::string &data, int field) +{ + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException{"No command to receive data."}; + } + + // Pass the data to the current Handler instance + stack.top()->data(data, field); +} +} +} + diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp new file mode 100644 index 0000000..b106475 --- /dev/null +++ b/src/core/parser/stack/Stack.hpp @@ -0,0 +1,191 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file ParserStateStack.hpp + * + * Helper classes for document or description parsers. Contains the + * ParserStateStack class, which is an pushdown automaton responsible for + * accepting commands in the correct order and calling specified handlers. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_STACK_HPP_ +#define _OUSIA_PARSER_STATE_STACK_HPP_ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "Parser.hpp" +#include "ParserContext.hpp" +#include "ParserState.hpp" + +namespace ousia { + +/** + * The ParserStateStack class is a pushdown automaton responsible for turning a + * command stream into a tree of Node instances. + */ +class ParserStateStack { +private: + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap &states; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::stack> stack; + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from for the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + *state + * otherwise. + */ + const ParserState *findTargetState(const std::string &name); + +public: + /** + * Creates a new instance of the ParserStateStack class. + * + * @param ctx is the parser context the parser stack is working on. + * @param states is a map containing the command names and pointers at the + * corresponding ParserState instances. + */ + ParserStateStack( + ParserContext &ctx, + const std::multimap &states); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + + an equivalent state as the one in the including file. + * + * @param scope is the ParserScope instance from which the ParserState + * should be reconstructed. + * @param logger is the logger instance to which error messages should be + * written. + * @return true if the operation was sucessful, false otherwise. + */ + bool deduceState(); + + /** + * Returns the state the ParserStateStack instance currently is in. + * + * @return the state of the currently active Handler instance or STATE_NONE + * if no handler is on the stack. + */ + const ParserState ¤tState(); + + /** + * Returns the command name that is currently being handled. + * + * @return the name of the command currently being handled by the active + * Handler instance or an empty string if no handler is currently active. + */ + std::string currentCommandName(); + + /** + * Function that should be called whenever a new command is reached. + * + * @param name is the name of the command (including the namespace + * separator ':') and its corresponding location. Must be a string variant. + * @param args is a map variant containing the arguments that were passed to + * the command. + */ + void command(Variant name, Variant args); + + /** + * Function that should be called whenever a new field starts. Fields of the + * same command may not be separated by calls to + */ + void fieldStart(); + + /** + * Function that should be called whenever a field ends. + */ + void fieldEnd(); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. + * + * @param data is a variant of any type containing the data that was parsed + * as data. + */ + void data(Variant data); + + /** + * Function that should be called whenever an annotation starts. + * + * @param name is the name of the annotation class. + * @param args is a map variant containing the arguments that were passed + * to the annotation. + */ + void annotationStart(Variant name, Variant args); + + /** + * Function that should be called whenever an annotation ends. + * + * @param name is the name of the annotation class that was ended. + * @param annotationName is the name of the annotation that was ended. + */ + void annotationEnd(Variant name, Variant annotationName); + + /** + * Function that should be called whenever a previously registered token + * is found in the input stream. + * + * @param token is string variant containing the token that was encountered. + */ + void token(Variant token); +}; +} + +#endif /* _OUSIA_PARSER_STATE_STACK_HPP_ */ + diff --git a/src/core/parser/stack/State.cpp b/src/core/parser/stack/State.cpp new file mode 100644 index 0000000..d72f533 --- /dev/null +++ b/src/core/parser/stack/State.cpp @@ -0,0 +1,171 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class State */ + +State::State() : elementHandler(nullptr) {} + +State::State(StateSet parents, Arguments arguments, + RttiSet createdNodeTypes, + HandlerConstructor elementHandler, + bool supportsAnnotations) + : parents(parents), + arguments(arguments), + createdNodeTypes(createdNodeTypes), + elementHandler(elementHandler), + supportsAnnotations(supportsAnnotations) +{ +} + +State::State(const StateBuilder &builder) + : State(builder.build()) +{ +} + +/* Class StateBuilder */ + +StateBuilder &StateBuilder::copy(const State &state) +{ + this->state = state; + return *this; +} + +StateBuilder &StateBuilder::parent(const State *parent) +{ + state.parents = StateSet{parent}; + return *this; +} + +StateBuilder &StateBuilder::parents(const StateSet &parents) +{ + state.parents = parents; + return *this; +} + +StateBuilder &StateBuilder::arguments(const Arguments &arguments) +{ + state.arguments = arguments; + return *this; +} + +StateBuilder &StateBuilder::createdNodeType(const Rtti *type) +{ + state.createdNodeTypes = RttiSet{type}; + return *this; +} + +StateBuilder &StateBuilder::createdNodeTypes(const RttiSet &types) +{ + state.createdNodeTypes = types; + return *this; +} + +StateBuilder &StateBuilder::elementHandler( + HandlerConstructor elementHandler) +{ + state.elementHandler = elementHandler; + return *this; +} + +StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations) +{ + state.supportsAnnotations = supportsAnnotations; + return *this; +} + +const State &StateBuilder::build() const { return state; } + +/* Class StateDeductor */ + +StateDeductor::StateDeductor( + std::vector signature, + std::vector states) + : tbl(signature.size()), + signature(std::move(signature)), + states(std::move(states)) +{ +} + +bool StateDeductor::isActive(size_t d, const State *s) +{ + // Lookup the "active" state of (d, s), if it was not already set + // (e.second is true) we'll have to calculate it + auto e = tbl[d].emplace(s, false); + bool &res = e.first->second; + if (!e.second) { + return res; + } + + // Check whether this node is generative (may have produced the Node + // described by the current Signature element) + bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); + + if (isGenerative && d == 0) { + // End of recursion -- the last signature element is reached and the + // node was generative + res = true; + } else { + // Try repetition of this node + if (isGenerative && isActive(d - 1, s)) { + res = true; + } else { + // Check whether any of the parent nodes were active -- either for + // the previous element (if this one is generative) or for the + // current element (assuming this node was not generative) + for (const State *parent : s->parents) { + if ((isGenerative && isActive(d - 1, parent)) || + isActive(d, parent)) { + res = true; + break; + } + } + } + } + + return res; +} + +std::vector StateDeductor::deduce() +{ + std::vector res; + if (!signature.empty()) { + const size_t D = signature.size(); + for (auto s : states) { + if (signature[D - 1]->isOneOf(s->createdNodeTypes) && + isActive(D - 1, s)) { + res.push_back(s); + } + } + } + return res; +} + +/* Constant initializations */ + +namespace States { +const State All; +const State None; +} +} +} + diff --git a/src/core/parser/stack/State.hpp b/src/core/parser/stack/State.hpp new file mode 100644 index 0000000..ea326ec --- /dev/null +++ b/src/core/parser/stack/State.hpp @@ -0,0 +1,307 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file State.hpp + * + * Defines the State class used within the ParserStack pushdown + * automaton and the StateBuilder class for convenient construction of + * such classes. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_HPP_ +#define _OUSIA_PARSER_STATE_HPP_ + +#include + +#include +#include + +namespace ousia { +namespace parser_stack { + +// Forward declarations +class StateBuilder; +class State; +class HandlerData; +class Handler; +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * Set of pointers of parser states -- used for specifying a set of parent + * states. + */ +using StateSet = std::unordered_set; + +/** + * Class used for the complete specification of a State. Stores possible + * parent states, state handlers and arguments to be passed to that state. + */ +struct State { + /** + * Vector containing all possible parent states. + */ + StateSet parents; + + /** + * Descriptor of the arguments that should be passed to the handler. + */ + Arguments arguments; + + /** + * Set containing the types of the nodes that may be created in this + * State. This information is needed for Parsers to reconstruct the + * current State from a given ParserScope when a file is included. + */ + RttiSet createdNodeTypes; + + /** + * Pointer at a function which creates a new concrete Handler instance for + * the elements described by this state. May be nullptr in which case no + * handler instance is created. + */ + HandlerConstructor elementHandler; + + /** + * Set to true if this handler does support annotations. This is almost + * always false (e.g. all description handlers), except for document + * element handlers. + */ + bool supportsAnnotations; + + /** + * Default constructor, initializes the handlers with nullptr. + */ + State(); + + /** + * Constructor taking values for all fields. Use the StateBuilder + * class for a more convenient construction of State instances. + * + * @param parents is a vector containing all possible parent states. + * @param arguments is a descriptor of arguments that should be passed to + * the handler. + * @param createdNodeTypes is a set containing the types of the nodes tha + * may be created in this State. This information is needed for + * Parsers to reconstruct the current State from a given ParserScope + * when a file is included. + * @param elementHandler is a pointer at a function which creates a new + * concrete Handler instance for the elements described by this state. May + * be nullptr in which case no handler instance is created. + * @param supportsAnnotations specifies whether annotations are supported + * here at all. + */ + State(StateSet parents, Arguments arguments = Arguments{}, + RttiSet createdNodeTypes = RttiSet{}, + HandlerConstructor elementHandler = nullptr, + bool supportsAnnotations = false); + + /** + * Creates this State from the given StateBuilder instance. + */ + State(const StateBuilder &builder); +}; + +/** + * The StateBuilder class is a class used for conveniently building new + * State instances. + */ +class StateBuilder { +private: + /** + * State instance that is currently being built by the + * StateBuilder. + */ + State state; + +public: + /** + * Copies the State instance and uses it as internal state. Overrides + * all changes made by the StateBuilder. + * + * @param state is the state that should be copied. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder ©(const State &state); + + /** + * Sets the possible parent states to the single given parent element. + * + * @param parent is a pointer at the parent State instance that should + * be the possible parent state. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &parent(const State *parent); + + /** + * Sets the State instances in the given StateSet as the list of + * supported parent states. + * + * @param parents is a set of pointers at State instances that should + * be the possible parent states. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &parents(const StateSet &parents); + + /** + * Sets the arguments that should be passed to the parser state handler to + * those given as argument. + * + * @param arguments is the Arguments instance describing the Arguments that + * should be parsed to a Handler for this State. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &arguments(const Arguments &arguments); + + /** + * Sets the Node types this state may produce to the given Rtti descriptor. + * + * @param type is the Rtti descriptor of the Type that may be produced by + * this state. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &createdNodeType(const Rtti *type); + + /** + * Sets the Node types this state may produce to the given Rtti descriptors. + * + * @param types is a set of Rtti descriptors of the Types that may be + * produced by this state. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &createdNodeTypes(const RttiSet &types); + + /** + * Sets the constructor for the element handler. The constructor creates a + * new concrete Handler instance for the elements described by this state. + * May be nullptr in which case no handler instance is created (this is + * the default value). + * + * @param elementHandler is the HandlerConstructor that should create a + * new Handler instance. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &elementHandler(HandlerConstructor elementHandler); + + /** + * Sets the state of the "supportsAnnotations" flags (default value is + * false) + * + * @param supportsAnnotations should be set to true, if annotations are + * supported for the handlers associated with this document. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &supportsAnnotations(bool supportsAnnotations); + + /** + * Returns a reference at the internal State instance that was built + * using the StateBuilder. + * + * @return the built State. + */ + const State &build() const; +}; + +/** + * Class used to deduce the State a Parser is currently in based on the + * types of the Nodes that currently are on the ParserStack. Uses dynamic + * programming in order to solve this problem. + */ +class StateDeductor { +public: + /** + * Type containing the dynamic programming table. + */ + using Table = std::vector>; + +private: + /** + * Dynamic programming table. + */ + Table tbl; + + /** + * Signature given in the constructor. + */ + const std::vector signature; + + /** + * List of states that should be checked for being active. + */ + const std::vector states; + + /** + * Used internally to check whether the given parser stack s may have been + * active for signature element d. + * + * @param d is the signature element. + * @param s is the parser state. + * @return true if the the given State may have been active. + */ + bool isActive(size_t d, const State *s); + +public: + /** + * Constructor of the StateDeductor class. + * + * @param signature a Node type signature describing the types of the nodes + * which currently reside on e.g. the ParserScope stack. + * @param states is a list of states that should be checked. + */ + StateDeductor(std::vector signature, + std::vector states); + + /** + * Selects all active states from the given states. Only considers those + * states that may have produced the last signature element. + * + * @return a list of states that may actually have been active. + */ + std::vector deduce(); +}; + +/** + * The States namespace contains all the global state constants used + * in the ParserStack class. + */ +namespace States { +/** + * State representing all states. + */ +extern const State All; + +/** + * State representing the initial state. + */ +extern const State None; +} +} +} + +#endif /* _OUSIA_PARSER_STATE_HPP_ */ + diff --git a/test/core/parser/ParserStateTest.cpp b/test/core/parser/ParserStateTest.cpp deleted file mode 100644 index 91d8dcd..0000000 --- a/test/core/parser/ParserStateTest.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include - -namespace ousia { - -static const Rtti t1; -static const Rtti t2; -static const Rtti t3; -static const Rtti t4; -static const Rtti t5; - -static const ParserState s1 = ParserStateBuilder().createdNodeType(&t1); -static const ParserState s2a = - ParserStateBuilder().parent(&s1).createdNodeType(&t2); -static const ParserState s2b = - ParserStateBuilder().parent(&s1).createdNodeType(&t2); -static const ParserState s3 = - ParserStateBuilder().parents({&s2a, &s1}).createdNodeType(&t3); -static const ParserState s4 = - ParserStateBuilder().parent(&s3).createdNodeType(&t4); -static const ParserState s5 = - ParserStateBuilder().parent(&s2b).createdNodeType(&t5); - -TEST(ParserStateDeductor, deduce) -{ - using Result = std::vector; - using Signature = std::vector; - std::vector states{&s1, &s2a, &s2b, &s3, &s4, &s5}; - - // Should not crash on empty signature - ASSERT_EQ(Result{}, ParserStateDeductor(Signature{}, states).deduce()); - - // Try repeating signature elements - ASSERT_EQ(Result({&s1}), - ParserStateDeductor(Signature({&t1}), states).deduce()); - ASSERT_EQ(Result({&s1}), - ParserStateDeductor(Signature({&t1, &t1}), states).deduce()); - ASSERT_EQ(Result({&s1}), - ParserStateDeductor(Signature({&t1, &t1, &t1}), states).deduce()); - - // Go to another state - ASSERT_EQ(Result({&s2a, &s2b}), - ParserStateDeductor(Signature({&t1, &t1, &t2}), states).deduce()); - ASSERT_EQ(Result({&s4}), - ParserStateDeductor(Signature({&t1, &t3, &t4}), states).deduce()); - - // Skip one state - ASSERT_EQ(Result({&s4}), - ParserStateDeductor(Signature({&t2, &t4}), states).deduce()); - - // Impossible signature - ASSERT_EQ(Result({}), - ParserStateDeductor(Signature({&t4, &t5}), states).deduce()); - -} -} - diff --git a/test/core/parser/stack/StateTest.cpp b/test/core/parser/stack/StateTest.cpp new file mode 100644 index 0000000..e503d30 --- /dev/null +++ b/test/core/parser/stack/StateTest.cpp @@ -0,0 +1,79 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +namespace ousia { +namespace parser_stack { + +static const Rtti t1; +static const Rtti t2; +static const Rtti t3; +static const Rtti t4; +static const Rtti t5; + +static const State s1 = StateBuilder().createdNodeType(&t1); +static const State s2a = + StateBuilder().parent(&s1).createdNodeType(&t2); +static const State s2b = + StateBuilder().parent(&s1).createdNodeType(&t2); +static const State s3 = + StateBuilder().parents({&s2a, &s1}).createdNodeType(&t3); +static const State s4 = + StateBuilder().parent(&s3).createdNodeType(&t4); +static const State s5 = + StateBuilder().parent(&s2b).createdNodeType(&t5); + +TEST(StateDeductor, deduce) +{ + using Result = std::vector; + using Signature = std::vector; + std::vector states{&s1, &s2a, &s2b, &s3, &s4, &s5}; + + // Should not crash on empty signature + ASSERT_EQ(Result{}, StateDeductor(Signature{}, states).deduce()); + + // Try repeating signature elements + ASSERT_EQ(Result({&s1}), + StateDeductor(Signature({&t1}), states).deduce()); + ASSERT_EQ(Result({&s1}), + StateDeductor(Signature({&t1, &t1}), states).deduce()); + ASSERT_EQ(Result({&s1}), + StateDeductor(Signature({&t1, &t1, &t1}), states).deduce()); + + // Go to another state + ASSERT_EQ(Result({&s2a, &s2b}), + StateDeductor(Signature({&t1, &t1, &t2}), states).deduce()); + ASSERT_EQ(Result({&s4}), + StateDeductor(Signature({&t1, &t3, &t4}), states).deduce()); + + // Skip one state + ASSERT_EQ(Result({&s4}), + StateDeductor(Signature({&t2, &t4}), states).deduce()); + + // Impossible signature + ASSERT_EQ(Result({}), + StateDeductor(Signature({&t4, &t5}), states).deduce()); + +} +} +} + -- cgit v1.2.3 From 9acab70815a0f62bdaf2c7f01e588066b818d330 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 22:45:19 +0100 Subject: Fixed isIdentifier and isNamespacedIdentifier, added and used isIdentifierOrEmpty for use in Node --- src/core/common/Utils.cpp | 13 +++++++++---- src/core/common/Utils.hpp | 5 +++++ src/core/model/Node.cpp | 2 +- test/core/common/UtilsTest.cpp | 39 +++++++++++++++++++++++++++++++++------ 4 files changed, 48 insertions(+), 11 deletions(-) (limited to 'test') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index fc8ee00..f8b53c6 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -37,22 +37,27 @@ bool Utils::isIdentifier(const std::string &name) } first = false; } - return true; + return !first; } -bool Utils::isNamespaceIdentifier(const std::string &name) +bool Utils::isIdentifierOrEmpty(const std::string &name) +{ + return name.empty() || isIdentifier(name); +} + +bool Utils::isNamespacedIdentifier(const std::string &name) { bool first = true; for (char c : name) { if (first && !isIdentifierStartCharacter(c)) { return false; } - if (!first && (!isIdentifierCharacter(c) || c == ':')) { + if (!first && (!isIdentifierCharacter(c) && c != ':')) { return false; } first = (c == ':'); } - return true; + return !first; } bool Utils::hasNonWhitepaceChar(const std::string &s) diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index b5cd178..b5a54fc 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -85,6 +85,11 @@ public: */ static bool isIdentifier(const std::string &name); + /** + * Returns true if the given string is an identifier or an empty string. + */ + static bool isIdentifierOrEmpty(const std::string &name); + /** * Returns true if the given string is in * \code{.txt} diff --git a/src/core/model/Node.cpp b/src/core/model/Node.cpp index 39ee2e4..ce15cad 100644 --- a/src/core/model/Node.cpp +++ b/src/core/model/Node.cpp @@ -448,7 +448,7 @@ bool Node::doValidate(Logger &logger) const { return true; } bool Node::validateName(Logger &logger) const { - if (!Utils::isIdentifier(name)) { + if (!Utils::isIdentifierOrEmpty(name)) { logger.error(type()->name + std::string(" name \"") + name + std::string("\" is not a valid identifier"), this); diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index a4bf4b2..7801296 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -24,14 +24,40 @@ namespace ousia { TEST(Utils, isIdentifier) { - ASSERT_TRUE(Utils::isIdentifier("test")); - ASSERT_TRUE(Utils::isIdentifier("t0-_est")); - ASSERT_FALSE(Utils::isIdentifier("_t0-_EST")); - ASSERT_FALSE(Utils::isIdentifier("-t0-_EST")); - ASSERT_FALSE(Utils::isIdentifier("0t-_EST")); - ASSERT_FALSE(Utils::isIdentifier("invalid key")); + EXPECT_TRUE(Utils::isIdentifier("test")); + EXPECT_TRUE(Utils::isIdentifier("t0-_est")); + EXPECT_FALSE(Utils::isIdentifier("_t0-_EST")); + EXPECT_FALSE(Utils::isIdentifier("-t0-_EST")); + EXPECT_FALSE(Utils::isIdentifier("0t-_EST")); + EXPECT_FALSE(Utils::isIdentifier("_A")); + EXPECT_FALSE(Utils::isIdentifier("invalid key")); + EXPECT_FALSE(Utils::isIdentifier("")); } + +TEST(Utils, isNamespacedIdentifier) +{ + EXPECT_TRUE(Utils::isNamespacedIdentifier("test")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("t0-_est")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("_t0-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("-t0-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("0t-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("invalid key")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("_A")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("")); + EXPECT_FALSE(Utils::isNamespacedIdentifier(":")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("test:a")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("t0-_est:b")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("test:test")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("t0-_est:t0-_est")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("test:_A")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("test::a")); + EXPECT_FALSE(Utils::isNamespacedIdentifier(":test")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("t0-_est:_t0-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("t0-_est: b")); +} + + TEST(Utils, split) { ASSERT_EQ(std::vector({"ab"}), Utils::split("ab", '.')); @@ -82,5 +108,6 @@ TEST(Utils, endsWith) ASSERT_TRUE(Utils::endsWith("foobar", "bar")); ASSERT_TRUE(Utils::endsWith("foo", "")); } + } -- cgit v1.2.3 From 02995f1f9b5a0905ed8f79a5149f4b6375a622bf Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 22:45:49 +0100 Subject: Fixed gcc 4.9 warnings --- test/core/RangeSetTest.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'test') diff --git a/test/core/RangeSetTest.cpp b/test/core/RangeSetTest.cpp index cbf8f59..446ee51 100644 --- a/test/core/RangeSetTest.cpp +++ b/test/core/RangeSetTest.cpp @@ -110,7 +110,7 @@ TEST(RangeSet, Merge) s.merge(Range(40, 50)); s.merge(Range(60, 70)); { - ASSERT_EQ(ranges.size(), 4); + ASSERT_EQ(ranges.size(), 4U); auto it = ranges.begin(); ASSERT_EQ((*it).start, 0); @@ -132,7 +132,7 @@ TEST(RangeSet, Merge) // Now insert an element which spans the second and third element s.merge(Range(15, 55)); { - ASSERT_EQ(ranges.size(), 3); + ASSERT_EQ(ranges.size(), 3U); auto it = ranges.begin(); ASSERT_EQ((*it).start, 0); @@ -150,7 +150,7 @@ TEST(RangeSet, Merge) // Now insert an element which expands the first element s.merge(Range(-10, 11)); { - ASSERT_EQ(ranges.size(), 3); + ASSERT_EQ(ranges.size(), 3U); auto it = ranges.begin(); ASSERT_EQ((*it).start, -10); @@ -168,7 +168,7 @@ TEST(RangeSet, Merge) // Now insert an element which merges the last two elements s.merge(Range(13, 70)); { - ASSERT_EQ(ranges.size(), 2); + ASSERT_EQ(ranges.size(), 2U); auto it = ranges.begin(); ASSERT_EQ((*it).start, -10); @@ -182,7 +182,7 @@ TEST(RangeSet, Merge) // Now insert an element which merges the remaining elements s.merge(Range(-9, 12)); { - ASSERT_EQ(ranges.size(), 1); + ASSERT_EQ(ranges.size(), 1U); auto it = ranges.begin(); ASSERT_EQ((*it).start, -10); -- cgit v1.2.3 From 0a8a012850bb7c730ccac4c91c7aca5c88cbedc9 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:14:58 +0100 Subject: Implemented most of the desired behaviour of the Stack class, added unit tests --- src/core/parser/stack/Stack.cpp | 437 ++++++++++++++++++++---- src/core/parser/stack/Stack.hpp | 85 ++++- test/core/parser/stack/StackTest.cpp | 639 +++++++++++++++++++++++++++++++++++ 3 files changed, 1075 insertions(+), 86 deletions(-) create mode 100644 test/core/parser/stack/StackTest.cpp (limited to 'test') diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index b0df39b..d84a19c 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -18,6 +18,7 @@ #include +#include #include #include #include @@ -37,10 +38,28 @@ HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {} HandlerInfo::HandlerInfo(std::shared_ptr handler) : handler(handler), fieldIdx(0), + valid(true), + implicit(false), inField(false), inDefaultField(false), inImplicitDefaultField(false), - hasDefaultField(false) + inValidField(false), + hadDefaultField(false) +{ +} + +HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField, + bool inDefaultField, bool inImplicitDefaultField, + bool inValidField) + : handler(nullptr), + fieldIdx(0), + valid(valid), + implicit(implicit), + inField(inField), + inDefaultField(inDefaultField), + inImplicitDefaultField(inImplicitDefaultField), + inValidField(inValidField), + hadDefaultField(false) { } @@ -55,7 +74,7 @@ void HandlerInfo::fieldStart(bool isDefault, bool isImplicit, bool isValid) inDefaultField = isDefault || isImplicit; inImplicitDefaultField = isImplicit; inValidField = isValid; - hasDefaultField = hasDefaultField || inDefaultField; + hadDefaultField = hadDefaultField || inDefaultField; fieldIdx++; } @@ -65,11 +84,13 @@ void HandlerInfo::fieldEnd() inDefaultField = false; inImplicitDefaultField = false; inValidField = false; - if (fieldIdx > 0) { - fieldIdx--; - } } +/** + * Stub instance of HandlerInfo containing no handler information. + */ +static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true}; + /* Helper functions */ /** @@ -110,9 +131,31 @@ Stack::Stack(ParserContext &ctx, } } -Stack::~Stack() {} +Stack::~Stack() +{ + while (!stack.empty()) { + // Fetch the topmost stack element + HandlerInfo &info = currentInfo(); + + // It is an error if we're still in a field of an element while the + // Stack instance is destroyed. Log that + if (handlersValid()) { + if (info.inField && !info.implicit && + !info.inImplicitDefaultField) { + logger().error( + std::string("Reached end of stream, but command \"") + + info.handler->getName() + + "\" has not ended yet. Command was started here:", + info.handler->getLocation()); + } + } -bool Stack::deduceState() + // Remove the command from the stack + endCurrentHandler(); + } +} + +void Stack::deduceState() { // Assemble all states std::vector states; @@ -125,23 +168,24 @@ bool Stack::deduceState() std::vector possibleStates = StateDeductor(ctx.getScope().getStackTypeSignature(), states).deduce(); if (possibleStates.size() != 1U) { - throw LoggableException{ - "Error while including file: Cannot deduce parser state."}; + throw LoggableException( + "Error while including file: Cannot deduce parser state."); } - // Switch to this state by creating a dummy handler - const State *state = possibleStates[0]; - stack.emplace(std::shared_ptr{EmptyHandler::create({ctx, "", *state, *state, SourceLocation{}})}); -} + // Switch to this state by creating a handler, but do not call its start + // function + const State &state = *possibleStates[0]; + HandlerConstructor ctor = + state.elementHandler ? state.elementHandler : EmptyHandler::create; -bool Stack::handlersValid() -{ - for (auto it = stack.crbegin(); it != stack.crend(); it++) { - if (!it->valid) { - return false; - } - } - return true; + std::shared_ptr handler = + std::shared_ptr{ctor({ctx, "", state, SourceLocation{}})}; + stack.emplace_back(handler); + + // Set the correct flags for this implicit handler + HandlerInfo &info = currentInfo(); + info.implicit = true; + info.fieldStart(true, false, true); } std::set Stack::expectedCommands() @@ -158,12 +202,12 @@ std::set Stack::expectedCommands() const State &Stack::currentState() { - return stack.empty() ? States::None : stack.top()->state(); + return stack.empty() ? States::None : stack.back().handler->getState(); } std::string Stack::currentCommandName() { - return stack.empty() ? std::string{} : stack.top()->name(); + return stack.empty() ? std::string{} : stack.back().handler->getName(); } const State *Stack::findTargetState(const std::string &name) @@ -180,77 +224,330 @@ const State *Stack::findTargetState(const std::string &name) return nullptr; } +const State *Stack::findTargetStateOrWildcard(const std::string &name) +{ + // Try to find the target state with the given name, if none is found, try + // find a matching "*" state. + State const *targetState = findTargetState(name); + if (targetState == nullptr) { + return findTargetState("*"); + } + return targetState; +} + +HandlerInfo &Stack::currentInfo() +{ + return stack.empty() ? EmptyHandlerInfo : stack.back(); +} +HandlerInfo &Stack::lastInfo() +{ + return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2]; +} + +void Stack::endCurrentHandler() +{ + if (!stack.empty()) { + // Fetch the handler info for the current top-level element + HandlerInfo &info = stack.back(); + + // Do not call any callback functions while the stack is marked as + // invalid or this is an elment marked as "implicit" + if (!info.implicit && handlersValid()) { + // Make sure the fieldEnd handler is called if the element still + // is in a field + if (info.inField) { + info.handler->fieldEnd(); + info.fieldEnd(); + } + + // Call the "end" function of the corresponding Handler instance + info.handler->end(); + } + + // Remove the element from the stack + stack.pop_back(); + } +} + +bool Stack::ensureHandlerIsInField() +{ + // If the current handler is not in a field (and actually has a handler) + // try to start a default field + HandlerInfo &info = currentInfo(); + if (!info.inField && info.handler != nullptr) { + // Abort if the element already had a default field + if (info.hadDefaultField) { + return false; + } + + // Try to start a new default field, abort if this did not work + bool isDefault = true; + if (!info.handler->fieldStart(isDefault, info.fieldIdx)) { + info.handler->fieldEnd(); + endCurrentHandler(); + return false; + } + + // Mark the field as started + info.fieldStart(true, true, true); + } + return true; +} + +bool Stack::handlersValid() +{ + for (auto it = stack.crbegin(); it != stack.crend(); it++) { + if (!it->valid) { + return false; + } + } + return true; +} + +Logger &Stack::logger() { return ctx.getLogger(); } + void Stack::command(const Variant &name, const Variant::mapType &args) { - // Make sure the given identifier is valid + // Make sure the given identifier is valid (preventing "*" from being + // malicously passed to this function) if (!Utils::isNamespacedIdentifier(name.asString())) { throw LoggableException(std::string("Invalid identifier \"") + - name.asString() + std::string("\""), name); + name.asString() + std::string("\""), + name); } - // Try to find a target state for the given command - State const *targetState = findTargetState(name.asString()); + State const *lastTargetState = nullptr; + Variant::mapType canonicalArgs; + while (true) { + // Try to find a target state for the given command, if none can be + // found and the current command does not have an open field, then try + // to create an empty default field, otherwise this is an exception + const State *targetState = findTargetStateOrWildcard(name.asString()); + if (targetState == nullptr) { + if (!currentInfo().inField) { + endCurrentHandler(); + continue; + } else { + throw buildInvalidCommandException(name.asString(), + expectedCommands()); + } + } + + // Make sure we're currently inside a field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } - // No target state is found, try to find a wildcard handler for the current - // state - if (targetState == nullptr) { - targetState = findTargetState("*"); - } + // Fork the logger. We do not want any validation errors to skip + LoggerFork loggerFork = logger().fork(); - // No handler has been found at all, - if (targetState == nullptr) { - throw buildInvalidCommandException(name.asString(), expectedCommands()); + // Canonicalize the arguments (if this has not already been done), allow + // additional arguments + if (lastTargetState != targetState) { + canonicalArgs = args; + targetState->arguments.validateMap(canonicalArgs, loggerFork, true); + lastTargetState = targetState; + } + + // Instantiate the handler and push it onto the stack + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : EmptyHandler::create; + std::shared_ptr handler{ + ctor({ctx, name.asString(), *targetState, name.getLocation()})}; + stack.emplace_back(handler); + + // Fetch the HandlerInfo for the parent element and the current element + HandlerInfo &parentInfo = lastInfo(); + HandlerInfo &info = currentInfo(); + + // Call the "start" method of the handler, store the result of the start + // method as the validity of the handler -- do not call the start method + // if the stack is currently invalid (as this may cause further, + // unwanted errors) + bool validStack = handlersValid(); + info.valid = false; + if (validStack) { + handler->setLogger(loggerFork); + try { + info.valid = handler->start(canonicalArgs); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + handler->resetLogger(); + } + + // We started the command within an implicit default field and it is not + // valid -- remove both the new handler and the parent field from the + // stack + if (!info.valid && parentInfo.inImplicitDefaultField) { + endCurrentHandler(); + endCurrentHandler(); + continue; + } + + // If we ended up here, starting the command may or may not have worked, + // but after all, we cannot unroll the stack any further. Update the + // "valid" flag, commit any potential error messages and return. + info.valid = parentInfo.valid && info.valid; + loggerFork.commit(); + return; } +} + +void Stack::data(const Variant &data) +{ + while (true) { + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException("No command here to receive data."); + } + + // Fetch the current command handler information + HandlerInfo &info = currentInfo(); + + // Make sure the current handler has an open field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } + + // If this field should not get any data, log an error and do not call + // the "data" handler + if (!info.inValidField) { + logger().error("Did not expect any data here", data); + } + + if (handlersValid() && info.inValidField) { + // Fork the logger and set it as temporary logger for the "start" + // method. We only want to keep error messages if this was not a try + // to implicitly open a default field. + LoggerFork loggerFork = logger().fork(); + info.handler->setLogger(loggerFork); + + // Pass the data to the current Handler instance + bool valid = false; + try { + valid = info.handler->data(data); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + + // Reset the logger instance as soon as possible + info.handler->resetLogger(); + + // If placing the data here failed and we're currently in an + // implicitly opened field, just unroll the stack to the next field + // and try again + if (!valid && info.inImplicitDefaultField) { + endCurrentHandler(); + continue; + } + + // Commit the content of the logger fork. Do not change the valid + // flag. + loggerFork.commit(); + } - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and push it onto the stack - Handler *handler = - ctor({ctx, name.asString(), *targetState, currentState(), name.getLocation()}); - stack.emplace_back(std::shared_ptr{handler}); - - // Call the "start" method of the handler, store the result of the start - // method as the validity of the handler -- do not call the start method - // if the stack is currently invalid (as this may cause further, unwanted - // errors) - try { - stack.back().valid = handlersValid() && handler->start(args); - } catch (LoggableException ex) { - stack.back().valid = false; - logger.log(ex, ) + // There was no reason to unroll the stack any further, so continue + return; } } -void Stack::end() +void Stack::fieldStart(bool isDefault) { - // Check whether the current command could be ended + // Make sure the current handler stack is not empty if (stack.empty()) { - throw LoggableException{"No command to end."}; + throw LoggableException( + "No command for which a field could be started"); } - // Remove the current HandlerInstance from the stack - std::shared_ptr inst{stack.top()}; - stack.pop(); + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (info.inField) { + logger().error( + "Got field start, but there is no command for which to start the " + "field."); + return; + } + + // Copy the isDefault flag to a local variable, the fieldStart method will + // write into this variable + bool defaultField = isDefault; + + // Do not call the "fieldStart" function if we're in an invalid subtree + bool valid = false; + if (handlersValid()) { + try { + valid = info.handler->fieldStart(defaultField, info.fieldIdx); + } + catch (LoggableException ex) { + logger().log(ex); + } + if (!valid && !defaultField) { + logger().error( + std::string("Cannot start a new field here (index ") + + std::to_string(info.fieldIdx + 1) + + std::string("), field does not exist")); + } + } - // Call the end function of the last Handler - inst->end(); + // Mark the field as started + info.fieldStart(defaultField, false, valid); } -void Stack::data(const std::string &data, int field) +void Stack::fieldEnd() { - // Check whether there is any command the data can be sent to + // Make sure the current handler stack is not empty if (stack.empty()) { - throw LoggableException{"No command to receive data."}; + throw LoggableException("No command for which a field could be ended"); } - // Pass the data to the current Handler instance - stack.top()->data(data, field); + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (!info.inField) { + logger().error( + "Got field end, but there is no command for which to end the " + "field."); + return; + } + + // Only continue if the current handler stack is in a valid state, do not + // call the fieldEnd function if something went wrong before + if (handlersValid()) { + try { + info.handler->fieldEnd(); + } + catch (LoggableException ex) { + logger().log(ex); + } + } + + // This command no longer is in a field + info.fieldEnd(); + + // As soon as this command had a default field, remove it from the stack + if (info.hadDefaultField) { + endCurrentHandler(); + } +} + +void Stack::annotationStart(const Variant &className, const Variant &args) +{ + // TODO +} + +void Stack::annotationEnd(const Variant &className, const Variant &elementName) +{ + // TODO +} + +void Stack::token(Variant token) +{ + // TODO } } } diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index 294f7ec..76eefd9 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -43,6 +43,7 @@ namespace ousia { // Forward declarations class ParserContext; +class Logger; namespace parser_stack { @@ -75,7 +76,13 @@ public: bool valid : 1; /** - * Set to true if the handler currently is in a filed. + * Set to true if this is an implicit handler, that was created when the + * current stack state was deduced. + */ + bool implicit : 1; + + /** + * Set to true if the handler currently is in a field. */ bool inField : 1; @@ -99,12 +106,17 @@ public: /** * Set to true, if the default field was already started. */ - bool hasDefaultField : 1; + bool hadDefaultField : 1; /** * Default constructor of the HandlerInfo class. */ HandlerInfo(); + /** + * Constructor of the HandlerInfo class, allows to set all flags manually. + */ + HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField, bool inValidField); /** * Constructor of the HandlerInfo class, taking a shared_ptr to the handler @@ -129,7 +141,6 @@ public: void fieldEnd(); }; - /** * The Stack class is a pushdown automaton responsible for turning a command * stream into a tree of Node instances. It does so by following a state @@ -154,6 +165,11 @@ private: */ std::vector stack; + /** + * Return the reference in the Logger instance stored within the context. + */ + Logger &logger(); + /** * Used internally to get all expected command names for the current state. * This function is used to build error messages. @@ -164,7 +180,7 @@ private: /** * Returns the targetState for a command with the given name that can be - * reached from for the current state. + * reached from the current state. * * @param name is the name of the requested command. * @return nullptr if no target state was found, a pointer at the target @@ -172,6 +188,17 @@ private: */ const State *findTargetState(const std::string &name); + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state, also including the wildcard "*" state. + * Throws an exception if the given target state is not a valid identifier. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetStateOrWildcard(const std::string &name); + /** * Tries to reconstruct the parser state from the Scope instance of the * ParserContext given in the constructor. This functionality is needed for @@ -180,6 +207,33 @@ private: */ void deduceState(); + /** + * Returns a reference at the current HandlerInfo instance (or a stub + * HandlerInfo instance if the stack is empty). + */ + HandlerInfo ¤tInfo(); + + /** + * Returns a reference at the last HandlerInfo instance (or a stub + * HandlerInfo instance if the stack has only one element). + */ + HandlerInfo &lastInfo(); + + /** + * Ends the current handler and removes the corresponding element from the + * stack. + */ + void endCurrentHandler(); + + /** + * Tries to start a default field for the current handler, if currently the + * handler is not inside a field and did not have a default field yet. + * + * @return true if the handler is inside a field, false if no field could + * be started. + */ + bool ensureHandlerIsInField(); + /** * Returns true if all handlers on the stack are currently valid, or false * if at least one handler is invalid. @@ -196,9 +250,8 @@ public: * @param states is a map containing the command names and pointers at the * corresponding State instances. */ - Stack( - ParserContext &ctx, - const std::multimap &states); + Stack(ParserContext &ctx, + const std::multimap &states); /** * Destructor of the Stack class. @@ -231,6 +284,15 @@ public: */ void command(const Variant &name, const Variant::mapType &args); + /** + * Function that shuold be called whenever character data is found in the + * input stream. May only be called if the currently is a command on the + * stack. + * + * @param data is a string variant containing the data that has been found. + */ + void data(const Variant &data); + /** * Function that should be called whenever a new field starts. Fields of the * same command may not be separated by calls to data or annotations. Doing @@ -247,15 +309,6 @@ public: */ void fieldEnd(); - /** - * Function that shuold be called whenever character data is found in the - * input stream. May only be called if the currently is a command on the - * stack. - * - * @param data is a string variant containing the data that has been found. - */ - void data(const Variant &data); - /** * Function that should be called whenever an annotation starts. * diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp new file mode 100644 index 0000000..7cc8bc5 --- /dev/null +++ b/test/core/parser/stack/StackTest.cpp @@ -0,0 +1,639 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include +#include +#include +#include + +#include + +namespace ousia { +namespace parser_stack { + +// Build an instance of the StandaloneEnvironment used for this unit test +static TerminalLogger logger(std::cerr, true); +// static ConcreteLogger logger; +static StandaloneEnvironment env(logger); + +namespace { + +struct Tracker { + int startCount; + int endCount; + int fieldStartCount; + int fieldEndCount; + int annotationStartCount; + int annotationEndCount; + int dataCount; + + Variant::mapType startArgs; + bool fieldStartIsDefault; + size_t fieldStartIdx; + Variant annotationStartClassName; + Variant::mapType annotationStartArgs; + Variant annotationEndClassName; + Variant annotationEndElementName; + Variant dataData; + + bool startResult; + bool fieldStartSetIsDefault; + bool fieldStartResult; + bool annotationStartResult; + bool annotationEndResult; + bool dataResult; + + Tracker() { reset(); } + + void reset() + { + startCount = 0; + endCount = 0; + fieldStartCount = 0; + fieldEndCount = 0; + annotationStartCount = 0; + annotationEndCount = 0; + dataCount = 0; + + startArgs = Variant::mapType{}; + fieldStartIsDefault = false; + fieldStartIdx = 0; + annotationStartClassName = Variant::fromString(std::string{}); + annotationStartArgs = Variant::mapType{}; + annotationEndClassName = Variant::fromString(std::string{}); + annotationEndElementName = Variant::fromString(std::string{}); + dataData = Variant::fromString(std::string{}); + + startResult = true; + fieldStartSetIsDefault = false; + fieldStartResult = true; + annotationStartResult = true; + annotationEndResult = true; + dataResult = true; + } + + void expect(int startCount, int endCount, int fieldStartCount, + int fieldEndCount, int annotationStartCount, + int annotationEndCount, int dataCount) + { + EXPECT_EQ(startCount, this->startCount); + EXPECT_EQ(endCount, this->endCount); + EXPECT_EQ(fieldStartCount, this->fieldStartCount); + EXPECT_EQ(fieldEndCount, this->fieldEndCount); + EXPECT_EQ(annotationStartCount, this->annotationStartCount); + EXPECT_EQ(annotationEndCount, this->annotationEndCount); + EXPECT_EQ(dataCount, this->dataCount); + } +}; + +static Tracker tracker; + +class TestHandler : public Handler { +private: + TestHandler(const HandlerData &handlerData) : Handler(handlerData) {} + +public: + bool start(const Variant::mapType &args) + { + tracker.startCount++; + tracker.startArgs = args; + return tracker.startResult; + } + + void end() { tracker.endCount++; } + + bool fieldStart(bool &isDefault, size_t fieldIdx) + { + tracker.fieldStartCount++; + tracker.fieldStartIsDefault = isDefault; + tracker.fieldStartIdx = fieldIdx; + if (tracker.fieldStartSetIsDefault) { + isDefault = true; + } + return tracker.fieldStartResult; + } + + void fieldEnd() { tracker.fieldEndCount++; } + + bool annotationStart(const Variant &className, const Variant::mapType &args) + { + tracker.annotationStartCount++; + tracker.annotationStartClassName = className; + tracker.annotationStartArgs = args; + return tracker.annotationStartResult; + } + + bool annotationEnd(const Variant &className, const Variant &elementName) + { + tracker.annotationEndCount++; + tracker.annotationEndClassName = className; + tracker.annotationEndElementName = elementName; + return tracker.annotationEndResult; + } + + bool data(const Variant &data) + { + tracker.dataCount++; + tracker.dataData = data; + return tracker.dataResult; + } + + static Handler *create(const HandlerData &handlerData) + { + return new TestHandler(handlerData); + } +}; +} + +namespace States { +static const State Document = + StateBuilder().parent(&None).elementHandler(TestHandler::create); +static const State Body = + StateBuilder().parent(&Document).elementHandler(TestHandler::create); +static const State Empty = + StateBuilder().parent(&Document).elementHandler(TestHandler::create); +static const State Special = + StateBuilder().parent(&All).elementHandler(TestHandler::create); +static const State Arguments = + StateBuilder().parent(&None).elementHandler(TestHandler::create).arguments( + {Argument::Int("a"), Argument::String("b")}); +static const State BodyChildren = + StateBuilder().parent(&Body).elementHandler(TestHandler::create); +static const State Any = + StateBuilder().parents({&None, &Any}).elementHandler(TestHandler::create); + +static const std::multimap TestHandlers{ + {"document", &Document}, + {"body", &Body}, + {"empty", &Empty}, + {"special", &Special}, + {"arguments", &Arguments}, + {"*", &BodyChildren}}; + +static const std::multimap AnyHandlers{{"*", &Any}}; +} + +TEST(Stack, basicTest) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::TestHandlers}; + + EXPECT_EQ("", s.currentCommandName()); + EXPECT_EQ(&States::None, &s.currentState()); + + s.command("document", {}); + s.fieldStart(true); + s.data("test1"); + + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + + s.command("body", {}); + s.fieldStart(true); + s.data("test2"); + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + tracker.expect(2, 0, 2, 0, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.command("inner", {}); + s.fieldStart(true); + EXPECT_EQ("inner", s.currentCommandName()); + EXPECT_EQ(&States::BodyChildren, &s.currentState()); + + s.fieldEnd(); + tracker.expect(3, 1, 3, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldEnd(); + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + tracker.expect(3, 2, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.command("body", {}); + s.fieldStart(true); + s.data("test3"); + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + s.fieldEnd(); + tracker.expect(4, 3, 4, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + + s.fieldEnd(); + tracker.expect(4, 4, 4, 4, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + + EXPECT_EQ("", s.currentCommandName()); + EXPECT_EQ(&States::None, &s.currentState()); + } + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, errorInvalidCommands) +{ + Stack s{env.context, States::TestHandlers}; + tracker.reset(); + EXPECT_THROW(s.command("body", {}), LoggableException); + s.command("document", {}); + s.fieldStart(true); + EXPECT_THROW(s.command("document", {}), LoggableException); + s.command("empty", {}); + s.fieldStart(true); + EXPECT_THROW(s.command("body", {}), LoggableException); + s.command("special", {}); + s.fieldStart(true); + s.fieldEnd(); + s.fieldEnd(); + s.fieldEnd(); + EXPECT_EQ(&States::None, &s.currentState()); + ASSERT_THROW(s.fieldEnd(), LoggableException); + ASSERT_THROW(s.data("test"), LoggableException); +} + +TEST(Stack, validation) +{ + Stack s{env.context, States::TestHandlers}; + tracker.reset(); + logger.reset(); + + s.command("arguments", {}); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); + + logger.reset(); + s.command("arguments", {{"a", 5}}); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); + + logger.reset(); + s.command("arguments", {{"a", 5}, {"b", "test"}}); + EXPECT_FALSE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); +} + +TEST(Stack, invalidCommandName) +{ + Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + + s.command("a", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("a_", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("a_:b", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + ASSERT_THROW(s.command("_a", {}), LoggableException); + ASSERT_THROW(s.command("a:", {}), LoggableException); + ASSERT_THROW(s.command("a:_b", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, multipleFields) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {{"a", false}}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("a", s.currentCommandName()); + EXPECT_EQ(Variant::mapType({{"a", false}}), tracker.startArgs); + + s.fieldStart(false); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(0U, tracker.fieldStartIdx); + + s.data("test"); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test", tracker.dataData); + + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + tracker.expect(1, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(1U, tracker.fieldStartIdx); + + s.data("test2"); + tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test2", tracker.dataData); + + s.fieldEnd(); + tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(true); + tracker.expect(1, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_TRUE(tracker.fieldStartIsDefault); + EXPECT_EQ(2U, tracker.fieldStartIdx); + + s.data("test3"); + tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test3", tracker.dataData); + + s.fieldEnd(); + tracker.expect(1, 1, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + } + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, implicitDefaultFieldOnNewCommand) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("b", {}); + tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, implicitDefaultFieldOnNewCommandWithExplicitDefaultField) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + s.command("b", {}); + tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 1, 2, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + } + tracker.expect(2, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, noImplicitDefaultFieldOnIncompatibleCommand) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + tracker.fieldStartResult = false; + s.command("b", {}); + tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, noImplicitDefaultFieldIfDefaultFieldGiven) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldStart(true); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("", s.currentCommandName()); + + s.command("b", {}); + tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, implicitDefaultFieldOnData) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.data("test"); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, autoFieldEnd) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, autoImplicitFieldEnd) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + s.command("b", {}); + s.command("c", {}); + s.command("d", {}); + s.command("e", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(5, 1, 5, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(5, 5, 5, 5, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, invalidDefaultField) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, errorInvalidDefaultFieldData) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + ASSERT_FALSE(logger.hasError()); + s.data("test"); + ASSERT_TRUE(logger.hasError()); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorInvalidFieldData) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + s.data("test"); + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorFieldStartNoCommand) +{ + tracker.reset(); + logger.reset(); + + Stack s{env.context, States::AnyHandlers}; + ASSERT_THROW(s.fieldStart(false), LoggableException); + ASSERT_THROW(s.fieldStart(true), LoggableException); + tracker.expect(0, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorMutlipleFieldStarts) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorMutlipleFieldEnds) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + s.fieldEnd(); + ASSERT_FALSE(logger.hasError()); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorOpenField) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + ASSERT_FALSE(logger.hasError()); + } + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} +} +} + -- cgit v1.2.3 From 36b712c9f9af5c008fbd193392546fd472a35189 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:15:08 +0100 Subject: Added lonely comment to StandaloneEnvironment --- test/core/StandaloneEnvironment.hpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'test') diff --git a/test/core/StandaloneEnvironment.hpp b/test/core/StandaloneEnvironment.hpp index a9dcdce..790bad4 100644 --- a/test/core/StandaloneEnvironment.hpp +++ b/test/core/StandaloneEnvironment.hpp @@ -31,6 +31,10 @@ namespace ousia { +/** + * StandaloneEnvironment is a class used for quickly setting up an entire + * environment needed for running an Ousia instance. + */ struct StandaloneEnvironment { ConcreteLogger &logger; Manager manager; -- cgit v1.2.3 From b1aade072781b0eca9b4c2fd15c360ec7d3ed25f Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 14:53:34 +0100 Subject: Removed legacy test file --- test/core/parser/ParserStackTest.cpp | 177 ----------------------------------- 1 file changed, 177 deletions(-) delete mode 100644 test/core/parser/ParserStackTest.cpp (limited to 'test') diff --git a/test/core/parser/ParserStackTest.cpp b/test/core/parser/ParserStackTest.cpp deleted file mode 100644 index 3a0decb..0000000 --- a/test/core/parser/ParserStackTest.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include - -namespace ousia { - -ConcreteLogger logger; - -static int startCount = 0; -static int endCount = 0; -static int dataCount = 0; - -class TestHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override { startCount++; } - - void end() override { endCount++; } - - void data(const std::string &data, int field) override { dataCount++; } - - static Handler *create(const HandlerData &data) - { - return new TestHandler(data); - } -}; - -namespace ParserStates { -static const ParserState Document = - ParserStateBuilder().parent(&None).elementHandler(TestHandler::create); -static const ParserState Body = ParserStateBuilder() - .parent(&Document) - .elementHandler(TestHandler::create); -static const ParserState Empty = - ParserStateBuilder().parent(&Document).elementHandler(TestHandler::create); -static const ParserState Special = - ParserStateBuilder().parent(&All).elementHandler(TestHandler::create); -static const ParserState Arguments = - ParserStateBuilder() - .parent(&None) - .elementHandler(TestHandler::create) - .arguments({Argument::Int("a"), Argument::String("b")}); -static const ParserState BodyChildren = - ParserStateBuilder() - .parent(&Body) - .elementHandler(TestHandler::create); - -static const std::multimap TestHandlers{ - {"document", &Document}, - {"body", &Body}, - {"empty", &Empty}, - {"special", &Special}, - {"arguments", &Arguments}, - {"*", &BodyChildren}}; -} - -TEST(ParserStack, simpleTest) -{ - StandaloneEnvironment env(logger); - ParserStack s{env.context, ParserStates::TestHandlers}; - - startCount = 0; - endCount = 0; - dataCount = 0; - - EXPECT_EQ("", s.currentCommandName()); - EXPECT_EQ(&ParserStates::None, &s.currentState()); - - s.start("document", {}); - s.data("test1"); - - EXPECT_EQ("document", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Document, &s.currentState()); - EXPECT_EQ(1, startCount); - EXPECT_EQ(1, dataCount); - - s.start("body", {}); - s.data("test2"); - EXPECT_EQ("body", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Body, &s.currentState()); - EXPECT_EQ(2, startCount); - EXPECT_EQ(2, dataCount); - - s.start("inner", {}); - EXPECT_EQ("inner", s.currentCommandName()); - EXPECT_EQ(&ParserStates::BodyChildren, &s.currentState()); - s.end(); - EXPECT_EQ(3, startCount); - EXPECT_EQ(1, endCount); - - s.end(); - EXPECT_EQ(2, endCount); - - EXPECT_EQ("document", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Document, &s.currentState()); - - s.start("body", {}); - s.data("test3"); - EXPECT_EQ("body", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Body, &s.currentState()); - s.end(); - EXPECT_EQ(4, startCount); - EXPECT_EQ(3, dataCount); - EXPECT_EQ(3, endCount); - - EXPECT_EQ("document", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Document, &s.currentState()); - - s.end(); - EXPECT_EQ(4, endCount); - - EXPECT_EQ("", s.currentCommandName()); - EXPECT_EQ(&ParserStates::None, &s.currentState()); -} - -TEST(ParserStack, errorHandling) -{ - StandaloneEnvironment env(logger); - ParserStack s{env.context, ParserStates::TestHandlers}; - - EXPECT_THROW(s.start("body", {}), OusiaException); - s.start("document", {}); - EXPECT_THROW(s.start("document", {}), OusiaException); - s.start("empty", {}); - EXPECT_THROW(s.start("body", {}), OusiaException); - s.start("special", {}); - s.end(); - s.end(); - s.end(); - EXPECT_EQ(&ParserStates::None, &s.currentState()); - ASSERT_THROW(s.end(), OusiaException); - ASSERT_THROW(s.data("test", 1), OusiaException); -} - -TEST(ParserStack, validation) -{ - StandaloneEnvironment env(logger); - ParserStack s{env.context, ParserStates::TestHandlers}; - - logger.reset(); - s.start("arguments", {}); - EXPECT_TRUE(logger.hasError()); - s.end(); - - s.start("arguments", {{"a", 5}}); - EXPECT_TRUE(logger.hasError()); - s.end(); - - logger.reset(); - s.start("arguments", {{"a", 5}, {"b", "test"}}); - EXPECT_FALSE(logger.hasError()); - s.end(); -} -} - -- cgit v1.2.3 From 53c92aea125a439858d03245a914e20f55e5bcba Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 14:53:50 +0100 Subject: Fixed GCC 4.9 warnings --- test/core/model/DomainTest.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'test') diff --git a/test/core/model/DomainTest.cpp b/test/core/model/DomainTest.cpp index 8fcbdf2..4cb4331 100644 --- a/test/core/model/DomainTest.cpp +++ b/test/core/model/DomainTest.cpp @@ -242,7 +242,7 @@ TEST(Descriptor, getDefaultFields) A->createPrimitiveFieldDescriptor(sys->getStringType(), logger); // now we should find that. auto fields = A->getDefaultFields(); - ASSERT_EQ(1, fields.size()); + ASSERT_EQ(1U, fields.size()); ASSERT_EQ(A_prim_field, fields[0]); // remove that field from A and add it to another class. @@ -258,7 +258,7 @@ TEST(Descriptor, getDefaultFields) // but we should find it again if we set B as superclass of A. A->setSuperclass(B, logger); fields = A->getDefaultFields(); - ASSERT_EQ(1, fields.size()); + ASSERT_EQ(1U, fields.size()); ASSERT_EQ(A_prim_field, fields[0]); // and we should not be able to find it if we override the field. @@ -277,7 +277,7 @@ TEST(Descriptor, getDefaultFields) // now we should find that. fields = A->getDefaultFields(); - ASSERT_EQ(1, fields.size()); + ASSERT_EQ(1U, fields.size()); ASSERT_EQ(C_field, fields[0]); // add another transparent child class to A with a daughter class that has @@ -296,7 +296,7 @@ TEST(Descriptor, getDefaultFields) // now we should find both primitive fields, but the C field first. fields = A->getDefaultFields(); - ASSERT_EQ(2, fields.size()); + ASSERT_EQ(2U, fields.size()); ASSERT_EQ(C_field, fields[0]); ASSERT_EQ(F_field, fields[1]); } @@ -321,7 +321,7 @@ TEST(Descriptor, getPermittedChildren) * in between. */ NodeVector children = book->getPermittedChildren(); - ASSERT_EQ(3, children.size()); + ASSERT_EQ(3U, children.size()); ASSERT_EQ(section, children[0]); ASSERT_EQ(paragraph, children[1]); ASSERT_EQ(text, children[2]); @@ -331,7 +331,7 @@ TEST(Descriptor, getPermittedChildren) mgr, "Subclass", domain, Cardinality::any(), text, true, false)}; // And that should be in the result list as well now. children = book->getPermittedChildren(); - ASSERT_EQ(4, children.size()); + ASSERT_EQ(4U, children.size()); ASSERT_EQ(section, children[0]); ASSERT_EQ(paragraph, children[1]); ASSERT_EQ(text, children[2]); -- cgit v1.2.3 From 69ebaddbeaea1aa651a0f0babbf9283240d9c07b Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 14:58:46 +0100 Subject: Slightly adapted Handler instances to new Handler, once again passing non-const references to data and start, using "parseGenericString" in DocumentHandler for resolving non-string values, added unit test for testing whether "end()" is not called if "start()" fails. --- src/core/parser/stack/DocumentHandler.cpp | 141 +++++++++++++++---------- src/core/parser/stack/DocumentHandler.hpp | 96 ++++++++++++++--- src/core/parser/stack/DomainHandler.cpp | 51 +++++---- src/core/parser/stack/DomainHandler.hpp | 28 +++-- src/core/parser/stack/Handler.cpp | 20 ++-- src/core/parser/stack/Handler.hpp | 31 +++--- src/core/parser/stack/ImportIncludeHandler.cpp | 54 ++-------- src/core/parser/stack/ImportIncludeHandler.hpp | 13 ++- src/core/parser/stack/Stack.cpp | 18 ++-- src/core/parser/stack/TypesystemHandler.cpp | 48 ++++----- src/core/parser/stack/TypesystemHandler.hpp | 131 +++++++++++++++++------ test/core/parser/stack/StackTest.cpp | 41 +++++-- 12 files changed, 422 insertions(+), 250 deletions(-) (limited to 'test') diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index ba7430d..b28f0fb 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -22,22 +22,28 @@ #include #include +#include #include #include +#include #include #include +#include namespace ousia { +namespace parser_stack { /* DocumentHandler */ -void DocumentHandler::start(Variant::mapType &args) +bool DocumentHandler::start(Variant::mapType &args) { Rooted document = - project()->createDocument(args["name"].asString()); + context().getProject()->createDocument(args["name"].asString()); document->setLocation(location()); scope().push(document); scope().setFlag(ParserFlag::POST_HEAD, false); + + return true; } void DocumentHandler::end() { scope().pop(); } @@ -48,7 +54,7 @@ void DocumentChildHandler::preamble(Handle parentNode, std::string &fieldName, DocumentEntity *&parent, bool &inField) { - // check if the parent in the structure tree was an explicit field + // Check if the parent in the structure tree was an explicit field // reference. inField = parentNode->isa(&RttiTypes::DocumentField); if (inField) { @@ -56,10 +62,11 @@ void DocumentChildHandler::preamble(Handle parentNode, parentNode = scope().selectOrThrow( {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); } else { - // if it wasn't an explicit reference, we use the default field. + // If it wasn't an explicit reference, we use the default field. fieldName = DEFAULT_FIELD_NAME; } - // reference the parent entity explicitly. + + // Reference the parent entity explicitly. parent = nullptr; if (parentNode->isa(&RttiTypes::StructuredEntity)) { parent = static_cast( @@ -73,6 +80,8 @@ void DocumentChildHandler::preamble(Handle parentNode, void DocumentChildHandler::createPath(const NodeVector &path, DocumentEntity *&parent) { + // TODO (@benjamin): These should be pushed onto the scope and poped once + // the scope is left. Otherwise stuff may not be correclty resolved. size_t S = path.size(); for (size_t p = 1; p < S; p = p + 2) { parent = static_cast( @@ -82,7 +91,7 @@ void DocumentChildHandler::createPath(const NodeVector &path, } } -void DocumentChildHandler::start(Variant::mapType &args) +bool DocumentChildHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); Rooted parentNode = scope().selectOrThrow( @@ -95,7 +104,7 @@ void DocumentChildHandler::start(Variant::mapType &args) preamble(parentNode, fieldName, parent, inField); - // try to find a FieldDescriptor for the given tag if we are not in a + // Try to find a FieldDescriptor for the given tag if we are not in a // field already. This does _not_ try to construct transparent paths // in between. if (!inField && parent != nullptr && @@ -104,7 +113,7 @@ void DocumentChildHandler::start(Variant::mapType &args) new DocumentField(parentNode->getManager(), fieldName, parentNode)}; field->setLocation(location()); scope().push(field); - return; + return true; } // Otherwise create a new StructuredEntity @@ -147,27 +156,39 @@ void DocumentChildHandler::start(Variant::mapType &args) } entity->setLocation(location()); scope().push(entity); + return true; } void DocumentChildHandler::end() { scope().pop(); } -std::pair DocumentChildHandler::convertData( - Handle field, Logger &logger, const std::string &data) +bool DocumentChildHandler::convertData(Handle field, + Variant &data, Logger &logger) { - // if the content is supposed to be of type string, we can finish - // directly. - auto vts = field->getPrimitiveType()->getVariantTypes(); - if (std::find(vts.begin(), vts.end(), VariantType::STRING) != vts.end()) { - return std::make_pair(true, Variant::fromString(data)); + bool valid = true; + Rooted type = field->getPrimitiveType(); + + // If the content is supposed to be of type string, we only need to check + // for "magic" values -- otherwise just call the "parseGenericString" + // function on the string data + if (type->isa(&RttiTypes::StringType)) { + const std::string &str = data.asString(); + // TODO: Referencing constants with "." separator should also work + if (Utils::isIdentifier(str)) { + data.markAsMagic(); + } + } else { + // Parse the string as generic string, assign the result + auto res = VariantReader::parseGenericString( + data.asString(), logger, data.getLocation().getSourceId(), + data.getLocation().getStart()); + data = res.second; } - // then try to parse the content using the type specification. - auto res = field->getPrimitiveType()->read( - data, logger, location().getSourceId(), location().getStart()); - return res; + // Now try to resolve the value for the primitive type + return valid && scope().resolveValue(data, type, logger); } -void DocumentChildHandler::data(const std::string &data, int fieldIdx) +bool DocumentChildHandler::data(Variant &data) { Rooted parentNode = scope().selectOrThrow( {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, @@ -180,11 +201,10 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) preamble(parentNode, fieldName, parent, inField); Rooted desc = parent->getDescriptor(); - /* - * We distinguish two cases here: One for fields that are given. - */ + + // We distinguish two cases here: One for fields that are given. if (fieldName != DEFAULT_FIELD_NAME) { - // retrieve the actual FieldDescriptor + // Retrieve the actual FieldDescriptor Rooted field = desc->getFieldDescriptor(fieldName); if (field == nullptr) { logger().error( @@ -192,49 +212,57 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) fieldName + "\" exists in descriptor\"" + desc->getName() + "\".", location()); - return; + return false; } - // if it is not primitive at all, we can't parse the content. + // If it is not primitive at all, we can't parse the content. if (!field->isPrimitive()) { logger().error(std::string("Can't handle data because field \"") + fieldName + "\" of descriptor \"" + desc->getName() + "\" is not primitive!", location()); - return; + return false; } - // then try to parse the content using the type specification. - auto res = convertData(field, logger(), data); - // add it as primitive content. - if (res.first) { - parent->createChildDocumentPrimitive(res.second, fieldName); + + // Try to convert the data variable to the correct format, abort if this + // does not work + if (!convertData(field, data, logger())) { + return false; } + + // Add it as primitive content + parent->createChildDocumentPrimitive(data, fieldName); + return true; } else { - /* - * The second case is for primitive fields. Here we search through - * all FieldDescriptors that allow primitive content at this point - * and could be constructed via transparent intermediate entities. - * We then try to parse the data using the type specified by the - * respective field. If that does not work we proceed to the next - * possible field. - */ - // retrieve all fields. + // The second case is for primitive fields. Here we search through + // all FieldDescriptors that allow primitive content at this point + // and could be constructed via transparent intermediate entities. + // We then try to parse the data using the type specified by the + // respective field. If that does not work we proceed to the next + // possible field. NodeVector fields = desc->getDefaultFields(); std::vector forks; for (auto field : fields) { - // then try to parse the content using the type specification. + // Then try to parse the content using the type specification forks.emplace_back(logger().fork()); - auto res = convertData(field, forks.back(), data); - if (res.first) { - forks.back().commit(); - // if that worked, construct the necessary path. - auto pathRes = desc->pathTo(field, logger()); - assert(pathRes.second); - NodeVector path = pathRes.first; - createPath(path, parent); - // then create the primitive element. - parent->createChildDocumentPrimitive(res.second, fieldName); - return; + + // Try to convert the data variable to the correct format, abort if + // this does not work + if (!convertData(field, data, forks.back())) { + return false; } + + // Show possible warnings that were emitted by this type conversion + forks.back().commit(); + + // If that worked, construct the necessary path + auto pathRes = desc->pathTo(field, logger()); + assert(pathRes.second); + NodeVector path = pathRes.first; + createPath(path, parent); + + // Then create the primitive element + parent->createChildDocumentPrimitive(data, fieldName); + return true; } logger().error("Could not read data with any of the possible fields:"); for (size_t f = 0; f < fields.size(); f++) { @@ -242,11 +270,14 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) SourceLocation{}, MessageMode::NO_CONTEXT); forks[f].commit(); } + return false; } + return true; +} } namespace RttiTypes { -const Rtti DocumentField = - RttiBuilder("DocumentField").parent(&Node); +const Rtti DocumentField = RttiBuilder( + "DocumentField").parent(&Node); } } diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 475fe69..7dc4c86 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -19,13 +19,19 @@ /** * @file DocumentHandler.hpp * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + * Contains the Handler instances used for parsing actual documents. This file + * declares to classes: The Document handler which parses the "document" command + * that introduces a new document and the "DocumentChildHandler" which parses + * the actual user defined tags. + * + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_DOCUMENT_HANDLER_HPP_ -#define _OUSIA_DOCUMENT_HANDLER_HPP_ +#ifndef _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ +#define _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ #include +#include #include "Handler.hpp" @@ -36,53 +42,117 @@ class Rtti; class DocumentEntity; class FieldDescriptor; +namespace parser_stack { +/** + * The DocumentHandler class parses the "document" tag that is used to introduce + * a new document. Note that this tag is not mandatory in osml files -- if the + * first command is not a typesystem, domain or any other declarative command, + * the DocumentHandler will be implicitly called. + */ class DocumentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; + /** + * Creates a new instance of the ImportHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new DocumentHandler{handlerData}; } }; +/** + * Temporary Node that is being pushed onto the ParserScope in order to indicate + * the field the parser is currently in. The name of the Node is stored in the + * "name" field of the parent Node class. + */ class DocumentField : public Node { public: using Node::Node; }; +/** + * The DocumentChildHandler class performs the actual parsing of the user + * defined elements in an Ousía document. + */ class DocumentChildHandler : public StaticHandler { private: + /** + * Code shared by both the start() and the end() method. Checks whether the + * parser currently is in a field and returns the name of this field. + * + * @param parentNode is the next possible parent node (a document, + * a structured entity, an annotation entity or a field). + * @param fieldName is an output parameter to which the name of the current + * field is written (or unchanged if we're not in a field). + * @param parent is an output parameter to which the parent document entity + * will be written. + * @param inField is set to true if we actually are in a field. + */ void preamble(Handle parentNode, std::string &fieldName, DocumentEntity *&parent, bool &inField); + /** + * Constructs all structured entites along the given path and inserts them + * into the document graph. + * + * @param path is a path containing an alternating series of structured + * classes and fields. + * @pram parent is the root entity from which the process should be started. + */ void createPath(const NodeVector &path, DocumentEntity *&parent); - std::pair convertData(Handle field, - Logger &logger, - const std::string &data); + /** + * Tries to convert the given data to the type that is specified in the + * given primitive field. + * + * @param field is the primitive field for which the data is intended. + * @param data is the is the data that should be converted, the result is + * written into this argument as output variable. + * @param logger is the Logger instance to which error messages should be + * written. Needed to allow the convertData function to write to a forked + * Logger instance. + * @return true if the operation was successful, false otherwise. + */ + bool convertData(Handle field, Variant &data, + Logger &logger); public: - using Handler::Handler; + using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; - - bool data(const Variant &data) override; - + bool data(Variant &data) override; + + /** + * Creates a new instance of the DocumentChildHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new DocumentChildHandler{handlerData}; } }; +} namespace RttiTypes { +/** + * RttiType for the internally used DocumentField class. + */ extern const Rtti DocumentField; } } -#endif + +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/DomainHandler.cpp b/src/core/parser/stack/DomainHandler.cpp index 6571717..cb12543 100644 --- a/src/core/parser/stack/DomainHandler.cpp +++ b/src/core/parser/stack/DomainHandler.cpp @@ -20,25 +20,30 @@ #include #include +#include #include +#include namespace ousia { +namespace parser_stack { /* DomainHandler */ -void DomainHandler::start(Variant::mapType &args) +bool DomainHandler::start(Variant::mapType &args) { - Rooted domain = project()->createDomain(args["name"].asString()); + Rooted domain = + context().getProject()->createDomain(args["name"].asString()); domain->setLocation(location()); scope().push(domain); + return true; } void DomainHandler::end() { scope().pop(); } /* DomainStructHandler */ -void DomainStructHandler::start(Variant::mapType &args) +bool DomainStructHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -63,12 +68,13 @@ void DomainStructHandler::start(Variant::mapType &args) } scope().push(structuredClass); + return true; } void DomainStructHandler::end() { scope().pop(); } /* DomainAnnotationHandler */ -void DomainAnnotationHandler::start(Variant::mapType &args) +bool DomainAnnotationHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -79,13 +85,14 @@ void DomainAnnotationHandler::start(Variant::mapType &args) annotationClass->setLocation(location()); scope().push(annotationClass); + return true; } void DomainAnnotationHandler::end() { scope().pop(); } /* DomainAttributesHandler */ -void DomainAttributesHandler::start(Variant::mapType &args) +bool DomainAttributesHandler::start(Variant::mapType &args) { // Fetch the current typesystem and create the struct node Rooted parent = scope().selectOrThrow(); @@ -94,13 +101,14 @@ void DomainAttributesHandler::start(Variant::mapType &args) attrDesc->setLocation(location()); scope().push(attrDesc); + return true; } void DomainAttributesHandler::end() { scope().pop(); } /* DomainFieldHandler */ -void DomainFieldHandler::start(Variant::mapType &args) +bool DomainFieldHandler::start(Variant::mapType &args) { FieldDescriptor::FieldType type; if (args["isSubtree"].asBool()) { @@ -116,13 +124,14 @@ void DomainFieldHandler::start(Variant::mapType &args) field->setLocation(location()); scope().push(field); + return true; } void DomainFieldHandler::end() { scope().pop(); } /* DomainFieldRefHandler */ -void DomainFieldRefHandler::start(Variant::mapType &args) +bool DomainFieldRefHandler::start(Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); @@ -135,13 +144,14 @@ void DomainFieldRefHandler::start(Variant::mapType &args) field.cast(), logger); } }); + return true; } void DomainFieldRefHandler::end() {} /* DomainPrimitiveHandler */ -void DomainPrimitiveHandler::start(Variant::mapType &args) +bool DomainPrimitiveHandler::start(Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); @@ -167,13 +177,14 @@ void DomainPrimitiveHandler::start(Variant::mapType &args) }); scope().push(field); + return true; } void DomainPrimitiveHandler::end() { scope().pop(); } /* DomainChildHandler */ -void DomainChildHandler::start(Variant::mapType &args) +bool DomainChildHandler::start(Variant::mapType &args) { Rooted field = scope().selectOrThrow(); @@ -186,13 +197,12 @@ void DomainChildHandler::start(Variant::mapType &args) child.cast()); } }); + return true; } -void DomainChildHandler::end() {} - /* DomainParentHandler */ -void DomainParentHandler::start(Variant::mapType &args) +bool DomainParentHandler::start(Variant::mapType &args) { Rooted strct = scope().selectOrThrow(); @@ -200,12 +210,14 @@ void DomainParentHandler::start(Variant::mapType &args) new DomainParent(strct->getManager(), args["ref"].asString(), strct)}; parent->setLocation(location()); scope().push(parent); + return true; } void DomainParentHandler::end() { scope().pop(); } /* DomainParentFieldHandler */ -void DomainParentFieldHandler::start(Variant::mapType &args) + +bool DomainParentFieldHandler::start(Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); FieldDescriptor::FieldType type; @@ -233,13 +245,12 @@ void DomainParentFieldHandler::start(Variant::mapType &args) field->addChild(strct.cast()); } }); + return true; } -void DomainParentFieldHandler::end() {} - /* DomainParentFieldRefHandler */ -void DomainParentFieldRefHandler::start(Variant::mapType &args) +bool DomainParentFieldRefHandler::start(Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); @@ -265,12 +276,12 @@ void DomainParentFieldRefHandler::start(Variant::mapType &args) field->addChild(strct.cast()); } }); + return true; +} } - -void DomainParentFieldRefHandler::end() {} namespace RttiTypes { -const Rtti DomainParent = - RttiBuilder("DomainParent").parent(&Node); +const Rtti DomainParent = RttiBuilder( + "DomainParent").parent(&Node); } } diff --git a/src/core/parser/stack/DomainHandler.hpp b/src/core/parser/stack/DomainHandler.hpp index 5e8ea60..917d65d 100644 --- a/src/core/parser/stack/DomainHandler.hpp +++ b/src/core/parser/stack/DomainHandler.hpp @@ -19,17 +19,24 @@ /** * @file DomainHandler.hpp * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + * Contains the Handler classes used for parsing Domain descriptors. This + * includes the "domain" tag and all describing tags below the "domain" tag. + * + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_DOMAIN_HANDLER_HPP_ #define _OUSIA_DOMAIN_HANDLER_HPP_ #include +#include #include "Handler.hpp" namespace ousia { +namespace parser_stack { + +// TODO: Documentation // Forward declarations class Rtti; @@ -39,7 +46,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -53,7 +59,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -67,7 +72,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -81,7 +85,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -95,7 +98,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -109,7 +111,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -123,7 +124,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -138,8 +138,6 @@ public: bool start(Variant::mapType &args) override; - void end() override; - static Handler *create(const HandlerData &handlerData) { return new DomainChildHandler{handlerData}; @@ -160,7 +158,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -175,8 +172,6 @@ public: bool start(Variant::mapType &args) override; - void end() override; - static Handler *create(const HandlerData &handlerData) { return new DomainParentFieldHandler{handlerData}; @@ -189,12 +184,15 @@ public: bool start(Variant::mapType &args) override; - void end() override; - static Handler *create(const HandlerData &handlerData) { return new DomainParentFieldRefHandler{handlerData}; } }; } + +namespace RttiTypes { +extern const Rtti DomainParent; +} +} #endif diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index a608f7f..86000c4 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -65,6 +65,8 @@ Logger &Handler::logger() const SourceLocation &Handler::location() const { return handlerData.location; } +const std::string &Handler::name() const { return handlerData.name; } + void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) { /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/ @@ -80,7 +82,7 @@ void Handler::unregisterToken(const std::string &token) /*handlerData.callbacks.unregisterToken(token);*/ } -const std::string &Handler::getName() const { return handlerData.name; } +const std::string &Handler::getName() const { return name(); } const State &Handler::getState() const { return handlerData.state; } @@ -92,7 +94,7 @@ const SourceLocation &Handler::getLocation() const { return location(); } /* Class EmptyHandler */ -bool EmptyHandler::start(const Variant::mapType &args) +bool EmptyHandler::start(Variant::mapType &args) { // Just accept anything return true; @@ -115,7 +117,7 @@ void EmptyHandler::fieldEnd() } bool EmptyHandler::annotationStart(const Variant &className, - const Variant::mapType &args) + Variant::mapType &args) { // Accept any data return true; @@ -128,7 +130,7 @@ bool EmptyHandler::annotationEnd(const Variant &className, return true; } -bool EmptyHandler::data(const Variant &data) +bool EmptyHandler::data(Variant &data) { // Support any data return true; @@ -141,7 +143,7 @@ Handler *EmptyHandler::create(const HandlerData &handlerData) /* Class StaticHandler */ -bool StaticHandler::start(const Variant::mapType &args) +bool StaticHandler::start(Variant::mapType &args) { // Do nothing in the default implementation, accept anything return true; @@ -169,7 +171,7 @@ void StaticHandler::fieldEnd() } bool StaticHandler::annotationStart(const Variant &className, - const Variant::mapType &args) + Variant::mapType &args) { // No annotations supported return false; @@ -182,7 +184,7 @@ bool StaticHandler::annotationEnd(const Variant &className, return false; } -bool StaticHandler::data(const Variant &data) +bool StaticHandler::data(Variant &data) { logger().error("Did not expect any data here", data); return false; @@ -196,7 +198,7 @@ StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData, { } -bool StaticFieldHandler::start(const Variant::mapType &args) +bool StaticFieldHandler::start(Variant::mapType &args) { if (!argName.empty()) { auto it = args.find(argName); @@ -225,7 +227,7 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(const Variant &data) +bool StaticFieldHandler::data(Variant &data) { // Call the doHandle function if this has not been done before if (!handled) { diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index eeaf555..7cda7a4 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -151,6 +151,13 @@ protected: */ const SourceLocation &location() const; + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &name() const; + public: /** * Virtual destructor. @@ -229,7 +236,7 @@ public: * @return true if the handler was successful in starting the element it * represents, false otherwise. */ - virtual bool start(const Variant::mapType &args) = 0; + virtual bool start(Variant::mapType &args) = 0; /** * Called before the command for which this handler is defined ends (is @@ -270,7 +277,7 @@ public: * if an error occurred. */ virtual bool annotationStart(const Variant &className, - const Variant::mapType &args) = 0; + Variant::mapType &args) = 0; /** * Called whenever an annotation ends while this handler is active. The @@ -296,7 +303,7 @@ public: * location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(const Variant &data) = 0; + virtual bool data(Variant &data) = 0; }; /** @@ -318,15 +325,15 @@ protected: using Handler::Handler; public: - bool start(const Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; bool annotationStart(const Variant &className, - const Variant::mapType &args) override; + Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(const Variant &data) override; + bool data(Variant &data) override; /** * Creates an instance of the EmptyHandler class. @@ -344,15 +351,15 @@ protected: using Handler::Handler; public: - bool start(const Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; bool annotationStart(const Variant &className, - const Variant::mapType &args) override; + Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(const Variant &data) override; + bool data(Variant &data) override; }; /** @@ -400,12 +407,12 @@ protected: * @param args are the arguments that were given in the "start" function. */ virtual void doHandle(const Variant &fieldData, - const Variant::mapType &args) = 0; + Variant::mapType &args) = 0; public: - bool start(const Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; - bool data(const Variant &data) override; + bool data(Variant &data) override; }; } } diff --git a/src/core/parser/stack/ImportIncludeHandler.cpp b/src/core/parser/stack/ImportIncludeHandler.cpp index 94ee82d..797dd8d 100644 --- a/src/core/parser/stack/ImportIncludeHandler.cpp +++ b/src/core/parser/stack/ImportIncludeHandler.cpp @@ -18,48 +18,16 @@ #include "ImportIncludeHandler.hpp" +#include #include +#include namespace ousia { - -/* ImportIncludeHandler */ - -void ImportIncludeHandler::start(Variant::mapType &args) -{ - rel = args["rel"].asString(); - type = args["type"].asString(); - src = args["src"].asString(); - srcInArgs = !src.empty(); -} - -void ImportIncludeHandler::data(const std::string &data, int field) -{ - if (srcInArgs) { - logger().error("\"src\" attribute has already been set"); - return; - } - if (field != 0) { - logger().error("Command has only one field."); - return; - } - src.append(data); -} +namespace parser_stack { /* ImportHandler */ -void ImportHandler::start(Variant::mapType &args) -{ - ImportIncludeHandler::start(args); - - // Make sure imports are still possible - if (scope().getFlag(ParserFlag::POST_HEAD)) { - logger().error("Imports must be listed before other commands.", - location()); - return; - } -} - -void ImportHandler::end() +void ImportHandler::doHandle(const Variant &fieldData, Variant::mapType &args) { // Fetch the last node and check whether an import is valid at this // position @@ -75,8 +43,9 @@ void ImportHandler::end() // Perform the actual import, register the imported node within the leaf // node - Rooted imported = - context().import(src, type, rel, leafRootNode->getReferenceTypes()); + Rooted imported = context().import( + fieldData.asString(), args["type"].asString(), args["rel"].asString(), + leafRootNode->getReferenceTypes()); if (imported != nullptr) { leafRootNode->reference(imported); } @@ -84,13 +53,10 @@ void ImportHandler::end() /* IncludeHandler */ -void IncludeHandler::start(Variant::mapType &args) +void IncludeHandler::doHandle(const Variant &fieldData, Variant::mapType &args) { - ImportIncludeHandler::start(args); + context().include(fieldData.asString(), args["type"].asString(), + args["rel"].asString(), {&RttiTypes::Node}); } - -void IncludeHandler::end() -{ - context().include(src, type, rel, {&RttiTypes::Node}); } } diff --git a/src/core/parser/stack/ImportIncludeHandler.hpp b/src/core/parser/stack/ImportIncludeHandler.hpp index f9abe55..8f3d3d0 100644 --- a/src/core/parser/stack/ImportIncludeHandler.hpp +++ b/src/core/parser/stack/ImportIncludeHandler.hpp @@ -29,9 +29,11 @@ #define _OUSIA_IMPORT_INCLUDE_HANDLER_HPP_ #include -#include + +#include "Handler.hpp" namespace ousia { +namespace parser_stack { /** * The ImportHandler is responsible for handling the "import" command. An import @@ -46,7 +48,7 @@ public: using StaticFieldHandler::StaticFieldHandler; void doHandle(const Variant &fieldData, - const Variant::mapType &args) override; + Variant::mapType &args) override; /** * Creates a new instance of the ImportHandler. @@ -57,7 +59,7 @@ public: */ static Handler *create(const HandlerData &handlerData) { - return new ImportHandler{handlerData}; + return new ImportHandler{handlerData, "src"}; } }; @@ -72,7 +74,7 @@ public: using StaticFieldHandler::StaticFieldHandler; void doHandle(const Variant &fieldData, - const Variant::mapType &args) override; + Variant::mapType &args) override; /** * Creates a new instance of the IncludeHandler. @@ -83,8 +85,9 @@ public: */ static Handler *create(const HandlerData &handlerData) { - return new IncludeHandler{handlerData}; + return new IncludeHandler{handlerData, "src"}; } }; } +} #endif diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index d84a19c..47f7d2c 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -316,8 +316,6 @@ void Stack::command(const Variant &name, const Variant::mapType &args) name); } - State const *lastTargetState = nullptr; - Variant::mapType canonicalArgs; while (true) { // Try to find a target state for the given command, if none can be // found and the current command does not have an open field, then try @@ -342,14 +340,6 @@ void Stack::command(const Variant &name, const Variant::mapType &args) // Fork the logger. We do not want any validation errors to skip LoggerFork loggerFork = logger().fork(); - // Canonicalize the arguments (if this has not already been done), allow - // additional arguments - if (lastTargetState != targetState) { - canonicalArgs = args; - targetState->arguments.validateMap(canonicalArgs, loggerFork, true); - lastTargetState = targetState; - } - // Instantiate the handler and push it onto the stack HandlerConstructor ctor = targetState->elementHandler ? targetState->elementHandler @@ -369,6 +359,11 @@ void Stack::command(const Variant &name, const Variant::mapType &args) bool validStack = handlersValid(); info.valid = false; if (validStack) { + // Canonicalize the arguments (if this has not already been done), + // allow additional arguments + Variant::mapType canonicalArgs = args; + targetState->arguments.validateMap(canonicalArgs, loggerFork, true); + handler->setLogger(loggerFork); try { info.valid = handler->start(canonicalArgs); @@ -430,7 +425,8 @@ void Stack::data(const Variant &data) // Pass the data to the current Handler instance bool valid = false; try { - valid = info.handler->data(data); + Variant dataCopy = data; + valid = info.handler->data(dataCopy); } catch (LoggableException ex) { loggerFork.log(ex); diff --git a/src/core/parser/stack/TypesystemHandler.cpp b/src/core/parser/stack/TypesystemHandler.cpp index 2cc7dfb..34f64f9 100644 --- a/src/core/parser/stack/TypesystemHandler.cpp +++ b/src/core/parser/stack/TypesystemHandler.cpp @@ -20,28 +20,33 @@ #include #include +#include + namespace ousia { +namespace parser_stack { /* TypesystemHandler */ -void TypesystemHandler::start(Variant::mapType &args) +bool TypesystemHandler::start(Variant::mapType &args) { // Create the typesystem instance Rooted typesystem = - project()->createTypesystem(args["name"].asString()); + context().getProject()->createTypesystem(args["name"].asString()); typesystem->setLocation(location()); // Push the typesystem onto the scope, set the POST_HEAD flag to true scope().push(typesystem); scope().setFlag(ParserFlag::POST_HEAD, false); + + return true; } void TypesystemHandler::end() { scope().pop(); } /* TypesystemEnumHandler */ -void TypesystemEnumHandler::start(Variant::mapType &args) +bool TypesystemEnumHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -52,33 +57,24 @@ void TypesystemEnumHandler::start(Variant::mapType &args) enumType->setLocation(location()); scope().push(enumType); + + return true; } void TypesystemEnumHandler::end() { scope().pop(); } /* TypesystemEnumEntryHandler */ -void TypesystemEnumEntryHandler::start(Variant::mapType &args) {} - -void TypesystemEnumEntryHandler::end() +void TypesystemEnumEntryHandler::doHandle(const Variant &fieldData, + Variant::mapType &args) { Rooted enumType = scope().selectOrThrow(); - enumType->addEntry(entry, logger()); -} - -void TypesystemEnumEntryHandler::data(const std::string &data, int field) -{ - if (field != 0) { - // TODO: This should be stored in the HandlerData - logger().error("Enum entry only has one field."); - return; - } - entry.append(data); + enumType->addEntry(fieldData.asString(), logger()); } /* TypesystemStructHandler */ -void TypesystemStructHandler::start(Variant::mapType &args) +bool TypesystemStructHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -103,13 +99,15 @@ void TypesystemStructHandler::start(Variant::mapType &args) }); } scope().push(structType); + + return true; } void TypesystemStructHandler::end() { scope().pop(); } /* TypesystemStructFieldHandler */ -void TypesystemStructFieldHandler::start(Variant::mapType &args) +bool TypesystemStructFieldHandler::start(Variant::mapType &args) { // Read the argument values const std::string &name = args["name"].asString(); @@ -142,13 +140,13 @@ void TypesystemStructFieldHandler::start(Variant::mapType &args) } }); } -} -void TypesystemStructFieldHandler::end() {} + return true; +} /* TypesystemConstantHandler */ -void TypesystemConstantHandler::start(Variant::mapType &args) +bool TypesystemConstantHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -169,7 +167,9 @@ void TypesystemConstantHandler::start(Variant::mapType &args) constant.cast()->setType(type.cast(), logger); } }); -} -void TypesystemConstantHandler::end() {} + return true; } +} +} + diff --git a/src/core/parser/stack/TypesystemHandler.hpp b/src/core/parser/stack/TypesystemHandler.hpp index 76a7bc9..55277a1 100644 --- a/src/core/parser/stack/TypesystemHandler.hpp +++ b/src/core/parser/stack/TypesystemHandler.hpp @@ -19,6 +19,9 @@ /** * @file TypesystemHandler.hpp * + * Contains the Handler classes used to parse Typesystem descriptions. The + * Handlers parse all the tags found below and including the "typesystem" tag. + * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -26,96 +29,154 @@ #define _OUSIA_TYPESYSTEM_HANDLER_HPP_ #include -#include + +#include "Handler.hpp" namespace ousia { +namespace parser_stack { -class TypesystemHandler : public Handler { +/** + * Handles the occurance of the "typesystem" tag. Creates a new Typesystem + * instance and places it on the ParserScope. + */ +class TypesystemHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemHandler{handlerData}; } }; -class TypesystemEnumHandler : public Handler { +/** + * Handles the occurance of the "enum" tag. Creates a new EnumType instance and + * places it on the ParserScope. + */ +class TypesystemEnumHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemEnumHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemEnumHandler{handlerData}; } }; -class TypesystemEnumEntryHandler : public Handler { +/** + * Handles the occurance of the "entry" tag within an "enum" tag. Creates a new + * EnumType instance and places it on the ParserScope. + */ +class TypesystemEnumEntryHandler : public StaticFieldHandler { public: - using Handler::Handler; - - std::string entry; - - void start(Variant::mapType &args) override; - - void end() override; - - void data(const std::string &data, int field) override; - + using StaticFieldHandler::StaticFieldHandler; + + void doHandle(const Variant &fieldData, + Variant::mapType &args) override; + + /** + * Creates a new instance of the TypesystemEnumEntryHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { - return new TypesystemEnumEntryHandler{handlerData}; + return new TypesystemEnumEntryHandler{handlerData, "name"}; } }; -class TypesystemStructHandler : public Handler { +/** + * Handles the occurance of the "struct" tag within a typesystem description. + * Creates a new StructType instance and places it on the ParserScope. + */ +class TypesystemStructHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemStructHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemStructHandler{handlerData}; } }; -class TypesystemStructFieldHandler : public Handler { +/** + * Handles the occurance of the "field" tag within a typesystem structure + * description. Places a new Attribute instance in the StructType instance + * that is currently at the top of the scope. + */ +class TypesystemStructFieldHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; - - void end() override; + bool start(Variant::mapType &args) override; + /** + * Creates a new instance of the TypesystemStructFieldHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemStructFieldHandler{handlerData}; } }; -class TypesystemConstantHandler : public Handler { +/** + * Handles the occurance of the "constant" tag within a typesystem structure + * description. Places a new Constant instance in the current typesystem. + */ +class TypesystemConstantHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; - - void end() override; + bool start(Variant::mapType &args) override; + /** + * Creates a new instance of the TypesystemConstantHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemConstantHandler{handlerData}; } }; } +} #endif diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp index 7cc8bc5..321d471 100644 --- a/test/core/parser/stack/StackTest.cpp +++ b/test/core/parser/stack/StackTest.cpp @@ -112,16 +112,21 @@ private: TestHandler(const HandlerData &handlerData) : Handler(handlerData) {} public: - bool start(const Variant::mapType &args) + bool start(Variant::mapType &args) override { tracker.startCount++; tracker.startArgs = args; + if (!tracker.startResult) { + logger().error( + "The TestHandler was told not to allow a field start. So it " + "doesn't. The TestHandler always obeys its master."); + } return tracker.startResult; } - void end() { tracker.endCount++; } + void end() override { tracker.endCount++; } - bool fieldStart(bool &isDefault, size_t fieldIdx) + bool fieldStart(bool &isDefault, size_t fieldIdx) override { tracker.fieldStartCount++; tracker.fieldStartIsDefault = isDefault; @@ -132,9 +137,10 @@ public: return tracker.fieldStartResult; } - void fieldEnd() { tracker.fieldEndCount++; } + void fieldEnd() override { tracker.fieldEndCount++; } - bool annotationStart(const Variant &className, const Variant::mapType &args) + bool annotationStart(const Variant &className, + Variant::mapType &args) override { tracker.annotationStartCount++; tracker.annotationStartClassName = className; @@ -142,7 +148,8 @@ public: return tracker.annotationStartResult; } - bool annotationEnd(const Variant &className, const Variant &elementName) + bool annotationEnd(const Variant &className, + const Variant &elementName) override { tracker.annotationEndCount++; tracker.annotationEndClassName = className; @@ -150,7 +157,7 @@ public: return tracker.annotationEndResult; } - bool data(const Variant &data) + bool data(Variant &data) override { tracker.dataCount++; tracker.dataData = data; @@ -458,6 +465,26 @@ TEST(Stack, noImplicitDefaultFieldIfDefaultFieldGiven) ASSERT_FALSE(logger.hasError()); } +TEST(Stack, noEndIfStartFails) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + tracker.startResult = false; + s.command("b", {}); + tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_TRUE(logger.hasError()); +} + TEST(Stack, implicitDefaultFieldOnData) { tracker.reset(); -- cgit v1.2.3 From b7ffeb3dca889aee1c878e2ef0f07644f910dba2 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 20:58:05 +0100 Subject: Made OsxmlEvents interface consistent with Stack commands --- src/formats/osxml/OsxmlEventParser.cpp | 2 +- src/formats/osxml/OsxmlEventParser.hpp | 24 +++++++++-------- test/formats/osxml/OsxmlEventParserTest.cpp | 41 +++++++++++++---------------- 3 files changed, 32 insertions(+), 35 deletions(-) (limited to 'test') diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index b4aff77..7404960 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -329,7 +329,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Just issue a "commandStart" event in any other case Variant nameVar = Variant::fromString(nameStr); nameVar.setLocation(nameLoc); - parser->getEvents().commandStart(nameVar, args); + parser->getEvents().command(nameVar, args); } } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index aa20ea9..e39245f 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -58,34 +58,36 @@ public: * * @param name is a string variant containing name and location of the * command. - * @param args is a map variant containing the arguments that were given - * to the command. + * @param args is a map containing the arguments that were given to the + * command. */ - virtual void commandStart(Variant name, Variant args) = 0; + virtual void command(const Variant &name, const Variant::mapType &args) = 0; /** * Called whenever an annotation starts. Note that this implicitly always * starts the default field of the annotation. * - * @param name is a string variant containing the name of the annotation - * class and the location of the annotation definition. + * @param className is a string variant containing the name of the + * annotation class and the location of the annotation definition. * @param args is a map variant containing the arguments that were given * to the annotation definition. */ - virtual void annotationStart(Variant name, Variant args) = 0; + virtual void annotationStart(const Variant &className, + const Variant::mapType &args) = 0; /** * Called whenever the range of an annotation ends. The callee must * disambiguate the actual annotation that is finished here. * - * @param name is a string variant containing the name of the annotation - * class that should end here. May be empty (or nullptr), if no elementName - * has been specified at the end of the annotation. + * @param className is a string variant containing the name of the + * annotation class that should end here. May be empty (or nullptr), if no + * elementName has been specified at the end of the annotation. * @param elementName is the name of the annotation element that should be * ended here. May be empty (or nullptr), if no elementName has been * specified at the end of the annotation. */ - virtual void annotationEnd(Variant name, Variant elementName) = 0; + virtual void annotationEnd(const Variant &className, + const Variant &elementName) = 0; /** * Called whenever the default field which was implicitly started by @@ -105,7 +107,7 @@ public: * @param data is the already parsed data that should be passed to the * handler. */ - virtual void data(Variant data) = 0; + virtual void data(const Variant &data) = 0; }; /** diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp index 06c800f..3293370 100644 --- a/test/formats/osxml/OsxmlEventParserTest.cpp +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -31,7 +31,7 @@ static TerminalLogger logger(std::cerr, true); namespace { enum class OsxmlEvent { - COMMAND_START, + COMMAND, ANNOTATION_START, ANNOTATION_END, FIELD_END, @@ -42,22 +42,24 @@ class TestOsxmlEventListener : public OsxmlEvents { public: std::vector> events; - void commandStart(Variant name, Variant args) override + void command(const Variant &name, const Variant::mapType &args) override { - events.emplace_back(OsxmlEvent::COMMAND_START, + events.emplace_back(OsxmlEvent::COMMAND, Variant::arrayType{name, args}); } - void annotationStart(Variant name, Variant args) override + void annotationStart(const Variant &className, + const Variant::mapType &args) override { events.emplace_back(OsxmlEvent::ANNOTATION_START, - Variant::arrayType{name, args}); + Variant::arrayType{className, args}); } - void annotationEnd(Variant name, Variant elementName) override + void annotationEnd(const Variant &className, + const Variant &elementName) override { events.emplace_back(OsxmlEvent::ANNOTATION_END, - Variant::arrayType{name, elementName}); + Variant::arrayType{className, elementName}); } void fieldEnd() override @@ -65,7 +67,7 @@ public: events.emplace_back(OsxmlEvent::FIELD_END, Variant::arrayType{}); } - void data(Variant data) override + void data(const Variant &data) override { events.emplace_back(OsxmlEvent::DATA, Variant::arrayType{data}); } @@ -91,7 +93,7 @@ TEST(OsxmlEventParser, simpleCommandWithArgs) // 0 1 2 3 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, + {OsxmlEvent::COMMAND, Variant::arrayType{ "a", Variant::mapType{ {"name", "test"}, {"a", 1}, {"b", 2}, {"c", "blub"}}}}, @@ -131,11 +133,9 @@ TEST(OsxmlEventParser, magicTopLevelTag) const char *testString = ""; std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{{"a", Variant::mapType{}}}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}, - {OsxmlEvent::COMMAND_START, - Variant::arrayType{{"b", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{{"b", Variant::mapType{}}}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; auto events = parseXml(testString); @@ -147,9 +147,8 @@ TEST(OsxmlEventParser, magicTopLevelTagInside) const char *testString = ""; std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{{"a", Variant::mapType{}}}}, - {OsxmlEvent::COMMAND_START, + {OsxmlEvent::COMMAND, Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{{"ousia", Variant::mapType{}}}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -165,8 +164,7 @@ TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) // 0 1 2 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -185,8 +183,7 @@ TEST(OsxmlEventParser, commandWithDataTrimWhitespace) // 0 1 2 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -205,8 +202,7 @@ TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) // 0 1 2 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, {OsxmlEvent::DATA, Variant::arrayType{"hello world"}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -217,6 +213,5 @@ TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); } - } -- cgit v1.2.3 From c298f00ef1633a663775fe9a715a249b9f4d255d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 20:58:26 +0100 Subject: Implemented OsxmlParser --- CMakeLists.txt | 2 + src/formats/osxml/OsxmlParser.cpp | 288 +++++++++------------------------ src/formats/osxml/OsxmlParser.hpp | 2 +- test/formats/osxml/OsxmlParserTest.cpp | 28 ++-- 4 files changed, 91 insertions(+), 229 deletions(-) (limited to 'test') diff --git a/CMakeLists.txt b/CMakeLists.txt index 2106cf0..ec1bb4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,6 +203,7 @@ TARGET_LINK_LIBRARIES(ousia_osml ADD_LIBRARY(ousia_osxml src/formats/osxml/OsxmlAttributeLocator src/formats/osxml/OsxmlEventParser + src/formats/osxml/OsxmlParser ) TARGET_LINK_LIBRARIES(ousia_osxml @@ -351,6 +352,7 @@ IF(TEST) ADD_EXECUTABLE(ousia_test_osxml test/formats/osxml/OsxmlEventParserTest + test/formats/osxml/OsxmlParserTest ) TARGET_LINK_LIBRARIES(ousia_test_osxml diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index 869c76a..c216855 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -16,223 +16,83 @@ along with this program. If not, see . */ -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "XmlParser.hpp" +#include +#include +#include + +#include "OsxmlEventParser.hpp" +#include "OsxmlParser.hpp" namespace ousia { -namespace ParserStates { -/* Document states */ -static const ParserState Document = - ParserStateBuilder() - .parent(&None) - .createdNodeType(&RttiTypes::Document) - .elementHandler(DocumentHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState DocumentChild = - ParserStateBuilder() - .parents({&Document, &DocumentChild}) - .createdNodeTypes({&RttiTypes::StructureNode, - &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}) - .elementHandler(DocumentChildHandler::create); - -/* Domain states */ -static const ParserState Domain = ParserStateBuilder() - .parents({&None, &Document}) - .createdNodeType(&RttiTypes::Domain) - .elementHandler(DomainHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStruct = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::StructuredClass) - .elementHandler(DomainStructHandler::create) - .arguments({Argument::String("name"), - Argument::Cardinality("cardinality", Cardinality::any()), - Argument::Bool("isRoot", false), - Argument::Bool("transparent", false), - Argument::String("isa", "")}); - -static const ParserState DomainAnnotation = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::AnnotationClass) - .elementHandler(DomainAnnotationHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainAttributes = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(DomainAttributesHandler::create) - .arguments({}); - -static const ParserState DomainAttribute = - ParserStateBuilder() - .parent(&DomainAttributes) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState DomainField = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldHandler::create) - .arguments({Argument::String("name", ""), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainFieldRef = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldRefHandler::create) - .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); - -static const ParserState DomainStructPrimitive = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainPrimitiveHandler::create) - .arguments( - {Argument::String("name", ""), Argument::Bool("isSubtree", false), - Argument::Bool("optional", false), Argument::String("type")}); - -static const ParserState DomainStructChild = - ParserStateBuilder() - .parent(&DomainField) - .elementHandler(DomainChildHandler::create) - .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParent = - ParserStateBuilder() - .parent(&DomainStruct) - .createdNodeType(&RttiTypes::DomainParent) - .elementHandler(DomainParentHandler::create) - .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParentField = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldHandler::create) - .arguments({Argument::String("name", ""), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainStructParentFieldRef = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldRefHandler::create) - .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); - -/* Typesystem states */ -static const ParserState Typesystem = - ParserStateBuilder() - .parents({&None, &Domain}) - .createdNodeType(&RttiTypes::Typesystem) - .elementHandler(TypesystemHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState TypesystemEnum = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::EnumType) - .elementHandler(TypesystemEnumHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState TypesystemEnumEntry = - ParserStateBuilder() - .parent(&TypesystemEnum) - .elementHandler(TypesystemEnumEntryHandler::create) - .arguments({}); - -static const ParserState TypesystemStruct = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(TypesystemStructHandler::create) - .arguments({Argument::String("name"), Argument::String("parent", "")}); - -static const ParserState TypesystemStructField = - ParserStateBuilder() - .parent(&TypesystemStruct) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState TypesystemConstant = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::Constant) - .elementHandler(TypesystemConstantHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("value")}); - -/* Special states for import and include */ -static const ParserState Import = - ParserStateBuilder() - .parents({&Document, &Typesystem, &Domain}) - .elementHandler(ImportHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const ParserState Include = - ParserStateBuilder() - .parent(&All) - .elementHandler(IncludeHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const std::multimap XmlStates{ - {"document", &Document}, - {"*", &DocumentChild}, - {"domain", &Domain}, - {"struct", &DomainStruct}, - {"annotation", &DomainAnnotation}, - {"attributes", &DomainAttributes}, - {"attribute", &DomainAttribute}, - {"field", &DomainField}, - {"fieldRef", &DomainFieldRef}, - {"primitive", &DomainStructPrimitive}, - {"childRef", &DomainStructChild}, - {"parentRef", &DomainStructParent}, - {"field", &DomainStructParentField}, - {"fieldRef", &DomainStructParentFieldRef}, - {"typesystem", &Typesystem}, - {"enum", &TypesystemEnum}, - {"entry", &TypesystemEnumEntry}, - {"struct", &TypesystemStruct}, - {"field", &TypesystemStructField}, - {"constant", &TypesystemConstant}, - {"import", &Import}, - {"include", &Include}}; +using namespace parser_stack; + +/** + * Class containing the actual OsxmlParser implementation. + */ +class OsxmlParserImplementation : public OsxmlEvents { +private: + /** + * Actual xml parser -- converts the xml stream into a set of events. + */ + OsxmlEventParser parser; + + /** + * Pushdown automaton responsible for converting the xml events into an + * actual Node tree. + */ + Stack stack; + +public: + /** + * Constructor of the OsxmlParserImplementation class. + * + * @param reader is a reference to the CharReader instance from which the + * XML should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) + : parser(reader, *this, ctx.getLogger()), + stack(ctx, GenericParserStates) + { + } + + /** + * Starts the actual parsing process. + */ + void parse() { parser.parse(); } + + void command(const Variant &name, const Variant::mapType &args) override + { + stack.command(name, args); + stack.fieldStart(true); + } + + void annotationStart(const Variant &name, + const Variant::mapType &args) override + { + stack.annotationStart(name, args); + stack.fieldStart(true); + } + + void annotationEnd(const Variant &className, + const Variant &elementName) override + { + stack.annotationEnd(className, elementName); + } + + void fieldEnd() override { stack.fieldEnd(); } + + void data(const Variant &data) override { stack.data(data); } +}; + +/* Class OsxmlParser */ + +void OsxmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ + OsxmlParserImplementation impl(reader, ctx); + impl.parse(); } - - } diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp index 281a49c..0fbf83c 100644 --- a/src/formats/osxml/OsxmlParser.hpp +++ b/src/formats/osxml/OsxmlParser.hpp @@ -17,7 +17,7 @@ */ /** - * @file XmlParser.hpp + * @file OsxmlParser.hpp * * Contains the parser responsible for reading Ousía XML Documents (extension * oxd) and Ousía XML Modules (extension oxm). diff --git a/test/formats/osxml/OsxmlParserTest.cpp b/test/formats/osxml/OsxmlParserTest.cpp index 269a3f6..a2bd8b1 100644 --- a/test/formats/osxml/OsxmlParserTest.cpp +++ b/test/formats/osxml/OsxmlParserTest.cpp @@ -30,7 +30,7 @@ #include #include -#include +#include namespace ousia { @@ -41,7 +41,7 @@ extern const Rtti Typesystem; } struct XmlStandaloneEnvironment : public StandaloneEnvironment { - XmlParser xmlParser; + OsxmlParser parser; FileLocator fileLocator; XmlStandaloneEnvironment(ConcreteLogger &logger) @@ -52,21 +52,21 @@ struct XmlStandaloneEnvironment : public StandaloneEnvironment { registry.registerDefaultExtensions(); registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, - {&RttiTypes::Node}, &xmlParser); + {&RttiTypes::Node}, &parser); registry.registerResourceLocator(&fileLocator); } }; static TerminalLogger logger(std::cerr, true); -TEST(XmlParser, mismatchedTag) +TEST(OsxmlParser, mismatchedTag) { XmlStandaloneEnvironment env(logger); env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); ASSERT_TRUE(logger.hasError()); } -TEST(XmlParser, generic) +TEST(OsxmlParser, generic) { XmlStandaloneEnvironment env(logger); env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); @@ -186,7 +186,7 @@ static void checkFieldDescriptor( Handle primitiveType = nullptr, bool optional = false) { auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); - ASSERT_EQ(1, res.size()); + ASSERT_EQ(1U, res.size()); checkFieldDescriptor(res[0].node, name, parent, children, type, primitiveType, optional); } @@ -201,7 +201,7 @@ static void checkFieldDescriptor( optional); } -TEST(XmlParser, domainParsing) +TEST(OsxmlParser, domainParsing) { XmlStandaloneEnvironment env(logger); Rooted book_domain_node = @@ -339,10 +339,10 @@ static void checkText(Handle p, Handle expectedParent, { checkStructuredEntity(p, expectedParent, doc, "paragraph"); Rooted par = p.cast(); - ASSERT_EQ(1, par->getField().size()); + ASSERT_EQ(1U, par->getField().size()); checkStructuredEntity(par->getField()[0], par, doc, "text"); Rooted text = par->getField()[0].cast(); - ASSERT_EQ(1, text->getField().size()); + ASSERT_EQ(1U, text->getField().size()); Handle d = text->getField()[0]; ASSERT_FALSE(d == nullptr); @@ -352,7 +352,7 @@ static void checkText(Handle p, Handle expectedParent, ASSERT_EQ(expected, prim->getContent()); } -TEST(XmlParser, documentParsing) +TEST(OsxmlParser, documentParsing) { XmlStandaloneEnvironment env(logger); Rooted book_document_node = @@ -364,7 +364,7 @@ TEST(XmlParser, documentParsing) checkStructuredEntity(doc->getRoot(), doc, doc, "book"); { Rooted book = doc->getRoot(); - ASSERT_EQ(2, book->getField().size()); + ASSERT_EQ(2U, book->getField().size()); checkText(book->getField()[0], book, doc, "This might be some introductory text or a dedication."); checkStructuredEntity(book->getField()[1], book, doc, "chapter", @@ -372,7 +372,7 @@ TEST(XmlParser, documentParsing) { Rooted chapter = book->getField()[1].cast(); - ASSERT_EQ(3, chapter->getField().size()); + ASSERT_EQ(3U, chapter->getField().size()); checkText(chapter->getField()[0], chapter, doc, "Here we might have an introduction to the chapter."); checkStructuredEntity(chapter->getField()[1], chapter, doc, @@ -381,7 +381,7 @@ TEST(XmlParser, documentParsing) { Rooted section = chapter->getField()[1].cast(); - ASSERT_EQ(1, section->getField().size()); + ASSERT_EQ(1U, section->getField().size()); checkText(section->getField()[0], section, doc, "Here we might find the actual section content."); } @@ -391,7 +391,7 @@ TEST(XmlParser, documentParsing) { Rooted section = chapter->getField()[2].cast(); - ASSERT_EQ(1, section->getField().size()); + ASSERT_EQ(1U, section->getField().size()); checkText(section->getField()[0], section, doc, "Here we might find the actual section content."); } -- cgit v1.2.3