From efe60ac3c3a8725ac71329c0bb19fa9d9c58f399 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:42:05 +0100 Subject: Moved specific file format parsers to formats/ folder, moved old tokenizer to css code (this is the only place where it is actually used) --- test/core/CodeTokenizerTest.cpp | 100 ---------------------------------- test/core/TokenizerTest.cpp | 118 ---------------------------------------- 2 files changed, 218 deletions(-) delete mode 100644 test/core/CodeTokenizerTest.cpp delete mode 100644 test/core/TokenizerTest.cpp (limited to 'test/core') diff --git a/test/core/CodeTokenizerTest.cpp b/test/core/CodeTokenizerTest.cpp deleted file mode 100644 index 2d4d5a7..0000000 --- a/test/core/CodeTokenizerTest.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -static const int BLOCK_COMMENT = 30; -static const int LINE_COMMENT = 31; -static const int STRING = 20; -static const int ESCAPE = 21; -static const int LINEBREAK = 21; -static const int CURLY_OPEN = 40; -static const int CURLY_CLOSE = 41; - -TEST(CodeTokenizer, testTokenizer) -{ - CharReader reader{ - "/**\n" // 1 - " * Some Block Comment\n" // 2 - " */\n" // 3 - "var my_string = 'My \\'String\\'';\n" // 4 - "// and a line comment\n" // 5 - "var my_obj = { a = 4;}", 0}; // 6 - // 123456789012345678901234567890123456789 - // 0 1 2 3 - TokenTreeNode root{{{"/*", 1}, - {"*/", 2}, - {"//", 3}, - {"'", 4}, - {"\\", 5}, - {"{", CURLY_OPEN}, - {"}", CURLY_CLOSE}, - {"\n", 6}}}; - std::map descriptors{ - // the block comment start Token has the id 1 and if the Tokenizer - // returns a Block Comment Token that should have the id 10. - {1, {CodeTokenMode::BLOCK_COMMENT_START, BLOCK_COMMENT}}, - {2, {CodeTokenMode::BLOCK_COMMENT_END, BLOCK_COMMENT}}, - {3, {CodeTokenMode::LINE_COMMENT, LINE_COMMENT}}, - {4, {CodeTokenMode::STRING_START_END, STRING}}, - {5, {CodeTokenMode::ESCAPE, ESCAPE}}, - {6, {CodeTokenMode::LINEBREAK, LINEBREAK}}}; - - std::vector expected = { - {BLOCK_COMMENT, "*\n * Some Block Comment\n ", SourceLocation{0, 0, 29}}, - {LINEBREAK, "\n", SourceLocation{0, 29, 30}}, - {TOKEN_TEXT, "var", SourceLocation{0, 30, 33}}, - {TOKEN_TEXT, "my_string", SourceLocation{0, 34, 43}}, - {TOKEN_TEXT, "=", SourceLocation{0, 44, 45}}, - {STRING, "My 'String'", SourceLocation{0, 46, 61}}, - {TOKEN_TEXT, ";", SourceLocation{0, 61, 62}}, - {LINEBREAK, "\n", SourceLocation{0, 62, 63}}, - // this is slightly counter-intuitive but makes sense if you think about - // it: As a line comment is ended by a line break the line break is - // technically still a part of the line comment and thus the ending - // is in the next line. - {LINE_COMMENT, " and a line comment", SourceLocation{0, 63, 85}}, - {TOKEN_TEXT, "var", SourceLocation{0, 85, 88}}, - {TOKEN_TEXT, "my_obj", SourceLocation{0, 89, 95}}, - {TOKEN_TEXT, "=", SourceLocation{0, 96, 97}}, - {CURLY_OPEN, "{", SourceLocation{0, 98, 99}}, - {TOKEN_TEXT, "a", SourceLocation{0, 100, 101}}, - {TOKEN_TEXT, "=", SourceLocation{0, 102, 103}}, - {TOKEN_TEXT, "4;", SourceLocation{0, 104, 106}}, - {CURLY_CLOSE, "}", SourceLocation{0, 106, 107}}, - }; - - CodeTokenizer tokenizer{reader, root, descriptors}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} -} - diff --git a/test/core/TokenizerTest.cpp b/test/core/TokenizerTest.cpp deleted file mode 100644 index c53f93d..0000000 --- a/test/core/TokenizerTest.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include - -namespace ousia { -TEST(TokenTreeNode, testConstructor) -{ - TokenTreeNode root{{{"a", 1}, {"aab", 2}, {"aac", 3}, {"abd", 4}}}; - - ASSERT_EQ(-1, root.tokenId); - ASSERT_EQ(1U, root.children.size()); - ASSERT_TRUE(root.children.find('a') != root.children.end()); - - const TokenTreeNode &a = root.children.at('a'); - ASSERT_EQ(1, a.tokenId); - ASSERT_EQ(2U, a.children.size()); - ASSERT_TRUE(a.children.find('a') != a.children.end()); - ASSERT_TRUE(a.children.find('b') != a.children.end()); - - const TokenTreeNode &aa = a.children.at('a'); - ASSERT_EQ(-1, aa.tokenId); - ASSERT_EQ(2U, aa.children.size()); - ASSERT_TRUE(aa.children.find('b') != aa.children.end()); - ASSERT_TRUE(aa.children.find('c') != aa.children.end()); - - const TokenTreeNode &aab = aa.children.at('b'); - ASSERT_EQ(2, aab.tokenId); - ASSERT_EQ(0U, aab.children.size()); - - const TokenTreeNode &aac = aa.children.at('c'); - ASSERT_EQ(3, aac.tokenId); - ASSERT_EQ(0U, aac.children.size()); - - const TokenTreeNode &ab = a.children.at('b'); - ASSERT_EQ(-1, ab.tokenId); - ASSERT_EQ(1U, ab.children.size()); - ASSERT_TRUE(ab.children.find('d') != ab.children.end()); - - const TokenTreeNode &abd = ab.children.at('d'); - ASSERT_EQ(4, abd.tokenId); - ASSERT_EQ(0U, abd.children.size()); -} - -TEST(Tokenizer, testTokenization) -{ - TokenTreeNode root{{{"/", 1}, {"/*", 2}, {"*/", 3}}}; - - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - - std::vector expected = { - {TOKEN_TEXT, "Test", SourceLocation{0, 0, 4}}, - {1, "/", SourceLocation{0, 4, 5}}, - {TOKEN_TEXT, "Test ", SourceLocation{0, 5, 10}}, - {2, "/*", SourceLocation{0, 10, 12}}, - {TOKEN_TEXT, " Block Comment ", SourceLocation{0, 12, 27}}, - {3, "*/", SourceLocation{0, 27, 29}}}; - - Tokenizer tokenizer{reader, root}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} - -TEST(Tokenizer, testIncompleteTokens) -{ - TokenTreeNode root{{{"ab", 1}, {"c", 2}}}; - - CharReader reader{"ac", 0}; - - std::vector expected = { - {TOKEN_TEXT, "a", SourceLocation{0, 0, 1}}, - {2, "c", SourceLocation{0, 1, 2}}}; - - Tokenizer tokenizer{reader, root}; - - Token t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.next(t)); - EXPECT_EQ(te.tokenId, t.tokenId); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.next(t)); -} -} - -- cgit v1.2.3 From ce4fd84a714d80859aa01bbca32a81302b93c4d7 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:43:32 +0100 Subject: Moved code for handling whitespaces to own header, including the "WhitespaceMode" enum --- src/core/common/Utils.cpp | 7 -- src/core/common/Utils.hpp | 57 +-------- src/core/common/Whitespace.cpp | 38 ++++++ src/core/common/Whitespace.hpp | 120 ++++++++++++++++++ src/core/common/WhitespaceHandler.hpp | 223 ++++++++++++++++++++++++++++++++++ test/core/common/UtilsTest.cpp | 8 -- test/core/common/Whitespace.cpp | 41 +++++++ 7 files changed, 428 insertions(+), 66 deletions(-) create mode 100644 src/core/common/Whitespace.cpp create mode 100644 src/core/common/Whitespace.hpp create mode 100644 src/core/common/WhitespaceHandler.hpp create mode 100644 test/core/common/Whitespace.cpp (limited to 'test/core') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 563fe2a..4005143 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -18,19 +18,12 @@ #include #include -#include #include #include "Utils.hpp" namespace ousia { -std::string Utils::trim(const std::string &s) -{ - std::pair bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::isIdentifier(const std::string &name) { bool first = true; diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 2c8a5b3..af7a773 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -78,12 +78,17 @@ public: */ static bool isIdentifier(const std::string &name); + /** + * Returns true if the given character is a linebreak character. + */ + static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } + /** * Returns true if the given character is a whitespace character. */ static bool isWhitespace(const char c) { - return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); + return (c == ' ') || (c == '\t') || isLinebreak(c); } /** @@ -94,56 +99,6 @@ public: */ static bool hasNonWhitepaceChar(const std::string &s); - /** - * Returns true if the given character is a whitespace character. - */ - static bool isLinebreak(const char c) { return (c == '\n') || (c == '\r'); } - - /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** - * Trims the given string or vector of chars by returning the start and end - * index. - * - * @param s is the container that should be trimmed. - * @param f is a function that returns true for values that should be - * removed. - * @return start and end index. Note that "end" points at the character - * beyond the end, thus "end" minus "start" - */ - template - static std::pair trim(const T &s, Filter f) - { - size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { - if (!f(s[i])) { - start = i; - break; - } - } - - size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { - if (!f(s[i])) { - end = i + 1; - break; - } - } - - if (end < start) { - start = 0; - end = 0; - } - - return std::pair{start, end}; - } - /** * Turns the elements of a collection into a string separated by the * given delimiter. diff --git a/src/core/common/Whitespace.cpp b/src/core/common/Whitespace.cpp new file mode 100644 index 0000000..4d7c01a --- /dev/null +++ b/src/core/common/Whitespace.cpp @@ -0,0 +1,38 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Whitespace.hpp" +#include "WhitespaceHandler.hpp" + +namespace ousia { + +std::string Utils::trim(const std::string &s) +{ + std::pair bounds = trim(s, Utils::isWhitespace); + return s.substr(bounds.first, bounds.second - bounds.first); +} + +std::string Utils::collapse(const std::string &s) +{ + CollapsingWhitespaceHandler h; + appendToWhitespaceHandler(h, s, 0); + return h.toString(); +} + +} + diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp new file mode 100644 index 0000000..1e9f36a --- /dev/null +++ b/src/core/common/Whitespace.hpp @@ -0,0 +1,120 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Whitespace.hpp + * + * Contains the WhitespaceMode enum used in various places, as well es functions + * for trimming and collapsing whitespaces. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HPP_ +#define _OUSIA_WHITESPACE_HPP_ + +#include +#include + +namespace ousia { + +/** + * Enum specifying the whitespace handling mode of the tokenizer and the + * parsers. + */ +enum class WhitespaceMode { + /** + * Preserves all whitespaces as they are found in the source file. + */ + PRESERVE, + + /** + * Trims whitespace at the beginning and the end of the found text. + */ + TRIM, + + /** + * Whitespaces are trimmed and collapsed, multiple whitespace characters + * are replaced by a single space character. + */ + COLLAPSE +}; + +/** + * Collection of functions for trimming or collapsing whitespace. + */ +class Whitespace { + /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s); + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template + static std::pair trim(const T &s, Filter f) + { + size_t start = 0; + for (size_t i = 0; i < s.size(); i++) { + if (!f(s[i])) { + start = i; + break; + } + } + + size_t end = 0; + for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { + if (!f(s[i])) { + end = i + 1; + break; + } + } + + if (end < start) { + start = 0; + end = 0; + } + + return std::pair{start, end}; + } + + /** + * Collapses the whitespaces in the given string (trims the string and + * replaces all whitespace characters by a single one). + * + * @param s is the string in which the whitespace should be collapsed. + * @return a copy of s with collapsed whitespace. + */ + static std::string collapse(const std::string &s); +}; + +} + +#endif /* _OUSIA_WHITESPACE_HPP_ */ + diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp new file mode 100644 index 0000000..1935c24 --- /dev/null +++ b/src/core/common/WhitespaceHandler.hpp @@ -0,0 +1,223 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file WhitespaceHandler.hpp + * + * Contains the WhitespaceHandler classes which are used in multiple places to + * trim, compact or preserve whitespaces while at the same time maintaining the + * position information associated with the input strings. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ +#define _OUSIA_WHITESPACE_HANDLER_HPP_ + +#include +#include + +#include "WhitespaceHandler.hpp" + +namespace ousia { + +/** + * WhitespaceHandler is a based class that can be used to collect text on a + * character-by-character basis. Note that this class and its descendants are + * hoped to be inlined by the compiler (and used in conjunction with templates), + * thus they are fully defined inside this header. + */ +class WhitespaceHandler { +public: + /** + * Start position of the extracted text. + */ + size_t textStart; + + /** + * End position of the extracted text. + */ + size_t textEnd; + + /** + * Buffer containing the extracted text. + */ + std::vector textBuf; + + /** + * Constructor of the TextHandlerBase base class. Initializes the start and + * end position with zeros. + */ + WhitespaceHandler() : textStart(0), textEnd(0) {} + + /** + * Returns true if this whitespace handler has found any text and a text + * token could be emitted. + * + * @return true if the internal data buffer is non-empty. + */ + bool hasText() { return !textBuf.empty(); } + + /** + * Returns the content of the WhitespaceHandler as string. + */ + std::string toString() + { + return std::string(textBuf.data(), textBuf.size()); + } +}; + +/** + * The PreservingWhitespaceHandler class preserves all characters unmodified, + * including whitepace characters. + */ +class PreservingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Appends the given character to the internal text buffer, does not + * eliminate whitespace. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + textBuf.push_back(c); + } +}; + +/** + * The TrimmingTextHandler class trims all whitespace characters at the begin + * and the end of a text section but leaves all other characters unmodified, + * including whitepace characters. + */ +class TrimmingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Buffer used internally to temporarily store all whitespace characters. + * They are only added to the output buffer if another non-whitespace + * character is reached. + */ + std::vector whitespaceBuf; + + /** + * Appends the given character to the internal text buffer, eliminates + * whitespace characters at the begin and end of the text. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + whitespaceBuf.push_back(c); + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (!whitespaceBuf.empty()) { + textBuf.insert(textBuf.end(), whitespaceBuf.begin(), + whitespaceBuf.end()); + whitespaceBuf.clear(); + } + textBuf.push_back(c); + } +}; + +/** + * The CollapsingTextHandler trims characters at the beginning and end of the + * text and reduced multiple whitespace characters to a single blank. + */ +class CollapsingWhitespaceHandler : public WhitespaceHandler { +public: + /** + * Flag set to true if a whitespace character was reached. + */ + bool hasWhitespace = false; + + /** + * Appends the given character to the internal text buffer, eliminates + * redundant whitespace characters. + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + */ + void append(char c, size_t start, size_t end) + { + // Handle whitespace characters + if (Utils::isWhitespace(c)) { + if (!textBuf.empty()) { + hasWhitespace = true; + } + return; + } + + // Set the start and end offset correctly + if (textBuf.empty()) { + textStart = start; + } + textEnd = end; + + // Store the character + if (hasWhitespace) { + textBuf.push_back(' '); + hasWhitespace = false; + } + textBuf.push_back(c); + } +}; + +/** + * Function that can be used to append the given buffer (e.g. a string or a + * vector) to the whitespace handler. + * + * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. + * @tparam Buffer is an iterable type. + * @param handler is the handler to which the characters of the Buffer should be + * appended. + * @param buf is the buffer from which the characters should be read. + * @param start is the start byte offset. Each character is counted as one byte. + */ +template +inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, + size_t start) +{ + for (auto elem : buf) { + handler.append(elem, start++); + } +} +} + +#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ + diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 917f45c..6b8a916 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -32,14 +32,6 @@ TEST(Utils, isIdentifier) ASSERT_FALSE(Utils::isIdentifier("invalid key")); } -TEST(Utils, trim) -{ - ASSERT_EQ("hello world", Utils::trim("\t hello world \n\r\t")); - ASSERT_EQ("hello world", Utils::trim("hello world \n\r\t")); - ASSERT_EQ("hello world", Utils::trim(" hello world")); - ASSERT_EQ("hello world", Utils::trim("hello world")); -} - TEST(Utils, split) { ASSERT_EQ(std::vector({"ab"}), Utils::split("ab", '.')); diff --git a/test/core/common/Whitespace.cpp b/test/core/common/Whitespace.cpp new file mode 100644 index 0000000..d6df8b7 --- /dev/null +++ b/test/core/common/Whitespace.cpp @@ -0,0 +1,41 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +TEST(Whitespace, trim) +{ + ASSERT_EQ("hello world", Whitespace::trim("\t hello world \n\r\t")); + ASSERT_EQ("hello world", Whitespace::trim("hello world \n\r\t")); + ASSERT_EQ("hello world", Whitespace::trim(" hello world")); + ASSERT_EQ("hello world", Whitespace::trim("hello world")); +} + +TEST(Whitespace, collapse) +{ + ASSERT("hello world", Whitespace::collapse(" hello \n\t\r world \n\r\t")); + ASSERT("hello world", Whitespace::collapse("hello \n\t\r world \n\r\t")); + ASSERT("hello world", Whitespace::collapse("hello \n\t\r world")); + ASSERT("hello world", Whitespace::collapse("hello world")); +} +} + -- cgit v1.2.3 From 65bbbd778f6e0a3668c859b0e22cced7075a726d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:11 +0100 Subject: Moved DynamicTokenizer and TokenTrie to parser/utils --- src/core/parser/utils/TokenTrie.cpp | 119 +++++++++ src/core/parser/utils/TokenTrie.hpp | 150 +++++++++++ src/core/parser/utils/Tokenizer.cpp | 381 ++++++++++++++++++++++++++ src/core/parser/utils/Tokenizer.hpp | 231 ++++++++++++++++ src/formats/osdm/DynamicTokenizer.cpp | 381 -------------------------- src/formats/osdm/DynamicTokenizer.hpp | 231 ---------------- src/formats/osdm/TokenTrie.cpp | 119 --------- src/formats/osdm/TokenTrie.hpp | 150 ----------- test/core/parser/utils/TokenTrieTest.cpp | 92 +++++++ test/core/parser/utils/TokenizerTest.cpp | 415 +++++++++++++++++++++++++++++ test/formats/osdm/DynamicTokenizerTest.cpp | 415 ----------------------------- test/formats/osdm/TokenTrieTest.cpp | 92 ------- 12 files changed, 1388 insertions(+), 1388 deletions(-) create mode 100644 src/core/parser/utils/TokenTrie.cpp create mode 100644 src/core/parser/utils/TokenTrie.hpp create mode 100644 src/core/parser/utils/Tokenizer.cpp create mode 100644 src/core/parser/utils/Tokenizer.hpp delete mode 100644 src/formats/osdm/DynamicTokenizer.cpp delete mode 100644 src/formats/osdm/DynamicTokenizer.hpp delete mode 100644 src/formats/osdm/TokenTrie.cpp delete mode 100644 src/formats/osdm/TokenTrie.hpp create mode 100644 test/core/parser/utils/TokenTrieTest.cpp create mode 100644 test/core/parser/utils/TokenizerTest.cpp delete mode 100644 test/formats/osdm/DynamicTokenizerTest.cpp delete mode 100644 test/formats/osdm/TokenTrieTest.cpp (limited to 'test/core') diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/core/parser/utils/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::make_shared()).first; + } + node = it->second.get(); + } + + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { + return false; + } + + // Otherwise just set the type to the given type. + node->type = type; + return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another type + node = it->second.get(); + if ((node->type != EmptyToken || node->children.size() > 1) && + (i + 1 != token.size())) { + subtreeRoot = node; + subtreeKey = token[i + 1]; + } + } + + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { + return false; + } + + // If the target node has children, we cannot delete the subtree. Set the + // type to EmptyToken instead + if (!node->children.empty()) { + node->type = EmptyToken; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return EmptyToken; + } + node = it->second.get(); + } + return node->type; +} +} + diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/core/parser/utils/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include +#include +#include +#include + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenTypeId type; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param type is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, TokenTypeId type) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp new file mode 100644 index 0000000..1fac25a --- /dev/null +++ b/src/core/parser/utils/Tokenizer.cpp @@ -0,0 +1,381 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include +#include +#include +#include + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + DynamicToken token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the DynamicToken instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * DynamicTokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + DynamicToken{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; + } + match.token.type = TextToken; +} +} + +/* Class DynamicTokenizer */ + +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) +{ + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector lookups; + std::vector nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = DynamicToken{}; + } + return match.hasMatch(); +} + +bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } + + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; + + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +} + diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp new file mode 100644 index 0000000..3e5aeb3 --- /dev/null +++ b/src/core/parser/utils/Tokenizer.hpp @@ -0,0 +1,231 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file DynamicTokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include +#include +#include + +#include +#include + +#include "TokenTrie.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; + +/** + * The DynamicToken structure describes a token discovered by the Tokenizer. + */ +struct DynamicToken { + /** + * Id of the type of this token. + */ + TokenTypeId type; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + DynamicToken() : type(EmptyToken) {} + + /** + * Constructor of the DynamicToken struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + DynamicToken(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the DynamicToken struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + DynamicToken(TokenTypeId type) : type(type) {} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; + +/** + * The DynamicTokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. Note that the + * DynamicTokenizer always tries to extract the longest possible token from the + * tokenizer. + */ +class DynamicTokenizer { +private: + /** + * Internally used token trie. This object holds all registered tokens. + */ + TokenTrie trie; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token types. + */ + std::vector tokens; + + /** + * Next index in the tokens list where to search for a new token id. + */ + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. + * + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template + bool next(CharReader &reader, DynamicToken &token); + +public: + /** + * Constructor of the DynamicTokenizer class. + * + * @param whitespaceMode specifies how whitespace should be handled. + */ + DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. + */ + TokenTypeId registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenTypeId. + * + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * DynamicToken instance. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(CharReader &reader, DynamicToken &token); + + /** + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool peek(CharReader &reader, DynamicToken &token); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp deleted file mode 100644 index 1fac25a..0000000 --- a/src/formats/osdm/DynamicTokenizer.cpp +++ /dev/null @@ -1,381 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include - -#include -#include -#include -#include - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -namespace { - -/* Internal class TokenMatch */ - -/** - * Contains information about a matching token. - */ -struct TokenMatch { - /** - * Token that was matched. - */ - DynamicToken token; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - - /** - * Constructor of the TokenMatch class. - */ - TokenMatch() : textLength(0), textEnd(0) {} - - /** - * Returns true if this TokenMatch instance actually represents a match. - */ - bool hasMatch() { return token.type != EmptyToken; } -}; - -/* Internal class TokenLookup */ - -/** - * The TokenLookup class is used to represent a thread in a running token - * lookup. - */ -class TokenLookup { -private: - /** - * Current node within the token trie. - */ - TokenTrie::Node const *node; - - /** - * Start offset within the source file. - */ - size_t start; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - -public: - /** - * Constructor of the TokenLookup class. - * - * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. - */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) - { - } - - /** - * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). - * - * @param c is the character that should be appended to the current prefix. - * @param lookups is a list to which new TokeLookup instances are added -- - * which could potentially be expanded in the next iteration. - * @param match is the DynamicToken instance to which the matching token - * should be written. - * @param tokens is a reference at the internal token list of the - * DynamicTokenizer. - * @param end is the end byte offset of the current character. - * @param sourceId is the source if of this file. - */ - void advance(char c, std::vector &lookups, TokenMatch &match, - const std::vector &tokens, SourceOffset end, - SourceId sourceId) - { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node - auto it = node->children.find(c); - if (it == node->children.end()) { - return; - } - - // Check whether the new node represents a complete token a whether it - // is longer than the current token. If yes, replace the current token. - node = it->second.get(); - if (node->type != EmptyToken) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - DynamicToken{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } - } - - // If this state can possibly be advanced, store it in the states list. - if (!node->children.empty()) { - lookups.emplace_back(*this); - } - } -}; - -/** - * Transforms the given token into a text token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.type = TextToken; -} -} - -/* Class DynamicTokenizer */ - -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenTypeId(0) -{ -} - -template -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) -{ - // If we're in the read mode, reset the char reader peek position to the - // current read position - if (read) { - reader.resetPeek(); - } - - // Prepare the lookups in the token trie - const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; - std::vector lookups; - std::vector nextLookups; - - // Instantiate the text handler - TextHandler textHandler; - - // Peek characters from the reader and try to advance the current token tree - // cursor - char c; - size_t charStart = reader.getPeekOffset(); - const SourceId sourceId = reader.getSourceId(); - while (reader.peek(c)) { - const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; - - // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); - } - - // Try to advance all other lookups with the new character - for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); - } - - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { - break; - } - } else { - // Record all incomming characters - textHandler.append(c, charStart, charEnd); - } - - // Swap the lookups and the nextLookups list - lookups = std::move(nextLookups); - nextLookups.clear(); - - // Advance the offset - charStart = charEnd; - } - - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildTextToken(textHandler, match, sourceId); - } - - // Move the read/peek cursor to the end of the token, abort if an error - // happens while doing so - if (match.hasMatch()) { - // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { - throw OusiaException{"Token end position offset out of range"}; - } - - // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); - if (read) { - reader.seek(end); - } else { - reader.seekPeekCursor(end); - } - token = match.token; - } else { - token = DynamicToken{}; - } - return match.hasMatch(); -} - -bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; -} - -bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; -} - -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) -{ - // Abort if an empty token should be registered - if (token.empty()) { - return EmptyToken; - } - - // Search for a new slot in the tokens list - TokenTypeId type = EmptyToken; - for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; - type = i; - break; - } - } - - // No existing slot was found, add a new one -- make sure we do not - // override the special token type handles - if (type == EmptyToken) { - type = tokens.size(); - if (type == TextToken || type == EmptyToken) { - throw OusiaException{"Token type ids depleted!"}; - } - tokens.emplace_back(token); - } - nextTokenTypeId = type + 1; - - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list - if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return EmptyToken; - } - return type; -} - -bool DynamicTokenizer::unregisterToken(TokenTypeId type) -{ - // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return true; - } - return false; -} - -std::string DynamicTokenizer::getTokenString(TokenTypeId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} - -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) -{ - whitespaceMode = mode; -} - -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } - -/* Explicitly instantiate all possible instantiations of the "next" member - function */ -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -} - diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp deleted file mode 100644 index 3e5aeb3..0000000 --- a/src/formats/osdm/DynamicTokenizer.hpp +++ /dev/null @@ -1,231 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file DynamicTokenizer.hpp - * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ - -#include -#include -#include - -#include -#include - -#include "TokenTrie.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; - -/** - * The DynamicToken structure describes a token discovered by the Tokenizer. - */ -struct DynamicToken { - /** - * Id of the type of this token. - */ - TokenTypeId type; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - DynamicToken() : type(EmptyToken) {} - - /** - * Constructor of the DynamicToken struct. - * - * @param id represents the token type. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - DynamicToken(TokenTypeId type, const std::string &content, - SourceLocation location) - : type(type), content(content), location(location) - { - } - - /** - * Constructor of the DynamicToken struct, only initializes the token type - * - * @param type is the id corresponding to the type of the token. - */ - DynamicToken(TokenTypeId type) : type(type) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; - -/** - * The DynamicTokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the - * tokenizer. - */ -class DynamicTokenizer { -private: - /** - * Internally used token trie. This object holds all registered tokens. - */ - TokenTrie trie; - - /** - * Flag defining whether whitespaces should be preserved or not. - */ - WhitespaceMode whitespaceMode; - - /** - * Vector containing all registered token types. - */ - std::vector tokens; - - /** - * Next index in the tokens list where to search for a new token id. - */ - size_t nextTokenTypeId; - - /** - * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. - * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is the token structure into which the token information - * should be written. - * @return false if the end of the stream has been reached, true otherwise. - */ - template - bool next(CharReader &reader, DynamicToken &token); - -public: - /** - * Constructor of the DynamicTokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. - */ - DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); - - /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. - * - * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if - * an error occured. - */ - TokenTypeId registerToken(const std::string &token); - - /** - * Unregisters the token belonging to the given TokenTypeId. - * - * @param type is the token type that should be unregistered. The - *TokenTypeId - * must have been returned by registerToken. - * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). - */ - bool unregisterToken(TokenTypeId type); - - /** - * Returns the token that was registered under the given TokenTypeId id or - *an - * empty string if an invalid TokenTypeId id is given. - * - * @param type is the TokenTypeId id for which the corresponding token - *string - * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenTypeId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. - */ - WhitespaceMode getWhitespaceMode(); - - /** - * Reads a new token from the CharReader and stores it in the given - * DynamicToken instance. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool read(CharReader &reader, DynamicToken &token); - - /** - * The peek method does not advance the read position of the char reader, - * but reads the next token from the current char reader peek position. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool peek(CharReader &reader, DynamicToken &token); -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ - diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp deleted file mode 100644 index 4a0430b..0000000 --- a/src/formats/osdm/TokenTrie.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "TokenTrie.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -TokenTrie::Node::Node() : type(EmptyToken) {} - -/* Class DynamicTokenTree */ - -bool TokenTrie::registerToken(const std::string &token, - TokenTypeId type) noexcept -{ - // Abort if the token is empty -- this would taint the root node - if (token.empty()) { - return false; - } - - // Iterate over each character in the given string and insert them as - // (new) nodes - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Insert a new node if this one does not exist - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - it = node->children.emplace(c, std::make_shared()).first; - } - node = it->second.get(); - } - - // If the resulting node already has a type set, we're screwed. - if (node->type != EmptyToken) { - return false; - } - - // Otherwise just set the type to the given type. - node->type = type; - return true; -} - -bool TokenTrie::unregisterToken(const std::string &token) noexcept -{ - // We cannot remove empty tokens as we need to access the fist character - // upfront - if (token.empty()) { - return false; - } - - // First pass -- search the node in the path that can be deleted - Node *subtreeRoot = &root; - char subtreeKey = token[0]; - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Go to the next node, abort if the tree ends unexpectedly - auto it = node->children.find(token[i]); - if (it == node->children.end()) { - return false; - } - - // Reset the subtree handler if this node has another type - node = it->second.get(); - if ((node->type != EmptyToken || node->children.size() > 1) && - (i + 1 != token.size())) { - subtreeRoot = node; - subtreeKey = token[i + 1]; - } - } - - // If the node type is already EmptyToken, we cannot do anything here - if (node->type == EmptyToken) { - return false; - } - - // If the target node has children, we cannot delete the subtree. Set the - // type to EmptyToken instead - if (!node->children.empty()) { - node->type = EmptyToken; - return true; - } - - // If we end up here, we can safely delete the complete subtree - subtreeRoot->children.erase(subtreeKey); - return true; -} - -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept -{ - Node const *node = &root; - for (size_t i = 0; i < token.size(); i++) { - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - return EmptyToken; - } - node = it->second.get(); - } - return node->type; -} -} - diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp deleted file mode 100644 index 36c2ffa..0000000 --- a/src/formats/osdm/TokenTrie.hpp +++ /dev/null @@ -1,150 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file TokenTrie.hpp - * - * Class representing a token trie that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_TRIE_HPP_ -#define _OUSIA_TOKEN_TRIE_HPP_ - -#include -#include -#include -#include - -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits::max(); - -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits::max() - 1; - -/** - * The Tokenizer internally uses a TokenTrie to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * A token trie is a construct that structures all special tokens a Tokenizer - * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and - * three. Then the token tree would look like this: - * - * \code{*.txt} - * ~ (0) - * / \ - * a (2) b (0) - * | | - * a (0) a (0) - * | | - * b (1) c (0) - * \endcode - * - * Where the number indicates the corresponding token descriptor identifier. - */ -class TokenTrie { -public: - /** - * Structure used to build the node tree. - */ - struct Node { - /** - * Type used for the child map. - */ - using ChildMap = std::unordered_map>; - - /** - * Map from single characters at the corresponding child nodes. - */ - ChildMap children; - - /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. - */ - TokenTypeId type; - - /** - * Default constructor, initializes the descriptor with nullptr. - */ - Node(); - }; - -private: - /** - * Root node of the internal token tree. - */ - Node root; - -public: - /** - * Registers a token containing the given string. Returns false if the - * token already exists, true otherwise. - * - * @param token is the character sequence that should be registered as - * token. - * @param type is the descriptor that should be set for this token. - * @return true if the operation is successful, false otherwise. - */ - bool registerToken(const std::string &token, TokenTypeId type) noexcept; - - /** - * Unregisters the token from the token tree. Returns true if the token was - * unregistered successfully, false otherwise. - * - * @param token is the character sequence that should be unregistered. - * @return true if the operation was successful, false otherwise. - */ - bool unregisterToken(const std::string &token) noexcept; - - /** - * Returns true, if the given token exists within the TokenTree. This - * function is mostly thought for debugging and unit testing. - * - * @param token is the character sequence that should be searched. - * @return the attached token descriptor or nullptr if the given token is - * not found. - */ - TokenTypeId hasToken(const std::string &token) const noexcept; - - /** - * Returns a reference at the root node to be used for traversing the token - * tree. - * - * @return a reference at the root node. - */ - const Node *getRoot() const noexcept { return &root; } -}; -} - -#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ - diff --git a/test/core/parser/utils/TokenTrieTest.cpp b/test/core/parser/utils/TokenTrieTest.cpp new file mode 100644 index 0000000..aacd6c0 --- /dev/null +++ b/test/core/parser/utils/TokenTrieTest.cpp @@ -0,0 +1,92 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +static const TokenTypeId t1 = 0; +static const TokenTypeId t2 = 1; +static const TokenTypeId t3 = 2; +static const TokenTypeId t4 = 3; + +TEST(TokenTrie, registerToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_TRUE(tree.registerToken("hello", t4)); + + ASSERT_FALSE(tree.registerToken("", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + ASSERT_FALSE(tree.registerToken("b", t4)); + ASSERT_FALSE(tree.registerToken("hello", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + ASSERT_EQ(t4, tree.hasToken("hello")); + ASSERT_EQ(EmptyToken, tree.hasToken("")); + ASSERT_EQ(EmptyToken, tree.hasToken("abc")); +} + +TEST(TokenTrie, unregisterToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_FALSE(tree.registerToken("b", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("a")); + ASSERT_FALSE(tree.unregisterToken("a")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("b")); + ASSERT_FALSE(tree.unregisterToken("b")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("ab")); + ASSERT_FALSE(tree.unregisterToken("ab")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(EmptyToken, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); +} +} + diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp new file mode 100644 index 0000000..c1f8785 --- /dev/null +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -0,0 +1,415 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +namespace ousia { + +TEST(DynamicTokenizer, tokenRegistration) +{ + DynamicTokenizer tokenizer; + + ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); + + ASSERT_EQ(0U, tokenizer.registerToken("a")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); + ASSERT_EQ("a", tokenizer.getTokenString(0U)); + + ASSERT_EQ(1U, tokenizer.registerToken("b")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); + ASSERT_EQ("b", tokenizer.getTokenString(1U)); + + ASSERT_EQ(2U, tokenizer.registerToken("c")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); + ASSERT_EQ("c", tokenizer.getTokenString(2U)); + + ASSERT_TRUE(tokenizer.unregisterToken(1U)); + ASSERT_FALSE(tokenizer.unregisterToken(1U)); + ASSERT_EQ("", tokenizer.getTokenString(1U)); + + ASSERT_EQ(1U, tokenizer.registerToken("d")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); + ASSERT_EQ("d", tokenizer.getTokenString(1U)); +} + +TEST(DynamicTokenizer, textTokenPreserveWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(36U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenTrimWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenCollapseWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, simpleReadToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ(':', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ('t', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + + char c; + ASSERT_FALSE(reader.peek(c)); + } +} + +TEST(DynamicTokenizer, simplePeekToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(5U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(6U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(11U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } +} + +TEST(DynamicTokenizer, ambiguousTokens) +{ + CharReader reader{"abc"}; + DynamicTokenizer tokenizer; + + TokenTypeId t1 = tokenizer.registerToken("abd"); + TokenTypeId t2 = tokenizer.registerToken("bc"); + + ASSERT_EQ(0U, t1); + ASSERT_EQ(1U, t2); + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("a", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(t2, token.type); + ASSERT_EQ("bc", token.content); + + loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(3U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); +} + +TEST(DynamicTokenizer, commentTestWhitespacePreserve) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test ", SourceLocation{0, 5, 10}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +TEST(DynamicTokenizer, commentTestWhitespaceCollapse) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test", SourceLocation{0, 5, 9}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +} + diff --git a/test/formats/osdm/DynamicTokenizerTest.cpp b/test/formats/osdm/DynamicTokenizerTest.cpp deleted file mode 100644 index c1f8785..0000000 --- a/test/formats/osdm/DynamicTokenizerTest.cpp +++ /dev/null @@ -1,415 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include - -namespace ousia { - -TEST(DynamicTokenizer, tokenRegistration) -{ - DynamicTokenizer tokenizer; - - ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); - - ASSERT_EQ(0U, tokenizer.registerToken("a")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); - ASSERT_EQ("a", tokenizer.getTokenString(0U)); - - ASSERT_EQ(1U, tokenizer.registerToken("b")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); - ASSERT_EQ("b", tokenizer.getTokenString(1U)); - - ASSERT_EQ(2U, tokenizer.registerToken("c")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); - ASSERT_EQ("c", tokenizer.getTokenString(2U)); - - ASSERT_TRUE(tokenizer.unregisterToken(1U)); - ASSERT_FALSE(tokenizer.unregisterToken(1U)); - ASSERT_EQ("", tokenizer.getTokenString(1U)); - - ASSERT_EQ(1U, tokenizer.registerToken("d")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); - ASSERT_EQ("d", tokenizer.getTokenString(1U)); -} - -TEST(DynamicTokenizer, textTokenPreserveWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ(" this \t is only a \n\n test text ", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(36U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenTrimWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenCollapseWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, simpleReadToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ(':', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ('t', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - - char c; - ASSERT_FALSE(reader.peek(c)); - } -} - -TEST(DynamicTokenizer, simplePeekToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(5U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(6U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(11U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } -} - -TEST(DynamicTokenizer, ambiguousTokens) -{ - CharReader reader{"abc"}; - DynamicTokenizer tokenizer; - - TokenTypeId t1 = tokenizer.registerToken("abd"); - TokenTypeId t2 = tokenizer.registerToken("bc"); - - ASSERT_EQ(0U, t1); - ASSERT_EQ(1U, t2); - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("a", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(t2, token.type); - ASSERT_EQ("bc", token.content); - - loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(3U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); -} - -TEST(DynamicTokenizer, commentTestWhitespacePreserve) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test ", SourceLocation{0, 5, 10}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -TEST(DynamicTokenizer, commentTestWhitespaceCollapse) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test", SourceLocation{0, 5, 9}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -} - diff --git a/test/formats/osdm/TokenTrieTest.cpp b/test/formats/osdm/TokenTrieTest.cpp deleted file mode 100644 index aacd6c0..0000000 --- a/test/formats/osdm/TokenTrieTest.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -static const TokenTypeId t1 = 0; -static const TokenTypeId t2 = 1; -static const TokenTypeId t3 = 2; -static const TokenTypeId t4 = 3; - -TEST(TokenTrie, registerToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_TRUE(tree.registerToken("hello", t4)); - - ASSERT_FALSE(tree.registerToken("", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - ASSERT_FALSE(tree.registerToken("b", t4)); - ASSERT_FALSE(tree.registerToken("hello", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - ASSERT_EQ(t4, tree.hasToken("hello")); - ASSERT_EQ(EmptyToken, tree.hasToken("")); - ASSERT_EQ(EmptyToken, tree.hasToken("abc")); -} - -TEST(TokenTrie, unregisterToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_FALSE(tree.registerToken("b", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("a")); - ASSERT_FALSE(tree.unregisterToken("a")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("b")); - ASSERT_FALSE(tree.unregisterToken("b")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("ab")); - ASSERT_FALSE(tree.unregisterToken("ab")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(EmptyToken, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); -} -} - -- cgit v1.2.3 From 919552bad0f3f4db20419d3d3771c724c2ab997f Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:25 +0100 Subject: Removed Whitespace file again --- src/core/common/Whitespace.cpp | 38 -------------------------------------- test/core/common/Whitespace.cpp | 41 ----------------------------------------- 2 files changed, 79 deletions(-) delete mode 100644 src/core/common/Whitespace.cpp delete mode 100644 test/core/common/Whitespace.cpp (limited to 'test/core') diff --git a/src/core/common/Whitespace.cpp b/src/core/common/Whitespace.cpp deleted file mode 100644 index 4d7c01a..0000000 --- a/src/core/common/Whitespace.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "Whitespace.hpp" -#include "WhitespaceHandler.hpp" - -namespace ousia { - -std::string Utils::trim(const std::string &s) -{ - std::pair bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - -std::string Utils::collapse(const std::string &s) -{ - CollapsingWhitespaceHandler h; - appendToWhitespaceHandler(h, s, 0); - return h.toString(); -} - -} - diff --git a/test/core/common/Whitespace.cpp b/test/core/common/Whitespace.cpp deleted file mode 100644 index d6df8b7..0000000 --- a/test/core/common/Whitespace.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -TEST(Whitespace, trim) -{ - ASSERT_EQ("hello world", Whitespace::trim("\t hello world \n\r\t")); - ASSERT_EQ("hello world", Whitespace::trim("hello world \n\r\t")); - ASSERT_EQ("hello world", Whitespace::trim(" hello world")); - ASSERT_EQ("hello world", Whitespace::trim("hello world")); -} - -TEST(Whitespace, collapse) -{ - ASSERT("hello world", Whitespace::collapse(" hello \n\t\r world \n\r\t")); - ASSERT("hello world", Whitespace::collapse("hello \n\t\r world \n\r\t")); - ASSERT("hello world", Whitespace::collapse("hello \n\t\r world")); - ASSERT("hello world", Whitespace::collapse("hello world")); -} -} - -- cgit v1.2.3 From 295783320ea3855a14123f9cea163f8f5f689e07 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:50:11 +0100 Subject: Moved some of the whitespace functionality back to Utils --- src/core/common/Utils.cpp | 25 ++++++++++++ src/core/common/Utils.hpp | 72 +++++++++++++++++++++++++++++++++++ src/core/common/Whitespace.hpp | 62 +----------------------------- src/core/common/WhitespaceHandler.hpp | 7 ++-- test/core/common/UtilsTest.cpp | 17 +++++++++ 5 files changed, 119 insertions(+), 64 deletions(-) (limited to 'test/core') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 4005143..3739c61 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -21,6 +21,7 @@ #include #include "Utils.hpp" +#include "WhitespaceHandler.hpp" namespace ousia { @@ -87,5 +88,29 @@ std::string Utils::extractFileExtension(const std::string &filename) } return std::string{}; } + +std::string Utils::trim(const std::string &s) +{ + std::pair bounds = trim(s, Utils::isWhitespace); + return s.substr(bounds.first, bounds.second - bounds.first); +} + +std::string Utils::collapse(const std::string &s) +{ + CollapsingWhitespaceHandler h; + appendToWhitespaceHandler(h, s, 0); + return h.toString(); +} + +bool Utils::startsWith(const std::string &s, const std::string &prefix) +{ + return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; +} + +bool Utils::endsWith(const std::string &s, const std::string &suffix) +{ + return suffix.size() <= s.size() && + s.substr(s.size() - suffix.size(), suffix.size()) == suffix; +} } diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index af7a773..16a9136 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -99,6 +99,60 @@ public: */ static bool hasNonWhitepaceChar(const std::string &s); + /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s); + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template + static std::pair trim(const T &s, Filter f) + { + size_t start = 0; + for (size_t i = 0; i < s.size(); i++) { + if (!f(s[i])) { + start = i; + break; + } + } + + size_t end = 0; + for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { + if (!f(s[i])) { + end = i + 1; + break; + } + } + + if (end < start) { + start = 0; + end = 0; + } + + return std::pair{start, end}; + } + + /** + * Collapses the whitespaces in the given string (trims the string and + * replaces all whitespace characters by a single one). + * + * @param s is the string in which the whitespace should be collapsed. + * @return a copy of s with collapsed whitespace. + */ + static std::string collapse(const std::string &s); + /** * Turns the elements of a collection into a string separated by the * given delimiter. @@ -159,6 +213,24 @@ public: */ static std::string extractFileExtension(const std::string &filename); + /** + * Checks whether the given string starts with the given prefix. + * + * @param s is the string. + * @param prefix is the string which should be checked for being a prefix of + * s. + */ + static bool startsWith(const std::string &s, const std::string &prefix); + + /** + * Checks whether the given string ends with the given suffix. + * + * @param s is the string. + * @param suffix is the string which should be checked for being a suffix of + * s. + */ + static bool endsWith(const std::string &s, const std::string &suffix); + /** * Hash functional to be used for enum classes. * See http://stackoverflow.com/a/24847480/2188211 diff --git a/src/core/common/Whitespace.hpp b/src/core/common/Whitespace.hpp index 1e9f36a..72a2291 100644 --- a/src/core/common/Whitespace.hpp +++ b/src/core/common/Whitespace.hpp @@ -19,8 +19,7 @@ /** * @file Whitespace.hpp * - * Contains the WhitespaceMode enum used in various places, as well es functions - * for trimming and collapsing whitespaces. + * Contains the WhitespaceMode enum used in various places. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -55,65 +54,6 @@ enum class WhitespaceMode { COLLAPSE }; -/** - * Collection of functions for trimming or collapsing whitespace. - */ -class Whitespace { - /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - - /** - * Trims the given string or vector of chars by returning the start and end - * index. - * - * @param s is the container that should be trimmed. - * @param f is a function that returns true for values that should be - * removed. - * @return start and end index. Note that "end" points at the character - * beyond the end, thus "end" minus "start" - */ - template - static std::pair trim(const T &s, Filter f) - { - size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { - if (!f(s[i])) { - start = i; - break; - } - } - - size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { - if (!f(s[i])) { - end = i + 1; - break; - } - } - - if (end < start) { - start = 0; - end = 0; - } - - return std::pair{start, end}; - } - - /** - * Collapses the whitespaces in the given string (trims the string and - * replaces all whitespace characters by a single one). - * - * @param s is the string in which the whitespace should be collapsed. - * @return a copy of s with collapsed whitespace. - */ - static std::string collapse(const std::string &s); -}; - } #endif /* _OUSIA_WHITESPACE_HPP_ */ diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp index 1935c24..79e0518 100644 --- a/src/core/common/WhitespaceHandler.hpp +++ b/src/core/common/WhitespaceHandler.hpp @@ -32,7 +32,7 @@ #include #include -#include "WhitespaceHandler.hpp" +#include "Utils.hpp" namespace ousia { @@ -76,7 +76,7 @@ public: /** * Returns the content of the WhitespaceHandler as string. */ - std::string toString() + std::string toString() const { return std::string(textBuf.data(), textBuf.size()); } @@ -214,7 +214,8 @@ inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, size_t start) { for (auto elem : buf) { - handler.append(elem, start++); + handler.append(elem, start, start + 1); + start++; } } } diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 6b8a916..a4bf4b2 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -65,5 +65,22 @@ TEST(Utils, extractFileExtension) ASSERT_EQ("ext", Utils::extractFileExtension("foo.bar/test.EXT")); } +TEST(Utils, startsWith) +{ + ASSERT_TRUE(Utils::startsWith("foobar", "foo")); + ASSERT_TRUE(Utils::startsWith("foo", "foo")); + ASSERT_FALSE(Utils::startsWith("foo", "foobar")); + ASSERT_FALSE(Utils::startsWith("foobar", "bar")); + ASSERT_TRUE(Utils::startsWith("foo", "")); +} + +TEST(Utils, endsWith) +{ + ASSERT_FALSE(Utils::endsWith("foobar", "foo")); + ASSERT_TRUE(Utils::endsWith("foo", "foo")); + ASSERT_FALSE(Utils::endsWith("foo", "foobar")); + ASSERT_TRUE(Utils::endsWith("foobar", "bar")); + ASSERT_TRUE(Utils::endsWith("foo", "")); +} } -- cgit v1.2.3 From fde9997a9d321823ba6a2685e20769f5a10982cd Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:00:06 +0100 Subject: Moved TokenTrieTest to new directory --- test/core/parser/utils/TokenTrieTest.cpp | 2 +- test/core/parser/utils/TokenizerTest.cpp | 85 ++++++++++++++++---------------- 2 files changed, 43 insertions(+), 44 deletions(-) (limited to 'test/core') diff --git a/test/core/parser/utils/TokenTrieTest.cpp b/test/core/parser/utils/TokenTrieTest.cpp index aacd6c0..087e6e6 100644 --- a/test/core/parser/utils/TokenTrieTest.cpp +++ b/test/core/parser/utils/TokenTrieTest.cpp @@ -18,7 +18,7 @@ #include -#include +#include namespace ousia { diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp index c1f8785..8565057 100644 --- a/test/core/parser/utils/TokenizerTest.cpp +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -19,13 +19,13 @@ #include #include -#include +#include namespace ousia { -TEST(DynamicTokenizer, tokenRegistration) +TEST(Tokenizer, tokenRegistration) { - DynamicTokenizer tokenizer; + Tokenizer tokenizer; ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); @@ -50,15 +50,15 @@ TEST(DynamicTokenizer, tokenRegistration) ASSERT_EQ("d", tokenizer.getTokenString(1U)); } -TEST(DynamicTokenizer, textTokenPreserveWhitespace) +TEST(Tokenizer, textTokenPreserveWhitespace) { { CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer{WhitespaceMode::PRESERVE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ(" this \t is only a \n\n test text ", token.content); @@ -74,9 +74,9 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + Tokenizer tokenizer{WhitespaceMode::PRESERVE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -89,15 +89,15 @@ TEST(DynamicTokenizer, textTokenPreserveWhitespace) } } -TEST(DynamicTokenizer, textTokenTrimWhitespace) +TEST(Tokenizer, textTokenTrimWhitespace) { { CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer{WhitespaceMode::TRIM}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -113,9 +113,9 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + Tokenizer tokenizer{WhitespaceMode::TRIM}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this \t is only a \n\n test text", token.content); @@ -128,15 +128,15 @@ TEST(DynamicTokenizer, textTokenTrimWhitespace) } } -TEST(DynamicTokenizer, textTokenCollapseWhitespace) +TEST(Tokenizer, textTokenCollapseWhitespace) { { CharReader reader{" this \t is only a \n\n test text "}; // 012345 6789012345678 9 0123456789012345 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -152,9 +152,9 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) CharReader reader{"this \t is only a \n\n test text"}; // 01234 5678901234567 8 9012345678901 // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + Tokenizer tokenizer{WhitespaceMode::COLLAPSE}; - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); ASSERT_EQ("this is only a test text", token.content); @@ -167,16 +167,16 @@ TEST(DynamicTokenizer, textTokenCollapseWhitespace) } } -TEST(DynamicTokenizer, simpleReadToken) +TEST(Tokenizer, simpleReadToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; + Tokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -192,7 +192,7 @@ TEST(DynamicTokenizer, simpleReadToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); @@ -208,7 +208,7 @@ TEST(DynamicTokenizer, simpleReadToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -223,16 +223,16 @@ TEST(DynamicTokenizer, simpleReadToken) } } -TEST(DynamicTokenizer, simplePeekToken) +TEST(Tokenizer, simplePeekToken) { CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; + Tokenizer tokenizer; const TokenTypeId tid = tokenizer.registerToken(":"); ASSERT_EQ(0U, tid); { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -246,7 +246,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(tid, token.type); @@ -260,7 +260,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.peek(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -274,7 +274,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -288,7 +288,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(tid, token.type); @@ -302,7 +302,7 @@ TEST(DynamicTokenizer, simplePeekToken) } { - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -316,10 +316,10 @@ TEST(DynamicTokenizer, simplePeekToken) } } -TEST(DynamicTokenizer, ambiguousTokens) +TEST(Tokenizer, ambiguousTokens) { CharReader reader{"abc"}; - DynamicTokenizer tokenizer; + Tokenizer tokenizer; TokenTypeId t1 = tokenizer.registerToken("abd"); TokenTypeId t2 = tokenizer.registerToken("bc"); @@ -327,7 +327,7 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_EQ(0U, t1); ASSERT_EQ(1U, t2); - DynamicToken token; + Token token; ASSERT_TRUE(tokenizer.read(reader, token)); ASSERT_EQ(TextToken, token.type); @@ -349,18 +349,18 @@ TEST(DynamicTokenizer, ambiguousTokens) ASSERT_FALSE(tokenizer.read(reader, token)); } -TEST(DynamicTokenizer, commentTestWhitespacePreserve) +TEST(Tokenizer, commentTestWhitespacePreserve) { CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); + Tokenizer tokenizer(WhitespaceMode::PRESERVE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); const TokenTypeId t3 = tokenizer.registerToken("*/"); - std::vector expected = { + std::vector expected = { {TextToken, "Test", SourceLocation{0, 0, 4}}, {t1, "/", SourceLocation{0, 4, 5}}, {TextToken, "Test ", SourceLocation{0, 5, 10}}, @@ -368,7 +368,7 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, {t3, "*/", SourceLocation{0, 27, 29}}}; - DynamicToken t; + Token t; for (auto &te : expected) { EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); @@ -380,18 +380,18 @@ TEST(DynamicTokenizer, commentTestWhitespacePreserve) ASSERT_FALSE(tokenizer.read(reader, t)); } -TEST(DynamicTokenizer, commentTestWhitespaceCollapse) +TEST(Tokenizer, commentTestWhitespaceCollapse) { CharReader reader{"Test/Test /* Block Comment */", 0}; // 012345678901234567890123456789 // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); + Tokenizer tokenizer(WhitespaceMode::COLLAPSE); const TokenTypeId t1 = tokenizer.registerToken("/"); const TokenTypeId t2 = tokenizer.registerToken("/*"); const TokenTypeId t3 = tokenizer.registerToken("*/"); - std::vector expected = { + std::vector expected = { {TextToken, "Test", SourceLocation{0, 0, 4}}, {t1, "/", SourceLocation{0, 4, 5}}, {TextToken, "Test", SourceLocation{0, 5, 9}}, @@ -399,7 +399,7 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, {t3, "*/", SourceLocation{0, 27, 29}}}; - DynamicToken t; + Token t; for (auto &te : expected) { EXPECT_TRUE(tokenizer.read(reader, t)); EXPECT_EQ(te.type, t.type); @@ -410,6 +410,5 @@ TEST(DynamicTokenizer, commentTestWhitespaceCollapse) } ASSERT_FALSE(tokenizer.read(reader, t)); } - } -- cgit v1.2.3 From cc281d91def921b7bbf5d3d4a0fce53afc5a317b Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:07:58 +0100 Subject: Renamed parser/generic to parser/stack and made filenames much shorter --- src/core/parser/generic/ParserState.cpp | 161 ------------ src/core/parser/generic/ParserState.hpp | 284 --------------------- src/core/parser/generic/ParserStateCallbacks.cpp | 26 -- src/core/parser/generic/ParserStateCallbacks.hpp | 106 -------- src/core/parser/generic/ParserStateHandler.cpp | 104 -------- src/core/parser/generic/ParserStateHandler.hpp | 281 --------------------- src/core/parser/generic/ParserStateStack.cpp | 187 -------------- src/core/parser/generic/ParserStateStack.hpp | 191 -------------- src/core/parser/stack/Callbacks.cpp | 23 ++ src/core/parser/stack/Callbacks.hpp | 99 ++++++++ src/core/parser/stack/Handler.cpp | 90 +++++++ src/core/parser/stack/Handler.hpp | 302 ++++++++++++++++++++++ src/core/parser/stack/Stack.cpp | 188 ++++++++++++++ src/core/parser/stack/Stack.hpp | 191 ++++++++++++++ src/core/parser/stack/State.cpp | 171 +++++++++++++ src/core/parser/stack/State.hpp | 307 +++++++++++++++++++++++ test/core/parser/ParserStateTest.cpp | 77 ------ test/core/parser/stack/StateTest.cpp | 79 ++++++ 18 files changed, 1450 insertions(+), 1417 deletions(-) delete mode 100644 src/core/parser/generic/ParserState.cpp delete mode 100644 src/core/parser/generic/ParserState.hpp delete mode 100644 src/core/parser/generic/ParserStateCallbacks.cpp delete mode 100644 src/core/parser/generic/ParserStateCallbacks.hpp delete mode 100644 src/core/parser/generic/ParserStateHandler.cpp delete mode 100644 src/core/parser/generic/ParserStateHandler.hpp delete mode 100644 src/core/parser/generic/ParserStateStack.cpp delete mode 100644 src/core/parser/generic/ParserStateStack.hpp create mode 100644 src/core/parser/stack/Callbacks.cpp create mode 100644 src/core/parser/stack/Callbacks.hpp create mode 100644 src/core/parser/stack/Handler.cpp create mode 100644 src/core/parser/stack/Handler.hpp create mode 100644 src/core/parser/stack/Stack.cpp create mode 100644 src/core/parser/stack/Stack.hpp create mode 100644 src/core/parser/stack/State.cpp create mode 100644 src/core/parser/stack/State.hpp delete mode 100644 test/core/parser/ParserStateTest.cpp create mode 100644 test/core/parser/stack/StateTest.cpp (limited to 'test/core') diff --git a/src/core/parser/generic/ParserState.cpp b/src/core/parser/generic/ParserState.cpp deleted file mode 100644 index f635d86..0000000 --- a/src/core/parser/generic/ParserState.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "ParserState.hpp" - -namespace ousia { - -/* Class ParserState */ - -ParserState::ParserState() : elementHandler(nullptr) {} - -ParserState::ParserState(ParserStateSet parents, Arguments arguments, - RttiSet createdNodeTypes, - HandlerConstructor elementHandler) - : parents(parents), - arguments(arguments), - createdNodeTypes(createdNodeTypes), - elementHandler(elementHandler) -{ -} - -ParserState::ParserState(const ParserStateBuilder &builder) - : ParserState(builder.build()) -{ -} - -/* Class ParserStateBuilder */ - -ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) -{ - this->state = state; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) -{ - state.parents = ParserStateSet{parent}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) -{ - state.parents = parents; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) -{ - state.arguments = arguments; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) -{ - state.createdNodeTypes = RttiSet{type}; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) -{ - state.createdNodeTypes = types; - return *this; -} - -ParserStateBuilder &ParserStateBuilder::elementHandler( - HandlerConstructor elementHandler) -{ - state.elementHandler = elementHandler; - return *this; -} - -const ParserState &ParserStateBuilder::build() const { return state; } - -/* Class ParserStateDeductor */ - -ParserStateDeductor::ParserStateDeductor( - std::vector signature, - std::vector states) - : tbl(signature.size()), - signature(std::move(signature)), - states(std::move(states)) -{ -} - -bool ParserStateDeductor::isActive(size_t d, const ParserState *s) -{ - // Lookup the "active" state of (d, s), if it was not already set - // (e.second is true) we'll have to calculate it - auto e = tbl[d].emplace(s, false); - bool &res = e.first->second; - if (!e.second) { - return res; - } - - // Check whether this node is generative (may have produced the Node - // described by the current Signature element) - bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); - - if (isGenerative && d == 0) { - // End of recursion -- the last signature element is reached and the - // node was generative - res = true; - } else { - // Try repetition of this node - if (isGenerative && isActive(d - 1, s)) { - res = true; - } else { - // Check whether any of the parent nodes were active -- either for - // the previous element (if this one is generative) or for the - // current element (assuming this node was not generative) - for (const ParserState *parent : s->parents) { - if ((isGenerative && isActive(d - 1, parent)) || - isActive(d, parent)) { - res = true; - break; - } - } - } - } - - return res; -} - -std::vector ParserStateDeductor::deduce() -{ - std::vector res; - if (!signature.empty()) { - const size_t D = signature.size(); - for (auto s : states) { - if (signature[D - 1]->isOneOf(s->createdNodeTypes) && - isActive(D - 1, s)) { - res.push_back(s); - } - } - } - return res; -} - -/* Constant initializations */ - -namespace ParserStates { -const ParserState All; -const ParserState None; -} -} - diff --git a/src/core/parser/generic/ParserState.hpp b/src/core/parser/generic/ParserState.hpp deleted file mode 100644 index 6487fdd..0000000 --- a/src/core/parser/generic/ParserState.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserState.hpp - * - * Defines the ParserState class used within the ParserStack pushdown - * automaton and the ParserStateBuilder class for convenient construction of - * such classes. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_HPP_ -#define _OUSIA_PARSER_STATE_HPP_ - -#include - -#include -#include - -namespace ousia { - -// Forward declarations -class ParserStateBuilder; -class ParserState; -class HandlerData; -class Handler; -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * Set of pointers of parser states -- used for specifying a set of parent - * states. - */ -using ParserStateSet = std::unordered_set; - -/** - * Class used for the complete specification of a ParserState. Stores possible - * parent states, state handlers and arguments to be passed to that state. - */ -struct ParserState { - /** - * Vector containing all possible parent states. - */ - ParserStateSet parents; - - /** - * Descriptor of the arguments that should be passed to the handler. - */ - Arguments arguments; - - /** - * Set containing the types of the nodes that may be created in this - * ParserState. This information is needed for Parsers to reconstruct the - * current ParserState from a given ParserScope when a file is included. - */ - RttiSet createdNodeTypes; - - /** - * Pointer at a function which creates a new concrete Handler instance for - * the elements described by this state. May be nullptr in which case no - * handler instance is created. - */ - HandlerConstructor elementHandler; - - /** - * Default constructor, initializes the handlers with nullptr. - */ - ParserState(); - - /** - * Constructor taking values for all fields. Use the ParserStateBuilder - * class for a more convenient construction of ParserState instances. - * - * @param parents is a vector containing all possible parent states. - * @param arguments is a descriptor of arguments that should be passed to - * the handler. - * @param createdNodeTypes is a set containing the types of the nodes tha - * may be created in this ParserState. This information is needed for - * Parsers to reconstruct the current ParserState from a given ParserScope - * when a file is included. - * @param elementHandler is a pointer at a function which creates a new - * concrete Handler instance for the elements described by this state. May - * be nullptr in which case no handler instance is created. - */ - ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, - RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr); - - /** - * Creates this ParserState from the given ParserStateBuilder instance. - */ - ParserState(const ParserStateBuilder &builder); -}; - -/** - * The ParserStateBuilder class is a class used for conveniently building new - * ParserState instances. - */ -class ParserStateBuilder { -private: - /** - * ParserState instance that is currently being built by the - * ParserStateBuilder. - */ - ParserState state; - -public: - /** - * Copies the ParserState instance and uses it as internal state. Overrides - * all changes made by the ParserStateBuilder. - * - * @param state is the state that should be copied. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder ©(const ParserState &state); - - /** - * Sets the possible parent states to the single given parent element. - * - * @param parent is a pointer at the parent ParserState instance that should - * be the possible parent state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parent(const ParserState *parent); - - /** - * Sets the ParserState instances in the given ParserStateSet as the list of - * supported parent states. - * - * @param parents is a set of pointers at ParserState instances that should - * be the possible parent states. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &parents(const ParserStateSet &parents); - - /** - * Sets the arguments that should be passed to the parser state handler to - * those given as argument. - * - * @param arguments is the Arguments instance describing the Arguments that - * should be parsed to a Handler for this ParserState. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &arguments(const Arguments &arguments); - - /** - * Sets the Node types this state may produce to the given Rtti descriptor. - * - * @param type is the Rtti descriptor of the Type that may be produced by - * this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeType(const Rtti *type); - - /** - * Sets the Node types this state may produce to the given Rtti descriptors. - * - * @param types is a set of Rtti descriptors of the Types that may be - * produced by this state. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &createdNodeTypes(const RttiSet &types); - - /** - * Sets the constructor for the element handler. The constructor creates a - * new concrete Handler instance for the elements described by this state. - * May be nullptr in which case no handler instance is created (this is - * the default value). - * - * @param elementHandler is the HandlerConstructor that should create a - * new Handler instance. - * @return a reference at this ParserStateBuilder instance for method - * chaining. - */ - ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); - - /** - * Returns a reference at the internal ParserState instance that was built - * using the ParserStateBuilder. - * - * @return the built ParserState. - */ - const ParserState &build() const; -}; - -/** - * Class used to deduce the ParserState a Parser is currently in based on the - * types of the Nodes that currently are on the ParserStack. Uses dynamic - * programming in order to solve this problem. - */ -class ParserStateDeductor { -public: - /** - * Type containing the dynamic programming table. - */ - using Table = std::vector>; - -private: - /** - * Dynamic programming table. - */ - Table tbl; - - /** - * Signature given in the constructor. - */ - const std::vector signature; - - /** - * List of states that should be checked for being active. - */ - const std::vector states; - - /** - * Used internally to check whether the given parser stack s may have been - * active for signature element d. - * - * @param d is the signature element. - * @param s is the parser state. - * @return true if the the given ParserState may have been active. - */ - bool isActive(size_t d, const ParserState *s); - -public: - /** - * Constructor of the ParserStateDeductor class. - * - * @param signature a Node type signature describing the types of the nodes - * which currently reside on e.g. the ParserScope stack. - * @param states is a list of states that should be checked. - */ - ParserStateDeductor(std::vector signature, - std::vector states); - - /** - * Selects all active states from the given states. Only considers those - * states that may have produced the last signature element. - * - * @return a list of states that may actually have been active. - */ - std::vector deduce(); -}; - -/** - * The ParserStates namespace contains all the global state constants used - * in the ParserStack class. - */ -namespace ParserStates { -/** - * State representing all states. - */ -extern const ParserState All; - -/** - * State representing the initial state. - */ -extern const ParserState None; -} -} - -#endif /* _OUSIA_PARSER_STATE_HPP_ */ - diff --git a/src/core/parser/generic/ParserStateCallbacks.cpp b/src/core/parser/generic/ParserStateCallbacks.cpp deleted file mode 100644 index 50bac57..0000000 --- a/src/core/parser/generic/ParserStateCallbacks.cpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -namespace ousia { - -/* Class ParserStateCallbacks */ - -} - diff --git a/src/core/parser/generic/ParserStateCallbacks.hpp b/src/core/parser/generic/ParserStateCallbacks.hpp deleted file mode 100644 index 7ec5264..0000000 --- a/src/core/parser/generic/ParserStateCallbacks.hpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserStateCallbacks.hpp - * - * Contains an interface defining the callbacks that can be directed from a - * ParserStateHandler to the ParserStateStack, and from the ParserStateStack to - * the actual parser. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_CALLBACKS_HPP_ -#define _OUSIA_PARSER_STATE_CALLBACKS_HPP_ - -#include - -#include - -namespace ousia { - -/** - * Interface defining a set of callback functions that act as a basis for the - * ParserStateStackCallbacks and the ParserCallbacks. - */ -class ParserStateCallbacks { -public: - /** - * Virtual descructor. - */ - virtual ~ParserStateCallbacks() {}; - - /** - * Sets the whitespace mode that specifies how string data should be - * processed. - * - * @param whitespaceMode specifies one of the three WhitespaceMode constants - * PRESERVE, TRIM or COLLAPSE. - */ - virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; - - /** - * Sets the type as which the variant data should be parsed. - * - * @param type is one of the VariantType constants, specifying with which - * type the data that is passed to the ParserStateHandler in the "data" - * function should be handled. - */ - virtual void setDataType(VariantType type) = 0; - - /** - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. - */ - virtual void registerToken(const std::string &token) = 0; - - /** - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be unregistered. - */ - virtual void unregisterToken(const std::string &token) = 0; -}; - -/** - * Interface defining the callback functions that can be passed from a - * ParserStateStack to the underlying parser. - */ -class ParserCallbacks : public ParserStateCallbacks { - /** - * Checks whether the given token is supported by the parser. The parser - * returns true, if the token is supported, false if this token cannot be - * registered. Note that parsers that do not support the registration of - * tokens at all should always return "true". - * - * @param token is the token that should be checked for support. - * @return true if the token is generally supported (or the parser does not - * support registering tokens at all), false if the token is not supported, - * because e.g. it is a reserved token or it interferes with other tokens. - */ - virtual bool supportsToken(const std::string &token) = 0; -} - -} - -#endif /* _OUSIA_PARSER_STATE_CALLBACKS_HPP_ */ - diff --git a/src/core/parser/generic/ParserStateHandler.cpp b/src/core/parser/generic/ParserStateHandler.cpp deleted file mode 100644 index 64e2bfa..0000000 --- a/src/core/parser/generic/ParserStateHandler.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include "ParserStateHandler.hpp" - -namespace ousia { - -/* Class ParserStatedata */ - -ParserStatedata::ParserStatedata(ParserContext &ctx, - ParserStateCallbacks &callbacks, - std::string name, const ParserState &state, - const ParserState &parentState, - const SourceLocation location) - : ctx(ctx), - callbacks(callbacks), - name(std::move(name)), - state(state), - parentState(parentState), - location(location){}; - -/* Class ParserStateHandler */ - -ParserStateHandler::ParserStateHandler(const ParserStatedata &data) : data(data) -{ -} - -ParserContext &ParserStateHandler::context() { return data.ctx; } - -const std::string &ParserStateHandler::name() { return data.name; } - -ParserScope &ParserStateHandler::scope() { return data.ctx.getScope(); } - -Manager &ParserStateHandler::manager() { return data.ctx.getManager(); } - -Logger &ParserStateHandler::logger() { return data.ctx.getLogger(); } - -Rooted ParserStateHandler::project() { return data.ctx.getProject(); } - -const ParserState &ParserStateHandler::state() { return data.state; } - -SourceLocation ParserStateHandler::location() { return data.location; } - -void ParserStateHandler::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - data.callbacks.setWhitespaceMode(whitespaceMode); -} - -void ParserStateHandler::setDataType(VariantType type) -{ - data.callbacks.setDataType(type); -} - -bool ParserStateHandler::supportsToken(const std::string &token) -{ - return data.callbacks.supportsToken(token); -} - -void ParserStateHandler::registerToken(const std::string &token) -{ - data.callbacks.registerToken(token); -} - -void ParserStateHandler::unregisterToken(const std::string &token) -{ - data.callbacks.unregisterToken(token); -} - -void ParserStateHandler::data(const std::string &data, int field) -{ - if (Utils::hasNonWhitepaceChar(data)) { - logger().error("Expected command but found character data."); - } -} - -/* Class DefaultParserStateHandler */ - -void DefaultParserStateHandler::start(Variant::mapType &args) {} - -void DefaultParserStateHandler::end() {} - -ParserStateHandler *DefaultParserStateHandler::create(const data &data) -{ - return new DefaultHandler{data}; -} -} - diff --git a/src/core/parser/generic/ParserStateHandler.hpp b/src/core/parser/generic/ParserStateHandler.hpp deleted file mode 100644 index f3c836e..0000000 --- a/src/core/parser/generic/ParserStateHandler.hpp +++ /dev/null @@ -1,281 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef _OUSIA_PARSER_STATE_HANDLER_HPP_ -#define _OUSIA_PARSER_STATE_HANDLER_HPP_ - -#include -#include - -#include - -namespace ousia { - -// Forward declarations -class ParserContext; -class ParserState; -class ParserStateCallbacks; - -/** - * Class collecting all the data that is being passed to a ParserStateHandler - * instance. - */ -class ParserStateHandlerData { -public: - /** - * Reference to the ParserContext instance that should be used to resolve - * references to nodes in the Graph. - */ - ParserContext &ctx; - - /** - * Reference at an instance of the ParserStateCallbacks class, used for - * modifying the behaviour of the parser (like registering tokens, setting - * the data type or changing the whitespace handling mode). - */ - ParserStateCallbacks &callbacks; - - /** - * Contains the name of the command that is being handled. - */ - const std::string name; - - /** - * Contains the current state of the state machine. - */ - const ParserState &state; - - /** - * Contains the state of the state machine when the parent node was handled. - */ - const ParserState &parentState; - - /** - * Current source code location. - */ - const SourceLocation location; - - /** - * Constructor of the HandlerData class. - * - * @param ctx is the parser context the handler should be executed in. - * @param callbacks is an instance of ParserStateCallbacks used to notify - * the parser about certain state changes. - * @param name is the name of the string. - * @param state is the state this handler was called for. - * @param parentState is the state of the parent command. - * @param location is the location at which the handler is created. - */ - ParserStateHandlerData(ParserContext &ctx, ParserStateCallbacks &callbacks, - std::string name, const ParserState &state, - const ParserState &parentState, - const SourceLocation &location); -}; - -/** - * The handler class provides a context for handling an XML tag. It has to be - * overridden and registered in the StateStack class to form handlers for - * concrete XML tags. - */ -class ParserStateHandler { -private: - /** - * Structure containing the internal handler data. - */ - const ParserStateHandlerData data; - -protected: - /** - * Constructor of the Handler class. - * - * @param data is a structure containing all data being passed to the - * handler. - */ - ParserStateHandler(const ParserStateHandlerData &data){}; - -public: - /** - * Virtual destructor. - */ - virtual ~Handler(){}; - - /** - * Returns a reference at the ParserContext. - * - * @return a reference at the ParserContext. - */ - ParserContext &context(); - - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. - */ - const std::string &name(); - - /** - * Returns a reference at the ParserScope instance. - * - * @return a reference at the ParserScope instance. - */ - ParserScope &scope(); - - /** - * Returns a reference at the Manager instance which manages all nodes. - * - * @return a referance at the Manager instance. - */ - Manager &manager(); - - /** - * Returns a reference at the Logger instance used for logging error - * messages. - * - * @return a reference at the Logger instance. - */ - Logger &logger(); - - /** - * Returns a reference at the Project Node, representing the project into - * which the file is currently being parsed. - * - * @return a referance at the Project Node. - */ - Rooted project(); - - /** - * Reference at the ParserState descriptor for which this Handler was - * created. - * - * @return a const reference at the constructing ParserState descriptor. - */ - const ParserState &state(); - - /** - * Returns the current location in the source file. - * - * @return the current location in the source file. - */ - SourceLocation location(); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Sets the whitespace mode that specifies how string data should be - * processed. - * - * @param whitespaceMode specifies one of the three WhitespaceMode constants - * PRESERVE, TRIM or COLLAPSE. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Sets the type as which the variant data should be parsed. - * - * @param type is one of the VariantType constants, specifying with which - * type the data that is passed to the ParserStateHandler in the "data" - * function should be handled. - */ - void setDataType(VariantType type); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Checks whether the given token is supported by the parser. The parser - * returns true, if the token is supported, false if this token cannot be - * registered. Note that parsers that do not support the registration of - * tokens at all should always return "true". - * - * @param token is the token that should be checked for support. - * @return true if the token is generally supported (or the parser does not - * support registering tokens at all), false if the token is not supported, - * because e.g. it is a reserved token or it interferes with other tokens. - */ - bool supportsToken(const std::string &token); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. - */ - void registerToken(const std::string &token); - - /** - * Calls the corresponding function in the ParserStateCallbacks instance. - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be unregistered. - */ - void unregisterToken(const std::string &token); - - /** - * Called when the command that was specified in the constructor is - * instanciated. - * - * @param args is a map from strings to variants (argument name and value). - */ - virtual void start(Variant::mapType &args) = 0; - - /** - * Called whenever the command for which this handler is defined ends. - */ - virtual void end() = 0; - - /** - * Called whenever raw data (int the form of a string) is available for the - * Handler instance. In the default handler an exception is raised if the - * received data contains non-whitespace characters. - * - * @param data is a pointer at the character data that is available for the - * Handler instance. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - virtual void data(const std::string &data, int field); -}; - -/** - * HandlerConstructor is a function pointer type used to create concrete - * instances of the Handler class. - * - * @param handlerData is the data that should be passed to the new handler - * instance. - * @return a newly created handler instance. - */ -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * The DefaultHandler class is used in case no element handler is specified in - * the ParserState descriptor. - */ -class DefaultParserStateHandler : public ParserStateHandler { -public: - using ParserStateHandler::ParserStateHandler; - - void start(Variant::mapType &args) override; - - void end() override; - - static Handler *create(const HandlerData &handlerData); -}; -} - -#endif /* _OUSIA_PARSER_STATE_HANDLER_HPP_ */ - diff --git a/src/core/parser/generic/ParserStateStack.cpp b/src/core/parser/generic/ParserStateStack.cpp deleted file mode 100644 index 8c32f17..0000000 --- a/src/core/parser/generic/ParserStateStack.cpp +++ /dev/null @@ -1,187 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include -#include - -#include "ParserScope.hpp" -#include "ParserStateStack.hpp" - -namespace ousia { - -/* Class ParserStateStack */ - -/** - * Returns an Exception that should be thrown when a currently invalid command - * is thrown. - */ -static LoggableException InvalidCommand(const std::string &name, - const std::set &expected) -{ - if (expected.empty()) { - return LoggableException{ - std::string{"No nested elements allowed, but got \""} + name + - std::string{"\""}}; - } else { - return LoggableException{ - std::string{"Expected "} + - (expected.size() == 1 ? std::string{"\""} - : std::string{"one of \""}) + - Utils::join(expected, "\", \"") + std::string{"\", but got \""} + - name + std::string{"\""}}; - } -} - -ParserStateStack::ParserStateStack( - ParserContext &ctx, - const std::multimap &states) - : ctx(ctx), states(states) -{ -} - -bool ParserStateStack::deduceState() -{ - // Assemble all states - std::vector states; - for (const auto &e : this->states) { - states.push_back(e.second); - } - - // Fetch the type signature of the scope and derive all possible states, - // abort if no unique parser state was found - std::vector possibleStates = - ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) - .deduce(); - if (possibleStates.size() != 1) { - ctx.getLogger().error( - "Error while including file: Cannot deduce parser state."); - return false; - } - - // Switch to this state by creating a dummy handler - const ParserState *state = possibleStates[0]; - Handler *handler = - DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); - stack.emplace(handler); - return true; -} - -std::set ParserStateStack::expectedCommands() -{ - const ParserState *currentState = &(this->currentState()); - std::set res; - for (const auto &v : states) { - if (v.second->parents.count(currentState)) { - res.insert(v.first); - } - } - return res; -} - -const ParserState &ParserStateStack::currentState() -{ - return stack.empty() ? ParserStates::None : stack.top()->state(); -} - -std::string ParserStateStack::currentCommandName() -{ - return stack.empty() ? std::string{} : stack.top()->name(); -} - -const ParserState *ParserStateStack::findTargetState(const std::string &name) -{ - const ParserState *currentState = &(this->currentState()); - auto range = states.equal_range(name); - for (auto it = range.first; it != range.second; it++) { - const ParserStateSet &parents = it->second->parents; - if (parents.count(currentState) || parents.count(&ParserStates::All)) { - return it->second; - } - } - - return nullptr; -} - -void ParserStateStack::start(const std::string &name, Variant::mapType &args, - const SourceLocation &location) -{ - ParserState const *targetState = findTargetState(name); -// TODO: Andreas, please improve this. -// if (!Utils::isIdentifier(name)) { -// throw LoggableException(std::string("Invalid identifier \"") + name + -// std::string("\"")); -// } - - if (targetState == nullptr) { - targetState = findTargetState("*"); - } - if (targetState == nullptr) { - throw InvalidCommand(name, expectedCommands()); - } - - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and call its start function - Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); - handler->start(args); - stack.emplace(handler); -} - -void ParserStateStack::start(std::string name, const Variant::mapType &args, - const SourceLocation &location) -{ - Variant::mapType argsCopy(args); - start(name, argsCopy); -} - -void ParserStateStack::end() -{ - // Check whether the current command could be ended - if (stack.empty()) { - throw LoggableException{"No command to end."}; - } - - // Remove the current HandlerInstance from the stack - std::shared_ptr inst{stack.top()}; - stack.pop(); - - // Call the end function of the last Handler - inst->end(); -} - -void ParserStateStack::data(const std::string &data, int field) -{ - // Check whether there is any command the data can be sent to - if (stack.empty()) { - throw LoggableException{"No command to receive data."}; - } - - // Pass the data to the current Handler instance - stack.top()->data(data, field); -} -} - diff --git a/src/core/parser/generic/ParserStateStack.hpp b/src/core/parser/generic/ParserStateStack.hpp deleted file mode 100644 index b106475..0000000 --- a/src/core/parser/generic/ParserStateStack.hpp +++ /dev/null @@ -1,191 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file ParserStateStack.hpp - * - * Helper classes for document or description parsers. Contains the - * ParserStateStack class, which is an pushdown automaton responsible for - * accepting commands in the correct order and calling specified handlers. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STATE_STACK_HPP_ -#define _OUSIA_PARSER_STATE_STACK_HPP_ - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "Parser.hpp" -#include "ParserContext.hpp" -#include "ParserState.hpp" - -namespace ousia { - -/** - * The ParserStateStack class is a pushdown automaton responsible for turning a - * command stream into a tree of Node instances. - */ -class ParserStateStack { -private: - /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. - */ - const std::multimap &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::stack> stack; - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. - */ - std::set expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from for the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - *state - * otherwise. - */ - const ParserState *findTargetState(const std::string &name); - -public: - /** - * Creates a new instance of the ParserStateStack class. - * - * @param ctx is the parser context the parser stack is working on. - * @param states is a map containing the command names and pointers at the - * corresponding ParserState instances. - */ - ParserStateStack( - ParserContext &ctx, - const std::multimap &states); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - + an equivalent state as the one in the including file. - * - * @param scope is the ParserScope instance from which the ParserState - * should be reconstructed. - * @param logger is the logger instance to which error messages should be - * written. - * @return true if the operation was sucessful, false otherwise. - */ - bool deduceState(); - - /** - * Returns the state the ParserStateStack instance currently is in. - * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. - */ - const ParserState ¤tState(); - - /** - * Returns the command name that is currently being handled. - * - * @return the name of the command currently being handled by the active - * Handler instance or an empty string if no handler is currently active. - */ - std::string currentCommandName(); - - /** - * Function that should be called whenever a new command is reached. - * - * @param name is the name of the command (including the namespace - * separator ':') and its corresponding location. Must be a string variant. - * @param args is a map variant containing the arguments that were passed to - * the command. - */ - void command(Variant name, Variant args); - - /** - * Function that should be called whenever a new field starts. Fields of the - * same command may not be separated by calls to - */ - void fieldStart(); - - /** - * Function that should be called whenever a field ends. - */ - void fieldEnd(); - - /** - * Function that shuold be called whenever character data is found in the - * input stream. - * - * @param data is a variant of any type containing the data that was parsed - * as data. - */ - void data(Variant data); - - /** - * Function that should be called whenever an annotation starts. - * - * @param name is the name of the annotation class. - * @param args is a map variant containing the arguments that were passed - * to the annotation. - */ - void annotationStart(Variant name, Variant args); - - /** - * Function that should be called whenever an annotation ends. - * - * @param name is the name of the annotation class that was ended. - * @param annotationName is the name of the annotation that was ended. - */ - void annotationEnd(Variant name, Variant annotationName); - - /** - * Function that should be called whenever a previously registered token - * is found in the input stream. - * - * @param token is string variant containing the token that was encountered. - */ - void token(Variant token); -}; -} - -#endif /* _OUSIA_PARSER_STATE_STACK_HPP_ */ - diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp new file mode 100644 index 0000000..6ebc549 --- /dev/null +++ b/src/core/parser/stack/Callbacks.cpp @@ -0,0 +1,23 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Callbacks.hpp" + +namespace ousia { +} + diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp new file mode 100644 index 0000000..bb56e44 --- /dev/null +++ b/src/core/parser/stack/Callbacks.hpp @@ -0,0 +1,99 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Callbacks.hpp + * + * Contains an interface defining the callbacks that can be directed from a + * StateHandler to the StateStack, and from the StateStack to + * the actual parser. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_CALLBACKS_HPP_ +#define _OUSIA_PARSER_STATE_CALLBACKS_HPP_ + +#include + +#include + +namespace ousia { +namespace parser_stack { + +/** + * Interface defining a set of callback functions that act as a basis for the + * StateStackCallbacks and the ParserCallbacks. + */ +class Callbacks { +public: + /** + * Virtual descructor. + */ + virtual ~Callbacks() {}; + + /** + * Sets the whitespace mode that specifies how string data should be + * processed. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; + + /** + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + virtual void registerToken(const std::string &token) = 0; + + /** + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + virtual void unregisterToken(const std::string &token) = 0; +}; + +/** + * Interface defining the callback functions that can be passed from a + * StateStack to the underlying parser. + */ +class ParserCallbacks : public Callbacks { + /** + * Checks whether the given token is supported by the parser. The parser + * returns true, if the token is supported, false if this token cannot be + * registered. Note that parsers that do not support the registration of + * tokens at all should always return "true". + * + * @param token is the token that should be checked for support. + * @return true if the token is generally supported (or the parser does not + * support registering tokens at all), false if the token is not supported, + * because e.g. it is a reserved token or it interferes with other tokens. + */ + virtual bool supportsToken(const std::string &token) = 0; +}; + +} +} + +#endif /* _OUSIA_PARSER_STATE_CALLBACKS_HPP_ */ + diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp new file mode 100644 index 0000000..66af2a4 --- /dev/null +++ b/src/core/parser/stack/Handler.cpp @@ -0,0 +1,90 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include "Callbacks.hpp" +#include "Handler.hpp" +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class HandlerData */ + +HandlerData::HandlerData(ParserContext &ctx, Callbacks &callbacks, + std::string name, const State &state, + const SourceLocation &location) + : ctx(ctx), + callbacks(callbacks), + name(std::move(name)), + state(state), + location(location) +{ +} + +/* Class Handler */ + +Handler::Handler(const HandlerData &internalData) : internalData(internalData) +{ +} + +Handler::~Handler() {} + +ParserContext &Handler::context() { return internalData.ctx; } + +const std::string &Handler::name() { return internalData.name; } + +ParserScope &Handler::scope() { return internalData.ctx.getScope(); } + +Manager &Handler::manager() { return internalData.ctx.getManager(); } + +Logger &Handler::logger() { return internalData.ctx.getLogger(); } + +const State &Handler::state() { return internalData.state; } + +SourceLocation Handler::location() { return internalData.location; } + +void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ + internalData.callbacks.setWhitespaceMode(whitespaceMode); +} + +void Handler::registerToken(const std::string &token) +{ + internalData.callbacks.registerToken(token); +} + +void Handler::unregisterToken(const std::string &token) +{ + internalData.callbacks.unregisterToken(token); +} + +/* Class DefaultHandler */ + +/*void DefaultHandler::start(Variant::mapType &args) {} + +void DefaultHandler::end() {} + +Handler *DefaultHandler::create(const data &data) +{ + return new DefaultHandler{data}; +}*/ +} +} + diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp new file mode 100644 index 0000000..0701343 --- /dev/null +++ b/src/core/parser/stack/Handler.hpp @@ -0,0 +1,302 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _OUSIA_PARSER_STATE_HANDLER_HPP_ +#define _OUSIA_PARSER_STATE_HANDLER_HPP_ + +#include +#include + +#include +#include + +namespace ousia { + +// Forward declarations +class ParserContext; +class Callbacks; +class Logger; +class Project; + +namespace parser_stack { + +// More forward declarations +class State; + +/** + * Class collecting all the data that is being passed to a Handler + * instance. + */ +class HandlerData { +public: + /** + * Reference to the ParserContext instance that should be used to resolve + * references to nodes in the Graph. + */ + ParserContext &ctx; + + /** + * Reference at an instance of the Callbacks class, used for + * modifying the behaviour of the parser (like registering tokens, setting + * the data type or changing the whitespace handling mode). + */ + Callbacks &callbacks; + + /** + * Contains the name of the command that is being handled. + */ + std::string name; + + /** + * Contains the current state of the state machine. + */ + const State &state; + + /** + * Current source code location. + */ + SourceLocation location; + + /** + * Constructor of the HandlerData class. + * + * @param ctx is the parser context the handler should be executed in. + * @param callbacks is an instance of Callbacks used to notify + * the parser about certain state changes. + * @param name is the name of the string. + * @param state is the state this handler was called for. + * @param location is the location at which the handler is created. + */ + HandlerData(ParserContext &ctx, Callbacks &callbacks, std::string name, + const State &state, const SourceLocation &location); +}; + +/** + * The Handler class provides a context for handling a generic stack element. + * It has to beoverridden and registered in the StateStack class to form + * handlers for concrete XML tags. + */ +class Handler { +private: + /** + * Structure containing the internal handler data. + */ + const HandlerData internalData; + +protected: + /** + * Constructor of the Handler class. + * + * @param data is a structure containing all data being passed to the + * handler. + */ + Handler(const HandlerData &internalData); + + /** + * Returns a reference at the ParserContext. + * + * @return a reference at the ParserContext. + */ + ParserContext &context(); + + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &name(); + + /** + * Returns a reference at the ParserScope instance. + * + * @return a reference at the ParserScope instance. + */ + ParserScope &scope(); + + /** + * Returns a reference at the Manager instance which manages all nodes. + * + * @return a referance at the Manager instance. + */ + Manager &manager(); + + /** + * Returns a reference at the Logger instance used for logging error + * messages. + * + * @return a reference at the Logger instance. + */ + Logger &logger(); + + /** + * Reference at the State descriptor for which this Handler was created. + * + * @return a const reference at the constructing State descriptor. + */ + const State &state(); + + /** + * Returns the current location in the source file. + * + * @return the current location in the source file. + */ + SourceLocation location(); + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); + + /** + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + void setWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Calls the corresponding function in the Callbacks instance. + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + void registerToken(const std::string &token); + + /** + * Calls the corresponding function in the Callbacks instance. + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + void unregisterToken(const std::string &token); + + /** + * Called when the command that was specified in the constructor is + * instanciated. + * + * @param args is a map from strings to variants (argument name and value). + * @return true if the handler was successful in starting the element it + * represents, false otherwise. + */ + virtual bool start(Variant::mapType &args) = 0; + + /** + * Called before the command for which this handler is defined ends (is + * forever removed from the stack). + */ + virtual void end() = 0; + + /** + * Called when a new field starts, while the handler is active. This + * function should return true if the field is supported, false otherwise. + * No error should be logged if the field cannot be started, the caller will + * take care of that (since it is always valid to start a default field, + * even though the corresponding structure does not have a field, as long as + * no data is fed into the field). + * + * @param isDefaultField is set to true if the field that is being started + * is the default/tree field. The handler should set the value of this + * variable to true if the referenced field is indeed the default field. + * @param isImplicit is set to true if the field is implicitly being started + * by the stack (this field always implies isDefaultField being set to + * true). + * @param fieldIndex is the numerical index of the field. + */ + virtual bool fieldStart(bool &isDefaultField, bool isImplicit, + size_t fieldIndex) = 0; + + /** + * Called when a previously opened field ends, while the handler is active. + * Note that a "fieldStart" and "fieldEnd" are always called alternately. + */ + virtual void fieldEnd() = 0; + + /** + * Called whenever an annotation starts while this handler is active. The + * function should return true if starting the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the name in the source code. + * @param args is a map from strings to variants (argument name and value). + * @return true if the mentioned annotation could be started here, false + * if an error occurred. + */ + virtual bool annotationStart(Variant className, Variant::mapType &args) = 0; + + /** + * Called whenever an annotation ends while this handler is active. The + * function should return true if ending the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the class name in the source code. + * @param elementName is a string variant containing the name of the + * annotation class and the location of the element name in the source code. + * @return true if the mentioned annotation could be started here, false if + * an error occurred. + */ + virtual bool annotationEnd(Variant className, Variant elementName) = 0; + + /** + * Called whenever raw data (int the form of a string) is available for the + * Handler instance. + * + * @param data is a string variant containing the character data and its + * location. + */ + virtual void data(Variant data) = 0; +}; + +/** + * HandlerConstructor is a function pointer type used to create concrete + * instances of the Handler class. + * + * @param handlerData is the data that should be passed to the new handler + * instance. + * @return a newly created handler instance. + */ +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * The DefaultHandler class is used in case no element handler is specified in + * the State descriptor. + */ +/*class EmptyHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override; + + void end() override; + + static Handler *create(const HandlerData &handlerData); +};*/ + +} +} + +#endif /* _OUSIA_PARSER_STATE_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp new file mode 100644 index 0000000..1d83a68 --- /dev/null +++ b/src/core/parser/stack/Stack.cpp @@ -0,0 +1,188 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include "Stack.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class StateStack */ + +/** + * Returns an Exception that should be thrown when a currently invalid command + * is thrown. + */ +static LoggableException InvalidCommand(const std::string &name, + const std::set &expected) +{ + if (expected.empty()) { + return LoggableException{ + std::string{"No nested elements allowed, but got \""} + name + + std::string{"\""}}; + } else { + return LoggableException{ + std::string{"Expected "} + + (expected.size() == 1 ? std::string{"\""} + : std::string{"one of \""}) + + Utils::join(expected, "\", \"") + std::string{"\", but got \""} + + name + std::string{"\""}}; + } +} + +StateStack::StateStack( + ParserContext &ctx, + const std::multimap &states) + : ctx(ctx), states(states) +{ +} + +bool StateStack::deduceState() +{ + // Assemble all states + std::vector states; + for (const auto &e : this->states) { + states.push_back(e.second); + } + + // Fetch the type signature of the scope and derive all possible states, + // abort if no unique parser state was found + std::vector possibleStates = + StateDeductor(ctx.getScope().getStackTypeSignature(), states) + .deduce(); + if (possibleStates.size() != 1) { + ctx.getLogger().error( + "Error while including file: Cannot deduce parser state."); + return false; + } + + // Switch to this state by creating a dummy handler + const State *state = possibleStates[0]; + Handler *handler = + DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); + stack.emplace(handler); + return true; +} + +std::set StateStack::expectedCommands() +{ + const State *currentState = &(this->currentState()); + std::set res; + for (const auto &v : states) { + if (v.second->parents.count(currentState)) { + res.insert(v.first); + } + } + return res; +} + +const State &StateStack::currentState() +{ + return stack.empty() ? States::None : stack.top()->state(); +} + +std::string StateStack::currentCommandName() +{ + return stack.empty() ? std::string{} : stack.top()->name(); +} + +const State *StateStack::findTargetState(const std::string &name) +{ + const State *currentState = &(this->currentState()); + auto range = states.equal_range(name); + for (auto it = range.first; it != range.second; it++) { + const StateSet &parents = it->second->parents; + if (parents.count(currentState) || parents.count(&States::All)) { + return it->second; + } + } + + return nullptr; +} + +void StateStack::start(const std::string &name, Variant::mapType &args, + const SourceLocation &location) +{ + State const *targetState = findTargetState(name); +// TODO: Andreas, please improve this. +// if (!Utils::isIdentifier(name)) { +// throw LoggableException(std::string("Invalid identifier \"") + name + +// std::string("\"")); +// } + + if (targetState == nullptr) { + targetState = findTargetState("*"); + } + if (targetState == nullptr) { + throw InvalidCommand(name, expectedCommands()); + } + + // Fetch the associated constructor + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : DefaultHandler::create; + + // Canonicalize the arguments, allow additional arguments + targetState->arguments.validateMap(args, ctx.getLogger(), true); + + // Instantiate the handler and call its start function + Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); + handler->start(args); + stack.emplace(handler); +} + +void StateStack::start(std::string name, const Variant::mapType &args, + const SourceLocation &location) +{ + Variant::mapType argsCopy(args); + start(name, argsCopy); +} + +void StateStack::end() +{ + // Check whether the current command could be ended + if (stack.empty()) { + throw LoggableException{"No command to end."}; + } + + // Remove the current HandlerInstance from the stack + std::shared_ptr inst{stack.top()}; + stack.pop(); + + // Call the end function of the last Handler + inst->end(); +} + +void StateStack::data(const std::string &data, int field) +{ + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException{"No command to receive data."}; + } + + // Pass the data to the current Handler instance + stack.top()->data(data, field); +} +} +} + diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp new file mode 100644 index 0000000..b106475 --- /dev/null +++ b/src/core/parser/stack/Stack.hpp @@ -0,0 +1,191 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file ParserStateStack.hpp + * + * Helper classes for document or description parsers. Contains the + * ParserStateStack class, which is an pushdown automaton responsible for + * accepting commands in the correct order and calling specified handlers. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_STACK_HPP_ +#define _OUSIA_PARSER_STATE_STACK_HPP_ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "Parser.hpp" +#include "ParserContext.hpp" +#include "ParserState.hpp" + +namespace ousia { + +/** + * The ParserStateStack class is a pushdown automaton responsible for turning a + * command stream into a tree of Node instances. + */ +class ParserStateStack { +private: + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap &states; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::stack> stack; + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from for the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + *state + * otherwise. + */ + const ParserState *findTargetState(const std::string &name); + +public: + /** + * Creates a new instance of the ParserStateStack class. + * + * @param ctx is the parser context the parser stack is working on. + * @param states is a map containing the command names and pointers at the + * corresponding ParserState instances. + */ + ParserStateStack( + ParserContext &ctx, + const std::multimap &states); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + + an equivalent state as the one in the including file. + * + * @param scope is the ParserScope instance from which the ParserState + * should be reconstructed. + * @param logger is the logger instance to which error messages should be + * written. + * @return true if the operation was sucessful, false otherwise. + */ + bool deduceState(); + + /** + * Returns the state the ParserStateStack instance currently is in. + * + * @return the state of the currently active Handler instance or STATE_NONE + * if no handler is on the stack. + */ + const ParserState ¤tState(); + + /** + * Returns the command name that is currently being handled. + * + * @return the name of the command currently being handled by the active + * Handler instance or an empty string if no handler is currently active. + */ + std::string currentCommandName(); + + /** + * Function that should be called whenever a new command is reached. + * + * @param name is the name of the command (including the namespace + * separator ':') and its corresponding location. Must be a string variant. + * @param args is a map variant containing the arguments that were passed to + * the command. + */ + void command(Variant name, Variant args); + + /** + * Function that should be called whenever a new field starts. Fields of the + * same command may not be separated by calls to + */ + void fieldStart(); + + /** + * Function that should be called whenever a field ends. + */ + void fieldEnd(); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. + * + * @param data is a variant of any type containing the data that was parsed + * as data. + */ + void data(Variant data); + + /** + * Function that should be called whenever an annotation starts. + * + * @param name is the name of the annotation class. + * @param args is a map variant containing the arguments that were passed + * to the annotation. + */ + void annotationStart(Variant name, Variant args); + + /** + * Function that should be called whenever an annotation ends. + * + * @param name is the name of the annotation class that was ended. + * @param annotationName is the name of the annotation that was ended. + */ + void annotationEnd(Variant name, Variant annotationName); + + /** + * Function that should be called whenever a previously registered token + * is found in the input stream. + * + * @param token is string variant containing the token that was encountered. + */ + void token(Variant token); +}; +} + +#endif /* _OUSIA_PARSER_STATE_STACK_HPP_ */ + diff --git a/src/core/parser/stack/State.cpp b/src/core/parser/stack/State.cpp new file mode 100644 index 0000000..d72f533 --- /dev/null +++ b/src/core/parser/stack/State.cpp @@ -0,0 +1,171 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class State */ + +State::State() : elementHandler(nullptr) {} + +State::State(StateSet parents, Arguments arguments, + RttiSet createdNodeTypes, + HandlerConstructor elementHandler, + bool supportsAnnotations) + : parents(parents), + arguments(arguments), + createdNodeTypes(createdNodeTypes), + elementHandler(elementHandler), + supportsAnnotations(supportsAnnotations) +{ +} + +State::State(const StateBuilder &builder) + : State(builder.build()) +{ +} + +/* Class StateBuilder */ + +StateBuilder &StateBuilder::copy(const State &state) +{ + this->state = state; + return *this; +} + +StateBuilder &StateBuilder::parent(const State *parent) +{ + state.parents = StateSet{parent}; + return *this; +} + +StateBuilder &StateBuilder::parents(const StateSet &parents) +{ + state.parents = parents; + return *this; +} + +StateBuilder &StateBuilder::arguments(const Arguments &arguments) +{ + state.arguments = arguments; + return *this; +} + +StateBuilder &StateBuilder::createdNodeType(const Rtti *type) +{ + state.createdNodeTypes = RttiSet{type}; + return *this; +} + +StateBuilder &StateBuilder::createdNodeTypes(const RttiSet &types) +{ + state.createdNodeTypes = types; + return *this; +} + +StateBuilder &StateBuilder::elementHandler( + HandlerConstructor elementHandler) +{ + state.elementHandler = elementHandler; + return *this; +} + +StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations) +{ + state.supportsAnnotations = supportsAnnotations; + return *this; +} + +const State &StateBuilder::build() const { return state; } + +/* Class StateDeductor */ + +StateDeductor::StateDeductor( + std::vector signature, + std::vector states) + : tbl(signature.size()), + signature(std::move(signature)), + states(std::move(states)) +{ +} + +bool StateDeductor::isActive(size_t d, const State *s) +{ + // Lookup the "active" state of (d, s), if it was not already set + // (e.second is true) we'll have to calculate it + auto e = tbl[d].emplace(s, false); + bool &res = e.first->second; + if (!e.second) { + return res; + } + + // Check whether this node is generative (may have produced the Node + // described by the current Signature element) + bool isGenerative = signature[d]->isOneOf(s->createdNodeTypes); + + if (isGenerative && d == 0) { + // End of recursion -- the last signature element is reached and the + // node was generative + res = true; + } else { + // Try repetition of this node + if (isGenerative && isActive(d - 1, s)) { + res = true; + } else { + // Check whether any of the parent nodes were active -- either for + // the previous element (if this one is generative) or for the + // current element (assuming this node was not generative) + for (const State *parent : s->parents) { + if ((isGenerative && isActive(d - 1, parent)) || + isActive(d, parent)) { + res = true; + break; + } + } + } + } + + return res; +} + +std::vector StateDeductor::deduce() +{ + std::vector res; + if (!signature.empty()) { + const size_t D = signature.size(); + for (auto s : states) { + if (signature[D - 1]->isOneOf(s->createdNodeTypes) && + isActive(D - 1, s)) { + res.push_back(s); + } + } + } + return res; +} + +/* Constant initializations */ + +namespace States { +const State All; +const State None; +} +} +} + diff --git a/src/core/parser/stack/State.hpp b/src/core/parser/stack/State.hpp new file mode 100644 index 0000000..ea326ec --- /dev/null +++ b/src/core/parser/stack/State.hpp @@ -0,0 +1,307 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file State.hpp + * + * Defines the State class used within the ParserStack pushdown + * automaton and the StateBuilder class for convenient construction of + * such classes. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STATE_HPP_ +#define _OUSIA_PARSER_STATE_HPP_ + +#include + +#include +#include + +namespace ousia { +namespace parser_stack { + +// Forward declarations +class StateBuilder; +class State; +class HandlerData; +class Handler; +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * Set of pointers of parser states -- used for specifying a set of parent + * states. + */ +using StateSet = std::unordered_set; + +/** + * Class used for the complete specification of a State. Stores possible + * parent states, state handlers and arguments to be passed to that state. + */ +struct State { + /** + * Vector containing all possible parent states. + */ + StateSet parents; + + /** + * Descriptor of the arguments that should be passed to the handler. + */ + Arguments arguments; + + /** + * Set containing the types of the nodes that may be created in this + * State. This information is needed for Parsers to reconstruct the + * current State from a given ParserScope when a file is included. + */ + RttiSet createdNodeTypes; + + /** + * Pointer at a function which creates a new concrete Handler instance for + * the elements described by this state. May be nullptr in which case no + * handler instance is created. + */ + HandlerConstructor elementHandler; + + /** + * Set to true if this handler does support annotations. This is almost + * always false (e.g. all description handlers), except for document + * element handlers. + */ + bool supportsAnnotations; + + /** + * Default constructor, initializes the handlers with nullptr. + */ + State(); + + /** + * Constructor taking values for all fields. Use the StateBuilder + * class for a more convenient construction of State instances. + * + * @param parents is a vector containing all possible parent states. + * @param arguments is a descriptor of arguments that should be passed to + * the handler. + * @param createdNodeTypes is a set containing the types of the nodes tha + * may be created in this State. This information is needed for + * Parsers to reconstruct the current State from a given ParserScope + * when a file is included. + * @param elementHandler is a pointer at a function which creates a new + * concrete Handler instance for the elements described by this state. May + * be nullptr in which case no handler instance is created. + * @param supportsAnnotations specifies whether annotations are supported + * here at all. + */ + State(StateSet parents, Arguments arguments = Arguments{}, + RttiSet createdNodeTypes = RttiSet{}, + HandlerConstructor elementHandler = nullptr, + bool supportsAnnotations = false); + + /** + * Creates this State from the given StateBuilder instance. + */ + State(const StateBuilder &builder); +}; + +/** + * The StateBuilder class is a class used for conveniently building new + * State instances. + */ +class StateBuilder { +private: + /** + * State instance that is currently being built by the + * StateBuilder. + */ + State state; + +public: + /** + * Copies the State instance and uses it as internal state. Overrides + * all changes made by the StateBuilder. + * + * @param state is the state that should be copied. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder ©(const State &state); + + /** + * Sets the possible parent states to the single given parent element. + * + * @param parent is a pointer at the parent State instance that should + * be the possible parent state. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &parent(const State *parent); + + /** + * Sets the State instances in the given StateSet as the list of + * supported parent states. + * + * @param parents is a set of pointers at State instances that should + * be the possible parent states. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &parents(const StateSet &parents); + + /** + * Sets the arguments that should be passed to the parser state handler to + * those given as argument. + * + * @param arguments is the Arguments instance describing the Arguments that + * should be parsed to a Handler for this State. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &arguments(const Arguments &arguments); + + /** + * Sets the Node types this state may produce to the given Rtti descriptor. + * + * @param type is the Rtti descriptor of the Type that may be produced by + * this state. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &createdNodeType(const Rtti *type); + + /** + * Sets the Node types this state may produce to the given Rtti descriptors. + * + * @param types is a set of Rtti descriptors of the Types that may be + * produced by this state. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &createdNodeTypes(const RttiSet &types); + + /** + * Sets the constructor for the element handler. The constructor creates a + * new concrete Handler instance for the elements described by this state. + * May be nullptr in which case no handler instance is created (this is + * the default value). + * + * @param elementHandler is the HandlerConstructor that should create a + * new Handler instance. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &elementHandler(HandlerConstructor elementHandler); + + /** + * Sets the state of the "supportsAnnotations" flags (default value is + * false) + * + * @param supportsAnnotations should be set to true, if annotations are + * supported for the handlers associated with this document. + * @return a reference at this StateBuilder instance for method + * chaining. + */ + StateBuilder &supportsAnnotations(bool supportsAnnotations); + + /** + * Returns a reference at the internal State instance that was built + * using the StateBuilder. + * + * @return the built State. + */ + const State &build() const; +}; + +/** + * Class used to deduce the State a Parser is currently in based on the + * types of the Nodes that currently are on the ParserStack. Uses dynamic + * programming in order to solve this problem. + */ +class StateDeductor { +public: + /** + * Type containing the dynamic programming table. + */ + using Table = std::vector>; + +private: + /** + * Dynamic programming table. + */ + Table tbl; + + /** + * Signature given in the constructor. + */ + const std::vector signature; + + /** + * List of states that should be checked for being active. + */ + const std::vector states; + + /** + * Used internally to check whether the given parser stack s may have been + * active for signature element d. + * + * @param d is the signature element. + * @param s is the parser state. + * @return true if the the given State may have been active. + */ + bool isActive(size_t d, const State *s); + +public: + /** + * Constructor of the StateDeductor class. + * + * @param signature a Node type signature describing the types of the nodes + * which currently reside on e.g. the ParserScope stack. + * @param states is a list of states that should be checked. + */ + StateDeductor(std::vector signature, + std::vector states); + + /** + * Selects all active states from the given states. Only considers those + * states that may have produced the last signature element. + * + * @return a list of states that may actually have been active. + */ + std::vector deduce(); +}; + +/** + * The States namespace contains all the global state constants used + * in the ParserStack class. + */ +namespace States { +/** + * State representing all states. + */ +extern const State All; + +/** + * State representing the initial state. + */ +extern const State None; +} +} +} + +#endif /* _OUSIA_PARSER_STATE_HPP_ */ + diff --git a/test/core/parser/ParserStateTest.cpp b/test/core/parser/ParserStateTest.cpp deleted file mode 100644 index 91d8dcd..0000000 --- a/test/core/parser/ParserStateTest.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include - -namespace ousia { - -static const Rtti t1; -static const Rtti t2; -static const Rtti t3; -static const Rtti t4; -static const Rtti t5; - -static const ParserState s1 = ParserStateBuilder().createdNodeType(&t1); -static const ParserState s2a = - ParserStateBuilder().parent(&s1).createdNodeType(&t2); -static const ParserState s2b = - ParserStateBuilder().parent(&s1).createdNodeType(&t2); -static const ParserState s3 = - ParserStateBuilder().parents({&s2a, &s1}).createdNodeType(&t3); -static const ParserState s4 = - ParserStateBuilder().parent(&s3).createdNodeType(&t4); -static const ParserState s5 = - ParserStateBuilder().parent(&s2b).createdNodeType(&t5); - -TEST(ParserStateDeductor, deduce) -{ - using Result = std::vector; - using Signature = std::vector; - std::vector states{&s1, &s2a, &s2b, &s3, &s4, &s5}; - - // Should not crash on empty signature - ASSERT_EQ(Result{}, ParserStateDeductor(Signature{}, states).deduce()); - - // Try repeating signature elements - ASSERT_EQ(Result({&s1}), - ParserStateDeductor(Signature({&t1}), states).deduce()); - ASSERT_EQ(Result({&s1}), - ParserStateDeductor(Signature({&t1, &t1}), states).deduce()); - ASSERT_EQ(Result({&s1}), - ParserStateDeductor(Signature({&t1, &t1, &t1}), states).deduce()); - - // Go to another state - ASSERT_EQ(Result({&s2a, &s2b}), - ParserStateDeductor(Signature({&t1, &t1, &t2}), states).deduce()); - ASSERT_EQ(Result({&s4}), - ParserStateDeductor(Signature({&t1, &t3, &t4}), states).deduce()); - - // Skip one state - ASSERT_EQ(Result({&s4}), - ParserStateDeductor(Signature({&t2, &t4}), states).deduce()); - - // Impossible signature - ASSERT_EQ(Result({}), - ParserStateDeductor(Signature({&t4, &t5}), states).deduce()); - -} -} - diff --git a/test/core/parser/stack/StateTest.cpp b/test/core/parser/stack/StateTest.cpp new file mode 100644 index 0000000..e503d30 --- /dev/null +++ b/test/core/parser/stack/StateTest.cpp @@ -0,0 +1,79 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +namespace ousia { +namespace parser_stack { + +static const Rtti t1; +static const Rtti t2; +static const Rtti t3; +static const Rtti t4; +static const Rtti t5; + +static const State s1 = StateBuilder().createdNodeType(&t1); +static const State s2a = + StateBuilder().parent(&s1).createdNodeType(&t2); +static const State s2b = + StateBuilder().parent(&s1).createdNodeType(&t2); +static const State s3 = + StateBuilder().parents({&s2a, &s1}).createdNodeType(&t3); +static const State s4 = + StateBuilder().parent(&s3).createdNodeType(&t4); +static const State s5 = + StateBuilder().parent(&s2b).createdNodeType(&t5); + +TEST(StateDeductor, deduce) +{ + using Result = std::vector; + using Signature = std::vector; + std::vector states{&s1, &s2a, &s2b, &s3, &s4, &s5}; + + // Should not crash on empty signature + ASSERT_EQ(Result{}, StateDeductor(Signature{}, states).deduce()); + + // Try repeating signature elements + ASSERT_EQ(Result({&s1}), + StateDeductor(Signature({&t1}), states).deduce()); + ASSERT_EQ(Result({&s1}), + StateDeductor(Signature({&t1, &t1}), states).deduce()); + ASSERT_EQ(Result({&s1}), + StateDeductor(Signature({&t1, &t1, &t1}), states).deduce()); + + // Go to another state + ASSERT_EQ(Result({&s2a, &s2b}), + StateDeductor(Signature({&t1, &t1, &t2}), states).deduce()); + ASSERT_EQ(Result({&s4}), + StateDeductor(Signature({&t1, &t3, &t4}), states).deduce()); + + // Skip one state + ASSERT_EQ(Result({&s4}), + StateDeductor(Signature({&t2, &t4}), states).deduce()); + + // Impossible signature + ASSERT_EQ(Result({}), + StateDeductor(Signature({&t4, &t5}), states).deduce()); + +} +} +} + -- cgit v1.2.3 From 9acab70815a0f62bdaf2c7f01e588066b818d330 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 22:45:19 +0100 Subject: Fixed isIdentifier and isNamespacedIdentifier, added and used isIdentifierOrEmpty for use in Node --- src/core/common/Utils.cpp | 13 +++++++++---- src/core/common/Utils.hpp | 5 +++++ src/core/model/Node.cpp | 2 +- test/core/common/UtilsTest.cpp | 39 +++++++++++++++++++++++++++++++++------ 4 files changed, 48 insertions(+), 11 deletions(-) (limited to 'test/core') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index fc8ee00..f8b53c6 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -37,22 +37,27 @@ bool Utils::isIdentifier(const std::string &name) } first = false; } - return true; + return !first; } -bool Utils::isNamespaceIdentifier(const std::string &name) +bool Utils::isIdentifierOrEmpty(const std::string &name) +{ + return name.empty() || isIdentifier(name); +} + +bool Utils::isNamespacedIdentifier(const std::string &name) { bool first = true; for (char c : name) { if (first && !isIdentifierStartCharacter(c)) { return false; } - if (!first && (!isIdentifierCharacter(c) || c == ':')) { + if (!first && (!isIdentifierCharacter(c) && c != ':')) { return false; } first = (c == ':'); } - return true; + return !first; } bool Utils::hasNonWhitepaceChar(const std::string &s) diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index b5cd178..b5a54fc 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -85,6 +85,11 @@ public: */ static bool isIdentifier(const std::string &name); + /** + * Returns true if the given string is an identifier or an empty string. + */ + static bool isIdentifierOrEmpty(const std::string &name); + /** * Returns true if the given string is in * \code{.txt} diff --git a/src/core/model/Node.cpp b/src/core/model/Node.cpp index 39ee2e4..ce15cad 100644 --- a/src/core/model/Node.cpp +++ b/src/core/model/Node.cpp @@ -448,7 +448,7 @@ bool Node::doValidate(Logger &logger) const { return true; } bool Node::validateName(Logger &logger) const { - if (!Utils::isIdentifier(name)) { + if (!Utils::isIdentifierOrEmpty(name)) { logger.error(type()->name + std::string(" name \"") + name + std::string("\" is not a valid identifier"), this); diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index a4bf4b2..7801296 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -24,14 +24,40 @@ namespace ousia { TEST(Utils, isIdentifier) { - ASSERT_TRUE(Utils::isIdentifier("test")); - ASSERT_TRUE(Utils::isIdentifier("t0-_est")); - ASSERT_FALSE(Utils::isIdentifier("_t0-_EST")); - ASSERT_FALSE(Utils::isIdentifier("-t0-_EST")); - ASSERT_FALSE(Utils::isIdentifier("0t-_EST")); - ASSERT_FALSE(Utils::isIdentifier("invalid key")); + EXPECT_TRUE(Utils::isIdentifier("test")); + EXPECT_TRUE(Utils::isIdentifier("t0-_est")); + EXPECT_FALSE(Utils::isIdentifier("_t0-_EST")); + EXPECT_FALSE(Utils::isIdentifier("-t0-_EST")); + EXPECT_FALSE(Utils::isIdentifier("0t-_EST")); + EXPECT_FALSE(Utils::isIdentifier("_A")); + EXPECT_FALSE(Utils::isIdentifier("invalid key")); + EXPECT_FALSE(Utils::isIdentifier("")); } + +TEST(Utils, isNamespacedIdentifier) +{ + EXPECT_TRUE(Utils::isNamespacedIdentifier("test")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("t0-_est")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("_t0-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("-t0-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("0t-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("invalid key")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("_A")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("")); + EXPECT_FALSE(Utils::isNamespacedIdentifier(":")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("test:a")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("t0-_est:b")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("test:test")); + EXPECT_TRUE(Utils::isNamespacedIdentifier("t0-_est:t0-_est")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("test:_A")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("test::a")); + EXPECT_FALSE(Utils::isNamespacedIdentifier(":test")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("t0-_est:_t0-_EST")); + EXPECT_FALSE(Utils::isNamespacedIdentifier("t0-_est: b")); +} + + TEST(Utils, split) { ASSERT_EQ(std::vector({"ab"}), Utils::split("ab", '.')); @@ -82,5 +108,6 @@ TEST(Utils, endsWith) ASSERT_TRUE(Utils::endsWith("foobar", "bar")); ASSERT_TRUE(Utils::endsWith("foo", "")); } + } -- cgit v1.2.3 From 02995f1f9b5a0905ed8f79a5149f4b6375a622bf Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 22:45:49 +0100 Subject: Fixed gcc 4.9 warnings --- test/core/RangeSetTest.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'test/core') diff --git a/test/core/RangeSetTest.cpp b/test/core/RangeSetTest.cpp index cbf8f59..446ee51 100644 --- a/test/core/RangeSetTest.cpp +++ b/test/core/RangeSetTest.cpp @@ -110,7 +110,7 @@ TEST(RangeSet, Merge) s.merge(Range(40, 50)); s.merge(Range(60, 70)); { - ASSERT_EQ(ranges.size(), 4); + ASSERT_EQ(ranges.size(), 4U); auto it = ranges.begin(); ASSERT_EQ((*it).start, 0); @@ -132,7 +132,7 @@ TEST(RangeSet, Merge) // Now insert an element which spans the second and third element s.merge(Range(15, 55)); { - ASSERT_EQ(ranges.size(), 3); + ASSERT_EQ(ranges.size(), 3U); auto it = ranges.begin(); ASSERT_EQ((*it).start, 0); @@ -150,7 +150,7 @@ TEST(RangeSet, Merge) // Now insert an element which expands the first element s.merge(Range(-10, 11)); { - ASSERT_EQ(ranges.size(), 3); + ASSERT_EQ(ranges.size(), 3U); auto it = ranges.begin(); ASSERT_EQ((*it).start, -10); @@ -168,7 +168,7 @@ TEST(RangeSet, Merge) // Now insert an element which merges the last two elements s.merge(Range(13, 70)); { - ASSERT_EQ(ranges.size(), 2); + ASSERT_EQ(ranges.size(), 2U); auto it = ranges.begin(); ASSERT_EQ((*it).start, -10); @@ -182,7 +182,7 @@ TEST(RangeSet, Merge) // Now insert an element which merges the remaining elements s.merge(Range(-9, 12)); { - ASSERT_EQ(ranges.size(), 1); + ASSERT_EQ(ranges.size(), 1U); auto it = ranges.begin(); ASSERT_EQ((*it).start, -10); -- cgit v1.2.3 From 0a8a012850bb7c730ccac4c91c7aca5c88cbedc9 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:14:58 +0100 Subject: Implemented most of the desired behaviour of the Stack class, added unit tests --- src/core/parser/stack/Stack.cpp | 437 ++++++++++++++++++++---- src/core/parser/stack/Stack.hpp | 85 ++++- test/core/parser/stack/StackTest.cpp | 639 +++++++++++++++++++++++++++++++++++ 3 files changed, 1075 insertions(+), 86 deletions(-) create mode 100644 test/core/parser/stack/StackTest.cpp (limited to 'test/core') diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index b0df39b..d84a19c 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -18,6 +18,7 @@ #include +#include #include #include #include @@ -37,10 +38,28 @@ HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {} HandlerInfo::HandlerInfo(std::shared_ptr handler) : handler(handler), fieldIdx(0), + valid(true), + implicit(false), inField(false), inDefaultField(false), inImplicitDefaultField(false), - hasDefaultField(false) + inValidField(false), + hadDefaultField(false) +{ +} + +HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField, + bool inDefaultField, bool inImplicitDefaultField, + bool inValidField) + : handler(nullptr), + fieldIdx(0), + valid(valid), + implicit(implicit), + inField(inField), + inDefaultField(inDefaultField), + inImplicitDefaultField(inImplicitDefaultField), + inValidField(inValidField), + hadDefaultField(false) { } @@ -55,7 +74,7 @@ void HandlerInfo::fieldStart(bool isDefault, bool isImplicit, bool isValid) inDefaultField = isDefault || isImplicit; inImplicitDefaultField = isImplicit; inValidField = isValid; - hasDefaultField = hasDefaultField || inDefaultField; + hadDefaultField = hadDefaultField || inDefaultField; fieldIdx++; } @@ -65,11 +84,13 @@ void HandlerInfo::fieldEnd() inDefaultField = false; inImplicitDefaultField = false; inValidField = false; - if (fieldIdx > 0) { - fieldIdx--; - } } +/** + * Stub instance of HandlerInfo containing no handler information. + */ +static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true}; + /* Helper functions */ /** @@ -110,9 +131,31 @@ Stack::Stack(ParserContext &ctx, } } -Stack::~Stack() {} +Stack::~Stack() +{ + while (!stack.empty()) { + // Fetch the topmost stack element + HandlerInfo &info = currentInfo(); + + // It is an error if we're still in a field of an element while the + // Stack instance is destroyed. Log that + if (handlersValid()) { + if (info.inField && !info.implicit && + !info.inImplicitDefaultField) { + logger().error( + std::string("Reached end of stream, but command \"") + + info.handler->getName() + + "\" has not ended yet. Command was started here:", + info.handler->getLocation()); + } + } -bool Stack::deduceState() + // Remove the command from the stack + endCurrentHandler(); + } +} + +void Stack::deduceState() { // Assemble all states std::vector states; @@ -125,23 +168,24 @@ bool Stack::deduceState() std::vector possibleStates = StateDeductor(ctx.getScope().getStackTypeSignature(), states).deduce(); if (possibleStates.size() != 1U) { - throw LoggableException{ - "Error while including file: Cannot deduce parser state."}; + throw LoggableException( + "Error while including file: Cannot deduce parser state."); } - // Switch to this state by creating a dummy handler - const State *state = possibleStates[0]; - stack.emplace(std::shared_ptr{EmptyHandler::create({ctx, "", *state, *state, SourceLocation{}})}); -} + // Switch to this state by creating a handler, but do not call its start + // function + const State &state = *possibleStates[0]; + HandlerConstructor ctor = + state.elementHandler ? state.elementHandler : EmptyHandler::create; -bool Stack::handlersValid() -{ - for (auto it = stack.crbegin(); it != stack.crend(); it++) { - if (!it->valid) { - return false; - } - } - return true; + std::shared_ptr handler = + std::shared_ptr{ctor({ctx, "", state, SourceLocation{}})}; + stack.emplace_back(handler); + + // Set the correct flags for this implicit handler + HandlerInfo &info = currentInfo(); + info.implicit = true; + info.fieldStart(true, false, true); } std::set Stack::expectedCommands() @@ -158,12 +202,12 @@ std::set Stack::expectedCommands() const State &Stack::currentState() { - return stack.empty() ? States::None : stack.top()->state(); + return stack.empty() ? States::None : stack.back().handler->getState(); } std::string Stack::currentCommandName() { - return stack.empty() ? std::string{} : stack.top()->name(); + return stack.empty() ? std::string{} : stack.back().handler->getName(); } const State *Stack::findTargetState(const std::string &name) @@ -180,77 +224,330 @@ const State *Stack::findTargetState(const std::string &name) return nullptr; } +const State *Stack::findTargetStateOrWildcard(const std::string &name) +{ + // Try to find the target state with the given name, if none is found, try + // find a matching "*" state. + State const *targetState = findTargetState(name); + if (targetState == nullptr) { + return findTargetState("*"); + } + return targetState; +} + +HandlerInfo &Stack::currentInfo() +{ + return stack.empty() ? EmptyHandlerInfo : stack.back(); +} +HandlerInfo &Stack::lastInfo() +{ + return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2]; +} + +void Stack::endCurrentHandler() +{ + if (!stack.empty()) { + // Fetch the handler info for the current top-level element + HandlerInfo &info = stack.back(); + + // Do not call any callback functions while the stack is marked as + // invalid or this is an elment marked as "implicit" + if (!info.implicit && handlersValid()) { + // Make sure the fieldEnd handler is called if the element still + // is in a field + if (info.inField) { + info.handler->fieldEnd(); + info.fieldEnd(); + } + + // Call the "end" function of the corresponding Handler instance + info.handler->end(); + } + + // Remove the element from the stack + stack.pop_back(); + } +} + +bool Stack::ensureHandlerIsInField() +{ + // If the current handler is not in a field (and actually has a handler) + // try to start a default field + HandlerInfo &info = currentInfo(); + if (!info.inField && info.handler != nullptr) { + // Abort if the element already had a default field + if (info.hadDefaultField) { + return false; + } + + // Try to start a new default field, abort if this did not work + bool isDefault = true; + if (!info.handler->fieldStart(isDefault, info.fieldIdx)) { + info.handler->fieldEnd(); + endCurrentHandler(); + return false; + } + + // Mark the field as started + info.fieldStart(true, true, true); + } + return true; +} + +bool Stack::handlersValid() +{ + for (auto it = stack.crbegin(); it != stack.crend(); it++) { + if (!it->valid) { + return false; + } + } + return true; +} + +Logger &Stack::logger() { return ctx.getLogger(); } + void Stack::command(const Variant &name, const Variant::mapType &args) { - // Make sure the given identifier is valid + // Make sure the given identifier is valid (preventing "*" from being + // malicously passed to this function) if (!Utils::isNamespacedIdentifier(name.asString())) { throw LoggableException(std::string("Invalid identifier \"") + - name.asString() + std::string("\""), name); + name.asString() + std::string("\""), + name); } - // Try to find a target state for the given command - State const *targetState = findTargetState(name.asString()); + State const *lastTargetState = nullptr; + Variant::mapType canonicalArgs; + while (true) { + // Try to find a target state for the given command, if none can be + // found and the current command does not have an open field, then try + // to create an empty default field, otherwise this is an exception + const State *targetState = findTargetStateOrWildcard(name.asString()); + if (targetState == nullptr) { + if (!currentInfo().inField) { + endCurrentHandler(); + continue; + } else { + throw buildInvalidCommandException(name.asString(), + expectedCommands()); + } + } + + // Make sure we're currently inside a field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } - // No target state is found, try to find a wildcard handler for the current - // state - if (targetState == nullptr) { - targetState = findTargetState("*"); - } + // Fork the logger. We do not want any validation errors to skip + LoggerFork loggerFork = logger().fork(); - // No handler has been found at all, - if (targetState == nullptr) { - throw buildInvalidCommandException(name.asString(), expectedCommands()); + // Canonicalize the arguments (if this has not already been done), allow + // additional arguments + if (lastTargetState != targetState) { + canonicalArgs = args; + targetState->arguments.validateMap(canonicalArgs, loggerFork, true); + lastTargetState = targetState; + } + + // Instantiate the handler and push it onto the stack + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : EmptyHandler::create; + std::shared_ptr handler{ + ctor({ctx, name.asString(), *targetState, name.getLocation()})}; + stack.emplace_back(handler); + + // Fetch the HandlerInfo for the parent element and the current element + HandlerInfo &parentInfo = lastInfo(); + HandlerInfo &info = currentInfo(); + + // Call the "start" method of the handler, store the result of the start + // method as the validity of the handler -- do not call the start method + // if the stack is currently invalid (as this may cause further, + // unwanted errors) + bool validStack = handlersValid(); + info.valid = false; + if (validStack) { + handler->setLogger(loggerFork); + try { + info.valid = handler->start(canonicalArgs); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + handler->resetLogger(); + } + + // We started the command within an implicit default field and it is not + // valid -- remove both the new handler and the parent field from the + // stack + if (!info.valid && parentInfo.inImplicitDefaultField) { + endCurrentHandler(); + endCurrentHandler(); + continue; + } + + // If we ended up here, starting the command may or may not have worked, + // but after all, we cannot unroll the stack any further. Update the + // "valid" flag, commit any potential error messages and return. + info.valid = parentInfo.valid && info.valid; + loggerFork.commit(); + return; } +} + +void Stack::data(const Variant &data) +{ + while (true) { + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException("No command here to receive data."); + } + + // Fetch the current command handler information + HandlerInfo &info = currentInfo(); + + // Make sure the current handler has an open field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } + + // If this field should not get any data, log an error and do not call + // the "data" handler + if (!info.inValidField) { + logger().error("Did not expect any data here", data); + } + + if (handlersValid() && info.inValidField) { + // Fork the logger and set it as temporary logger for the "start" + // method. We only want to keep error messages if this was not a try + // to implicitly open a default field. + LoggerFork loggerFork = logger().fork(); + info.handler->setLogger(loggerFork); + + // Pass the data to the current Handler instance + bool valid = false; + try { + valid = info.handler->data(data); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + + // Reset the logger instance as soon as possible + info.handler->resetLogger(); + + // If placing the data here failed and we're currently in an + // implicitly opened field, just unroll the stack to the next field + // and try again + if (!valid && info.inImplicitDefaultField) { + endCurrentHandler(); + continue; + } + + // Commit the content of the logger fork. Do not change the valid + // flag. + loggerFork.commit(); + } - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and push it onto the stack - Handler *handler = - ctor({ctx, name.asString(), *targetState, currentState(), name.getLocation()}); - stack.emplace_back(std::shared_ptr{handler}); - - // Call the "start" method of the handler, store the result of the start - // method as the validity of the handler -- do not call the start method - // if the stack is currently invalid (as this may cause further, unwanted - // errors) - try { - stack.back().valid = handlersValid() && handler->start(args); - } catch (LoggableException ex) { - stack.back().valid = false; - logger.log(ex, ) + // There was no reason to unroll the stack any further, so continue + return; } } -void Stack::end() +void Stack::fieldStart(bool isDefault) { - // Check whether the current command could be ended + // Make sure the current handler stack is not empty if (stack.empty()) { - throw LoggableException{"No command to end."}; + throw LoggableException( + "No command for which a field could be started"); } - // Remove the current HandlerInstance from the stack - std::shared_ptr inst{stack.top()}; - stack.pop(); + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (info.inField) { + logger().error( + "Got field start, but there is no command for which to start the " + "field."); + return; + } + + // Copy the isDefault flag to a local variable, the fieldStart method will + // write into this variable + bool defaultField = isDefault; + + // Do not call the "fieldStart" function if we're in an invalid subtree + bool valid = false; + if (handlersValid()) { + try { + valid = info.handler->fieldStart(defaultField, info.fieldIdx); + } + catch (LoggableException ex) { + logger().log(ex); + } + if (!valid && !defaultField) { + logger().error( + std::string("Cannot start a new field here (index ") + + std::to_string(info.fieldIdx + 1) + + std::string("), field does not exist")); + } + } - // Call the end function of the last Handler - inst->end(); + // Mark the field as started + info.fieldStart(defaultField, false, valid); } -void Stack::data(const std::string &data, int field) +void Stack::fieldEnd() { - // Check whether there is any command the data can be sent to + // Make sure the current handler stack is not empty if (stack.empty()) { - throw LoggableException{"No command to receive data."}; + throw LoggableException("No command for which a field could be ended"); } - // Pass the data to the current Handler instance - stack.top()->data(data, field); + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (!info.inField) { + logger().error( + "Got field end, but there is no command for which to end the " + "field."); + return; + } + + // Only continue if the current handler stack is in a valid state, do not + // call the fieldEnd function if something went wrong before + if (handlersValid()) { + try { + info.handler->fieldEnd(); + } + catch (LoggableException ex) { + logger().log(ex); + } + } + + // This command no longer is in a field + info.fieldEnd(); + + // As soon as this command had a default field, remove it from the stack + if (info.hadDefaultField) { + endCurrentHandler(); + } +} + +void Stack::annotationStart(const Variant &className, const Variant &args) +{ + // TODO +} + +void Stack::annotationEnd(const Variant &className, const Variant &elementName) +{ + // TODO +} + +void Stack::token(Variant token) +{ + // TODO } } } diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index 294f7ec..76eefd9 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -43,6 +43,7 @@ namespace ousia { // Forward declarations class ParserContext; +class Logger; namespace parser_stack { @@ -75,7 +76,13 @@ public: bool valid : 1; /** - * Set to true if the handler currently is in a filed. + * Set to true if this is an implicit handler, that was created when the + * current stack state was deduced. + */ + bool implicit : 1; + + /** + * Set to true if the handler currently is in a field. */ bool inField : 1; @@ -99,12 +106,17 @@ public: /** * Set to true, if the default field was already started. */ - bool hasDefaultField : 1; + bool hadDefaultField : 1; /** * Default constructor of the HandlerInfo class. */ HandlerInfo(); + /** + * Constructor of the HandlerInfo class, allows to set all flags manually. + */ + HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField, bool inValidField); /** * Constructor of the HandlerInfo class, taking a shared_ptr to the handler @@ -129,7 +141,6 @@ public: void fieldEnd(); }; - /** * The Stack class is a pushdown automaton responsible for turning a command * stream into a tree of Node instances. It does so by following a state @@ -154,6 +165,11 @@ private: */ std::vector stack; + /** + * Return the reference in the Logger instance stored within the context. + */ + Logger &logger(); + /** * Used internally to get all expected command names for the current state. * This function is used to build error messages. @@ -164,7 +180,7 @@ private: /** * Returns the targetState for a command with the given name that can be - * reached from for the current state. + * reached from the current state. * * @param name is the name of the requested command. * @return nullptr if no target state was found, a pointer at the target @@ -172,6 +188,17 @@ private: */ const State *findTargetState(const std::string &name); + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state, also including the wildcard "*" state. + * Throws an exception if the given target state is not a valid identifier. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetStateOrWildcard(const std::string &name); + /** * Tries to reconstruct the parser state from the Scope instance of the * ParserContext given in the constructor. This functionality is needed for @@ -180,6 +207,33 @@ private: */ void deduceState(); + /** + * Returns a reference at the current HandlerInfo instance (or a stub + * HandlerInfo instance if the stack is empty). + */ + HandlerInfo ¤tInfo(); + + /** + * Returns a reference at the last HandlerInfo instance (or a stub + * HandlerInfo instance if the stack has only one element). + */ + HandlerInfo &lastInfo(); + + /** + * Ends the current handler and removes the corresponding element from the + * stack. + */ + void endCurrentHandler(); + + /** + * Tries to start a default field for the current handler, if currently the + * handler is not inside a field and did not have a default field yet. + * + * @return true if the handler is inside a field, false if no field could + * be started. + */ + bool ensureHandlerIsInField(); + /** * Returns true if all handlers on the stack are currently valid, or false * if at least one handler is invalid. @@ -196,9 +250,8 @@ public: * @param states is a map containing the command names and pointers at the * corresponding State instances. */ - Stack( - ParserContext &ctx, - const std::multimap &states); + Stack(ParserContext &ctx, + const std::multimap &states); /** * Destructor of the Stack class. @@ -231,6 +284,15 @@ public: */ void command(const Variant &name, const Variant::mapType &args); + /** + * Function that shuold be called whenever character data is found in the + * input stream. May only be called if the currently is a command on the + * stack. + * + * @param data is a string variant containing the data that has been found. + */ + void data(const Variant &data); + /** * Function that should be called whenever a new field starts. Fields of the * same command may not be separated by calls to data or annotations. Doing @@ -247,15 +309,6 @@ public: */ void fieldEnd(); - /** - * Function that shuold be called whenever character data is found in the - * input stream. May only be called if the currently is a command on the - * stack. - * - * @param data is a string variant containing the data that has been found. - */ - void data(const Variant &data); - /** * Function that should be called whenever an annotation starts. * diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp new file mode 100644 index 0000000..7cc8bc5 --- /dev/null +++ b/test/core/parser/stack/StackTest.cpp @@ -0,0 +1,639 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +#include +#include +#include +#include + +#include + +namespace ousia { +namespace parser_stack { + +// Build an instance of the StandaloneEnvironment used for this unit test +static TerminalLogger logger(std::cerr, true); +// static ConcreteLogger logger; +static StandaloneEnvironment env(logger); + +namespace { + +struct Tracker { + int startCount; + int endCount; + int fieldStartCount; + int fieldEndCount; + int annotationStartCount; + int annotationEndCount; + int dataCount; + + Variant::mapType startArgs; + bool fieldStartIsDefault; + size_t fieldStartIdx; + Variant annotationStartClassName; + Variant::mapType annotationStartArgs; + Variant annotationEndClassName; + Variant annotationEndElementName; + Variant dataData; + + bool startResult; + bool fieldStartSetIsDefault; + bool fieldStartResult; + bool annotationStartResult; + bool annotationEndResult; + bool dataResult; + + Tracker() { reset(); } + + void reset() + { + startCount = 0; + endCount = 0; + fieldStartCount = 0; + fieldEndCount = 0; + annotationStartCount = 0; + annotationEndCount = 0; + dataCount = 0; + + startArgs = Variant::mapType{}; + fieldStartIsDefault = false; + fieldStartIdx = 0; + annotationStartClassName = Variant::fromString(std::string{}); + annotationStartArgs = Variant::mapType{}; + annotationEndClassName = Variant::fromString(std::string{}); + annotationEndElementName = Variant::fromString(std::string{}); + dataData = Variant::fromString(std::string{}); + + startResult = true; + fieldStartSetIsDefault = false; + fieldStartResult = true; + annotationStartResult = true; + annotationEndResult = true; + dataResult = true; + } + + void expect(int startCount, int endCount, int fieldStartCount, + int fieldEndCount, int annotationStartCount, + int annotationEndCount, int dataCount) + { + EXPECT_EQ(startCount, this->startCount); + EXPECT_EQ(endCount, this->endCount); + EXPECT_EQ(fieldStartCount, this->fieldStartCount); + EXPECT_EQ(fieldEndCount, this->fieldEndCount); + EXPECT_EQ(annotationStartCount, this->annotationStartCount); + EXPECT_EQ(annotationEndCount, this->annotationEndCount); + EXPECT_EQ(dataCount, this->dataCount); + } +}; + +static Tracker tracker; + +class TestHandler : public Handler { +private: + TestHandler(const HandlerData &handlerData) : Handler(handlerData) {} + +public: + bool start(const Variant::mapType &args) + { + tracker.startCount++; + tracker.startArgs = args; + return tracker.startResult; + } + + void end() { tracker.endCount++; } + + bool fieldStart(bool &isDefault, size_t fieldIdx) + { + tracker.fieldStartCount++; + tracker.fieldStartIsDefault = isDefault; + tracker.fieldStartIdx = fieldIdx; + if (tracker.fieldStartSetIsDefault) { + isDefault = true; + } + return tracker.fieldStartResult; + } + + void fieldEnd() { tracker.fieldEndCount++; } + + bool annotationStart(const Variant &className, const Variant::mapType &args) + { + tracker.annotationStartCount++; + tracker.annotationStartClassName = className; + tracker.annotationStartArgs = args; + return tracker.annotationStartResult; + } + + bool annotationEnd(const Variant &className, const Variant &elementName) + { + tracker.annotationEndCount++; + tracker.annotationEndClassName = className; + tracker.annotationEndElementName = elementName; + return tracker.annotationEndResult; + } + + bool data(const Variant &data) + { + tracker.dataCount++; + tracker.dataData = data; + return tracker.dataResult; + } + + static Handler *create(const HandlerData &handlerData) + { + return new TestHandler(handlerData); + } +}; +} + +namespace States { +static const State Document = + StateBuilder().parent(&None).elementHandler(TestHandler::create); +static const State Body = + StateBuilder().parent(&Document).elementHandler(TestHandler::create); +static const State Empty = + StateBuilder().parent(&Document).elementHandler(TestHandler::create); +static const State Special = + StateBuilder().parent(&All).elementHandler(TestHandler::create); +static const State Arguments = + StateBuilder().parent(&None).elementHandler(TestHandler::create).arguments( + {Argument::Int("a"), Argument::String("b")}); +static const State BodyChildren = + StateBuilder().parent(&Body).elementHandler(TestHandler::create); +static const State Any = + StateBuilder().parents({&None, &Any}).elementHandler(TestHandler::create); + +static const std::multimap TestHandlers{ + {"document", &Document}, + {"body", &Body}, + {"empty", &Empty}, + {"special", &Special}, + {"arguments", &Arguments}, + {"*", &BodyChildren}}; + +static const std::multimap AnyHandlers{{"*", &Any}}; +} + +TEST(Stack, basicTest) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::TestHandlers}; + + EXPECT_EQ("", s.currentCommandName()); + EXPECT_EQ(&States::None, &s.currentState()); + + s.command("document", {}); + s.fieldStart(true); + s.data("test1"); + + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + + s.command("body", {}); + s.fieldStart(true); + s.data("test2"); + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + tracker.expect(2, 0, 2, 0, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.command("inner", {}); + s.fieldStart(true); + EXPECT_EQ("inner", s.currentCommandName()); + EXPECT_EQ(&States::BodyChildren, &s.currentState()); + + s.fieldEnd(); + tracker.expect(3, 1, 3, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldEnd(); + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + tracker.expect(3, 2, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.command("body", {}); + s.fieldStart(true); + s.data("test3"); + EXPECT_EQ("body", s.currentCommandName()); + EXPECT_EQ(&States::Body, &s.currentState()); + s.fieldEnd(); + tracker.expect(4, 3, 4, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + + EXPECT_EQ("document", s.currentCommandName()); + EXPECT_EQ(&States::Document, &s.currentState()); + + s.fieldEnd(); + tracker.expect(4, 4, 4, 4, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + + EXPECT_EQ("", s.currentCommandName()); + EXPECT_EQ(&States::None, &s.currentState()); + } + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, errorInvalidCommands) +{ + Stack s{env.context, States::TestHandlers}; + tracker.reset(); + EXPECT_THROW(s.command("body", {}), LoggableException); + s.command("document", {}); + s.fieldStart(true); + EXPECT_THROW(s.command("document", {}), LoggableException); + s.command("empty", {}); + s.fieldStart(true); + EXPECT_THROW(s.command("body", {}), LoggableException); + s.command("special", {}); + s.fieldStart(true); + s.fieldEnd(); + s.fieldEnd(); + s.fieldEnd(); + EXPECT_EQ(&States::None, &s.currentState()); + ASSERT_THROW(s.fieldEnd(), LoggableException); + ASSERT_THROW(s.data("test"), LoggableException); +} + +TEST(Stack, validation) +{ + Stack s{env.context, States::TestHandlers}; + tracker.reset(); + logger.reset(); + + s.command("arguments", {}); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); + + logger.reset(); + s.command("arguments", {{"a", 5}}); + EXPECT_TRUE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); + + logger.reset(); + s.command("arguments", {{"a", 5}, {"b", "test"}}); + EXPECT_FALSE(logger.hasError()); + s.fieldStart(true); + s.fieldEnd(); +} + +TEST(Stack, invalidCommandName) +{ + Stack s{env.context, States::AnyHandlers}; + tracker.reset(); + logger.reset(); + + s.command("a", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("a_", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("a_:b", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + ASSERT_THROW(s.command("_a", {}), LoggableException); + ASSERT_THROW(s.command("a:", {}), LoggableException); + ASSERT_THROW(s.command("a:_b", {}), LoggableException); + tracker.expect(3, 3, 3, 3, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, multipleFields) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {{"a", false}}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("a", s.currentCommandName()); + EXPECT_EQ(Variant::mapType({{"a", false}}), tracker.startArgs); + + s.fieldStart(false); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(0U, tracker.fieldStartIdx); + + s.data("test"); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test", tracker.dataData); + + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + tracker.expect(1, 0, 2, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_FALSE(tracker.fieldStartIsDefault); + EXPECT_EQ(1U, tracker.fieldStartIdx); + + s.data("test2"); + tracker.expect(1, 0, 2, 1, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test2", tracker.dataData); + + s.fieldEnd(); + tracker.expect(1, 0, 2, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(true); + tracker.expect(1, 0, 3, 2, 0, 0, 2); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_TRUE(tracker.fieldStartIsDefault); + EXPECT_EQ(2U, tracker.fieldStartIdx); + + s.data("test3"); + tracker.expect(1, 0, 3, 2, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + EXPECT_EQ("test3", tracker.dataData); + + s.fieldEnd(); + tracker.expect(1, 1, 3, 3, 0, 0, 3); // sc, ec, fsc, fse, asc, aec, dc + } + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, implicitDefaultFieldOnNewCommand) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.command("b", {}); + tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, implicitDefaultFieldOnNewCommandWithExplicitDefaultField) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + s.command("b", {}); + tracker.expect(2, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(2, 1, 2, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + } + tracker.expect(2, 2, 2, 2, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, noImplicitDefaultFieldOnIncompatibleCommand) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + tracker.fieldStartResult = false; + s.command("b", {}); + tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, noImplicitDefaultFieldIfDefaultFieldGiven) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldStart(true); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("", s.currentCommandName()); + + s.command("b", {}); + tracker.expect(2, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(2, 2, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, implicitDefaultFieldOnData) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.data("test"); + tracker.expect(1, 0, 1, 0, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 1); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, autoFieldEnd) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, autoImplicitFieldEnd) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + s.command("b", {}); + s.command("c", {}); + s.command("d", {}); + s.command("e", {}); + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(5, 1, 5, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(5, 5, 5, 5, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, invalidDefaultField) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_FALSE(logger.hasError()); +} + +TEST(Stack, errorInvalidDefaultFieldData) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + s.fieldStart(true); + ASSERT_FALSE(logger.hasError()); + s.data("test"); + ASSERT_TRUE(logger.hasError()); + s.fieldEnd(); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorInvalidFieldData) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.fieldStartResult = false; + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + s.data("test"); + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorFieldStartNoCommand) +{ + tracker.reset(); + logger.reset(); + + Stack s{env.context, States::AnyHandlers}; + ASSERT_THROW(s.fieldStart(false), LoggableException); + ASSERT_THROW(s.fieldStart(true), LoggableException); + tracker.expect(0, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorMutlipleFieldStarts) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + ASSERT_FALSE(logger.hasError()); + s.fieldStart(false); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 0, 1, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldEnd(); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorMutlipleFieldEnds) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + s.fieldEnd(); + ASSERT_FALSE(logger.hasError()); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + s.fieldEnd(); + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 0, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + } + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} + +TEST(Stack, errorOpenField) +{ + tracker.reset(); + logger.reset(); + + { + Stack s{env.context, States::AnyHandlers}; + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + + s.fieldStart(false); + ASSERT_FALSE(logger.hasError()); + } + ASSERT_TRUE(logger.hasError()); + tracker.expect(1, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc +} +} +} + -- cgit v1.2.3 From 36b712c9f9af5c008fbd193392546fd472a35189 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:15:08 +0100 Subject: Added lonely comment to StandaloneEnvironment --- test/core/StandaloneEnvironment.hpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'test/core') diff --git a/test/core/StandaloneEnvironment.hpp b/test/core/StandaloneEnvironment.hpp index a9dcdce..790bad4 100644 --- a/test/core/StandaloneEnvironment.hpp +++ b/test/core/StandaloneEnvironment.hpp @@ -31,6 +31,10 @@ namespace ousia { +/** + * StandaloneEnvironment is a class used for quickly setting up an entire + * environment needed for running an Ousia instance. + */ struct StandaloneEnvironment { ConcreteLogger &logger; Manager manager; -- cgit v1.2.3 From b1aade072781b0eca9b4c2fd15c360ec7d3ed25f Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 14:53:34 +0100 Subject: Removed legacy test file --- test/core/parser/ParserStackTest.cpp | 177 ----------------------------------- 1 file changed, 177 deletions(-) delete mode 100644 test/core/parser/ParserStackTest.cpp (limited to 'test/core') diff --git a/test/core/parser/ParserStackTest.cpp b/test/core/parser/ParserStackTest.cpp deleted file mode 100644 index 3a0decb..0000000 --- a/test/core/parser/ParserStackTest.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include - -namespace ousia { - -ConcreteLogger logger; - -static int startCount = 0; -static int endCount = 0; -static int dataCount = 0; - -class TestHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override { startCount++; } - - void end() override { endCount++; } - - void data(const std::string &data, int field) override { dataCount++; } - - static Handler *create(const HandlerData &data) - { - return new TestHandler(data); - } -}; - -namespace ParserStates { -static const ParserState Document = - ParserStateBuilder().parent(&None).elementHandler(TestHandler::create); -static const ParserState Body = ParserStateBuilder() - .parent(&Document) - .elementHandler(TestHandler::create); -static const ParserState Empty = - ParserStateBuilder().parent(&Document).elementHandler(TestHandler::create); -static const ParserState Special = - ParserStateBuilder().parent(&All).elementHandler(TestHandler::create); -static const ParserState Arguments = - ParserStateBuilder() - .parent(&None) - .elementHandler(TestHandler::create) - .arguments({Argument::Int("a"), Argument::String("b")}); -static const ParserState BodyChildren = - ParserStateBuilder() - .parent(&Body) - .elementHandler(TestHandler::create); - -static const std::multimap TestHandlers{ - {"document", &Document}, - {"body", &Body}, - {"empty", &Empty}, - {"special", &Special}, - {"arguments", &Arguments}, - {"*", &BodyChildren}}; -} - -TEST(ParserStack, simpleTest) -{ - StandaloneEnvironment env(logger); - ParserStack s{env.context, ParserStates::TestHandlers}; - - startCount = 0; - endCount = 0; - dataCount = 0; - - EXPECT_EQ("", s.currentCommandName()); - EXPECT_EQ(&ParserStates::None, &s.currentState()); - - s.start("document", {}); - s.data("test1"); - - EXPECT_EQ("document", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Document, &s.currentState()); - EXPECT_EQ(1, startCount); - EXPECT_EQ(1, dataCount); - - s.start("body", {}); - s.data("test2"); - EXPECT_EQ("body", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Body, &s.currentState()); - EXPECT_EQ(2, startCount); - EXPECT_EQ(2, dataCount); - - s.start("inner", {}); - EXPECT_EQ("inner", s.currentCommandName()); - EXPECT_EQ(&ParserStates::BodyChildren, &s.currentState()); - s.end(); - EXPECT_EQ(3, startCount); - EXPECT_EQ(1, endCount); - - s.end(); - EXPECT_EQ(2, endCount); - - EXPECT_EQ("document", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Document, &s.currentState()); - - s.start("body", {}); - s.data("test3"); - EXPECT_EQ("body", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Body, &s.currentState()); - s.end(); - EXPECT_EQ(4, startCount); - EXPECT_EQ(3, dataCount); - EXPECT_EQ(3, endCount); - - EXPECT_EQ("document", s.currentCommandName()); - EXPECT_EQ(&ParserStates::Document, &s.currentState()); - - s.end(); - EXPECT_EQ(4, endCount); - - EXPECT_EQ("", s.currentCommandName()); - EXPECT_EQ(&ParserStates::None, &s.currentState()); -} - -TEST(ParserStack, errorHandling) -{ - StandaloneEnvironment env(logger); - ParserStack s{env.context, ParserStates::TestHandlers}; - - EXPECT_THROW(s.start("body", {}), OusiaException); - s.start("document", {}); - EXPECT_THROW(s.start("document", {}), OusiaException); - s.start("empty", {}); - EXPECT_THROW(s.start("body", {}), OusiaException); - s.start("special", {}); - s.end(); - s.end(); - s.end(); - EXPECT_EQ(&ParserStates::None, &s.currentState()); - ASSERT_THROW(s.end(), OusiaException); - ASSERT_THROW(s.data("test", 1), OusiaException); -} - -TEST(ParserStack, validation) -{ - StandaloneEnvironment env(logger); - ParserStack s{env.context, ParserStates::TestHandlers}; - - logger.reset(); - s.start("arguments", {}); - EXPECT_TRUE(logger.hasError()); - s.end(); - - s.start("arguments", {{"a", 5}}); - EXPECT_TRUE(logger.hasError()); - s.end(); - - logger.reset(); - s.start("arguments", {{"a", 5}, {"b", "test"}}); - EXPECT_FALSE(logger.hasError()); - s.end(); -} -} - -- cgit v1.2.3 From 53c92aea125a439858d03245a914e20f55e5bcba Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 14:53:50 +0100 Subject: Fixed GCC 4.9 warnings --- test/core/model/DomainTest.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'test/core') diff --git a/test/core/model/DomainTest.cpp b/test/core/model/DomainTest.cpp index 8fcbdf2..4cb4331 100644 --- a/test/core/model/DomainTest.cpp +++ b/test/core/model/DomainTest.cpp @@ -242,7 +242,7 @@ TEST(Descriptor, getDefaultFields) A->createPrimitiveFieldDescriptor(sys->getStringType(), logger); // now we should find that. auto fields = A->getDefaultFields(); - ASSERT_EQ(1, fields.size()); + ASSERT_EQ(1U, fields.size()); ASSERT_EQ(A_prim_field, fields[0]); // remove that field from A and add it to another class. @@ -258,7 +258,7 @@ TEST(Descriptor, getDefaultFields) // but we should find it again if we set B as superclass of A. A->setSuperclass(B, logger); fields = A->getDefaultFields(); - ASSERT_EQ(1, fields.size()); + ASSERT_EQ(1U, fields.size()); ASSERT_EQ(A_prim_field, fields[0]); // and we should not be able to find it if we override the field. @@ -277,7 +277,7 @@ TEST(Descriptor, getDefaultFields) // now we should find that. fields = A->getDefaultFields(); - ASSERT_EQ(1, fields.size()); + ASSERT_EQ(1U, fields.size()); ASSERT_EQ(C_field, fields[0]); // add another transparent child class to A with a daughter class that has @@ -296,7 +296,7 @@ TEST(Descriptor, getDefaultFields) // now we should find both primitive fields, but the C field first. fields = A->getDefaultFields(); - ASSERT_EQ(2, fields.size()); + ASSERT_EQ(2U, fields.size()); ASSERT_EQ(C_field, fields[0]); ASSERT_EQ(F_field, fields[1]); } @@ -321,7 +321,7 @@ TEST(Descriptor, getPermittedChildren) * in between. */ NodeVector children = book->getPermittedChildren(); - ASSERT_EQ(3, children.size()); + ASSERT_EQ(3U, children.size()); ASSERT_EQ(section, children[0]); ASSERT_EQ(paragraph, children[1]); ASSERT_EQ(text, children[2]); @@ -331,7 +331,7 @@ TEST(Descriptor, getPermittedChildren) mgr, "Subclass", domain, Cardinality::any(), text, true, false)}; // And that should be in the result list as well now. children = book->getPermittedChildren(); - ASSERT_EQ(4, children.size()); + ASSERT_EQ(4U, children.size()); ASSERT_EQ(section, children[0]); ASSERT_EQ(paragraph, children[1]); ASSERT_EQ(text, children[2]); -- cgit v1.2.3 From 69ebaddbeaea1aa651a0f0babbf9283240d9c07b Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 14:58:46 +0100 Subject: Slightly adapted Handler instances to new Handler, once again passing non-const references to data and start, using "parseGenericString" in DocumentHandler for resolving non-string values, added unit test for testing whether "end()" is not called if "start()" fails. --- src/core/parser/stack/DocumentHandler.cpp | 141 +++++++++++++++---------- src/core/parser/stack/DocumentHandler.hpp | 96 ++++++++++++++--- src/core/parser/stack/DomainHandler.cpp | 51 +++++---- src/core/parser/stack/DomainHandler.hpp | 28 +++-- src/core/parser/stack/Handler.cpp | 20 ++-- src/core/parser/stack/Handler.hpp | 31 +++--- src/core/parser/stack/ImportIncludeHandler.cpp | 54 ++-------- src/core/parser/stack/ImportIncludeHandler.hpp | 13 ++- src/core/parser/stack/Stack.cpp | 18 ++-- src/core/parser/stack/TypesystemHandler.cpp | 48 ++++----- src/core/parser/stack/TypesystemHandler.hpp | 131 +++++++++++++++++------ test/core/parser/stack/StackTest.cpp | 41 +++++-- 12 files changed, 422 insertions(+), 250 deletions(-) (limited to 'test/core') diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index ba7430d..b28f0fb 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -22,22 +22,28 @@ #include #include +#include #include #include +#include #include #include +#include namespace ousia { +namespace parser_stack { /* DocumentHandler */ -void DocumentHandler::start(Variant::mapType &args) +bool DocumentHandler::start(Variant::mapType &args) { Rooted document = - project()->createDocument(args["name"].asString()); + context().getProject()->createDocument(args["name"].asString()); document->setLocation(location()); scope().push(document); scope().setFlag(ParserFlag::POST_HEAD, false); + + return true; } void DocumentHandler::end() { scope().pop(); } @@ -48,7 +54,7 @@ void DocumentChildHandler::preamble(Handle parentNode, std::string &fieldName, DocumentEntity *&parent, bool &inField) { - // check if the parent in the structure tree was an explicit field + // Check if the parent in the structure tree was an explicit field // reference. inField = parentNode->isa(&RttiTypes::DocumentField); if (inField) { @@ -56,10 +62,11 @@ void DocumentChildHandler::preamble(Handle parentNode, parentNode = scope().selectOrThrow( {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); } else { - // if it wasn't an explicit reference, we use the default field. + // If it wasn't an explicit reference, we use the default field. fieldName = DEFAULT_FIELD_NAME; } - // reference the parent entity explicitly. + + // Reference the parent entity explicitly. parent = nullptr; if (parentNode->isa(&RttiTypes::StructuredEntity)) { parent = static_cast( @@ -73,6 +80,8 @@ void DocumentChildHandler::preamble(Handle parentNode, void DocumentChildHandler::createPath(const NodeVector &path, DocumentEntity *&parent) { + // TODO (@benjamin): These should be pushed onto the scope and poped once + // the scope is left. Otherwise stuff may not be correclty resolved. size_t S = path.size(); for (size_t p = 1; p < S; p = p + 2) { parent = static_cast( @@ -82,7 +91,7 @@ void DocumentChildHandler::createPath(const NodeVector &path, } } -void DocumentChildHandler::start(Variant::mapType &args) +bool DocumentChildHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); Rooted parentNode = scope().selectOrThrow( @@ -95,7 +104,7 @@ void DocumentChildHandler::start(Variant::mapType &args) preamble(parentNode, fieldName, parent, inField); - // try to find a FieldDescriptor for the given tag if we are not in a + // Try to find a FieldDescriptor for the given tag if we are not in a // field already. This does _not_ try to construct transparent paths // in between. if (!inField && parent != nullptr && @@ -104,7 +113,7 @@ void DocumentChildHandler::start(Variant::mapType &args) new DocumentField(parentNode->getManager(), fieldName, parentNode)}; field->setLocation(location()); scope().push(field); - return; + return true; } // Otherwise create a new StructuredEntity @@ -147,27 +156,39 @@ void DocumentChildHandler::start(Variant::mapType &args) } entity->setLocation(location()); scope().push(entity); + return true; } void DocumentChildHandler::end() { scope().pop(); } -std::pair DocumentChildHandler::convertData( - Handle field, Logger &logger, const std::string &data) +bool DocumentChildHandler::convertData(Handle field, + Variant &data, Logger &logger) { - // if the content is supposed to be of type string, we can finish - // directly. - auto vts = field->getPrimitiveType()->getVariantTypes(); - if (std::find(vts.begin(), vts.end(), VariantType::STRING) != vts.end()) { - return std::make_pair(true, Variant::fromString(data)); + bool valid = true; + Rooted type = field->getPrimitiveType(); + + // If the content is supposed to be of type string, we only need to check + // for "magic" values -- otherwise just call the "parseGenericString" + // function on the string data + if (type->isa(&RttiTypes::StringType)) { + const std::string &str = data.asString(); + // TODO: Referencing constants with "." separator should also work + if (Utils::isIdentifier(str)) { + data.markAsMagic(); + } + } else { + // Parse the string as generic string, assign the result + auto res = VariantReader::parseGenericString( + data.asString(), logger, data.getLocation().getSourceId(), + data.getLocation().getStart()); + data = res.second; } - // then try to parse the content using the type specification. - auto res = field->getPrimitiveType()->read( - data, logger, location().getSourceId(), location().getStart()); - return res; + // Now try to resolve the value for the primitive type + return valid && scope().resolveValue(data, type, logger); } -void DocumentChildHandler::data(const std::string &data, int fieldIdx) +bool DocumentChildHandler::data(Variant &data) { Rooted parentNode = scope().selectOrThrow( {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, @@ -180,11 +201,10 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) preamble(parentNode, fieldName, parent, inField); Rooted desc = parent->getDescriptor(); - /* - * We distinguish two cases here: One for fields that are given. - */ + + // We distinguish two cases here: One for fields that are given. if (fieldName != DEFAULT_FIELD_NAME) { - // retrieve the actual FieldDescriptor + // Retrieve the actual FieldDescriptor Rooted field = desc->getFieldDescriptor(fieldName); if (field == nullptr) { logger().error( @@ -192,49 +212,57 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) fieldName + "\" exists in descriptor\"" + desc->getName() + "\".", location()); - return; + return false; } - // if it is not primitive at all, we can't parse the content. + // If it is not primitive at all, we can't parse the content. if (!field->isPrimitive()) { logger().error(std::string("Can't handle data because field \"") + fieldName + "\" of descriptor \"" + desc->getName() + "\" is not primitive!", location()); - return; + return false; } - // then try to parse the content using the type specification. - auto res = convertData(field, logger(), data); - // add it as primitive content. - if (res.first) { - parent->createChildDocumentPrimitive(res.second, fieldName); + + // Try to convert the data variable to the correct format, abort if this + // does not work + if (!convertData(field, data, logger())) { + return false; } + + // Add it as primitive content + parent->createChildDocumentPrimitive(data, fieldName); + return true; } else { - /* - * The second case is for primitive fields. Here we search through - * all FieldDescriptors that allow primitive content at this point - * and could be constructed via transparent intermediate entities. - * We then try to parse the data using the type specified by the - * respective field. If that does not work we proceed to the next - * possible field. - */ - // retrieve all fields. + // The second case is for primitive fields. Here we search through + // all FieldDescriptors that allow primitive content at this point + // and could be constructed via transparent intermediate entities. + // We then try to parse the data using the type specified by the + // respective field. If that does not work we proceed to the next + // possible field. NodeVector fields = desc->getDefaultFields(); std::vector forks; for (auto field : fields) { - // then try to parse the content using the type specification. + // Then try to parse the content using the type specification forks.emplace_back(logger().fork()); - auto res = convertData(field, forks.back(), data); - if (res.first) { - forks.back().commit(); - // if that worked, construct the necessary path. - auto pathRes = desc->pathTo(field, logger()); - assert(pathRes.second); - NodeVector path = pathRes.first; - createPath(path, parent); - // then create the primitive element. - parent->createChildDocumentPrimitive(res.second, fieldName); - return; + + // Try to convert the data variable to the correct format, abort if + // this does not work + if (!convertData(field, data, forks.back())) { + return false; } + + // Show possible warnings that were emitted by this type conversion + forks.back().commit(); + + // If that worked, construct the necessary path + auto pathRes = desc->pathTo(field, logger()); + assert(pathRes.second); + NodeVector path = pathRes.first; + createPath(path, parent); + + // Then create the primitive element + parent->createChildDocumentPrimitive(data, fieldName); + return true; } logger().error("Could not read data with any of the possible fields:"); for (size_t f = 0; f < fields.size(); f++) { @@ -242,11 +270,14 @@ void DocumentChildHandler::data(const std::string &data, int fieldIdx) SourceLocation{}, MessageMode::NO_CONTEXT); forks[f].commit(); } + return false; } + return true; +} } namespace RttiTypes { -const Rtti DocumentField = - RttiBuilder("DocumentField").parent(&Node); +const Rtti DocumentField = RttiBuilder( + "DocumentField").parent(&Node); } } diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 475fe69..7dc4c86 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -19,13 +19,19 @@ /** * @file DocumentHandler.hpp * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + * Contains the Handler instances used for parsing actual documents. This file + * declares to classes: The Document handler which parses the "document" command + * that introduces a new document and the "DocumentChildHandler" which parses + * the actual user defined tags. + * + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_DOCUMENT_HANDLER_HPP_ -#define _OUSIA_DOCUMENT_HANDLER_HPP_ +#ifndef _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ +#define _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ #include +#include #include "Handler.hpp" @@ -36,53 +42,117 @@ class Rtti; class DocumentEntity; class FieldDescriptor; +namespace parser_stack { +/** + * The DocumentHandler class parses the "document" tag that is used to introduce + * a new document. Note that this tag is not mandatory in osml files -- if the + * first command is not a typesystem, domain or any other declarative command, + * the DocumentHandler will be implicitly called. + */ class DocumentHandler : public StaticHandler { public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; + /** + * Creates a new instance of the ImportHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new DocumentHandler{handlerData}; } }; +/** + * Temporary Node that is being pushed onto the ParserScope in order to indicate + * the field the parser is currently in. The name of the Node is stored in the + * "name" field of the parent Node class. + */ class DocumentField : public Node { public: using Node::Node; }; +/** + * The DocumentChildHandler class performs the actual parsing of the user + * defined elements in an Ousía document. + */ class DocumentChildHandler : public StaticHandler { private: + /** + * Code shared by both the start() and the end() method. Checks whether the + * parser currently is in a field and returns the name of this field. + * + * @param parentNode is the next possible parent node (a document, + * a structured entity, an annotation entity or a field). + * @param fieldName is an output parameter to which the name of the current + * field is written (or unchanged if we're not in a field). + * @param parent is an output parameter to which the parent document entity + * will be written. + * @param inField is set to true if we actually are in a field. + */ void preamble(Handle parentNode, std::string &fieldName, DocumentEntity *&parent, bool &inField); + /** + * Constructs all structured entites along the given path and inserts them + * into the document graph. + * + * @param path is a path containing an alternating series of structured + * classes and fields. + * @pram parent is the root entity from which the process should be started. + */ void createPath(const NodeVector &path, DocumentEntity *&parent); - std::pair convertData(Handle field, - Logger &logger, - const std::string &data); + /** + * Tries to convert the given data to the type that is specified in the + * given primitive field. + * + * @param field is the primitive field for which the data is intended. + * @param data is the is the data that should be converted, the result is + * written into this argument as output variable. + * @param logger is the Logger instance to which error messages should be + * written. Needed to allow the convertData function to write to a forked + * Logger instance. + * @return true if the operation was successful, false otherwise. + */ + bool convertData(Handle field, Variant &data, + Logger &logger); public: - using Handler::Handler; + using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; - - bool data(const Variant &data) override; - + bool data(Variant &data) override; + + /** + * Creates a new instance of the DocumentChildHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new DocumentChildHandler{handlerData}; } }; +} namespace RttiTypes { +/** + * RttiType for the internally used DocumentField class. + */ extern const Rtti DocumentField; } } -#endif + +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/DomainHandler.cpp b/src/core/parser/stack/DomainHandler.cpp index 6571717..cb12543 100644 --- a/src/core/parser/stack/DomainHandler.cpp +++ b/src/core/parser/stack/DomainHandler.cpp @@ -20,25 +20,30 @@ #include #include +#include #include +#include namespace ousia { +namespace parser_stack { /* DomainHandler */ -void DomainHandler::start(Variant::mapType &args) +bool DomainHandler::start(Variant::mapType &args) { - Rooted domain = project()->createDomain(args["name"].asString()); + Rooted domain = + context().getProject()->createDomain(args["name"].asString()); domain->setLocation(location()); scope().push(domain); + return true; } void DomainHandler::end() { scope().pop(); } /* DomainStructHandler */ -void DomainStructHandler::start(Variant::mapType &args) +bool DomainStructHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -63,12 +68,13 @@ void DomainStructHandler::start(Variant::mapType &args) } scope().push(structuredClass); + return true; } void DomainStructHandler::end() { scope().pop(); } /* DomainAnnotationHandler */ -void DomainAnnotationHandler::start(Variant::mapType &args) +bool DomainAnnotationHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -79,13 +85,14 @@ void DomainAnnotationHandler::start(Variant::mapType &args) annotationClass->setLocation(location()); scope().push(annotationClass); + return true; } void DomainAnnotationHandler::end() { scope().pop(); } /* DomainAttributesHandler */ -void DomainAttributesHandler::start(Variant::mapType &args) +bool DomainAttributesHandler::start(Variant::mapType &args) { // Fetch the current typesystem and create the struct node Rooted parent = scope().selectOrThrow(); @@ -94,13 +101,14 @@ void DomainAttributesHandler::start(Variant::mapType &args) attrDesc->setLocation(location()); scope().push(attrDesc); + return true; } void DomainAttributesHandler::end() { scope().pop(); } /* DomainFieldHandler */ -void DomainFieldHandler::start(Variant::mapType &args) +bool DomainFieldHandler::start(Variant::mapType &args) { FieldDescriptor::FieldType type; if (args["isSubtree"].asBool()) { @@ -116,13 +124,14 @@ void DomainFieldHandler::start(Variant::mapType &args) field->setLocation(location()); scope().push(field); + return true; } void DomainFieldHandler::end() { scope().pop(); } /* DomainFieldRefHandler */ -void DomainFieldRefHandler::start(Variant::mapType &args) +bool DomainFieldRefHandler::start(Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); @@ -135,13 +144,14 @@ void DomainFieldRefHandler::start(Variant::mapType &args) field.cast(), logger); } }); + return true; } void DomainFieldRefHandler::end() {} /* DomainPrimitiveHandler */ -void DomainPrimitiveHandler::start(Variant::mapType &args) +bool DomainPrimitiveHandler::start(Variant::mapType &args) { Rooted parent = scope().selectOrThrow(); @@ -167,13 +177,14 @@ void DomainPrimitiveHandler::start(Variant::mapType &args) }); scope().push(field); + return true; } void DomainPrimitiveHandler::end() { scope().pop(); } /* DomainChildHandler */ -void DomainChildHandler::start(Variant::mapType &args) +bool DomainChildHandler::start(Variant::mapType &args) { Rooted field = scope().selectOrThrow(); @@ -186,13 +197,12 @@ void DomainChildHandler::start(Variant::mapType &args) child.cast()); } }); + return true; } -void DomainChildHandler::end() {} - /* DomainParentHandler */ -void DomainParentHandler::start(Variant::mapType &args) +bool DomainParentHandler::start(Variant::mapType &args) { Rooted strct = scope().selectOrThrow(); @@ -200,12 +210,14 @@ void DomainParentHandler::start(Variant::mapType &args) new DomainParent(strct->getManager(), args["ref"].asString(), strct)}; parent->setLocation(location()); scope().push(parent); + return true; } void DomainParentHandler::end() { scope().pop(); } /* DomainParentFieldHandler */ -void DomainParentFieldHandler::start(Variant::mapType &args) + +bool DomainParentFieldHandler::start(Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); FieldDescriptor::FieldType type; @@ -233,13 +245,12 @@ void DomainParentFieldHandler::start(Variant::mapType &args) field->addChild(strct.cast()); } }); + return true; } -void DomainParentFieldHandler::end() {} - /* DomainParentFieldRefHandler */ -void DomainParentFieldRefHandler::start(Variant::mapType &args) +bool DomainParentFieldRefHandler::start(Variant::mapType &args) { Rooted parentNameNode = scope().selectOrThrow(); @@ -265,12 +276,12 @@ void DomainParentFieldRefHandler::start(Variant::mapType &args) field->addChild(strct.cast()); } }); + return true; +} } - -void DomainParentFieldRefHandler::end() {} namespace RttiTypes { -const Rtti DomainParent = - RttiBuilder("DomainParent").parent(&Node); +const Rtti DomainParent = RttiBuilder( + "DomainParent").parent(&Node); } } diff --git a/src/core/parser/stack/DomainHandler.hpp b/src/core/parser/stack/DomainHandler.hpp index 5e8ea60..917d65d 100644 --- a/src/core/parser/stack/DomainHandler.hpp +++ b/src/core/parser/stack/DomainHandler.hpp @@ -19,17 +19,24 @@ /** * @file DomainHandler.hpp * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + * Contains the Handler classes used for parsing Domain descriptors. This + * includes the "domain" tag and all describing tags below the "domain" tag. + * + * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_DOMAIN_HANDLER_HPP_ #define _OUSIA_DOMAIN_HANDLER_HPP_ #include +#include #include "Handler.hpp" namespace ousia { +namespace parser_stack { + +// TODO: Documentation // Forward declarations class Rtti; @@ -39,7 +46,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -53,7 +59,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -67,7 +72,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -81,7 +85,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -95,7 +98,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -109,7 +111,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -123,7 +124,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -138,8 +138,6 @@ public: bool start(Variant::mapType &args) override; - void end() override; - static Handler *create(const HandlerData &handlerData) { return new DomainChildHandler{handlerData}; @@ -160,7 +158,6 @@ public: using StaticHandler::StaticHandler; bool start(Variant::mapType &args) override; - void end() override; static Handler *create(const HandlerData &handlerData) @@ -175,8 +172,6 @@ public: bool start(Variant::mapType &args) override; - void end() override; - static Handler *create(const HandlerData &handlerData) { return new DomainParentFieldHandler{handlerData}; @@ -189,12 +184,15 @@ public: bool start(Variant::mapType &args) override; - void end() override; - static Handler *create(const HandlerData &handlerData) { return new DomainParentFieldRefHandler{handlerData}; } }; } + +namespace RttiTypes { +extern const Rtti DomainParent; +} +} #endif diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index a608f7f..86000c4 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -65,6 +65,8 @@ Logger &Handler::logger() const SourceLocation &Handler::location() const { return handlerData.location; } +const std::string &Handler::name() const { return handlerData.name; } + void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) { /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/ @@ -80,7 +82,7 @@ void Handler::unregisterToken(const std::string &token) /*handlerData.callbacks.unregisterToken(token);*/ } -const std::string &Handler::getName() const { return handlerData.name; } +const std::string &Handler::getName() const { return name(); } const State &Handler::getState() const { return handlerData.state; } @@ -92,7 +94,7 @@ const SourceLocation &Handler::getLocation() const { return location(); } /* Class EmptyHandler */ -bool EmptyHandler::start(const Variant::mapType &args) +bool EmptyHandler::start(Variant::mapType &args) { // Just accept anything return true; @@ -115,7 +117,7 @@ void EmptyHandler::fieldEnd() } bool EmptyHandler::annotationStart(const Variant &className, - const Variant::mapType &args) + Variant::mapType &args) { // Accept any data return true; @@ -128,7 +130,7 @@ bool EmptyHandler::annotationEnd(const Variant &className, return true; } -bool EmptyHandler::data(const Variant &data) +bool EmptyHandler::data(Variant &data) { // Support any data return true; @@ -141,7 +143,7 @@ Handler *EmptyHandler::create(const HandlerData &handlerData) /* Class StaticHandler */ -bool StaticHandler::start(const Variant::mapType &args) +bool StaticHandler::start(Variant::mapType &args) { // Do nothing in the default implementation, accept anything return true; @@ -169,7 +171,7 @@ void StaticHandler::fieldEnd() } bool StaticHandler::annotationStart(const Variant &className, - const Variant::mapType &args) + Variant::mapType &args) { // No annotations supported return false; @@ -182,7 +184,7 @@ bool StaticHandler::annotationEnd(const Variant &className, return false; } -bool StaticHandler::data(const Variant &data) +bool StaticHandler::data(Variant &data) { logger().error("Did not expect any data here", data); return false; @@ -196,7 +198,7 @@ StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData, { } -bool StaticFieldHandler::start(const Variant::mapType &args) +bool StaticFieldHandler::start(Variant::mapType &args) { if (!argName.empty()) { auto it = args.find(argName); @@ -225,7 +227,7 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(const Variant &data) +bool StaticFieldHandler::data(Variant &data) { // Call the doHandle function if this has not been done before if (!handled) { diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index eeaf555..7cda7a4 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -151,6 +151,13 @@ protected: */ const SourceLocation &location() const; + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &name() const; + public: /** * Virtual destructor. @@ -229,7 +236,7 @@ public: * @return true if the handler was successful in starting the element it * represents, false otherwise. */ - virtual bool start(const Variant::mapType &args) = 0; + virtual bool start(Variant::mapType &args) = 0; /** * Called before the command for which this handler is defined ends (is @@ -270,7 +277,7 @@ public: * if an error occurred. */ virtual bool annotationStart(const Variant &className, - const Variant::mapType &args) = 0; + Variant::mapType &args) = 0; /** * Called whenever an annotation ends while this handler is active. The @@ -296,7 +303,7 @@ public: * location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(const Variant &data) = 0; + virtual bool data(Variant &data) = 0; }; /** @@ -318,15 +325,15 @@ protected: using Handler::Handler; public: - bool start(const Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; bool annotationStart(const Variant &className, - const Variant::mapType &args) override; + Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(const Variant &data) override; + bool data(Variant &data) override; /** * Creates an instance of the EmptyHandler class. @@ -344,15 +351,15 @@ protected: using Handler::Handler; public: - bool start(const Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; void fieldEnd() override; bool annotationStart(const Variant &className, - const Variant::mapType &args) override; + Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(const Variant &data) override; + bool data(Variant &data) override; }; /** @@ -400,12 +407,12 @@ protected: * @param args are the arguments that were given in the "start" function. */ virtual void doHandle(const Variant &fieldData, - const Variant::mapType &args) = 0; + Variant::mapType &args) = 0; public: - bool start(const Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; - bool data(const Variant &data) override; + bool data(Variant &data) override; }; } } diff --git a/src/core/parser/stack/ImportIncludeHandler.cpp b/src/core/parser/stack/ImportIncludeHandler.cpp index 94ee82d..797dd8d 100644 --- a/src/core/parser/stack/ImportIncludeHandler.cpp +++ b/src/core/parser/stack/ImportIncludeHandler.cpp @@ -18,48 +18,16 @@ #include "ImportIncludeHandler.hpp" +#include #include +#include namespace ousia { - -/* ImportIncludeHandler */ - -void ImportIncludeHandler::start(Variant::mapType &args) -{ - rel = args["rel"].asString(); - type = args["type"].asString(); - src = args["src"].asString(); - srcInArgs = !src.empty(); -} - -void ImportIncludeHandler::data(const std::string &data, int field) -{ - if (srcInArgs) { - logger().error("\"src\" attribute has already been set"); - return; - } - if (field != 0) { - logger().error("Command has only one field."); - return; - } - src.append(data); -} +namespace parser_stack { /* ImportHandler */ -void ImportHandler::start(Variant::mapType &args) -{ - ImportIncludeHandler::start(args); - - // Make sure imports are still possible - if (scope().getFlag(ParserFlag::POST_HEAD)) { - logger().error("Imports must be listed before other commands.", - location()); - return; - } -} - -void ImportHandler::end() +void ImportHandler::doHandle(const Variant &fieldData, Variant::mapType &args) { // Fetch the last node and check whether an import is valid at this // position @@ -75,8 +43,9 @@ void ImportHandler::end() // Perform the actual import, register the imported node within the leaf // node - Rooted imported = - context().import(src, type, rel, leafRootNode->getReferenceTypes()); + Rooted imported = context().import( + fieldData.asString(), args["type"].asString(), args["rel"].asString(), + leafRootNode->getReferenceTypes()); if (imported != nullptr) { leafRootNode->reference(imported); } @@ -84,13 +53,10 @@ void ImportHandler::end() /* IncludeHandler */ -void IncludeHandler::start(Variant::mapType &args) +void IncludeHandler::doHandle(const Variant &fieldData, Variant::mapType &args) { - ImportIncludeHandler::start(args); + context().include(fieldData.asString(), args["type"].asString(), + args["rel"].asString(), {&RttiTypes::Node}); } - -void IncludeHandler::end() -{ - context().include(src, type, rel, {&RttiTypes::Node}); } } diff --git a/src/core/parser/stack/ImportIncludeHandler.hpp b/src/core/parser/stack/ImportIncludeHandler.hpp index f9abe55..8f3d3d0 100644 --- a/src/core/parser/stack/ImportIncludeHandler.hpp +++ b/src/core/parser/stack/ImportIncludeHandler.hpp @@ -29,9 +29,11 @@ #define _OUSIA_IMPORT_INCLUDE_HANDLER_HPP_ #include -#include + +#include "Handler.hpp" namespace ousia { +namespace parser_stack { /** * The ImportHandler is responsible for handling the "import" command. An import @@ -46,7 +48,7 @@ public: using StaticFieldHandler::StaticFieldHandler; void doHandle(const Variant &fieldData, - const Variant::mapType &args) override; + Variant::mapType &args) override; /** * Creates a new instance of the ImportHandler. @@ -57,7 +59,7 @@ public: */ static Handler *create(const HandlerData &handlerData) { - return new ImportHandler{handlerData}; + return new ImportHandler{handlerData, "src"}; } }; @@ -72,7 +74,7 @@ public: using StaticFieldHandler::StaticFieldHandler; void doHandle(const Variant &fieldData, - const Variant::mapType &args) override; + Variant::mapType &args) override; /** * Creates a new instance of the IncludeHandler. @@ -83,8 +85,9 @@ public: */ static Handler *create(const HandlerData &handlerData) { - return new IncludeHandler{handlerData}; + return new IncludeHandler{handlerData, "src"}; } }; } +} #endif diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index d84a19c..47f7d2c 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -316,8 +316,6 @@ void Stack::command(const Variant &name, const Variant::mapType &args) name); } - State const *lastTargetState = nullptr; - Variant::mapType canonicalArgs; while (true) { // Try to find a target state for the given command, if none can be // found and the current command does not have an open field, then try @@ -342,14 +340,6 @@ void Stack::command(const Variant &name, const Variant::mapType &args) // Fork the logger. We do not want any validation errors to skip LoggerFork loggerFork = logger().fork(); - // Canonicalize the arguments (if this has not already been done), allow - // additional arguments - if (lastTargetState != targetState) { - canonicalArgs = args; - targetState->arguments.validateMap(canonicalArgs, loggerFork, true); - lastTargetState = targetState; - } - // Instantiate the handler and push it onto the stack HandlerConstructor ctor = targetState->elementHandler ? targetState->elementHandler @@ -369,6 +359,11 @@ void Stack::command(const Variant &name, const Variant::mapType &args) bool validStack = handlersValid(); info.valid = false; if (validStack) { + // Canonicalize the arguments (if this has not already been done), + // allow additional arguments + Variant::mapType canonicalArgs = args; + targetState->arguments.validateMap(canonicalArgs, loggerFork, true); + handler->setLogger(loggerFork); try { info.valid = handler->start(canonicalArgs); @@ -430,7 +425,8 @@ void Stack::data(const Variant &data) // Pass the data to the current Handler instance bool valid = false; try { - valid = info.handler->data(data); + Variant dataCopy = data; + valid = info.handler->data(dataCopy); } catch (LoggableException ex) { loggerFork.log(ex); diff --git a/src/core/parser/stack/TypesystemHandler.cpp b/src/core/parser/stack/TypesystemHandler.cpp index 2cc7dfb..34f64f9 100644 --- a/src/core/parser/stack/TypesystemHandler.cpp +++ b/src/core/parser/stack/TypesystemHandler.cpp @@ -20,28 +20,33 @@ #include #include +#include + namespace ousia { +namespace parser_stack { /* TypesystemHandler */ -void TypesystemHandler::start(Variant::mapType &args) +bool TypesystemHandler::start(Variant::mapType &args) { // Create the typesystem instance Rooted typesystem = - project()->createTypesystem(args["name"].asString()); + context().getProject()->createTypesystem(args["name"].asString()); typesystem->setLocation(location()); // Push the typesystem onto the scope, set the POST_HEAD flag to true scope().push(typesystem); scope().setFlag(ParserFlag::POST_HEAD, false); + + return true; } void TypesystemHandler::end() { scope().pop(); } /* TypesystemEnumHandler */ -void TypesystemEnumHandler::start(Variant::mapType &args) +bool TypesystemEnumHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -52,33 +57,24 @@ void TypesystemEnumHandler::start(Variant::mapType &args) enumType->setLocation(location()); scope().push(enumType); + + return true; } void TypesystemEnumHandler::end() { scope().pop(); } /* TypesystemEnumEntryHandler */ -void TypesystemEnumEntryHandler::start(Variant::mapType &args) {} - -void TypesystemEnumEntryHandler::end() +void TypesystemEnumEntryHandler::doHandle(const Variant &fieldData, + Variant::mapType &args) { Rooted enumType = scope().selectOrThrow(); - enumType->addEntry(entry, logger()); -} - -void TypesystemEnumEntryHandler::data(const std::string &data, int field) -{ - if (field != 0) { - // TODO: This should be stored in the HandlerData - logger().error("Enum entry only has one field."); - return; - } - entry.append(data); + enumType->addEntry(fieldData.asString(), logger()); } /* TypesystemStructHandler */ -void TypesystemStructHandler::start(Variant::mapType &args) +bool TypesystemStructHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -103,13 +99,15 @@ void TypesystemStructHandler::start(Variant::mapType &args) }); } scope().push(structType); + + return true; } void TypesystemStructHandler::end() { scope().pop(); } /* TypesystemStructFieldHandler */ -void TypesystemStructFieldHandler::start(Variant::mapType &args) +bool TypesystemStructFieldHandler::start(Variant::mapType &args) { // Read the argument values const std::string &name = args["name"].asString(); @@ -142,13 +140,13 @@ void TypesystemStructFieldHandler::start(Variant::mapType &args) } }); } -} -void TypesystemStructFieldHandler::end() {} + return true; +} /* TypesystemConstantHandler */ -void TypesystemConstantHandler::start(Variant::mapType &args) +bool TypesystemConstantHandler::start(Variant::mapType &args) { scope().setFlag(ParserFlag::POST_HEAD, true); @@ -169,7 +167,9 @@ void TypesystemConstantHandler::start(Variant::mapType &args) constant.cast()->setType(type.cast(), logger); } }); -} -void TypesystemConstantHandler::end() {} + return true; } +} +} + diff --git a/src/core/parser/stack/TypesystemHandler.hpp b/src/core/parser/stack/TypesystemHandler.hpp index 76a7bc9..55277a1 100644 --- a/src/core/parser/stack/TypesystemHandler.hpp +++ b/src/core/parser/stack/TypesystemHandler.hpp @@ -19,6 +19,9 @@ /** * @file TypesystemHandler.hpp * + * Contains the Handler classes used to parse Typesystem descriptions. The + * Handlers parse all the tags found below and including the "typesystem" tag. + * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -26,96 +29,154 @@ #define _OUSIA_TYPESYSTEM_HANDLER_HPP_ #include -#include + +#include "Handler.hpp" namespace ousia { +namespace parser_stack { -class TypesystemHandler : public Handler { +/** + * Handles the occurance of the "typesystem" tag. Creates a new Typesystem + * instance and places it on the ParserScope. + */ +class TypesystemHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemHandler{handlerData}; } }; -class TypesystemEnumHandler : public Handler { +/** + * Handles the occurance of the "enum" tag. Creates a new EnumType instance and + * places it on the ParserScope. + */ +class TypesystemEnumHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemEnumHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemEnumHandler{handlerData}; } }; -class TypesystemEnumEntryHandler : public Handler { +/** + * Handles the occurance of the "entry" tag within an "enum" tag. Creates a new + * EnumType instance and places it on the ParserScope. + */ +class TypesystemEnumEntryHandler : public StaticFieldHandler { public: - using Handler::Handler; - - std::string entry; - - void start(Variant::mapType &args) override; - - void end() override; - - void data(const std::string &data, int field) override; - + using StaticFieldHandler::StaticFieldHandler; + + void doHandle(const Variant &fieldData, + Variant::mapType &args) override; + + /** + * Creates a new instance of the TypesystemEnumEntryHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { - return new TypesystemEnumEntryHandler{handlerData}; + return new TypesystemEnumEntryHandler{handlerData, "name"}; } }; -class TypesystemStructHandler : public Handler { +/** + * Handles the occurance of the "struct" tag within a typesystem description. + * Creates a new StructType instance and places it on the ParserScope. + */ +class TypesystemStructHandler : public StaticHandler { public: - using Handler::Handler; - - void start(Variant::mapType &args) override; + using StaticHandler::StaticHandler; + bool start(Variant::mapType &args) override; void end() override; + /** + * Creates a new instance of the TypesystemStructHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemStructHandler{handlerData}; } }; -class TypesystemStructFieldHandler : public Handler { +/** + * Handles the occurance of the "field" tag within a typesystem structure + * description. Places a new Attribute instance in the StructType instance + * that is currently at the top of the scope. + */ +class TypesystemStructFieldHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; - - void end() override; + bool start(Variant::mapType &args) override; + /** + * Creates a new instance of the TypesystemStructFieldHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemStructFieldHandler{handlerData}; } }; -class TypesystemConstantHandler : public Handler { +/** + * Handles the occurance of the "constant" tag within a typesystem structure + * description. Places a new Constant instance in the current typesystem. + */ +class TypesystemConstantHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; - - void end() override; + bool start(Variant::mapType &args) override; + /** + * Creates a new instance of the TypesystemConstantHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new TypesystemConstantHandler{handlerData}; } }; } +} #endif diff --git a/test/core/parser/stack/StackTest.cpp b/test/core/parser/stack/StackTest.cpp index 7cc8bc5..321d471 100644 --- a/test/core/parser/stack/StackTest.cpp +++ b/test/core/parser/stack/StackTest.cpp @@ -112,16 +112,21 @@ private: TestHandler(const HandlerData &handlerData) : Handler(handlerData) {} public: - bool start(const Variant::mapType &args) + bool start(Variant::mapType &args) override { tracker.startCount++; tracker.startArgs = args; + if (!tracker.startResult) { + logger().error( + "The TestHandler was told not to allow a field start. So it " + "doesn't. The TestHandler always obeys its master."); + } return tracker.startResult; } - void end() { tracker.endCount++; } + void end() override { tracker.endCount++; } - bool fieldStart(bool &isDefault, size_t fieldIdx) + bool fieldStart(bool &isDefault, size_t fieldIdx) override { tracker.fieldStartCount++; tracker.fieldStartIsDefault = isDefault; @@ -132,9 +137,10 @@ public: return tracker.fieldStartResult; } - void fieldEnd() { tracker.fieldEndCount++; } + void fieldEnd() override { tracker.fieldEndCount++; } - bool annotationStart(const Variant &className, const Variant::mapType &args) + bool annotationStart(const Variant &className, + Variant::mapType &args) override { tracker.annotationStartCount++; tracker.annotationStartClassName = className; @@ -142,7 +148,8 @@ public: return tracker.annotationStartResult; } - bool annotationEnd(const Variant &className, const Variant &elementName) + bool annotationEnd(const Variant &className, + const Variant &elementName) override { tracker.annotationEndCount++; tracker.annotationEndClassName = className; @@ -150,7 +157,7 @@ public: return tracker.annotationEndResult; } - bool data(const Variant &data) + bool data(Variant &data) override { tracker.dataCount++; tracker.dataData = data; @@ -458,6 +465,26 @@ TEST(Stack, noImplicitDefaultFieldIfDefaultFieldGiven) ASSERT_FALSE(logger.hasError()); } +TEST(Stack, noEndIfStartFails) +{ + tracker.reset(); + logger.reset(); + { + Stack s{env.context, States::AnyHandlers}; + + s.command("a", {}); + tracker.expect(1, 0, 0, 0, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("a", s.currentCommandName()); + + tracker.startResult = false; + s.command("b", {}); + tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_EQ("b", s.currentCommandName()); + } + tracker.expect(3, 1, 1, 1, 0, 0, 0); // sc, ec, fsc, fse, asc, aec, dc + ASSERT_TRUE(logger.hasError()); +} + TEST(Stack, implicitDefaultFieldOnData) { tracker.reset(); -- cgit v1.2.3