From 84c9abc3e9762c4486ddc5ca0352a5d697a51987 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Wed, 25 Feb 2015 23:09:26 +0100 Subject: start of branch, commit log will be rewritten --- src/core/common/SourceContextReader.cpp | 5 +- src/core/common/Token.cpp | 24 +++ src/core/common/Token.hpp | 181 ++++++++++++++++++++ src/core/common/Utils.cpp | 6 - src/core/common/Utils.hpp | 53 +++--- src/core/common/WhitespaceHandler.hpp | 284 -------------------------------- 6 files changed, 240 insertions(+), 313 deletions(-) create mode 100644 src/core/common/Token.cpp create mode 100644 src/core/common/Token.hpp delete mode 100644 src/core/common/WhitespaceHandler.hpp (limited to 'src/core/common') diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp index d5d379c..f7dbdf3 100644 --- a/src/core/common/SourceContextReader.cpp +++ b/src/core/common/SourceContextReader.cpp @@ -149,8 +149,9 @@ SourceContext SourceContextReader::readContext(CharReader &reader, ctx.relLen = end - start; // end >= start (I2) // Remove linebreaks at the beginning and the end - const std::pair b = - Utils::trim(lineBuf, Utils::isLinebreak); + const std::pair b = Utils::trim( + lineBuf, + [&lineBuf](size_t i) { return Utils::isLinebreak(lineBuf[i]); }); ssize_t s = b.first, e = b.second; s = std::min(s, static_cast(ctx.relPos)); diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp new file mode 100644 index 0000000..8bcdbb5 --- /dev/null +++ b/src/core/common/Token.cpp @@ -0,0 +1,24 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Token.hpp" + +namespace ousia { +// Stub to make sure Tokens.hpp is valid +} + diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp new file mode 100644 index 0000000..0cf56b0 --- /dev/null +++ b/src/core/common/Token.hpp @@ -0,0 +1,181 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Token.hpp + * + * Definition of the TokenId id and constants for some special tokens. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_HPP_ +#define _OUSIA_TOKEN_HPP_ + +#include +#include +#include +#include + +#include + +namespace ousia { + +/** + * The TokenId is used to give each token id a unique id. + */ +using TokenId = uint32_t; + +/** + * Type used for storing token lengths. + */ +using TokenLength = uint16_t; + +/** + * Type used for storing token sets. + */ +using TokenSet = std::unordered_set; + +/** + * Namespace containing constants for TokenId instances with special meaning. + */ +namespace Tokens { +/** + * Token which is not a token. + */ +constexpr TokenId Empty = std::numeric_limits::max(); + +/** + * Token which represents data (represented as TokenizedData). + */ +constexpr TokenId Data = std::numeric_limits::max() - 1; + +/** + * Token which represents a newline token. + */ +constexpr TokenId Newline = std::numeric_limits::max() - 2; + +/** + * Token which represents a paragraph token -- issued if two consecutive + * newlines occur with optionally any amout of whitespace between them. The + * paragraph token is not repeated until more text is reached. + */ +constexpr TokenId Paragraph = std::numeric_limits::max() - 3; + +/** + * Token which represents a section token -- issued if three or more + * consecutive newlines occur with optionally any amout of whitespace between + * them. The section token is not repeated until more text is reached. + */ +constexpr TokenId Section = std::numeric_limits::max() - 4; + +/** + * Token which represents an indentation token -- issued if the indentation of + * this line is larger than the indentation of the previous line. + */ +constexpr TokenId Indent = std::numeric_limits::max() - 5; + +/** + * Token which represents an dedentation -- issued if the indentation of + * this line is smaller than the indentation of the previous line. + */ +constexpr TokenId Dedent = std::numeric_limits::max() - 6; + +/** + * Maximum token id to be used. Tokens allocated for users should not surpass + * this value. + */ +constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255; +} + +/** + * The Token structure describes a token discovered by the Tokenizer or read + * from the TokenizedData struct. + */ +struct Token { + /** + * Id of the id of this token. + */ + TokenId id; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + Token() : id(Tokens::Empty) {} + + /** + * Constructor of a "data" token with no explicit content. + * + * @param location is the location of the extracted string content in the + * source file. + */ + Token(SourceLocation location) + : id(Tokens::Data), location(location) + { + } + + /** + * Constructor of the Token struct. + * + * @param id represents the token id. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + Token(TokenId id, const std::string &content, SourceLocation location) + : id(id), content(content), location(location) + { + } + + /** + * Constructor of the Token struct, only initializes the token id + * + * @param id is the id corresponding to the id of the token. + */ + Token(TokenId id) : id(id) {} + + /** + * Returns true if this token is special. + * + * @return true if the TokenId indicates that this token is a "special" + * token. + */ + bool isSpecial() const {return id > Tokens::MaxTokenId;} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; +} + +#endif /* _OUSIA_TOKENS_HPP_ */ + diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index a77951e..85d2c28 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -108,12 +108,6 @@ std::string Utils::extractFileExtension(const std::string &filename) return std::string{}; } -std::string Utils::trim(const std::string &s) -{ - std::pair bounds = trim(s, Utils::isWhitespace); - return s.substr(bounds.first, bounds.second - bounds.first); -} - bool Utils::startsWith(const std::string &s, const std::string &prefix) { return prefix.size() <= s.size() && s.substr(0, prefix.size()) == prefix; diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 7d96562..82a8f8c 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -123,14 +123,6 @@ public: */ static bool hasNonWhitepaceChar(const std::string &s); - /** - * Removes whitespace at the beginning and the end of the given string. - * - * @param s is the string that should be trimmed. - * @return a trimmed copy of s. - */ - static std::string trim(const std::string &s); - /** * Trims the given string or vector of chars by returning the start and end * index. @@ -153,8 +145,8 @@ public: * * @param s is the container that should be trimmed. * @param len is the number of elements in the container. - * @param f is a function that returns true for values that should be - * removed. + * @param f is a function that returns true for values at a certain index + * that should be removed. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" */ @@ -163,7 +155,7 @@ public: { size_t start = 0; for (size_t i = 0; i < len; i++) { - if (!f(s[i])) { + if (!f(i)) { start = i; break; } @@ -171,7 +163,7 @@ public: size_t end = 0; for (ssize_t i = len - 1; i >= static_cast(start); i--) { - if (!f(s[i])) { + if (!f(i)) { end = i + 1; break; } @@ -198,16 +190,32 @@ public: * the collapsed version of the string ends. * @return start and end index. Note that "end" points at the character * beyond the end, thus "end" minus "start" + * @param f is a function that returns true for values at a certain index + * that should be removed. */ - template - static std::string trim(const T &s, size_t len, size_t &start, size_t &end) + template + static std::string trim(const T &s, size_t len, size_t &start, size_t &end, + Filter f) { - auto res = trim(s, len, isWhitespace); + auto res = trim(s, len, f); start = res.first; end = res.second; return std::string(&s[start], end - start); } + /** + * Removes whitespace at the beginning and the end of the given string. + * + * @param s is the string that should be trimmed. + * @return a trimmed copy of s. + */ + static std::string trim(const std::string &s) + { + std::pair bounds = + trim(s, [&s](size_t i) { return isWhitespace(s[i]); }); + return s.substr(bounds.first, bounds.second - bounds.first); + } + /** * Collapses the whitespaces in the given string (trims the string and * replaces all whitespace characters by a single one). @@ -219,7 +227,8 @@ public: { size_t start; size_t end; - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -236,7 +245,8 @@ public: static std::string collapse(const std::string &s, size_t &start, size_t &end) { - return collapse(s, s.size(), start, end); + return collapse(s, s.size(), start, end, + [&s](size_t i) { return isWhitespace(s[i]); }); } /** @@ -244,6 +254,8 @@ public: * replaces all whitespace characters by a single one). * * @tparam T is the string type that should be used. + * @tparam Filter is a filter function used for detecting the character + * indices that might be removed. * @param s is the string in which the whitespace should be collapsed. * @param len is the length of the input string * @param start is an output parameter which is set to the offset at which @@ -252,9 +264,9 @@ public: * the collapsed version of the string ends. * @return a copy of s with collapsed whitespace. */ - template + template static std::string collapse(const T &s, size_t len, size_t &start, - size_t &end) + size_t &end, Filter f) { // Result vector std::vector res; @@ -268,8 +280,7 @@ public: bool hadWhitespace = false; for (size_t i = 0; i < len; i++) { const char c = s[i]; - const bool whitespace = isWhitespace(c); - if (whitespace) { + if (f(i)) { hadWhitespace = !res.empty(); } else { // Adapt the start and end position diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp deleted file mode 100644 index ed52ea3..0000000 --- a/src/core/common/WhitespaceHandler.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file WhitespaceHandler.hpp - * - * Contains the WhitespaceHandler classes which are used in multiple places to - * trim, compact or preserve whitespaces while at the same time maintaining the - * position information associated with the input strings. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_ -#define _OUSIA_WHITESPACE_HANDLER_HPP_ - -#include -#include - -#include "Utils.hpp" - -namespace ousia { - -/** - * WhitespaceHandler is a based class that can be used to collect text on a - * character-by-character basis. Note that this class and its descendants are - * hoped to be inlined by the compiler (and used in conjunction with templates), - * thus they are fully defined inside this header. - */ -class WhitespaceHandler { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - WhitespaceHandler() : textStart(0), textEnd(0) {} - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } - - /** - * Returns the content of the WhitespaceHandler as string. - */ - std::string toString() const - { - return std::string(textBuf.data(), textBuf.size()); - } -}; - -/** - * The PreservingWhitespaceHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd); - } - - /** - * Static version of PreservingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); - } - - /** - * Static version of TrimmingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param whitespaceBuf is a reference at the buffer for storing whitespace - * characters. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd, std::vector &whitespaceBuf) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingWhitespaceHandler : public WhitespaceHandler { -public: - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); - } - - /** - * Static version of CollapsingWhitespaceHandler append - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - * @param textBuf is a reference at the text buffer that is to be used. - * @param textStart is a reference at the text start variable that is to be - * used. - * @param textEnd is a reference at the text end variable that is to be - * used. - * @param hasWhitespace is a reference at the "hasWhitespace" flag. - */ - static void append(char c, size_t start, size_t end, - std::vector &textBuf, size_t &textStart, - size_t &textEnd, bool &hasWhitespace) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); - } -}; - -/** - * Function that can be used to append the given buffer (e.g. a string or a - * vector) to the whitespace handler. - * - * @tparam WhitespaceHandler is one of the WhitespaceHandler classes. - * @tparam Buffer is an iterable type. - * @param handler is the handler to which the characters of the Buffer should be - * appended. - * @param buf is the buffer from which the characters should be read. - * @param start is the start byte offset. Each character is counted as one byte. - */ -template -inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf, - size_t start) -{ - for (auto elem : buf) { - handler.append(elem, start, start + 1); - start++; - } -} -} - -#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */ - -- cgit v1.2.3 From 596fdab71b8bd116e20e33647d68f1d7a567696e Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:34:15 +0100 Subject: Wrote isUserDefinedToken function which checks whether a token is a valid user defined token and added unit tests --- src/core/common/Utils.cpp | 24 ++++++++++++++++++++++++ src/core/common/Utils.hpp | 19 +++++++++++++++++++ test/core/common/UtilsTest.cpp | 31 ++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) (limited to 'src/core/common') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 85d2c28..219b437 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -118,5 +118,29 @@ bool Utils::endsWith(const std::string &s, const std::string &suffix) return suffix.size() <= s.size() && s.substr(s.size() - suffix.size(), suffix.size()) == suffix; } + +bool Utils::isUserDefinedToken(const std::string &token) +{ + // Make sure the token meets is neither empty, nor starts or ends with an + // alphanumeric character + const size_t len = token.size(); + if (len == 0 || isAlphanumeric(token[0]) || isAlphanumeric(token[len - 1])) { + return false; + } + + // Make sure the token is not any special OSML token + if (token == "\\" || token == "%" || token == "%{" || token == "}%" || + token == "{!" || token == "<\\" || token == "\\>") { + return false; + } + + // Make sure the token contains other characters but { and } + for (char c: token) { + if (c != '{' && c != '}') { + return true; + } + } + return false; +} } diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 82a8f8c..25a4de5 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -102,6 +102,25 @@ public: */ static bool isNamespacedIdentifier(const std::string &name); + /** + * Returns true if the given characters form a valid user-defined token. + * This function returns true under the following circumstances: + *
    + *
  • The given token is not empty
  • + *
  • The given token starts and ends with a non-alphanumeric character + *
  • + *
  • The token is none of the following character sequences (which are + * special in OSML): + *
      + *
    • '{', '}' or any combined repetition of these characters
    • + *
    • '\', '{!', '<\', '\>'
    • + *
    • '%', '%{', '}%'
    • + *
    + *
  • + *
+ */ + static bool isUserDefinedToken(const std::string &token); + /** * Returns true if the given character is a linebreak character. */ diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 4bf1587..54890ee 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -131,4 +131,33 @@ TEST(Utils, collapse) ASSERT_EQ("long test", Utils::collapse(" long test ")); } -} \ No newline at end of file +TEST(Utils, isUserDefinedToken) +{ + EXPECT_FALSE(Utils::isUserDefinedToken("")); + EXPECT_FALSE(Utils::isUserDefinedToken("a")); + EXPECT_TRUE(Utils::isUserDefinedToken(":")); + EXPECT_TRUE(Utils::isUserDefinedToken("::")); + EXPECT_TRUE(Utils::isUserDefinedToken("!?")); + EXPECT_TRUE(Utils::isUserDefinedToken(".")); + EXPECT_TRUE(Utils::isUserDefinedToken("<<")); + EXPECT_TRUE(Utils::isUserDefinedToken(">>")); + EXPECT_TRUE(Utils::isUserDefinedToken("''")); + EXPECT_TRUE(Utils::isUserDefinedToken("``")); + EXPECT_TRUE(Utils::isUserDefinedToken("´´")); + EXPECT_TRUE(Utils::isUserDefinedToken("´")); + EXPECT_TRUE(Utils::isUserDefinedToken("`")); + EXPECT_TRUE(Utils::isUserDefinedToken("<")); + EXPECT_TRUE(Utils::isUserDefinedToken(">")); + EXPECT_FALSE(Utils::isUserDefinedToken("a:")); + EXPECT_FALSE(Utils::isUserDefinedToken("a:a")); + EXPECT_FALSE(Utils::isUserDefinedToken(":a")); + EXPECT_FALSE(Utils::isUserDefinedToken("{")); + EXPECT_FALSE(Utils::isUserDefinedToken("{{")); + EXPECT_FALSE(Utils::isUserDefinedToken("}}")); + EXPECT_FALSE(Utils::isUserDefinedToken("{{}{}")); + EXPECT_FALSE(Utils::isUserDefinedToken("<\\")); + EXPECT_FALSE(Utils::isUserDefinedToken("\\>")); + EXPECT_FALSE(Utils::isUserDefinedToken("{!")); +} + +} -- cgit v1.2.3 From 88afbcc2a4c4cb9956e4459cf1c5aa08e349835e Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 00:41:35 +0100 Subject: Implemented TokenSyntaxDescriptor structure --- src/core/common/Token.cpp | 16 ++++++++++- src/core/common/Token.hpp | 72 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 82 insertions(+), 6 deletions(-) (limited to 'src/core/common') diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp index 8bcdbb5..e454ae4 100644 --- a/src/core/common/Token.cpp +++ b/src/core/common/Token.cpp @@ -19,6 +19,20 @@ #include "Token.hpp" namespace ousia { -// Stub to make sure Tokens.hpp is valid + +/* Class TokenSyntaxDescriptor */ + +void TokenSyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const +{ + if (start != Tokens::Empty) { + set.insert(start); + } + if (end != Tokens::Empty) { + set.insert(end); + } + if (shortForm != Tokens::Empty) { + set.insert(shortForm); + } +} } diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp index 0cf56b0..f89a0ce 100644 --- a/src/core/common/Token.hpp +++ b/src/core/common/Token.hpp @@ -134,10 +134,7 @@ struct Token { * @param location is the location of the extracted string content in the * source file. */ - Token(SourceLocation location) - : id(Tokens::Data), location(location) - { - } + Token(SourceLocation location) : id(Tokens::Data), location(location) {} /** * Constructor of the Token struct. @@ -165,7 +162,7 @@ struct Token { * @return true if the TokenId indicates that this token is a "special" * token. */ - bool isSpecial() const {return id > Tokens::MaxTokenId;} + bool isSpecial() const { return id > Tokens::MaxTokenId; } /** * The getLocation function allows the tokens to be directly passed as @@ -175,6 +172,71 @@ struct Token { */ const SourceLocation &getLocation() const { return location; } }; + +/** + * Class describing the user defined syntax for a single field or annotation. + */ +struct TokenSyntaxDescriptor { + /** + * Possible start token or Tokens::Empty if no token is set. + */ + TokenId start; + + /** + * Possible end token or Tokens::Empty if no token is set. + */ + TokenId end; + + /** + * Possible representation token or Tokens::Empty if no token is set. + */ + TokenId shortForm; + + /** + * Flag specifying whether this TokenSyntaxDescriptor describes an + * annotation. + */ + bool isAnnotation; + + /** + * Default constructor, sets all token ids to Tokens::Empty and isAnnotation + * to false. + */ + TokenSyntaxDescriptor() + : start(Tokens::Empty), + end(Tokens::Empty), + shortForm(Tokens::Empty), + isAnnotation(false) + { + } + + /** + * Member initializer constructor. + * + * @param start is a possible start token. + * @param end is a possible end token. + * @param shortForm is a possible short form token. + * @param isAnnotation is set to true if this syntax descriptor describes an + * annotation. + */ + TokenSyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, + bool isAnnotation) + : start(start), + end(end), + shortForm(shortForm), + isAnnotation(isAnnotation) + { + } + + /** + * Inserts all tokens referenced in this TokenSyntaxDescriptor into the + * given TokenSet. Skips token ids set to Tokens::Empty. + * + * @param set is the TokenSet instance into which the Tokens should be + * inserted. + */ + void insertIntoTokenSet(TokenSet &set) const; +}; } #endif /* _OUSIA_TOKENS_HPP_ */ -- cgit v1.2.3 From 5d6ee07995c7f59e66e0df558c8ebe7d2a8d1f68 Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Mon, 2 Mar 2015 15:52:13 +0100 Subject: refactored SyntaxDescriptor to Token.hpp and added TokenDescriptor class. --- CMakeLists.txt | 1 + src/core/common/Token.cpp | 14 --- src/core/common/Token.hpp | 67 +----------- src/core/model/Syntax.cpp | 58 +++++++++++ src/core/model/Syntax.hpp | 196 +++++++++++++++++++++++++++++++++++ src/core/parser/stack/Callbacks.hpp | 3 +- src/core/parser/stack/Handler.cpp | 2 +- src/core/parser/stack/Handler.hpp | 3 +- src/core/parser/stack/TokenStack.cpp | 4 +- src/core/parser/stack/TokenStack.hpp | 5 +- 10 files changed, 266 insertions(+), 87 deletions(-) create mode 100644 src/core/model/Syntax.cpp create mode 100644 src/core/model/Syntax.hpp (limited to 'src/core/common') diff --git a/CMakeLists.txt b/CMakeLists.txt index b206458..13de9ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,7 @@ ADD_LIBRARY(ousia_core src/core/model/Project src/core/model/RootNode src/core/model/Style + src/core/model/Syntax src/core/model/Typesystem src/core/parser/Parser src/core/parser/ParserContext diff --git a/src/core/common/Token.cpp b/src/core/common/Token.cpp index e454ae4..17ce03e 100644 --- a/src/core/common/Token.cpp +++ b/src/core/common/Token.cpp @@ -20,19 +20,5 @@ namespace ousia { -/* Class TokenSyntaxDescriptor */ - -void TokenSyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const -{ - if (start != Tokens::Empty) { - set.insert(start); - } - if (end != Tokens::Empty) { - set.insert(end); - } - if (shortForm != Tokens::Empty) { - set.insert(shortForm); - } -} } diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp index f89a0ce..f37151f 100644 --- a/src/core/common/Token.hpp +++ b/src/core/common/Token.hpp @@ -173,71 +173,6 @@ struct Token { const SourceLocation &getLocation() const { return location; } }; -/** - * Class describing the user defined syntax for a single field or annotation. - */ -struct TokenSyntaxDescriptor { - /** - * Possible start token or Tokens::Empty if no token is set. - */ - TokenId start; - - /** - * Possible end token or Tokens::Empty if no token is set. - */ - TokenId end; - - /** - * Possible representation token or Tokens::Empty if no token is set. - */ - TokenId shortForm; - - /** - * Flag specifying whether this TokenSyntaxDescriptor describes an - * annotation. - */ - bool isAnnotation; - - /** - * Default constructor, sets all token ids to Tokens::Empty and isAnnotation - * to false. - */ - TokenSyntaxDescriptor() - : start(Tokens::Empty), - end(Tokens::Empty), - shortForm(Tokens::Empty), - isAnnotation(false) - { - } - - /** - * Member initializer constructor. - * - * @param start is a possible start token. - * @param end is a possible end token. - * @param shortForm is a possible short form token. - * @param isAnnotation is set to true if this syntax descriptor describes an - * annotation. - */ - TokenSyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, - bool isAnnotation) - : start(start), - end(end), - shortForm(shortForm), - isAnnotation(isAnnotation) - { - } - - /** - * Inserts all tokens referenced in this TokenSyntaxDescriptor into the - * given TokenSet. Skips token ids set to Tokens::Empty. - * - * @param set is the TokenSet instance into which the Tokens should be - * inserted. - */ - void insertIntoTokenSet(TokenSet &set) const; -}; } -#endif /* _OUSIA_TOKENS_HPP_ */ - +#endif /* _OUSIA_TOKENS_HPP_ */ \ No newline at end of file diff --git a/src/core/model/Syntax.cpp b/src/core/model/Syntax.cpp new file mode 100644 index 0000000..9dbaccc --- /dev/null +++ b/src/core/model/Syntax.cpp @@ -0,0 +1,58 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "Syntax.hpp" + +#include "Domain.hpp" + +namespace ousia { + +/* Class TokenSyntaxDescriptor */ + +bool SyntaxDescriptor::isAnnotation() const +{ + return descriptor->isa(&RttiTypes::AnnotationClass); +} +bool SyntaxDescriptor::isFieldDescriptor() const +{ + return descriptor->isa(&RttiTypes::FieldDescriptor); +} +bool SyntaxDescriptor::isStruct() const +{ + return descriptor->isa(&RttiTypes::StructuredClass); +} + +void SyntaxDescriptor::insertIntoTokenSet(TokenSet &set) const +{ + if (start != Tokens::Empty) { + set.insert(start); + } + if (end != Tokens::Empty) { + set.insert(end); + } + if (shortForm != Tokens::Empty) { + set.insert(shortForm); + } +} + +bool SyntaxDescriptor::isEmpty() const +{ + return start == Tokens::Empty && end == Tokens::Empty && + shortForm == Tokens::Empty; +} +} \ No newline at end of file diff --git a/src/core/model/Syntax.hpp b/src/core/model/Syntax.hpp new file mode 100644 index 0000000..4da3408 --- /dev/null +++ b/src/core/model/Syntax.hpp @@ -0,0 +1,196 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file Syntax.hpp + * + * This header contains the Descriptor classes for user definable syntax for + * Document entities or fields. These classes are referenced in Ontology.hpp. + */ + +#ifndef _OUSIA_MODEL_SYNTAX_HPP_ +#define _OUSIA_MODEL_SYNTAX_HPP_ + +#include +#include "Node.hpp" + +namespace ousia { + +/** + * Class to describe a single token that shall be used as user-defined syntax. + */ +struct TokenDescriptor { + /** + * The string content of this token, if it is not a special one. + */ + std::string token; + /** + * A flag to be set true if this TokenDescriptor uses a special token. + */ + bool special; + /** + * An id to uniquely identify this token. + */ + TokenId id; + + /** + * Constructor for non-special tokens. The special flag is set to false and + * the id to Tokens::Empty. + * + * @param token The string content of this token, if it is not a special + * one. + */ + TokenDescriptor(std::string token = std::string()) + : token(std::move(token)), special(false), id(Tokens::Empty) + { + } + + /** + * Constructor for special tokens. The token is set to an empty string and + * the special flag to true. + * + * @param id the id of the special token. + */ + TokenDescriptor(TokenId id) : special(true), id(id) {} + + /** + * Returns true if and only if neither a string nor an ID is given. + * + * @return true if and only if neither a string nor an ID is given. + */ + bool isEmpty() const { return token.empty() && id == Tokens::Empty; } +}; + +/** + * Class describing the user defined syntax for a StructuredClass, + * AnnotationClass or FieldDescriptor. + * + * This class is used during parsing of a Document. It is used to describe + * the tokens relevant for one Descriptor that could be created at this point + * during parsing. + */ +struct SyntaxDescriptor { + /** + * Possible start token or Tokens::Empty if no token is set. + */ + TokenId start; + + /** + * Possible end token or Tokens::Empty if no token is set. + */ + TokenId end; + + /** + * Possible representation token or Tokens::Empty if no token is set. + */ + TokenId shortForm; + + /* + * The Descriptor this SyntaxDescriptor belongs to. As this may be + * a FieldDescriptor as well as a class Descriptor (StructuredClass or + * AnnotationClass) we can only use the class Node as inner argument here. + */ + Rooted descriptor; + /* + * Given the current leaf in the parsed document the depth of a + * SyntaxDescriptor is defined as the number of transparent elements that + * would be needed to construct an instance of the referenced descriptor. + */ + ssize_t depth; + + /** + * Default constructor, sets all token ids to Tokens::Empty and the + * descriptor handle to nullptr. + */ + SyntaxDescriptor() + : start(Tokens::Empty), + end(Tokens::Empty), + shortForm(Tokens::Empty), + descriptor(nullptr), + depth(-1) + { + } + + /** + * Member initializer constructor. + * + * @param start is a possible start token. + * @param end is a possible end token. + * @param shortForm is a possible short form token. + * @param descriptor The Descriptor this SyntaxDescriptor belongs to. + * @param depth Given the current leaf in the parsed document the depth of a + * SyntaxDescriptor is defined as the number of transparent elements that + * would be needed to construct an instance of the referenced descriptor. + */ + SyntaxDescriptor(TokenId start, TokenId end, TokenId shortForm, + Handle descriptor, ssize_t depth) + : start(start), + end(end), + shortForm(shortForm), + descriptor(descriptor), + depth(depth) + { + } + + /** + * Inserts all tokens referenced in this SyntaxDescriptor into the + * given TokenSet. Skips token ids set to Tokens::Empty. + * + * @param set is the TokenSet instance into which the Tokens should be + * inserted. + */ + void insertIntoTokenSet(TokenSet &set) const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to an + * AnnotationClass. + * + * @return true if and only if this SyntaxDescriptor belongs to an + * AnnotationClass. + */ + bool isAnnotation() const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to a + * StrcturedClass. + * + * @return true if and only if this SyntaxDescriptor belongs to a + * StrcturedClass. + */ + bool isStruct() const; + + /** + * Returns true if and only if this SyntaxDescriptor belongs to a + * FieldDescriptor. + * + * @return true if and only if this SyntaxDescriptor belongs to a + * FieldDescriptor. + */ + bool isFieldDescriptor() const; + + /** + * Returns true if and only if this SyntaxDescriptor has only empty + * entries in start, end and short. + * + * @return true if and only if this SyntaxDescriptor has only empty + * entries in start, end and short. + */ + bool isEmpty() const; +}; +} +#endif \ No newline at end of file diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp index d7b2547..e471881 100644 --- a/src/core/parser/stack/Callbacks.hpp +++ b/src/core/parser/stack/Callbacks.hpp @@ -34,6 +34,7 @@ #include #include +#include namespace ousia { @@ -96,7 +97,7 @@ public: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + void pushTokens(const std::vector &tokens); /** * Removes the previously pushed list of tokens from the stack. diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index 734976a..12df0fd 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -74,7 +74,7 @@ Variant Handler::readData() return handlerData.callbacks.readData(); } -void Handler::pushTokens(const std::vector &tokens) +void Handler::pushTokens(const std::vector &tokens) { handlerData.callbacks.pushTokens(tokens); } diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 848d395..19660d0 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace ousia { @@ -200,7 +201,7 @@ protected: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + void pushTokens(const std::vector &tokens); /** * Calls the corresponding function in the HandlerCallbacks instance. diff --git a/src/core/parser/stack/TokenStack.cpp b/src/core/parser/stack/TokenStack.cpp index 6afeaed..ac1d94e 100644 --- a/src/core/parser/stack/TokenStack.cpp +++ b/src/core/parser/stack/TokenStack.cpp @@ -21,7 +21,7 @@ namespace ousia { namespace parser_stack { -void TokenStack::pushTokens(const std::vector &tokens) +void TokenStack::pushTokens(const std::vector &tokens) { stack.push_back(tokens); } @@ -35,7 +35,7 @@ TokenSet TokenStack::tokens() const } TokenSet res; - for (const TokenSyntaxDescriptor &descr : stack.back()) { + for (const SyntaxDescriptor &descr : stack.back()) { descr.insertIntoTokenSet(res); } return res; diff --git a/src/core/parser/stack/TokenStack.hpp b/src/core/parser/stack/TokenStack.hpp index 9669f50..af734bb 100644 --- a/src/core/parser/stack/TokenStack.hpp +++ b/src/core/parser/stack/TokenStack.hpp @@ -32,6 +32,7 @@ #include #include +#include namespace ousia { namespace parser_stack { @@ -52,7 +53,7 @@ private: * Stack containing vectors of TokenSyntaxDescriptor instances as given by * the user. */ - std::vector> stack; + std::vector> stack; /** * Constructor of the TokenStack class. @@ -86,7 +87,7 @@ public: * @param tokens is a list of TokenSyntaxDescriptor instances that should be * stored on the stack. */ - void pushTokens(const std::vector &tokens); + void pushTokens(const std::vector &tokens); /** * Removes the previously pushed list of tokens from the stack. -- cgit v1.2.3 From 522580cfdfc9e6dc3448240448c29533e68f240f Mon Sep 17 00:00:00 2001 From: Benjamin Paassen Date: Mon, 2 Mar 2015 15:52:34 +0100 Subject: added check for witespace characters in Utils::isUserDefinedToken --- src/core/common/Utils.cpp | 15 +++++++++++---- src/core/common/Utils.hpp | 1 + test/core/common/UtilsTest.cpp | 2 ++ 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'src/core/common') diff --git a/src/core/common/Utils.cpp b/src/core/common/Utils.cpp index 219b437..a87ff6d 100644 --- a/src/core/common/Utils.cpp +++ b/src/core/common/Utils.cpp @@ -124,7 +124,8 @@ bool Utils::isUserDefinedToken(const std::string &token) // Make sure the token meets is neither empty, nor starts or ends with an // alphanumeric character const size_t len = token.size(); - if (len == 0 || isAlphanumeric(token[0]) || isAlphanumeric(token[len - 1])) { + if (len == 0 || isAlphanumeric(token[0]) || + isAlphanumeric(token[len - 1])) { return false; } @@ -134,13 +135,19 @@ bool Utils::isUserDefinedToken(const std::string &token) return false; } + // Make sure the token does not contain any whitespaces. + for (char c : token) { + if (isWhitespace(c)) { + return false; + } + } + // Make sure the token contains other characters but { and } - for (char c: token) { + for (char c : token) { if (c != '{' && c != '}') { return true; } } return false; } -} - +} \ No newline at end of file diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 25a4de5..d9e26da 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -117,6 +117,7 @@ public: *
  • '%', '%{', '}%'
  • * * + *
  • The token does not contain any whitespaces.
  • * */ static bool isUserDefinedToken(const std::string &token); diff --git a/test/core/common/UtilsTest.cpp b/test/core/common/UtilsTest.cpp index 54890ee..2aaa430 100644 --- a/test/core/common/UtilsTest.cpp +++ b/test/core/common/UtilsTest.cpp @@ -148,6 +148,7 @@ TEST(Utils, isUserDefinedToken) EXPECT_TRUE(Utils::isUserDefinedToken("`")); EXPECT_TRUE(Utils::isUserDefinedToken("<")); EXPECT_TRUE(Utils::isUserDefinedToken(">")); + EXPECT_TRUE(Utils::isUserDefinedToken("<+>")); EXPECT_FALSE(Utils::isUserDefinedToken("a:")); EXPECT_FALSE(Utils::isUserDefinedToken("a:a")); EXPECT_FALSE(Utils::isUserDefinedToken(":a")); @@ -158,6 +159,7 @@ TEST(Utils, isUserDefinedToken) EXPECT_FALSE(Utils::isUserDefinedToken("<\\")); EXPECT_FALSE(Utils::isUserDefinedToken("\\>")); EXPECT_FALSE(Utils::isUserDefinedToken("{!")); + EXPECT_FALSE(Utils::isUserDefinedToken("< + >")); } } -- cgit v1.2.3 From e31968c9e073c64cf718fbcaebbc83ee2bee48c8 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Mon, 2 Mar 2015 18:09:34 +0100 Subject: Added additional constructor to Token --- src/core/common/Token.hpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'src/core/common') diff --git a/src/core/common/Token.hpp b/src/core/common/Token.hpp index f37151f..4b56f1a 100644 --- a/src/core/common/Token.hpp +++ b/src/core/common/Token.hpp @@ -134,7 +134,9 @@ struct Token { * @param location is the location of the extracted string content in the * source file. */ - Token(SourceLocation location) : id(Tokens::Data), location(location) {} + Token(const SourceLocation &location) : id(Tokens::Data), location(location) + { + } /** * Constructor of the Token struct. @@ -144,11 +146,25 @@ struct Token { * @param location is the location of the extracted string content in the * source file. */ - Token(TokenId id, const std::string &content, SourceLocation location) + Token(TokenId id, const std::string &content, + const SourceLocation &location) : id(id), content(content), location(location) { } + /** + * Constructor of the a "data" Token with the given string data and + * location. + * + * @param content is the string content that should be stored in the token. + * @param location is the location of the content within the source file. + */ + Token(const std::string &content, + const SourceLocation &location = SourceLocation{}) + : id(Tokens::Data), content(content), location(location) + { + } + /** * Constructor of the Token struct, only initializes the token id * @@ -172,7 +188,6 @@ struct Token { */ const SourceLocation &getLocation() const { return location; } }; - } -#endif /* _OUSIA_TOKENS_HPP_ */ \ No newline at end of file +#endif /* _OUSIA_TOKENS_HPP_ */ -- cgit v1.2.3