diff options
Diffstat (limited to 'src/core/parser')
-rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 286 | ||||
-rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 70 | ||||
-rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 7 | ||||
-rw-r--r-- | src/core/parser/utils/Tokenizer.hpp | 2 |
4 files changed, 251 insertions, 114 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index 0ec56af..aeefa26 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -48,6 +48,17 @@ struct TokenMark { TokenLength len; /** + * Specifies whether the token is special or not. + */ + bool special; + + /** + * Maximum token length. + */ + static constexpr TokenLength MaxTokenLength = + std::numeric_limits<TokenLength>::max(); + + /** * Constructor of the TokenMark structure, initializes all members with the * given values. * @@ -55,9 +66,10 @@ struct TokenMark { * @param bufStart is the start position of the TokenMark in the internal * character buffer. * @param len is the length of the token. + * @param special modifies the sort order, special tokens are prefered. */ - TokenMark(TokenId id, size_t bufStart, TokenLength len) - : bufStart(bufStart), id(id), len(len) + TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) + : bufStart(bufStart), id(id), len(len), special(special) { } @@ -72,7 +84,8 @@ struct TokenMark { TokenMark(size_t bufStart) : bufStart(bufStart), id(Tokens::Empty), - len(std::numeric_limits<TokenLength>::max()) + len(MaxTokenLength), + special(true) { } @@ -86,8 +99,22 @@ struct TokenMark { */ friend bool operator<(const TokenMark &m1, const TokenMark &m2) { - return (m1.bufStart < m2.bufStart) || - (m1.bufStart == m2.bufStart && m1.len > m2.len); + // Prefer the mark with the smaller bufStart + if (m1.bufStart < m2.bufStart) { + return true; + } + + // Special handling for marks with the same bufStart + if (m1.bufStart == m2.bufStart) { + // If exactly one of the two marks is special, return true if this + // one is special + if (m1.special != m2.special) { + return m1.special; + } + // Otherwise prefer longer marks + return m1.len > m2.len; + } + return false; } }; } @@ -110,6 +137,11 @@ private: std::vector<char> buf; /** + * Buffset storing the "protected" flag of the character data. + */ + std::vector<bool> protectedChars; + + /** * Vector storing all the character offsets efficiently. */ SourceOffsetVector offsets; @@ -120,6 +152,26 @@ private: mutable std::vector<TokenMark> marks; /** + * Position of the first linebreak in a sequence of linebreaks. + */ + size_t firstLinebreak; + + /** + * Current indentation level. + */ + uint16_t currentIndentation; + + /** + * Last indentation level. + */ + uint16_t lastIndentation; + + /** + * Number of linebreaks without any content between them. + */ + uint16_t numLinebreaks; + + /** * Flag indicating whether the internal "marks" vector is sorted. */ mutable bool sorted; @@ -132,7 +184,7 @@ public: * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ - TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} + TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); } /** * Appends a complete string to the internal character buffer and extends @@ -140,25 +192,22 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart) - { // Append the data to the internal buffer - buf.insert(buf.end(), data.begin(), data.end()); - - // Extend the text regions, interpolate the source position (this may - // yield incorrect results) - const size_t size = buf.size(); + size_t append(const std::string &data, SourceOffset offsStart, bool protect) + { for (size_t i = 0; i < data.size(); i++) { if (offsStart != InvalidSourceOffset) { - offsets.storeOffset(offsStart + i, offsStart + i + 1); + append(data[i], offsStart + i, offsStart + i + 1, protect); } else { - offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset); + append(data[i], InvalidSourceOffset, InvalidSourceOffset, + protect); } } - - return size; + return size(); } /** @@ -168,16 +217,86 @@ public: * @param c is the character that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param offsEnd is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect) { // Add the character to the list and store the location of the character // in the source file buf.push_back(c); + protectedChars.push_back(protect); offsets.storeOffset(offsStart, offsEnd); - return buf.size(); + + // Insert special tokens + const size_t size = buf.size(); + const bool isWhitespace = Utils::isWhitespace(c); + const bool isLinebreak = Utils::isLinebreak(c); + + // Handle linebreaks + if (isLinebreak) { + // Mark linebreaks as linebreak + mark(Tokens::Newline, size - 1, 1, false); + + // The linebreak sequence started at the previous character + if (numLinebreaks == 0) { + firstLinebreak = size - 1; + } + + // Reset the indentation + currentIndentation = 0; + + // Increment the number of linebreaks + numLinebreaks++; + + const size_t markStart = firstLinebreak; + const size_t markLength = size - firstLinebreak; + + // Issue two consecutive linebreaks as paragraph token + if (numLinebreaks == 2) { + mark(Tokens::Paragraph, markStart, markLength, false); + } + + // Issue three consecutive linebreaks as paragraph token + if (numLinebreaks >= 3) { + mark(Tokens::Section, markStart, markLength, false); + } + } else if (isWhitespace) { + // Count the whitespace characters at the beginning of the line + if (numLinebreaks > 0) { + // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the + // next multiple of eight. + if (c == '\t') { + currentIndentation = (currentIndentation + 8) & ~7; + } else { + currentIndentation++; + } + } + } + + // Issue indent and unindent tokens + if (!isWhitespace && numLinebreaks > 0) { + // Issue a larger indentation than that in the previous line as + // "Indent" token + if (currentIndentation > lastIndentation) { + mark(Tokens::Indent, size - 1, 0, true); + } + + // Issue a smaller indentation than that in the previous line as + // "Dedent" token + if (currentIndentation < lastIndentation) { + mark(Tokens::Dedent, size - 1, 0, true); + } + + // Reset the internal state machine + lastIndentation = currentIndentation; + numLinebreaks = 0; + } + + return size; } /** @@ -187,11 +306,12 @@ public: * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. + * @param special tags the mark as "special", prefering it in the sort order */ - void mark(TokenId id, size_t bufStart, TokenLength len) + void mark(TokenId id, size_t bufStart, TokenLength len, bool special) { // Push the new instance back onto the list - marks.emplace_back(id, bufStart, len); + marks.emplace_back(id, bufStart, len, special); // Update the sorted flag as soon as more than one element is in the // list @@ -215,9 +335,13 @@ public: * @return true if a token was returned, false if no more tokens are * available. */ - bool next(Token &token, WhitespaceMode mode, - const std::unordered_set<TokenId> &tokens, size_t &cursor) const + bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, + TokenizedDataCursor &cursor) const { + // Some variables for convenient access + size_t &bufPos = cursor.bufPos; + size_t &markPos = cursor.markPos; + // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { std::sort(marks.begin(), marks.end()); @@ -226,8 +350,8 @@ public: // Fetch the next larger TokenMark instance, make sure the token is in // the "enabled" list and within the buffer range - auto it = - std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); + auto it = std::lower_bound(marks.begin() + markPos, marks.end(), + TokenMark(bufPos)); while (it != marks.end() && (tokens.count(it->id) == 0 || it->bufStart + it->len > buf.size())) { it++; @@ -240,15 +364,15 @@ public: // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. - if (cursor < end && cursor < buf.size()) { + if (bufPos < end && bufPos < buf.size()) { switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( - Tokens::Data, std::string(&buf[cursor], end - cursor), + Tokens::Data, std::string(&buf[bufPos], end - bufPos), SourceLocation(sourceId, - offsets.loadOffset(cursor).first, + offsets.loadOffset(bufPos).first, offsets.loadOffset(end).first)); - cursor = end; + bufPos = end; return true; } case WhitespaceMode::TRIM: @@ -258,30 +382,35 @@ public: size_t stringStart; size_t stringEnd; std::string content; + const char *cBuf = &buf[bufPos]; + auto filter = [cBuf, this](size_t i) -> bool { + return Utils::isWhitespace(cBuf[i]) && + !protectedChars[i]; + }; if (mode == WhitespaceMode::TRIM) { - content = Utils::trim(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::trim(cBuf, end - bufPos, stringStart, + stringEnd, filter); } else { - content = Utils::collapse(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::collapse( + cBuf, end - bufPos, stringStart, stringEnd, filter); } // If the resulting string is empty (only whitespaces), // abort if (content.empty()) { - cursor = end; + bufPos = end; break; } // Calculate the absolute positions and return the token - stringStart += cursor; - stringEnd += cursor; + stringStart += bufPos; + stringEnd += bufPos; token = Token( Tokens::Data, content, SourceLocation(sourceId, offsets.loadOffset(stringStart).first, offsets.loadOffset(stringEnd).first)); - cursor = end; + bufPos = end; return true; } } @@ -290,14 +419,18 @@ public: // If start equals end, we're currently directly at a token // instance. Return this token and advance the cursor to the end of // the token. - if (cursor == end && it != marks.end()) { + if (bufPos == end && it != marks.end()) { const size_t tokenStart = it->bufStart; const size_t tokenEnd = it->bufStart + it->len; token = Token( it->id, std::string(&buf[tokenStart], it->len), SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, offsets.loadOffset(tokenEnd).first)); - cursor = tokenEnd; + + // Update the cursor, consume the token by incrementing the marks + // pos counter + bufPos = tokenEnd; + markPos = it - marks.begin() + 1; return true; } @@ -314,8 +447,12 @@ public: void clear() { buf.clear(); - marks.clear(); + protectedChars.clear(); offsets.clear(); + marks.clear(); + currentIndentation = 0; + lastIndentation = 0; + numLinebreaks = 1; // Assume the stream starts with a linebreak sorted = true; } @@ -367,39 +504,35 @@ public: TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} TokenizedData::TokenizedData(SourceId sourceId) - : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) + : impl(std::make_shared<TokenizedDataImpl>(sourceId)) { } TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, + bool protect) { - return impl->append(data, offsStart); + return impl->append(data, offsStart, protect); } size_t TokenizedData::append(char c, SourceOffset offsStart, - SourceOffset offsEnd) + SourceOffset offsEnd, bool protect) { - return impl->append(c, offsStart, offsEnd); + return impl->append(c, offsStart, offsEnd, protect); } void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->size() - len, len); + impl->mark(id, impl->size() - len, len, false); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) { - impl->mark(id, bufStart, len); + impl->mark(id, bufStart, len, false); } -void TokenizedData::clear() -{ - impl->clear(); - tokens.clear(); - cursor = 0; -} +void TokenizedData::clear() { impl->clear(); } void TokenizedData::trim(size_t length) { impl->trim(length); } @@ -412,49 +545,42 @@ SourceLocation TokenizedData::getLocation() const return impl->getLocation(); } -TokenizedDataReader reader() const +TokenizedDataReader TokenizedData::reader() const { - return TokenizedDataReader(impl, std::unordered_set<TokenId>{}, 0, 0); + return TokenizedDataReader(impl, TokenizedDataCursor(), + TokenizedDataCursor()); } /* Class TokenizedDataReader */ +TokenizedDataReader::TokenizedDataReader( + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} + TokenizedDataReaderFork TokenizedDataReader::fork() { - return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor); + return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); } -bool TokenizedDataReader::atEnd() const { return readCursor >= size(); } +bool TokenizedDataReader::atEnd() const +{ + return readCursor.bufPos >= impl->size(); +} -bool TokenizedData::read(Token &token, const TokenSet &tokens, - WhitespaceMode mode) +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) { peekCursor = readCursor; return impl->next(token, mode, tokens, readCursor); } -bool TokenizedData::peek(Token &token, const TokenSet &tokens, - WhitespaceMode mode) +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) { return impl->next(token, mode, tokens, peekCursor); } - -Variant TokenizedData::text(WhitespaceMode mode) -{ - // Copy the current cursor position to not update the actual cursor position - // if the operation was not successful - size_t cursorCopy = cursor; - Token token; - if (!impl->next(token, mode, tokens, cursorCopy) || - token.id != Tokens::Data) { - return Variant{nullptr}; - } - - // There is indeed a text token, update the internal cursor position and - // return the token as variant. - cursor = cursorCopy; - Variant res = Variant::fromString(token.content); - res.setLocation(token.getLocation()); - return res; -} } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 85b80ae..b72ca02 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -36,7 +36,6 @@ #include <unordered_set> #include <core/common/Location.hpp> -#include <core/common/Variant.hpp> #include <core/common/Whitespace.hpp> #include <core/common/Token.hpp> @@ -48,6 +47,28 @@ class TokenizedDataReader; class TokenizedDataReaderFork; /** + * Internally used structure representing a cursor within the TokenizedData + * stream. + */ +struct TokenizedDataCursor { + /** + * Position within the byte buffer. + */ + size_t bufPos; + + /** + * Position within the token mark buffer. + */ + size_t markPos; + + /** + * Default constructor. The resulting cursor points at the beginning of the + * stream. + */ + TokenizedDataCursor() : bufPos(0), markPos(0) {} +}; + +/** * The TokenizedData class stores data extracted from a user defined document. * The data stored in TokenizedData */ @@ -88,10 +109,13 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart = 0); + size_t append(const std::string &data, SourceOffset offsStart = 0, + bool protect = false); /** * Appends a single character to the internal character buffer. @@ -99,10 +123,13 @@ public: * @param c is the character that should be appended to the buffer. * @param start is the start offset in bytes in the input file. * @param end is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect = false); /** * Stores a token ending at the last character of the current buffer. @@ -187,15 +214,16 @@ private: /** * Position from which the last element was read from the internal buffer. */ - size_t readCursor; + TokenizedDataCursor readCursor; /** * Position from which the last element was peeked from the internal buffer. */ - size_t peekCursor; + TokenizedDataCursor peekCursor; +protected: /** - * Private constructor of TokenizedDataReader, taking a reference to the + * Protected constructor of TokenizedDataReader, taking a reference to the * internal TokenizedDataImpl structure storing the data that is accessed by * the reader. * @@ -205,8 +233,9 @@ private: * @param peekCursor is the cursor position from which tokens and text are * peeked. */ - TokenizedDataReader(std::shared_ptr<TokenizedDataImpl> impl, - size_t readCursor, size_t peekCursor); + TokenizedDataReader(std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor); public: /** @@ -237,7 +266,7 @@ public: * false if there are no more tokens. */ bool read(Token &token, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::COLLAPSE); + WhitespaceMode mode = WhitespaceMode::TRIM); /** * Stores the next token in the given token reference, returns true if the @@ -253,7 +282,7 @@ public: * false if there are no more tokens. */ bool peek(Token &token, const TokenSet &tokens = TokenSet{}, - WhitespaceMode mode = WhitespaceMode::COLLAPSE); + WhitespaceMode mode = WhitespaceMode::TRIM); /** * Consumes the peeked tokens, the read cursor will now be at the position @@ -265,20 +294,6 @@ public: * Resets the peek cursor to the position of the read cursor. */ void resetPeek() { peekCursor = readCursor; } - - /** - * Stores the next text token in the given token reference, returns true if - * the operation was successful (there was indeed a text token), false if - * the next token is not a text token or there were no more tokens. - * - * @param token is an output parameter into which the read token will be - * stored. The TokenId is set to Tokens::Empty if there are no more tokens. - * @param mode is the whitespace mode that should be used when a text token - * is returned. - * @return a string variant with the data if there is any data or a nullptr - * variant if there is no text. - */ - Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE); }; /** @@ -309,8 +324,9 @@ private: * peeked. */ TokenizedDataReaderFork(TokenizedDataReader &parent, - std::shared_ptr<TokenizedDataImpl> impl, - size_t readCursor, size_t peekCursor) + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) { } @@ -320,7 +336,7 @@ public: * Commits the read/peek progress to the underlying parent. */ void commit() { parent = *this; } -} +}; } #endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 51787cd..e78b0f4 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -156,6 +156,7 @@ public: return res; } }; + } /* Class Tokenizer */ @@ -229,12 +230,6 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) } else { // Record all incomming characters data.append(c, charStart, charEnd); - - // Special token processing - // TODO: Build a special state machine for this in another class - if (c == '\n') { - data.mark(Tokens::Newline, 1); - } } // Swap the lookups and the nextLookups list diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index 2ddb9c9..74e3f0d 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -28,7 +28,7 @@ #ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ #define _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#include <set> +#include <cstdint> #include <string> #include <vector> |