diff options
Diffstat (limited to 'src/core/parser/utils/TokenizedData.cpp')
-rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 361 |
1 files changed, 295 insertions, 66 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..d8a8b37 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -26,6 +26,11 @@ #include "TokenizedData.hpp" namespace ousia { +/** + * Maximum token length. + */ +constexpr TokenLength MaxTokenLength = std::numeric_limits<TokenLength>::max(); + namespace { /** * Structure used to represent the position of a token in the internal @@ -48,6 +53,11 @@ struct TokenMark { TokenLength len; /** + * Specifies whether the token is special or not. + */ + bool special; + + /** * Constructor of the TokenMark structure, initializes all members with the * given values. * @@ -55,9 +65,10 @@ struct TokenMark { * @param bufStart is the start position of the TokenMark in the internal * character buffer. * @param len is the length of the token. + * @param special modifies the sort order, special tokens are prefered. */ - TokenMark(TokenId id, size_t bufStart, TokenLength len) - : bufStart(bufStart), id(id), len(len) + TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) + : bufStart(bufStart), id(id), len(len), special(special) { } @@ -72,7 +83,8 @@ struct TokenMark { TokenMark(size_t bufStart) : bufStart(bufStart), id(Tokens::Empty), - len(std::numeric_limits<TokenLength>::max()) + len(MaxTokenLength), + special(true) { } @@ -86,8 +98,22 @@ struct TokenMark { */ friend bool operator<(const TokenMark &m1, const TokenMark &m2) { - return (m1.bufStart < m2.bufStart) || - (m1.bufStart == m2.bufStart && m1.len > m2.len); + // Prefer the mark with the smaller bufStart + if (m1.bufStart < m2.bufStart) { + return true; + } + + // Special handling for marks with the same bufStart + if (m1.bufStart == m2.bufStart) { + // If exactly one of the two marks is special, return true if this + // one is special + if (m1.special != m2.special) { + return m1.special; + } + // Otherwise prefer longer marks + return m1.len > m2.len; + } + return false; } }; } @@ -110,9 +136,9 @@ private: std::vector<char> buf; /** - * Vector containing all token marks. + * Buffset storing the "protected" flag of the character data. */ - std::vector<TokenMark> marks; + std::vector<bool> protectedChars; /** * Vector storing all the character offsets efficiently. @@ -120,9 +146,34 @@ private: SourceOffsetVector offsets; /** + * Vector containing all token marks. + */ + mutable std::vector<TokenMark> marks; + + /** + * Position of the first linebreak in a sequence of linebreaks. + */ + size_t firstLinebreak; + + /** + * Current indentation level. + */ + uint16_t currentIndentation; + + /** + * Last indentation level. + */ + uint16_t lastIndentation; + + /** + * Number of linebreaks without any content between them. + */ + uint16_t numLinebreaks; + + /** * Flag indicating whether the internal "marks" vector is sorted. */ - bool sorted; + mutable bool sorted; public: /** @@ -132,7 +183,7 @@ public: * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ - TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} + TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); } /** * Appends a complete string to the internal character buffer and extends @@ -140,22 +191,22 @@ public: * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. + * @param protect if set to true, the appended characters will not be + * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(const std::string &data, SourceOffset offsStart) - { // Append the data to the internal buffer - buf.insert(buf.end(), data.begin(), data.end()); - - // Extend the text regions, interpolate the source position (this may - // yield incorrect results) - const size_t size = buf.size(); - for (SourceOffset offs = offsStart; offs < offsStart + data.size(); - offs++) { - offsets.storeOffset(offs, offs + 1); + size_t append(const std::string &data, SourceOffset offsStart, bool protect) + { + for (size_t i = 0; i < data.size(); i++) { + if (offsStart != InvalidSourceOffset) { + append(data[i], offsStart + i, offsStart + i + 1, protect); + } else { + append(data[i], InvalidSourceOffset, InvalidSourceOffset, + protect); + } } - - return size; + return size(); } /** @@ -165,16 +216,86 @@ public: * @param c is the character that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param offsEnd is the end offset in bytes in the input file. + * @param protect if set to true, the appended character will not be + * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ - size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, + bool protect) { // Add the character to the list and store the location of the character // in the source file buf.push_back(c); + protectedChars.push_back(protect); offsets.storeOffset(offsStart, offsEnd); - return buf.size(); + + // Insert special tokens + const size_t size = buf.size(); + const bool isWhitespace = Utils::isWhitespace(c); + const bool isLinebreak = Utils::isLinebreak(c); + + // Handle linebreaks + if (isLinebreak) { + // Mark linebreaks as linebreak + mark(Tokens::Newline, size - 1, 1, false); + + // The linebreak sequence started at the previous character + if (numLinebreaks == 0) { + firstLinebreak = size - 1; + } + + // Reset the indentation + currentIndentation = 0; + + // Increment the number of linebreaks + numLinebreaks++; + + const size_t markStart = firstLinebreak; + const size_t markLength = size - firstLinebreak; + + // Issue two consecutive linebreaks as paragraph token + if (numLinebreaks == 2) { + mark(Tokens::Paragraph, markStart, markLength, false); + } + + // Issue three consecutive linebreaks as paragraph token + if (numLinebreaks >= 3) { + mark(Tokens::Section, markStart, markLength, false); + } + } else if (isWhitespace) { + // Count the whitespace characters at the beginning of the line + if (numLinebreaks > 0) { + // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the + // next multiple of eight. + if (c == '\t') { + currentIndentation = (currentIndentation + 8) & ~7; + } else { + currentIndentation++; + } + } + } + + // Issue indent and unindent tokens + if (!isWhitespace && numLinebreaks > 0) { + // Issue a larger indentation than that in the previous line as + // "Indent" token + if (currentIndentation > lastIndentation) { + mark(Tokens::Indent, size - 1, 0, true); + } + + // Issue a smaller indentation than that in the previous line as + // "Dedent" token + if (currentIndentation < lastIndentation) { + mark(Tokens::Dedent, size - 1, 0, true); + } + + // Reset the internal state machine + lastIndentation = currentIndentation; + numLinebreaks = 0; + } + + return size; } /** @@ -184,11 +305,12 @@ public: * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. + * @param special tags the mark as "special", prefering it in the sort order */ - void mark(TokenId id, size_t bufStart, TokenLength len) + void mark(TokenId id, size_t bufStart, TokenLength len, bool special) { // Push the new instance back onto the list - marks.emplace_back(id, bufStart, len); + marks.emplace_back(id, bufStart, len, special); // Update the sorted flag as soon as more than one element is in the // list @@ -212,9 +334,13 @@ public: * @return true if a token was returned, false if no more tokens are * available. */ - bool next(Token &token, WhitespaceMode mode, - const std::unordered_set<TokenId> &tokens, size_t &cursor) + bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, + TokenizedDataCursor &cursor) const { + // Some variables for convenient access + size_t &bufPos = cursor.bufPos; + size_t &markPos = cursor.markPos; + // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { std::sort(marks.begin(), marks.end()); @@ -222,10 +348,11 @@ public: } // Fetch the next larger TokenMark instance, make sure the token is in - // the "enabled" list - auto it = - std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); - while (it != marks.end() && tokens.count(it->id) == 0) { + // the "enabled" list and within the buffer range + auto it = std::lower_bound(marks.begin() + markPos, marks.end(), + TokenMark(bufPos)); + while (it != marks.end() && (tokens.count(it->id) == 0 || + it->bufStart + it->len > buf.size())) { it++; } @@ -236,15 +363,15 @@ public: // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. - if (cursor < end && cursor < buf.size()) { + if (bufPos < end && bufPos < buf.size()) { switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( - Tokens::Data, std::string(&buf[cursor], end - cursor), + Tokens::Data, std::string(&buf[bufPos], end - bufPos), SourceLocation(sourceId, - offsets.loadOffset(cursor).first, + offsets.loadOffset(bufPos).first, offsets.loadOffset(end).first)); - cursor = end; + bufPos = end; return true; } case WhitespaceMode::TRIM: @@ -254,30 +381,35 @@ public: size_t stringStart; size_t stringEnd; std::string content; + const char *cBuf = &buf[bufPos]; + auto filter = [cBuf, this](size_t i) -> bool { + return Utils::isWhitespace(cBuf[i]) && + !protectedChars[i]; + }; if (mode == WhitespaceMode::TRIM) { - content = Utils::trim(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::trim(cBuf, end - bufPos, stringStart, + stringEnd, filter); } else { - content = Utils::collapse(&buf[cursor], end - cursor, - stringStart, stringEnd); + content = Utils::collapse( + cBuf, end - bufPos, stringStart, stringEnd, filter); } // If the resulting string is empty (only whitespaces), // abort if (content.empty()) { - cursor = end; + bufPos = end; break; } // Calculate the absolute positions and return the token - stringStart += cursor; - stringEnd += cursor; + stringStart += bufPos; + stringEnd += bufPos; token = Token( Tokens::Data, content, SourceLocation(sourceId, offsets.loadOffset(stringStart).first, offsets.loadOffset(stringEnd).first)); - cursor = end; + bufPos = end; return true; } } @@ -286,14 +418,18 @@ public: // If start equals end, we're currently directly at a token // instance. Return this token and advance the cursor to the end of // the token. - if (cursor == end && it != marks.end()) { + if (bufPos == end && it != marks.end()) { const size_t tokenStart = it->bufStart; const size_t tokenEnd = it->bufStart + it->len; token = Token( it->id, std::string(&buf[tokenStart], it->len), SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, offsets.loadOffset(tokenEnd).first)); - cursor = tokenEnd; + + // Update the cursor, consume the token by incrementing the marks + // pos counter + bufPos = tokenEnd; + markPos = it - marks.begin() + 1; return true; } @@ -304,11 +440,64 @@ public: } /** + * Resets the TokenizedDataImpl instance to the state it had when it was + * constructred. + */ + void clear() + { + buf.clear(); + protectedChars.clear(); + offsets.clear(); + marks.clear(); + firstLinebreak = 0; + currentIndentation = 0; + lastIndentation = 0; + numLinebreaks = 1; // Assume the stream starts with a linebreak + sorted = true; + } + + /** + * Trims the length of the TokenizedDataImpl instance to the given length. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) + { + if (length < size()) { + buf.resize(length); + protectedChars.resize(length); + offsets.trim(length); + } + } + + /** * Returns the current size of the internal buffer. * * @return the size of the internal character buffer. */ - size_t getSize() { return buf.size(); } + size_t size() const { return buf.size(); } + + /** + * Returns true if no data is in the data buffer. + * + * @return true if the "buf" instance has no data. + */ + bool empty() const { return buf.empty(); } + + /** + * Returns the current location of all data in the buffer. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const + { + if (empty()) { + return SourceLocation{sourceId}; + } + return SourceLocation{sourceId, offsets.loadOffset(0).first, + offsets.loadOffset(size()).second}; + } }; /* Class TokenizedData */ @@ -316,50 +505,90 @@ public: TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} TokenizedData::TokenizedData(SourceId sourceId) - : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) + : impl(std::make_shared<TokenizedDataImpl>(sourceId)) { } +TokenizedData::TokenizedData(const std::string &data, SourceOffset offsStart, + SourceId sourceId) + : TokenizedData(sourceId) +{ + append(data, offsStart); +} + TokenizedData::~TokenizedData() {} -size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, + bool protect) { - return impl->append(data, offsStart); + return impl->append(data, offsStart, protect); } size_t TokenizedData::append(char c, SourceOffset offsStart, - SourceOffset offsEnd) + SourceOffset offsEnd, bool protect) { - return impl->append(c, offsStart, offsEnd); + return impl->append(c, offsStart, offsEnd, protect); } void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->getSize() - len, len); + impl->mark(id, impl->size() - len, len, false); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) { - impl->mark(id, bufStart, len); + impl->mark(id, bufStart, len, false); } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear() { impl->clear(); } + +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const { - return impl->next(token, mode, tokens, cursor); + return impl->getLocation(); } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +TokenizedDataReader TokenizedData::reader() const { - // Copy the current cursor position to not update the actual cursor position - // if the operation was not successful - size_t cursorCopy = cursor; - if (!impl->next(token, mode, tokens, cursorCopy) || - token.id != Tokens::Data) { - return false; - } + return TokenizedDataReader(impl, TokenizedDataCursor(), + TokenizedDataCursor()); +} + +/* Class TokenizedDataReader */ - // There is indeed a text token, update the internal cursor position - cursor = cursorCopy; - return true; +TokenizedDataReader::TokenizedDataReader( + std::shared_ptr<const TokenizedDataImpl> impl, + const TokenizedDataCursor &readCursor, + const TokenizedDataCursor &peekCursor) + : impl(impl), readCursor(readCursor), peekCursor(peekCursor) +{ +} + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ + return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const +{ + return readCursor.bufPos >= impl->size(); +} + +bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + peekCursor = readCursor; + return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + return impl->next(token, mode, tokens, peekCursor); } } |