/* Ousía Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include "SourceOffsetVector.hpp" #include "TokenizedData.hpp" namespace ousia { /** * Maximum token length. */ constexpr TokenLength MaxTokenLength = std::numeric_limits::max(); namespace { /** * Structure used to represent the position of a token in the internal * character buffer. */ struct TokenMark { /** * Relative position of the token in the buffer. */ size_t bufStart; /** * TokenId of the associated token. */ TokenId id; /** * Length of the token. */ TokenLength len; /** * Specifies whether the token is special or not. */ bool special; /** * Constructor of the TokenMark structure, initializes all members with the * given values. * * @param id is the id of the token that is marked here. * @param bufStart is the start position of the TokenMark in the internal * character buffer. * @param len is the length of the token. * @param special modifies the sort order, special tokens are prefered. */ TokenMark(TokenId id, size_t bufStart, TokenLength len, bool special) : bufStart(bufStart), id(id), len(len), special(special) { } /** * Creates a dummy TokenMark instance used for comparison purposes. This * TokenMark will compare to be smaller than an equal TokenMark with * equivalent start. * * @param bufStart is start position of the TokenMark in the internal * character buffer. */ TokenMark(size_t bufStart) : bufStart(bufStart), id(Tokens::Empty), len(MaxTokenLength), special(true) { } /** * Operator used for sorting TokenMark instances. They are sorted in such a * way that the instances with smallest bufStart come first and for equal * bufStart values instances with the largest length come first. * * @param m1 is the left-hand side TokenMark instance of the comparison. * @param m2 is the right-hand side TokenMark instance of the comparison. */ friend bool operator<(const TokenMark &m1, const TokenMark &m2) { // Prefer the mark with the smaller bufStart if (m1.bufStart < m2.bufStart) { return true; } // Special handling for marks with the same bufStart if (m1.bufStart == m2.bufStart) { // If exactly one of the two marks is special, return true if this // one is special if (m1.special != m2.special) { return m1.special; } // Otherwise prefer longer marks return m1.len > m2.len; } return false; } }; } /** * Structure used to hold all the internal data structures that may be * exchanged between TokenizedData instances. */ class TokenizedDataImpl { private: /** * SourceId representing the source file from which the current content is * being read. */ SourceId sourceId; /** * Buffer containing the actual character data. */ std::vector buf; /** * Buffset storing the "protected" flag of the character data. */ std::vector protectedChars; /** * Vector storing all the character offsets efficiently. */ SourceOffsetVector offsets; /** * Vector containing all token marks. */ mutable std::vector marks; /** * Position of the first linebreak in a sequence of linebreaks. */ size_t firstLinebreak; /** * Current indentation level. */ uint16_t currentIndentation; /** * List containing the indentation levels at which a "INDENT" token was * issued. This list is used to issue the right count of "DEDENT" tokens. */ std::vector indentationLevels; /** * Number of linebreaks without any content between them. */ uint16_t numLinebreaks; /** * True if the reader has a non-whitespace character. */ bool hasNonWhitespaceChar : 1; /** * True if the last character is a whitespace character. */ bool lastCharIsWhitespace : 1; /** * True if the first character in the buffer is a whitespace character. */ bool firstCharIsWhitespace : 1; /** * Flag indicating whether the internal "marks" vector is sorted. */ mutable bool sorted : 1; public: /** * Constructor of TokenizedDataImpl. Takes the SourceId that should be used * for returned tokens. * * @param sourceId is the source identifier that should be used for * constructing the location when returning tokens. */ TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId) { clear(); } /** * Appends a complete string to the internal character buffer and extends * the text regions in the regions map. * * @param data is the string that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param protect if set to true, the appended characters will not be * affected by whitespace handling, they will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ size_t append(const std::string &data, SourceOffset offsStart, bool protect) { for (size_t i = 0; i < data.size(); i++) { if (offsStart != InvalidSourceOffset) { append(data[i], offsStart + i, offsStart + i + 1, protect); } else { append(data[i], InvalidSourceOffset, InvalidSourceOffset, protect); } } return size(); } /** * Appends a single character to the internal character buffer and extends * the text regions in the regions map. * * @param c is the character that should be appended to the buffer. * @param offsStart is the start offset in bytes in the input file. * @param offsEnd is the end offset in bytes in the input file. * @param protect if set to true, the appended character will not be * affected by whitespace handling, it will be returned as is. * @return the current size of the internal byte buffer. The returned value * is intended to be used for the "mark" function. */ size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd, bool protect) { // Add the character to the list and store the location of the character // in the source file buf.push_back(c); protectedChars.push_back(protect); offsets.storeOffset(offsStart, offsEnd); // Fetch information about the current character const size_t size = buf.size(); const bool isWhitespace = Utils::isWhitespace(c); const bool isLinebreak = Utils::isLinebreak(c); // Update the whitespace flags hasNonWhitespaceChar = hasNonWhitespaceChar || !isWhitespace; if (size == 0 && isWhitespace) { firstCharIsWhitespace = true; } lastCharIsWhitespace = isWhitespace; // Handle linebreaks if (isLinebreak) { // Mark linebreaks as linebreak mark(Tokens::Newline, size - 1, 1, false); // The linebreak sequence started at the previous character if (numLinebreaks == 0) { firstLinebreak = size - 1; } // Reset the indentation currentIndentation = 0; // Increment the number of linebreaks numLinebreaks++; const size_t markStart = firstLinebreak; const size_t markLength = size - firstLinebreak; // Issue two consecutive linebreaks as paragraph token if (numLinebreaks == 2) { mark(Tokens::Paragraph, markStart, markLength, false); } // Issue three consecutive linebreaks as paragraph token if (numLinebreaks >= 3) { mark(Tokens::Section, markStart, markLength, false); } } else if (isWhitespace) { // Count the whitespace characters at the beginning of the line if (numLinebreaks > 0) { // Implement the UNIX/Pyhton rule for tabs: Tabs extend to the // next multiple of eight. if (c == '\t') { currentIndentation = (currentIndentation + 8) & ~7; } else { currentIndentation++; } } } // Issue indent and unindent tokens if (!isWhitespace && numLinebreaks > 0) { // Issue a larger indentation than that in the previous line as // "Indent" token size_t lastIndentation = indentationLevels.empty() ? 0 : indentationLevels.back(); if (currentIndentation > lastIndentation) { indentationLevels.push_back(currentIndentation); mark(Tokens::Indent, size - 1, 0, true); } // Issue a smaller indentation than that in the previous line as // "Dedent" token while (!indentationLevels.empty() && currentIndentation < indentationLevels.back()) { indentationLevels.pop_back(); mark(Tokens::Dedent, firstLinebreak, 0, true); } // Reset the internal state machine lastIndentation = currentIndentation; numLinebreaks = 0; } return size; } /** * Marks the whitespace character at the given buffer position as protected. * * @param bufPos is the position of the character for which the "protected" * flag should be set. */ void protect(size_t bufPos) { protectedChars[bufPos] = true; } /** * Stores a token at the given position. * * @param id is the token that should be stored. * @param bufStart is the start position in the internal buffer. Use the * values returned by append to calculate the start position. * @param len is the length of the token. * @param special tags the mark as "special", prefering it in the sort order */ void mark(TokenId id, size_t bufStart, TokenLength len, bool special) { // Push the new instance back onto the list marks.emplace_back(id, bufStart, len, special); // Update the sorted flag as soon as more than one element is in the // list if (marks.size() > 1U) { sorted = sorted && *(marks.end() - 2) < *(marks.end() - 1); } } /** * Returns the next token or a text token if no explicit token is available. * Advances the given cursor to the end of the returned token. * * @param token is the Token instance to which the token should be written. * @param mode is the WhitespaceMode to be used for extracting the text * cursor. * @param tokens is a set of enabled tokens. Tokens that are not in this set * are ignored and returned as text. * @param cursor is the position in the character buffer from which on the * next token should be read. The cursor will be updated to the position * beyond the returned token. * @param endAtWhitespace if true, only delivers data up to the next * whitespace. * @return true if a token was returned, false if no more tokens are * available. */ bool next(Token &token, WhitespaceMode mode, const TokenSet &tokens, TokenizedDataCursor &cursor, bool endAtWhitespace) const { // Some variables for convenient access size_t &bufPos = cursor.bufPos; size_t &markPos = cursor.markPos; // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { std::sort(marks.begin(), marks.end()); sorted = true; } // Fetch the next larger TokenMark instance, make sure the token is in // the "enabled" list and within the buffer range auto it = std::lower_bound(marks.begin() + markPos, marks.end(), TokenMark(bufPos)); while (it != marks.end() && (tokens.count(it->id) == 0 || it->bufStart + it->len > buf.size())) { it++; } // Calculate the buffer start and end character, based on the returned // TokenMark instance size_t end = (it != marks.end()) ? it->bufStart : buf.size(); // Depending on the whitespace mode, fetch all the data between the // cursor position and the calculated end position and return a token // containing that data. if (bufPos < end && bufPos < buf.size()) { // If endAtWhitespace is set to true, limit copying to the the first // whitespace character after non-whitespace if (endAtWhitespace) { bool hasNonWhitespace = false; for (size_t i = bufPos; i < end; i++) { const bool isWhitespace = Utils::isWhitespace(buf[i]); if (isWhitespace) { if (hasNonWhitespace) { end = i; break; } } else { hasNonWhitespace = true; } } } switch (mode) { case WhitespaceMode::PRESERVE: { token = Token( Tokens::Data, std::string(&buf[bufPos], end - bufPos), SourceLocation(sourceId, offsets.loadOffset(bufPos).first, offsets.loadOffset(end).first)); bufPos = end; return true; } case WhitespaceMode::TRIM: case WhitespaceMode::COLLAPSE: { // Calculate the collapsed string and the corresponding // trimmed region size_t stringStart; size_t stringEnd; std::string content; const char *cBuf = &buf[bufPos]; auto filter = [cBuf, this](size_t i) -> bool { return Utils::isWhitespace(cBuf[i]) && !protectedChars[i]; }; if (mode == WhitespaceMode::TRIM) { content = Utils::trim(cBuf, end - bufPos, stringStart, stringEnd, filter); } else { content = Utils::collapse( cBuf, end - bufPos, stringStart, stringEnd, filter); } // If the resulting string is empty (only whitespaces), // abort if (content.empty()) { bufPos = end; break; } // Calculate the absolute positions and return the token stringStart += bufPos; stringEnd += bufPos; token = Token( Tokens::Data, content, SourceLocation(sourceId, offsets.loadOffset(stringStart).first, offsets.loadOffset(stringEnd).first)); bufPos = end; return true; } } } // If start equals end, we're currently directly at a token // instance. Return this token and advance the cursor to the end of // the token. if (bufPos == end && it != marks.end()) { const size_t tokenStart = it->bufStart; const size_t tokenEnd = it->bufStart + it->len; token = Token( it->id, std::string(&buf[tokenStart], it->len), SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, offsets.loadOffset(tokenEnd).first)); // Update the cursor, consume the token by incrementing the marks // pos counter bufPos = tokenEnd; markPos = it - marks.begin() + 1; return true; } // We've failed. There is no token. Only void. Reset token and return // false. token = Token(); return false; } /** * Resets the TokenizedDataImpl instance to the state it had when it was * constructred. */ void clear() { buf.clear(); protectedChars.clear(); offsets.clear(); marks.clear(); firstLinebreak = 0; currentIndentation = 0; indentationLevels.clear(); numLinebreaks = 1; // Assume the stream starts with a linebreak hasNonWhitespaceChar = false; lastCharIsWhitespace = false; firstCharIsWhitespace = false; sorted = true; } /** * Trims the length of the TokenizedDataImpl instance to the given length. * * @param length is the number of characters to which the TokenizedData * instance should be trimmed. */ void trim(size_t length) { if (length < size()) { buf.resize(length); protectedChars.resize(length); offsets.trim(length); // Recalculate the whitespace flags hasNonWhitespaceChar = false; lastCharIsWhitespace = false; firstCharIsWhitespace = false; if (length > 0) { firstCharIsWhitespace = Utils::isWhitespace(buf[0]); lastCharIsWhitespace = Utils::isWhitespace(buf[length - 1]); for (char c: buf) { if (Utils::isWhitespace(c)) { hasNonWhitespaceChar = true; break; } } } } } /** * Returns the current size of the internal buffer. * * @return the size of the internal character buffer. */ size_t size() const { return buf.size(); } /** * Returns true if no data is in the data buffer. * * @return true if the "buf" instance has no data. */ bool empty() const { return buf.empty(); } /** * Returns the current location of all data in the buffer. * * @return the location of the entire data represented by this instance. */ SourceLocation getLocation() const { if (empty()) { return SourceLocation{sourceId}; } return SourceLocation{sourceId, offsets.loadOffset(0).first, offsets.loadOffset(size()).second}; } /** * Returns true if at least one non-whitespace character is stored in the * TokenizedData structure. * * @return true if the at least one character in the TokenizedData structure * is a non-whitespace character. */ bool getHasNonWhitespaceChar() const { return hasNonWhitespaceChar; } /** * Returns true if the last character of the TokenizedData structure is a * whitespace character. * * @return true if the last character is a whitespace character. */ bool getLastCharIsWhitespace() const { return lastCharIsWhitespace; } /** * Returns true if the first character of the TokenizedData structure is a * whitespace character. * * @return true if the first character is a whitespace character. */ bool getFirstCharIsWhitespace() const { return firstCharIsWhitespace; } }; /* Class TokenizedData */ TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} TokenizedData::TokenizedData(SourceId sourceId) : impl(std::make_shared(sourceId)) { } TokenizedData::TokenizedData(const std::string &data, SourceOffset offsStart, SourceId sourceId) : TokenizedData(sourceId) { append(data, offsStart); } TokenizedData::~TokenizedData() {} size_t TokenizedData::append(const std::string &data, SourceOffset offsStart, bool protect) { return impl->append(data, offsStart, protect); } size_t TokenizedData::append(char c, SourceOffset offsStart, SourceOffset offsEnd, bool protect) { return impl->append(c, offsStart, offsEnd, protect); } void TokenizedData::mark(TokenId id, TokenLength len) { impl->mark(id, impl->size() - len, len, false); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) { impl->mark(id, bufStart, len, false); } void TokenizedData::clear() { impl->clear(); } void TokenizedData::trim(size_t length) { impl->trim(length); } size_t TokenizedData::size() const { return impl->size(); } bool TokenizedData::empty() const { return impl->empty(); } SourceLocation TokenizedData::getLocation() const { return impl->getLocation(); } TokenizedDataReader TokenizedData::reader() const { return TokenizedDataReader(impl, TokenizedDataCursor(), TokenizedDataCursor()); } void TokenizedData::protect(size_t bufPos) { impl->protect(bufPos); } bool TokenizedData::hasNonWhitespaceChar() const { return impl->getHasNonWhitespaceChar(); } bool TokenizedData::lastCharIsWhitespace() const { return impl->getLastCharIsWhitespace(); } bool TokenizedData::firstCharIsWhitespace() const { return impl->getFirstCharIsWhitespace(); } /* Class TokenizedDataReader */ TokenizedDataReader::TokenizedDataReader( std::shared_ptr impl, const TokenizedDataCursor &readCursor, const TokenizedDataCursor &peekCursor) : impl(impl), readCursor(readCursor), peekCursor(peekCursor) { } TokenizedDataReaderFork TokenizedDataReader::fork() { return TokenizedDataReaderFork(*this, impl, readCursor, peekCursor); } bool TokenizedDataReader::atEnd() const { return readCursor.bufPos >= impl->size(); } bool TokenizedDataReader::read(Token &token, const TokenSet &tokens, WhitespaceMode mode, bool endAtWhitespace) { peekCursor = readCursor; return impl->next(token, mode, tokens, readCursor, endAtWhitespace); } bool TokenizedDataReader::peek(Token &token, const TokenSet &tokens, WhitespaceMode mode, bool endAtWhitespace) { return impl->next(token, mode, tokens, peekCursor, endAtWhitespace); } }