/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include "SourceOffsetVector.hpp"
#include "TokenizedData.hpp"
namespace ousia {
namespace {
/**
* Structure used to represent the position of a token in the internal
* character buffer.
*/
struct TokenMark {
/**
* Relative position of the token in the buffer.
*/
size_t bufStart;
/**
* TokenId of the associated token.
*/
TokenId id;
/**
* Length of the token.
*/
TokenLength len;
/**
* Constructor of the TokenMark structure, initializes all members with the
* given values.
*
* @param id is the id of the token that is marked here.
* @param bufStart is the start position of the TokenMark in the internal
* character buffer.
* @param len is the length of the token.
*/
TokenMark(TokenId id, size_t bufStart, TokenLength len)
: bufStart(bufStart), id(id), len(len)
{
}
/**
* Creates a dummy TokenMark instance used for comparison purposes. This
* TokenMark will compare to be smaller than an equal TokenMark with
* equivalent start.
*
* @param bufStart is start position of the TokenMark in the internal
* character buffer.
*/
TokenMark(size_t bufStart)
: bufStart(bufStart),
id(Tokens::Empty),
len(std::numeric_limits::max())
{
}
/**
* Operator used for sorting TokenMark instances. They are sorted in such a
* way that the instances with smallest bufStart come first and for equal
* bufStart values instances with the largest length come first.
*
* @param m1 is the left-hand side TokenMark instance of the comparison.
* @param m2 is the right-hand side TokenMark instance of the comparison.
*/
friend bool operator<(const TokenMark &m1, const TokenMark &m2)
{
return (m1.bufStart < m2.bufStart) ||
(m1.bufStart == m2.bufStart && m1.len > m2.len);
}
};
}
/**
* Structure used to hold all the internal data structures that may be
* exchanged between TokenizedData instances.
*/
class TokenizedDataImpl {
private:
/**
* SourceId representing the source file from which the current content is
* being read.
*/
SourceId sourceId;
/**
* Buffer containing the actual character data.
*/
std::vector buf;
/**
* Vector containing all token marks.
*/
std::vector marks;
/**
* Vector storing all the character offsets efficiently.
*/
SourceOffsetVector offsets;
/**
* Flag indicating whether the internal "marks" vector is sorted.
*/
bool sorted;
public:
/**
* Constructor of TokenizedDataImpl. Takes the SourceId that should be used
* for returned tokens.
*
* @param sourceId is the source identifier that should be used for
* constructing the location when returning tokens.
*/
TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {}
/**
* Appends a complete string to the internal character buffer and extends
* the text regions in the regions map.
*
* @param data is the string that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
size_t append(const std::string &data, SourceOffset offsStart)
{ // Append the data to the internal buffer
buf.insert(buf.end(), data.begin(), data.end());
// Extend the text regions, interpolate the source position (this may
// yield incorrect results)
const size_t size = buf.size();
for (SourceOffset offs = offsStart; offs < offsStart + data.size();
offs++) {
offsets.storeOffset(offs, offs + 1);
}
return size;
}
/**
* Appends a single character to the internal character buffer and extends
* the text regions in the regions map.
*
* @param c is the character that should be appended to the buffer.
* @param offsStart is the start offset in bytes in the input file.
* @param offsEnd is the end offset in bytes in the input file.
* @return the current size of the internal byte buffer. The returned value
* is intended to be used for the "mark" function.
*/
size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd)
{
// Add the character to the list and store the location of the character
// in the source file
buf.push_back(c);
offsets.storeOffset(offsStart, offsEnd);
return buf.size();
}
/**
* Stores a token at the given position.
*
* @param id is the token that should be stored.
* @param bufStart is the start position in the internal buffer. Use the
* values returned by append to calculate the start position.
* @param len is the length of the token.
*/
void mark(TokenId id, size_t bufStart, TokenLength len)
{
// Push the new instance back onto the list
marks.emplace_back(id, bufStart, len);
// Update the sorted flag as soon as more than one element is in the
// list
if (marks.size() > 1U) {
sorted = sorted && *(marks.end() - 2) < *(marks.end() - 1);
}
}
/**
* Returns the next token or a text token if no explicit token is available.
* Advances the given cursor to the end of the returned token.
*
* @param token is the Token instance to which the token should be written.
* @param mode is the WhitespaceMode to be used for extracting the text
* cursor.
* @param tokens is a set of enabled tokens. Tokens that are not in this set
* are ignored and returned as text.
* @param cursor is the position in the character buffer from which on the
* next token should be read. The cursor will be updated to the position
* beyond the returned token.
* @return true if a token was returned, false if no more tokens are
* available.
*/
bool next(Token &token, WhitespaceMode mode,
const std::unordered_set &tokens, size_t &cursor)
{
// Sort the "marks" vector if it has not been sorted yet.
if (!sorted) {
std::sort(marks.begin(), marks.end());
sorted = true;
}
// Fetch the next larger TokenMark instance, make sure the token is in
// the "enabled" list
auto it =
std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor));
while (it != marks.end() && tokens.count(it->id) == 0) {
it++;
}
// Calculate the buffer start and end character, based on the returned
// TokenMark instance
const size_t end = (it != marks.end()) ? it->bufStart : buf.size();
// Depending on the whitespace mode, fetch all the data between the
// cursor position and the calculated end position and return a token
// containing that data.
if (cursor < end && cursor < buf.size()) {
switch (mode) {
case WhitespaceMode::PRESERVE: {
token = Token(
Tokens::Data, std::string(&buf[cursor], end - cursor),
SourceLocation(sourceId,
offsets.loadOffset(cursor).first,
offsets.loadOffset(end).first));
cursor = end;
return true;
}
case WhitespaceMode::TRIM:
case WhitespaceMode::COLLAPSE: {
// Calculate the collapsed string and the corresponding
// trimmed region
size_t stringStart;
size_t stringEnd;
std::string content;
if (mode == WhitespaceMode::TRIM) {
content = Utils::trim(&buf[cursor], end - cursor,
stringStart, stringEnd);
} else {
content = Utils::collapse(&buf[cursor], end - cursor,
stringStart, stringEnd);
}
// If the resulting string is empty (only whitespaces),
// abort
if (content.empty()) {
cursor = end;
break;
}
// Calculate the absolute positions and return the token
stringStart += cursor;
stringEnd += cursor;
token = Token(
Tokens::Data, content,
SourceLocation(sourceId,
offsets.loadOffset(stringStart).first,
offsets.loadOffset(stringEnd).first));
cursor = end;
return true;
}
}
}
// If start equals end, we're currently directly at a token
// instance. Return this token and advance the cursor to the end of
// the token.
if (cursor == end && it != marks.end()) {
const size_t tokenStart = it->bufStart;
const size_t tokenEnd = it->bufStart + it->len;
token = Token(
it->id, std::string(&buf[tokenStart], it->len),
SourceLocation(sourceId, offsets.loadOffset(tokenStart).first,
offsets.loadOffset(tokenEnd).first));
cursor = tokenEnd;
return true;
}
// We've failed. There is no token. Only void. Reset token and return
// false.
token = Token();
return false;
}
/**
* Returns the current size of the internal buffer.
*
* @return the size of the internal character buffer.
*/
size_t getSize() { return buf.size(); }
};
/* Class TokenizedData */
TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {}
TokenizedData::TokenizedData(SourceId sourceId)
: impl(std::make_shared(sourceId)), cursor(0)
{
}
TokenizedData::~TokenizedData() {}
size_t TokenizedData::append(const std::string &data, SourceOffset offsStart)
{
return impl->append(data, offsStart);
}
size_t TokenizedData::append(char c, SourceOffset offsStart,
SourceOffset offsEnd)
{
return impl->append(c, offsStart, offsEnd);
}
void TokenizedData::mark(TokenId id, TokenLength len)
{
impl->mark(id, impl->getSize() - len, len);
}
void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len)
{
impl->mark(id, bufStart, len);
}
bool TokenizedData::next(Token &token, WhitespaceMode mode)
{
return impl->next(token, mode, tokens, cursor);
}
bool TokenizedData::text(Token &token, WhitespaceMode mode)
{
// Copy the current cursor position to not update the actual cursor position
// if the operation was not successful
size_t cursorCopy = cursor;
if (!impl->next(token, mode, tokens, cursorCopy) ||
token.id != Tokens::Data) {
return false;
}
// There is indeed a text token, update the internal cursor position
cursor = cursorCopy;
return true;
}
}