/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* @file SourceOffsetVector.hpp
*
* Contains a helper class used for storing the SourceOffset of each character
* in a character vector in a compressed manner.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_SOURCE_OFFSET_VECTOR_HPP_
#define _OUSIA_SOURCE_OFFSET_VECTOR_HPP_
#include
#include
#include
#include
#include
#include
namespace ousia {
/**
* Class used for storing the SourceOffset of each character in the buffer using
* a delta compression.
*/
class SourceOffsetVector {
private:
/**
* Type used for representing the length of a character.
*/
using Length = uint8_t;
/**
* Maximum length that can be represented using the Length type.
*/
static constexpr size_t MAX_LEN = std::numeric_limits::max();
/**
* Interval in which the actual offset is stored, expressed as the binary
* logarithm.
*/
static constexpr size_t LOG2_OFFSET_INTERVAL = 6;
/**
* Interval in which the actual offset is stored.
*/
static constexpr size_t OFFSET_INTERVAL = (1 << LOG2_OFFSET_INTERVAL);
/**
* Bitmask for the bits that are set to zero when a new offset needs to be
* written.
*/
static constexpr size_t OFFSET_INTERVAL_MASK = OFFSET_INTERVAL - 1;
/**
* Vector containing the delta compressed offset information.
*/
std::vector lens;
/**
* Offsets containing the absolute offset for all OFFSET_INTERVAL elements.
*/
std::vector offsets;
/**
* Last position given as "end" position in the storeOffset() method.
* Used to adapt the length of the previous element in case start and end
* positions do not match.
*/
SourceOffset lastEnd;
public:
/**
* Default constructor of the SourceOffsetVector class.
*/
SourceOffsetVector() : lastEnd(0) {}
/**
* Stores the location of a character in this SourceOffsetVector.
*
* @param start is the start location of the chracter in the source file.
* @param end is the end location of the character in the source file.
*/
void storeOffset(SourceOffset start, SourceOffset end)
{
// Make sure (end - start) is smaller than MAX_LEN
assert(end - start < MAX_LEN);
// Adapt the length of the previous character in case there is a gap
if (!lens.empty() && start > lastEnd) {
lens.back() += start - lastEnd;
}
lastEnd = end;
// Store an absolute offset every OFFSET_INTERVAL elements
if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) {
offsets.push_back(start);
}
// Store the length
lens.push_back(end - start);
}
/**
* Loads the location of the character with the given index.
*
* @param idx is the index of the character for which the location should be
* read.
* @return a pair containing start and end source offset.
*/
std::pair loadOffset(size_t idx) const
{
// Special treatment for the last character
const size_t count = lens.size();
if (idx > 0 && idx == count) {
auto offs = loadOffset(count - 1);
return std::pair(offs.second,
offs.second);
}
// Calculate the start index in the lens vector and in the offsets
// vector
const size_t offsetIdx = idx >> LOG2_OFFSET_INTERVAL;
const size_t sumStartIdx = idx & ~OFFSET_INTERVAL_MASK;
// Make sure the index is valid
assert(idx < count);
assert(offsetIdx < offsets.size());
// Sum over the length starting with the start offset
SourceOffset start = offsets[offsetIdx];
for (size_t i = sumStartIdx; i < idx; i++) {
start += lens[i];
}
return std::pair(start, start + lens[idx]);
}
/**
* Returns the number of characters for which offsets are stored.
*/
size_t size() const { return lens.size(); }
/**
* Trims the length of the TokenizedData instance to the given length.
* Removes all token matches that lie within the trimmed region.
*
* @param length is the number of characters to which the TokenizedData
* instance should be trimmed.
*/
void trim(size_t length) {
if (length < size()) {
lens.resize(length);
offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1);
if (length > 0) {
lastEnd = loadOffset(length - 1).second;
} else {
lastEnd = 0;
}
}
}
/**
* Resets the SourceOffsetVector to the state it had when it was
* constructed.
*/
void clear() {
lens.clear();
offsets.clear();
lastEnd = 0;
}
};
}
#endif /* _OUSIA_SOURCE_OFFSET_VECTOR_HPP_ */