From cb6cacdc7eade9d4290767bafb7ccf4e935d0fbf Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 13:49:26 +0100 Subject: allowing to store gaps in SourceOffsetVector and fixed bug with trim not resetting offsets correctly when the new length is zero --- src/core/parser/utils/SourceOffsetVector.hpp | 64 ++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 18 deletions(-) (limited to 'src/core') diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index 67bacef..f322a88 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -43,6 +44,9 @@ namespace ousia { * a delta compression. */ class SourceOffsetVector { +public: + using OffsPair = std::pair; + private: /** * Type used for representing the length of a character. @@ -81,10 +85,13 @@ private: */ std::vector offsets; + /** + * Map used to store discontinuities in the character offsets. + */ + std::unordered_map gaps; + /** * Last position given as "end" position in the storeOffset() method. - * Used to adapt the length of the previous element in case start and end - * positions do not match. */ SourceOffset lastEnd; @@ -105,19 +112,22 @@ public: // Make sure (end - start) is smaller than MAX_LEN assert(end - start < MAX_LEN); - // Adapt the length of the previous character in case there is a gap - if (!lens.empty() && start > lastEnd) { - lens.back() += start - lastEnd; - } - lastEnd = end; - // Store an absolute offset every OFFSET_INTERVAL elements if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) { offsets.push_back(start); } - // Store the length - lens.push_back(end - start); + // Adapt the length of the previous character in case there is a gap + if (!lens.empty() && start > lastEnd) { + // There is a discontinuity, store the given offsets in the "gaps" + // map + gaps[lens.size()] = OffsPair(start, end); + lens.push_back(MAX_LEN); + } else { + // Store the length + lens.push_back(end - start); + } + lastEnd = end; } /** @@ -127,14 +137,13 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair loadOffset(size_t idx) const + OffsPair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); if (idx > 0 && idx == count) { auto offs = loadOffset(count - 1); - return std::pair(offs.second, - offs.second); + return OffsPair(offs.second, offs.second); } // Calculate the start index in the lens vector and in the offsets @@ -146,12 +155,26 @@ public: assert(idx < count); assert(offsetIdx < offsets.size()); + // If the length of the last character is MAX_LEN, the position is + // stored in the "gaps" list + if (lens[idx] == MAX_LEN) { + auto it = gaps.find(idx); + assert(it != gaps.end()); + return it->second; + } + // Sum over the length starting with the start offset SourceOffset start = offsets[offsetIdx]; for (size_t i = sumStartIdx; i < idx; i++) { - start += lens[i]; + if (lens[i] == MAX_LEN) { + auto it = gaps.find(i); + assert(it != gaps.end()); + start = it->second.first; + } else { + start += lens[i]; + } } - return std::pair(start, start + lens[idx]); + return OffsPair(start, start + lens[idx]); } /** @@ -166,13 +189,16 @@ public: * @param length is the number of characters to which the TokenizedData * instance should be trimmed. */ - void trim(size_t length) { + void trim(size_t length) + { if (length < size()) { lens.resize(length); - offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); if (length > 0) { + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); lastEnd = loadOffset(length - 1).second; } else { + offsets.clear(); + gaps.clear(); lastEnd = 0; } } @@ -182,9 +208,11 @@ public: * Resets the SourceOffsetVector to the state it had when it was * constructed. */ - void clear() { + void clear() + { lens.clear(); offsets.clear(); + gaps.clear(); lastEnd = 0; } }; -- cgit v1.2.3