From 84c9abc3e9762c4486ddc5ca0352a5d697a51987 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Wed, 25 Feb 2015 23:09:26 +0100 Subject: start of branch, commit log will be rewritten --- src/core/parser/utils/SourceOffsetVector.hpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'src/core/parser/utils/SourceOffsetVector.hpp') diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..aaebe7d 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -127,7 +127,7 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair loadOffset(size_t idx) + std::pair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); @@ -157,7 +157,31 @@ public: /** * Returns the number of characters for which offsets are stored. */ - size_t size() { return lens.size(); } + size_t size() const { return lens.size(); } + + /** + * Trims the length of the TokenizedData instance to the given length. + * Removes all token matches that lie within the trimmed region. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) { + if (length < size()) { + lens.resize(length); + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + } + } + + /** + * Resets the SourceOffsetVector to the state it had when it was + * constructed. + */ + void clear() { + lens.clear(); + offsets.clear(); + lastEnd = 0; + } }; } -- cgit v1.2.3 From c18790f70beb5f52b00bc1c2b1ded2b252f1998a Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 28 Feb 2015 15:46:55 +0100 Subject: Fixed potential problem in SourceOffsetVector --- src/core/parser/utils/SourceOffsetVector.hpp | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'src/core/parser/utils/SourceOffsetVector.hpp') diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index aaebe7d..67bacef 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -170,6 +170,11 @@ public: if (length < size()) { lens.resize(length); offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + if (length > 0) { + lastEnd = loadOffset(length - 1).second; + } else { + lastEnd = 0; + } } } -- cgit v1.2.3 From cb6cacdc7eade9d4290767bafb7ccf4e935d0fbf Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 1 Mar 2015 13:49:26 +0100 Subject: allowing to store gaps in SourceOffsetVector and fixed bug with trim not resetting offsets correctly when the new length is zero --- src/core/parser/utils/SourceOffsetVector.hpp | 64 ++++++++++++++++------- test/core/parser/utils/SourceOffsetVectorTest.cpp | 2 +- 2 files changed, 47 insertions(+), 19 deletions(-) (limited to 'src/core/parser/utils/SourceOffsetVector.hpp') diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index 67bacef..f322a88 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -43,6 +44,9 @@ namespace ousia { * a delta compression. */ class SourceOffsetVector { +public: + using OffsPair = std::pair; + private: /** * Type used for representing the length of a character. @@ -81,10 +85,13 @@ private: */ std::vector offsets; + /** + * Map used to store discontinuities in the character offsets. + */ + std::unordered_map gaps; + /** * Last position given as "end" position in the storeOffset() method. - * Used to adapt the length of the previous element in case start and end - * positions do not match. */ SourceOffset lastEnd; @@ -105,19 +112,22 @@ public: // Make sure (end - start) is smaller than MAX_LEN assert(end - start < MAX_LEN); - // Adapt the length of the previous character in case there is a gap - if (!lens.empty() && start > lastEnd) { - lens.back() += start - lastEnd; - } - lastEnd = end; - // Store an absolute offset every OFFSET_INTERVAL elements if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) { offsets.push_back(start); } - // Store the length - lens.push_back(end - start); + // Adapt the length of the previous character in case there is a gap + if (!lens.empty() && start > lastEnd) { + // There is a discontinuity, store the given offsets in the "gaps" + // map + gaps[lens.size()] = OffsPair(start, end); + lens.push_back(MAX_LEN); + } else { + // Store the length + lens.push_back(end - start); + } + lastEnd = end; } /** @@ -127,14 +137,13 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair loadOffset(size_t idx) const + OffsPair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); if (idx > 0 && idx == count) { auto offs = loadOffset(count - 1); - return std::pair(offs.second, - offs.second); + return OffsPair(offs.second, offs.second); } // Calculate the start index in the lens vector and in the offsets @@ -146,12 +155,26 @@ public: assert(idx < count); assert(offsetIdx < offsets.size()); + // If the length of the last character is MAX_LEN, the position is + // stored in the "gaps" list + if (lens[idx] == MAX_LEN) { + auto it = gaps.find(idx); + assert(it != gaps.end()); + return it->second; + } + // Sum over the length starting with the start offset SourceOffset start = offsets[offsetIdx]; for (size_t i = sumStartIdx; i < idx; i++) { - start += lens[i]; + if (lens[i] == MAX_LEN) { + auto it = gaps.find(i); + assert(it != gaps.end()); + start = it->second.first; + } else { + start += lens[i]; + } } - return std::pair(start, start + lens[idx]); + return OffsPair(start, start + lens[idx]); } /** @@ -166,13 +189,16 @@ public: * @param length is the number of characters to which the TokenizedData * instance should be trimmed. */ - void trim(size_t length) { + void trim(size_t length) + { if (length < size()) { lens.resize(length); - offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); if (length > 0) { + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); lastEnd = loadOffset(length - 1).second; } else { + offsets.clear(); + gaps.clear(); lastEnd = 0; } } @@ -182,9 +208,11 @@ public: * Resets the SourceOffsetVector to the state it had when it was * constructed. */ - void clear() { + void clear() + { lens.clear(); offsets.clear(); + gaps.clear(); lastEnd = 0; } }; diff --git a/test/core/parser/utils/SourceOffsetVectorTest.cpp b/test/core/parser/utils/SourceOffsetVectorTest.cpp index 25a4163..26254f9 100644 --- a/test/core/parser/utils/SourceOffsetVectorTest.cpp +++ b/test/core/parser/utils/SourceOffsetVectorTest.cpp @@ -51,7 +51,7 @@ TEST(SourceOffsetVector, gaps) for (size_t i = 0; i < 999; i++) { auto elem = vec.loadOffset(i); EXPECT_EQ(i * 3 + 5, elem.first); - EXPECT_EQ((i + 1) * 3 + 5, elem.second); + EXPECT_EQ(i * 3 + 7, elem.second); } auto elem = vec.loadOffset(999); EXPECT_EQ(999U * 3 + 5, elem.first); -- cgit v1.2.3