From fcae5902752245e87b9f92583d7ab585555460cb Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 22 Feb 2015 23:00:53 +0100 Subject: Implemented SourceOffsetVector -- a class for storing the SourceOffset for each character in a sequence in an fairly efficient manner --- src/core/parser/utils/SourceOffsetVector.cpp | 24 ++++ src/core/parser/utils/SourceOffsetVector.hpp | 165 +++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 src/core/parser/utils/SourceOffsetVector.cpp create mode 100644 src/core/parser/utils/SourceOffsetVector.hpp (limited to 'src/core') diff --git a/src/core/parser/utils/SourceOffsetVector.cpp b/src/core/parser/utils/SourceOffsetVector.cpp new file mode 100644 index 0000000..87ad44a --- /dev/null +++ b/src/core/parser/utils/SourceOffsetVector.cpp @@ -0,0 +1,24 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "SourceOffsetVector.hpp" + +namespace ousia { +// Do nothing here, make sure the header compiles correctly +} + diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp new file mode 100644 index 0000000..d15055a --- /dev/null +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -0,0 +1,165 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file SourceOffsetVector.hpp + * + * Contains a helper class used for storing the SourceOffset of each character + * in a character vector in a compressed manner. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_SOURCE_OFFSET_VECTOR_HPP_ +#define _OUSIA_SOURCE_OFFSET_VECTOR_HPP_ + +#include +#include +#include +#include +#include + +#include + +namespace ousia { + +/** + * Class used for storing the SourceOffset of each character in the buffer using + * a delta compression. + */ +class SourceOffsetVector { +private: + /** + * Type used for representing the length of a character. + */ + using Length = uint8_t; + + /** + * Maximum length that can be represented using the Length type. + */ + static constexpr size_t MAX_LEN = std::numeric_limits::max(); + + /** + * Interval in which the actual offset is stored, expressed as the binary + * logarithm. + */ + static constexpr size_t LOG2_OFFSET_INTERVAL = 6; + + /** + * Interval in which the actual offset is stored. + */ + static constexpr size_t OFFSET_INTERVAL = (1 << LOG2_OFFSET_INTERVAL); + + /** + * Bitmask for the bits that are set to zero when a new offset needs to be + * written. + */ + static constexpr size_t OFFSET_INTERVAL_MASK = OFFSET_INTERVAL - 1; + + /** + * Vector containing the delta compressed offset information. + */ + std::vector lens; + + /** + * Offsets containing the absolute offset for all OFFSET_INTERVAL elements. + */ + std::vector offsets; + + /** + * Last position given as "end" position in the storeOffset() method. + * Used to adapt the length of the previous element in case start and end + * positions do not match. + */ + SourceOffset lastEnd; + +public: + /** + * Default constructor of the SourceOffsetVector class. + */ + SourceOffsetVector() : lastEnd(0) {} + + /** + * Stores the location of a character in this SourceOffsetVector. + * + * @param start is the start location of the chracter in the source file. + * @param end is the end location of the character in the source file. + */ + void storeOffset(SourceOffset start, SourceOffset end) + { + // Make sure (end - start) is smaller than MAX_LEN + assert(end - start < MAX_LEN); + + // Adapt the length of the previous character in case there is a gap + if (!lens.empty() && start > lastEnd) { + lens.back() += start - lastEnd; + } + lastEnd = end; + + // Store an absolute offset every OFFSET_INTERVAL elements + if ((lens.size() & OFFSET_INTERVAL_MASK) == 0) { + offsets.push_back(start); + } + + // Store the length + lens.push_back(end - start); + } + + /** + * Loads the location of the character with the given index. + * + * @param idx is the index of the character for which the location should be + * read. + * @return a pair containing start and end source offset. + */ + std::pair loadOffset(size_t idx) + { + // Special treatment for the last character + const size_t count = lens.size(); + if (idx > 0 && idx == count) { + auto offs = loadOffset(count - 1); + return std::pair(offs.second, + offs.second); + } + + // Calculate the start index in the lens vector and in the offsets + // vector + const size_t offsetIdx = idx >> LOG2_OFFSET_INTERVAL; + const size_t sumStartIdx = idx & ~OFFSET_INTERVAL_MASK; + + // Make sure the index is valid + assert(idx < count); + assert(offsetIdx < offsets.size()); + + // Sum over the length starting with the start offset + SourceOffset start = offsets[offsetIdx]; + for (size_t i = sumStartIdx; i < idx; i++) { + start += lens[i]; + } + return std::pair(start, start + lens[idx]); + } + + /** + * Returns the number of characters for which offsets are stored. + */ + size_t size() { return lens.size(); } +}; +} + +#endif /* _OUSIA_SOURCE_OFFSET_VECTOR_HPP_ */ + -- cgit v1.2.3