summaryrefslogtreecommitdiff
path: root/src/core/common
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-01-24 03:08:16 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-01-24 03:08:16 +0100
commit67d36e699a2852ce471c4d1b8dab5992d6c01a98 (patch)
tree0ef23befe3fa5af9c5d83b3b8934e444366a8575 /src/core/common
parentf819b42057b2baea205569dd808c4fcf2bc4d630 (diff)
Implemented SourceContextReader, added unit tests, implemented SourceContextReader interface in ResourceManager, added LoggerTest
Diffstat (limited to 'src/core/common')
-rw-r--r--src/core/common/SourceContextReader.cpp198
-rw-r--r--src/core/common/SourceContextReader.hpp91
2 files changed, 289 insertions, 0 deletions
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp
new file mode 100644
index 0000000..65a6281
--- /dev/null
+++ b/src/core/common/SourceContextReader.cpp
@@ -0,0 +1,198 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <algorithm>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+
+#include "SourceContextReader.hpp"
+
+namespace ousia {
+
+SourceContextReader::SourceContextReader() : cache{0} {}
+
+SourceContext SourceContextReader::readContext(CharReader &reader,
+ const SourceRange &range,
+ size_t maxContextLength,
+ const std::string &filename)
+{
+ // Abort if the given range is not valid
+ if (!range.isValid()) { // (I2)
+ return SourceContext{};
+ }
+
+ // Set the filename and the range
+ SourceContext ctx;
+ ctx.startLine = 1;
+ ctx.startColumn = 1;
+ ctx.endLine = 1;
+ ctx.endColumn = 1;
+ ctx.range = range;
+ ctx.filename = filename;
+
+ // Some constants for convenience
+ const SourceOffset start = range.getStart();
+ const SourceOffset end = range.getEnd();
+ const SourceOffset lastCacheOffs = cache.back();
+
+ // Find the entry in the cache that is just below the given start offset
+ // and jump to this location
+ size_t offs = 0;
+ auto it = std::lower_bound(cache.begin(), cache.end(), start);
+ if (it != cache.begin()) {
+ it--; // Go to the previous entry
+ offs = *it; // Read the corresponding byte offset
+ size_t line = it - cache.begin() + 1;
+ ctx.startLine = line;
+ ctx.endLine = line;
+ }
+
+ // Move the char reader to the specified offset, abort if this did not work
+ // out
+ if (offs != reader.seek(offs)) {
+ return SourceContext{};
+ }
+
+ // TODO: Handle skew introduced by linebreak processing \n\r => \n
+
+ // Read until the requested byte offset is reached, track linebreaks in the
+ // linebreak cache
+ std::vector<char> lineBuf;
+ size_t lineBufStart = offs;
+ size_t lastLineStart = offs;
+ char c;
+ while (reader.read(c)) {
+ // Fetch the offset after this character
+ const size_t nextOffs = reader.getOffset();
+
+ // Fetch the current offset, check whether start was reached
+ const bool reachedStart = offs >= start;
+ const bool reachedEnd = offs >= end;
+
+ // Handle linebreaks and update the linebreak cache
+ if (c == '\n') {
+ // Update the linebreak cache if we are in uncached regions
+ if (offs > lastCacheOffs) {
+ cache.push_back(nextOffs);
+ }
+ if (!reachedStart) {
+ ctx.startLine++;
+ ctx.startColumn = 1;
+ lineBuf.clear();
+ lineBufStart = nextOffs;
+ lastLineStart = nextOffs;
+ } else {
+ lineBuf.push_back('\n');
+ }
+ if (!reachedEnd) {
+ ctx.endLine++;
+ ctx.endColumn = 1;
+ } else {
+ // This was the last character, abort
+ break;
+ }
+ } else {
+ // Increment the start and the end column if this is not an
+ // UTF8-continuation byte (note that we count unicode codepoints not
+ // actual characters, which may be more than one codepoint)
+ if (!((c & 0x80) && !(c & 0x40))) {
+ if (!reachedStart) {
+ ctx.startColumn++;
+ }
+ if (!reachedEnd) {
+ ctx.endColumn++;
+ }
+ }
+
+ // Record all characters when start is reached or at least when
+ // the distance to start is smaller than the maximum context length
+ // TODO: This is suboptimal as parts of lineBuf are thrown away
+ // later. If the given range is really large, this will waste huge
+ // amounts of RAM.
+ if (reachedStart || (start - offs <= maxContextLength)) {
+ if (lineBuf.empty()) {
+ lineBufStart = offs;
+ }
+ lineBuf.push_back(c);
+ }
+ }
+
+ // Set the new offset
+ offs = nextOffs;
+ }
+
+ // If we did not reach the end or for some reason the lineBufStart is larger
+ // than start (to assure invariant I1 is fulfilled), abort
+ offs = reader.getOffset();
+ if (offs < end || lineBufStart > start) { // (I1)
+ return SourceContext{};
+ }
+
+ // Calculate a first relative position and length
+ ctx.relPos = start - lineBufStart; // lineBufStart > start (I1)
+ ctx.relLen = end - start; // end >= start (I2)
+
+ // Remove linebreaks at the beginning and the end
+ const std::pair<size_t, size_t> b =
+ Utils::trim(lineBuf, Utils::isLinebreak);
+ ssize_t s = b.first, e = b.second;
+ s = std::min(s, static_cast<ssize_t>(ctx.relPos));
+
+ // Remember the trimmed positions, only continue if the context text did
+ // not entirely consist of linebreaks
+ const ssize_t ts = s, te = e; // s >= 0, e >= 0, ts >= 0, te >= 0 (I3)
+ if (te > ts) {
+ // Trim the line further if it is longer than the maxContextLength
+ if (static_cast<size_t>(te - ts) > maxContextLength &&
+ maxContextLength != MAX_MAX_CONTEXT_LENGTH) {
+ ssize_t c = (ctx.relPos + ctx.relLen / 2);
+ s = c - maxContextLength / 2;
+ e = c + maxContextLength / 2;
+
+ // Account for rounding error
+ if (static_cast<size_t>(e - s) < maxContextLength) {
+ e++;
+ }
+
+ // Redistribute available characters at the beginning or the end
+ if (s < ts) {
+ e = e + (ts - s);
+ s = ts; // ts >= 0 => s >= 0 (I3)
+ }
+ if (e > te) {
+ s = s - std::min(s - ts, e - te); // ts - s <= s => s >= 0
+ e = te; // te >= 0 => e >= 0 (I3)
+ }
+ }
+
+ // Update the relative position and length, set the "truncated" flags
+ size_t us = static_cast<size_t>(s), ue = static_cast<size_t>(e);
+ ctx.relPos = start - lineBufStart - us;
+ ctx.relLen = std::min(ctx.relLen, ue - us);
+ ctx.truncatedStart = s > ts || lastLineStart < lineBufStart;
+ ctx.truncatedEnd = e < te;
+
+ // Copy the selected area to the output string
+ ctx.text = std::string{&lineBuf[s], ue - us};
+ }
+
+ return ctx;
+}
+}
+
diff --git a/src/core/common/SourceContextReader.hpp b/src/core/common/SourceContextReader.hpp
new file mode 100644
index 0000000..35e71b3
--- /dev/null
+++ b/src/core/common/SourceContextReader.hpp
@@ -0,0 +1,91 @@
+/*
+ Ousía
+ Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file SourceContextReader.hpp
+ *
+ * The SourceContextReader class is used to read a SourceContext struct from
+ * a SourcePosition instance and an input stream.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_SOURCE_CONTEXT_READER_HPP_
+#define _OUSIA_SOURCE_CONTEXT_READER_HPP_
+
+#include <string>
+#include <vector>
+#include <limits>
+
+#include "Location.hpp"
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+
+/**
+ * The SourceContextReader can read SourceContext structures given a
+ * SourcePosition or SourceRange and a char reader. It is capable of managing
+ * a line number cache which speeds up repeated context lookups.
+ */
+class SourceContextReader {
+private:
+ /**
+ * Cache containing the byte offset of each line break.
+ */
+ std::vector<SourceOffset> cache;
+
+public:
+ /**
+ * Maximum context size. Used to indicate that the context should have an
+ * unlimited size.
+ */
+ static constexpr size_t MAX_MAX_CONTEXT_LENGTH =
+ std::numeric_limits<ssize_t>::max();
+
+ /**
+ * Default constructor. Initializes the internal lineNumberCache with a
+ * single zero entry.
+ */
+ SourceContextReader();
+
+ /**
+ * Returns the context for the char reader and the given SourceRange.
+ * Returns an invalid source context if either the given range is invalid
+ * or the byte offset described in the SourceRange cannot be reached because
+ * the CharReader cannot be seeked back to this position.
+ *
+ * @param reader is the CharReader instance from which the context should be
+ * read.
+ * @param range describes the Range within the source file for which the
+ * context should be extraced.
+ * @param filename is the filename that should be stored in the returned
+ * context.
+ * @param maxContextLength is the maximum number of characters that should
+ * be stored in the returned context.
+ * @return a SourceContext instance describing the
+ */
+ SourceContext readContext(CharReader &reader, const SourceRange &range,
+ size_t maxContextLength = MAX_MAX_CONTEXT_LENGTH,
+ const std::string &filename = "");
+};
+}
+
+#endif /* _OUSIA_SOURCE_CONTEXT_READER_HPP_ */
+