Implemented SourceContextReader, added unit tests, implemented SourceContextReader interface in ResourceManager, added LoggerTest

author: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-01-24 03:08:16 +0100
committer: Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> 2015-01-24 03:08:16 +0100
commit: 67d36e699a2852ce471c4d1b8dab5992d6c01a98 (patch)
tree: 0ef23befe3fa5af9c5d83b3b8934e444366a8575 /src/core/common
parent: f819b42057b2baea205569dd808c4fcf2bc4d630 (diff)
2 files changed, 289 insertions, 0 deletions
diff --git a/src/core/common/SourceContextReader.cpp b/src/core/common/SourceContextReader.cpp
new file mode 100644
index 0000000..65a6281
--- /dev/null
+++ b/src/core/common/SourceContextReader.cpp
@@ -0,0 +1,198 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <algorithm>
+
+#include <core/common/CharReader.hpp>
+#include <core/common/Utils.hpp>
+
+#include "SourceContextReader.hpp"
+
+namespace ousia {
+
+SourceContextReader::SourceContextReader() : cache{0} {}
+
+SourceContext SourceContextReader::readContext(CharReader &reader,
+                                               const SourceRange &range,
+                                               size_t maxContextLength,
+                                               const std::string &filename)
+{
+	// Abort if the given range is not valid
+	if (!range.isValid()) {  // (I2)
+		return SourceContext{};
+	}
+
+	// Set the filename and the range
+	SourceContext ctx;
+	ctx.startLine = 1;
+	ctx.startColumn = 1;
+	ctx.endLine = 1;
+	ctx.endColumn = 1;
+	ctx.range = range;
+	ctx.filename = filename;
+
+	// Some constants for convenience
+	const SourceOffset start = range.getStart();
+	const SourceOffset end = range.getEnd();
+	const SourceOffset lastCacheOffs = cache.back();
+
+	// Find the entry in the cache that is just below the given start offset
+	// and jump to this location
+	size_t offs = 0;
+	auto it = std::lower_bound(cache.begin(), cache.end(), start);
+	if (it != cache.begin()) {
+		it--;  // Go to the previous entry
+		offs = *it; // Read the corresponding byte offset
+		size_t line = it - cache.begin() + 1;
+		ctx.startLine = line;
+		ctx.endLine = line;
+	}
+
+	// Move the char reader to the specified offset, abort if this did not work
+	// out
+	if (offs != reader.seek(offs)) {
+		return SourceContext{};
+	}
+
+	// TODO: Handle skew introduced by linebreak processing \n\r => \n
+
+	// Read until the requested byte offset is reached, track linebreaks in the
+	// linebreak cache
+	std::vector<char> lineBuf;
+	size_t lineBufStart = offs;
+	size_t lastLineStart = offs;
+	char c;
+	while (reader.read(c)) {
+		// Fetch the offset after this character
+		const size_t nextOffs = reader.getOffset();
+
+		// Fetch the current offset, check whether start was reached
+		const bool reachedStart = offs >= start;
+		const bool reachedEnd = offs >= end;
+
+		// Handle linebreaks and update the linebreak cache
+		if (c == '\n') {
+			// Update the linebreak cache if we are in uncached regions
+			if (offs > lastCacheOffs) {
+				cache.push_back(nextOffs);
+			}
+			if (!reachedStart) {
+				ctx.startLine++;
+				ctx.startColumn = 1;
+				lineBuf.clear();
+				lineBufStart = nextOffs;
+				lastLineStart = nextOffs;
+			} else {
+				lineBuf.push_back('\n');
+			}
+			if (!reachedEnd) {
+				ctx.endLine++;
+				ctx.endColumn = 1;
+			} else {
+				// This was the last character, abort
+				break;
+			}
+		} else {
+			// Increment the start and the end column if this is not an
+			// UTF8-continuation byte (note that we count unicode codepoints not
+			// actual characters, which may be more than one codepoint)
+			if (!((c & 0x80) && !(c & 0x40))) {
+				if (!reachedStart) {
+					ctx.startColumn++;
+				}
+				if (!reachedEnd) {
+					ctx.endColumn++;
+				}
+			}
+
+			// Record all characters when start is reached or at least when
+			// the distance to start is smaller than the maximum context length
+			// TODO: This is suboptimal as parts of lineBuf are thrown away
+			// later. If the given range is really large, this will waste huge
+			// amounts of RAM.
+			if (reachedStart || (start - offs <= maxContextLength)) {
+				if (lineBuf.empty()) {
+					lineBufStart = offs;
+				}
+				lineBuf.push_back(c);
+			}
+		}
+
+		// Set the new offset
+		offs = nextOffs;
+	}
+
+	// If we did not reach the end or for some reason the lineBufStart is larger
+	// than start (to assure invariant I1 is fulfilled), abort
+	offs = reader.getOffset();
+	if (offs < end || lineBufStart > start) {  // (I1)
+		return SourceContext{};
+	}
+
+	// Calculate a first relative position and length
+	ctx.relPos = start - lineBufStart;  // lineBufStart > start (I1)
+	ctx.relLen = end - start;           // end >= start (I2)
+
+	// Remove linebreaks at the beginning and the end
+	const std::pair<size_t, size_t> b =
+	    Utils::trim(lineBuf, Utils::isLinebreak);
+	ssize_t s = b.first, e = b.second;
+	s = std::min(s, static_cast<ssize_t>(ctx.relPos));
+
+	// Remember the trimmed positions, only continue if the context text did
+	// not entirely consist of linebreaks
+	const ssize_t ts = s, te = e;  // s >= 0, e >= 0, ts >= 0, te >= 0 (I3)
+	if (te > ts) {
+		// Trim the line further if it is longer than the maxContextLength
+		if (static_cast<size_t>(te - ts) > maxContextLength &&
+		    maxContextLength != MAX_MAX_CONTEXT_LENGTH) {
+			ssize_t c = (ctx.relPos + ctx.relLen / 2);
+			s = c - maxContextLength / 2;
+			e = c + maxContextLength / 2;
+
+			// Account for rounding error
+			if (static_cast<size_t>(e - s) < maxContextLength) {
+				e++;
+			}
+
+			// Redistribute available characters at the beginning or the end
+			if (s < ts) {
+				e = e + (ts - s);
+				s = ts;  // ts >= 0 => s >= 0 (I3)
+			}
+			if (e > te) {
+				s = s - std::min(s - ts, e - te);  // ts - s <= s => s >= 0
+				e = te;                            // te >= 0 => e >= 0 (I3)
+			}
+		}
+
+		// Update the relative position and length, set the "truncated" flags
+		size_t us = static_cast<size_t>(s), ue = static_cast<size_t>(e);
+		ctx.relPos = start - lineBufStart - us;
+		ctx.relLen = std::min(ctx.relLen, ue - us);
+		ctx.truncatedStart = s > ts || lastLineStart < lineBufStart;
+		ctx.truncatedEnd = e < te;
+
+		// Copy the selected area to the output string
+		ctx.text = std::string{&lineBuf[s], ue - us};
+	}
+
+	return ctx;
+}
+}
+
diff --git a/src/core/common/SourceContextReader.hpp b/src/core/common/SourceContextReader.hpp
new file mode 100644
index 0000000..35e71b3
--- /dev/null
+++ b/src/core/common/SourceContextReader.hpp
@@ -0,0 +1,91 @@
+/*
+    Ousía
+    Copyright (C) 2014, 2015  Benjamin Paaßen, Andreas Stöckel
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file SourceContextReader.hpp
+ *
+ * The SourceContextReader class is used to read a SourceContext struct from
+ * a SourcePosition instance and an input stream.
+ *
+ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
+ */
+
+#ifndef _OUSIA_SOURCE_CONTEXT_READER_HPP_
+#define _OUSIA_SOURCE_CONTEXT_READER_HPP_
+
+#include <string>
+#include <vector>
+#include <limits>
+
+#include "Location.hpp"
+
+namespace ousia {
+
+// Forward declarations
+class CharReader;
+
+/**
+ * The SourceContextReader can read SourceContext structures given a
+ * SourcePosition or SourceRange and a char reader. It is capable of managing
+ * a line number cache which speeds up repeated context lookups.
+ */
+class SourceContextReader {
+private:
+	/**
+	 * Cache containing the byte offset of each line break.
+	 */
+	std::vector<SourceOffset> cache;
+
+public:
+	/**
+	 * Maximum context size. Used to indicate that the context should have an
+	 * unlimited size.
+	 */
+	static constexpr size_t MAX_MAX_CONTEXT_LENGTH =
+	    std::numeric_limits<ssize_t>::max();
+
+	/**
+	 * Default constructor. Initializes the internal lineNumberCache with a
+	 * single zero entry.
+	 */
+	SourceContextReader();
+
+	/**
+	 * Returns the context for the char reader and the given SourceRange.
+	 * Returns an invalid source context if either the given range is invalid
+	 * or the byte offset described in the SourceRange cannot be reached because
+	 * the CharReader cannot be seeked back to this position.
+	 *
+	 * @param reader is the CharReader instance from which the context should be
+	 * read.
+	 * @param range describes the Range within the source file for which the
+	 * context should be extraced.
+	 * @param filename is the filename that should be stored in the returned
+	 * context.
+	 * @param maxContextLength is the maximum number of characters that should
+	 * be stored in the returned context.
+	 * @return a SourceContext instance describing the
+	 */
+	SourceContext readContext(CharReader &reader, const SourceRange &range,
+	                          size_t maxContextLength = MAX_MAX_CONTEXT_LENGTH,
+	                          const std::string &filename = "");
+};
+}
+
+#endif /* _OUSIA_SOURCE_CONTEXT_READER_HPP_ */
+
author	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-01-24 03:08:16 +0100
committer	Andreas Stöckel <astoecke@techfak.uni-bielefeld.de>	2015-01-24 03:08:16 +0100
commit	67d36e699a2852ce471c4d1b8dab5992d6c01a98 (patch)
tree	0ef23befe3fa5af9c5d83b3b8934e444366a8575 /src/core/common
parent	f819b42057b2baea205569dd808c4fcf2bc4d630 (diff)