From dbe1ed71ab4bc20c67bc6e814f8ff8454f10c0ff Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 16 Jul 2020 02:35:52 +0000 Subject: [PATCH] "Freeze" log file object before writing to WARC to ensure that further log messages aren't picked up This is a workaround for https://github.com/webrecorder/warcio/issues/90 --- qwarc/utils.py | 42 ++++++++++++++++++++++++++++++++++++++++++ qwarc/warc.py | 4 +++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/qwarc/utils.py b/qwarc/utils.py index 83ce2cb..9781b27 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -279,6 +279,48 @@ class ReadonlyFileView: return getattr(self._fp, key) +class FrozenFileView: + ''' + A poor minimal frozen view for a file object. It fixes the bounds of the file, i.e. if something is appended to the underlying file object, it does not become visible in the frozen view. Only seek, tell, and read are implemented. + + Note that seeks and reads will affect the underlying file object. The actual data is not really frozen either, and any changes on the underlying file object will affect the frozen view as well. + ''' + + def __init__(self, fp, begin, end): + ''' + fp: file-like object + begin: int, offset from beginning of the file + end: int, offset from beginning of the file + ''' + + self._fp = fp + self._begin = begin + self._end = end + + def seek(self, offset, whence = os.SEEK_SET): + if whence == os.SEEK_SET: + return self._fp.seek(self._begin + offset, whence) + elif whence == os.SEEK_CUR: + return self._fp.seek(offset, whence) + elif whence == os.SEEK_END: + return self._fp.seek(self._end + offset, whence) + raise NotImplementedError + + def tell(self): + return self._fp.tell() - self._begin + + def read(self, size = -1): + curPos = self._fp.tell() + if curPos < self._begin: + self._fp.seek(self._begin) + elif curPos > self._end: + return self._fp.read(0) + + if size == -1: + return self._fp.read(self._end - self._fp.tell()) + return self._fp.read(min(size, self._end - self._fp.tell())) + + class DummyClientResponse: '''A ClientResponse-like object for when no actual ClientResponse is available. Always evaluates to False when cast to a bool.''' diff --git a/qwarc/warc.py b/qwarc/warc.py index acb081b..1b1df87 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -231,10 +231,12 @@ class WARC: fp.seek(0, io.SEEK_END) length = fp.tell() fp.seek(0) + # Work around https://github.com/webrecorder/warcio/issues/90 + payload = qwarc.utils.FrozenFileView(fp, 0, length) record = self._warcWriter.create_warc_record( f'file://{self._logFilename}', 'resource', - payload = fp, + payload = payload, length = length, warc_headers_dict = {'X-QWARC-Type': 'log', 'Content-Type': 'text/plain; charset=utf-8', 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID}, )