diff --git a/qwarc/utils.py b/qwarc/utils.py index 83ce2cb..9781b27 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -279,6 +279,48 @@ class ReadonlyFileView: return getattr(self._fp, key) +class FrozenFileView: + ''' + A poor minimal frozen view for a file object. It fixes the bounds of the file, i.e. if something is appended to the underlying file object, it does not become visible in the frozen view. Only seek, tell, and read are implemented. + + Note that seeks and reads will affect the underlying file object. The actual data is not really frozen either, and any changes on the underlying file object will affect the frozen view as well. + ''' + + def __init__(self, fp, begin, end): + ''' + fp: file-like object + begin: int, offset from beginning of the file + end: int, offset from beginning of the file + ''' + + self._fp = fp + self._begin = begin + self._end = end + + def seek(self, offset, whence = os.SEEK_SET): + if whence == os.SEEK_SET: + return self._fp.seek(self._begin + offset, whence) + elif whence == os.SEEK_CUR: + return self._fp.seek(offset, whence) + elif whence == os.SEEK_END: + return self._fp.seek(self._end + offset, whence) + raise NotImplementedError + + def tell(self): + return self._fp.tell() - self._begin + + def read(self, size = -1): + curPos = self._fp.tell() + if curPos < self._begin: + self._fp.seek(self._begin) + elif curPos > self._end: + return self._fp.read(0) + + if size == -1: + return self._fp.read(self._end - self._fp.tell()) + return self._fp.read(min(size, self._end - self._fp.tell())) + + class DummyClientResponse: '''A ClientResponse-like object for when no actual ClientResponse is available. Always evaluates to False when cast to a bool.''' diff --git a/qwarc/warc.py b/qwarc/warc.py index acb081b..1b1df87 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -231,10 +231,12 @@ class WARC: fp.seek(0, io.SEEK_END) length = fp.tell() fp.seek(0) + # Work around https://github.com/webrecorder/warcio/issues/90 + payload = qwarc.utils.FrozenFileView(fp, 0, length) record = self._warcWriter.create_warc_record( f'file://{self._logFilename}', 'resource', - payload = fp, + payload = payload, length = length, warc_headers_dict = {'X-QWARC-Type': 'log', 'Content-Type': 'text/plain; charset=utf-8', 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID}, )