From 461cedbbde4b1c56ee7bcef597d4752a17864ad9 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 11 Dec 2019 00:26:24 +0000 Subject: [PATCH] Avoid temporary files created by warcio due to not knowing the record payload length --- qwarc/warc.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/qwarc/warc.py b/qwarc/warc.py index 48be75f..b247b41 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -100,11 +100,14 @@ class WARC: for r in response.iter_all(): usec = f'{(r.rawRequestTimestamp - int(r.rawRequestTimestamp)):.6f}'[2:] requestDate = time.strftime(f'%Y-%m-%dT%H:%M:%S.{usec}Z', time.gmtime(r.rawRequestTimestamp)) + r.rawRequestData.seek(0, io.SEEK_END) + length = r.rawRequestData.tell() r.rawRequestData.seek(0) requestRecord = self._warcWriter.create_warc_record( str(r.url), 'request', payload = r.rawRequestData, + length = length, warc_headers_dict = { 'WARC-Date': requestDate, 'WARC-IP-Address': r.remoteAddress[0], @@ -112,11 +115,14 @@ class WARC: } ) requestRecordID = requestRecord.rec_headers.get_header('WARC-Record-ID') + r.rawResponseData.seek(0, io.SEEK_END) + length = r.rawResponseData.tell() r.rawResponseData.seek(0) responseRecord = self._warcWriter.create_warc_record( str(r.url), 'response', payload = r.rawResponseData, + length = length, warc_headers_dict = { 'WARC-Date': requestDate, 'WARC-IP-Address': r.remoteAddress[0], @@ -161,10 +167,14 @@ class WARC: for type_, contentType, fn in itertools.chain((('specfile', 'application/x-python', self._specFile),), map(lambda x: ('spec-dependency-file', 'application/octet-stream', x), self._specDependencies.files)): with open(fn, 'rb') as f: + f.seek(0, io.SEEK_END) + length = f.tell() + f.seek(0) record = self._warcWriter.create_warc_record( f'file://{fn}', 'resource', payload = f, + length = length, warc_headers_dict = {'X-QWARC-Type': type_, 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID, 'Content-Type': contentType}, ) self._warcWriter.write_record(record) @@ -180,10 +190,14 @@ class WARC: for handler in rootLogger.handlers: #FIXME: Uses undocumented attribute handlers handler.flush() with open(self._logFilename, 'rb') as fp: + fp.seek(0, io.SEEK_END) + length = fp.tell() + fp.seek(0) record = self._warcWriter.create_warc_record( f'file://{self._logFilename}', 'resource', payload = fp, + length = length, warc_headers_dict = {'X-QWARC-Type': 'log', 'Content-Type': 'text/plain; charset=utf-8', 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID}, ) self._warcWriter.write_record(record)