From 08117630b05fd6d14f5271982a9ba63187743fda Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 26 Aug 2019 12:13:27 +0000 Subject: [PATCH] Remove warcinfo record in each data WARC and refer to the process's warcinfo record in the meta WARC instead --- qwarc/warc.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/qwarc/warc.py b/qwarc/warc.py index 2263ff2..f6f9946 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -45,7 +45,6 @@ class WARC: self._setup_logger() self._logFilename = logFilename - self._dataWarcinfoRecordID = None self._metaWarcinfoRecordID = None self._write_meta_warc(self._write_initial_meta_records) @@ -78,7 +77,6 @@ class WARC: self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True, warc_version = '1.1') self._closed = False self._counter += 1 - self._dataWarcinfoRecordID = self._write_warcinfo_record() def _write_warcinfo_record(self): data = { @@ -116,7 +114,7 @@ class WARC: warc_headers_dict = { 'WARC-Date': requestDate, 'WARC-IP-Address': r.remoteAddress[0], - 'WARC-Warcinfo-ID': self._dataWarcinfoRecordID, + 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID, } ) requestRecordID = requestRecord.rec_headers.get_header('WARC-Record-ID') @@ -128,7 +126,7 @@ class WARC: 'WARC-Date': requestDate, 'WARC-IP-Address': r.remoteAddress[0], 'WARC-Concurrent-To': requestRecordID, - 'WARC-Warcinfo-ID': self._dataWarcinfoRecordID, + 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID, } ) payloadDigest = responseRecord.rec_headers.get_header('WARC-Payload-Digest') @@ -149,7 +147,7 @@ class WARC: 'WARC-Concurrent-To': requestRecordID, 'WARC-Refers-To': refersToRecordId, 'WARC-Truncated': 'length', - 'WARC-Warcinfo-ID': self._dataWarcinfoRecordID, + 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID, } ) else: