From 413435b7fb688f8e0cada81fbec63e80800953da Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 8 Sep 2019 20:45:46 +0000 Subject: [PATCH] Work around warcio not writing the correct WARC-Profile header for revisit records on WARC/1.1 https://github.com/webrecorder/warcio/issues/94 --- qwarc/warc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qwarc/warc.py b/qwarc/warc.py index 13687de..83cc7ce 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -154,6 +154,8 @@ class WARC: 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID, } ) + # Workaround for https://github.com/webrecorder/warcio/issues/94 + responseRecord.rec_headers.replace_header('WARC-Profile', 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest') else: self._dedupeMap[payloadDigest] = (responseRecord.rec_headers.get_header('WARC-Record-ID'), str(r.url), requestDate) self._warcWriter.write_record(requestRecord)