diff --git a/qwarc/warc.py b/qwarc/warc.py index b247b41..8e71ee8 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -132,7 +132,7 @@ class WARC: ) payloadDigest = responseRecord.rec_headers.get_header('WARC-Payload-Digest') assert payloadDigest is not None - if self._dedupe and responseRecord.payload_length > 0: # Don't "deduplicate" empty responses + if self._dedupe and responseRecord.payload_length > 100: # Don't deduplicate small responses; the additional headers are typically larger than the payload dedupe savings... if payloadDigest in self._dedupeMap: refersToRecordId, refersToUri, refersToDate = self._dedupeMap[payloadDigest] responseHttpHeaders = responseRecord.http_headers