if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
self._blockDigester = hashlib.sha1()
self._recordedBlockDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest')
self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
else:
self._blockDigester = None
self._recordedBlockDigest = None
if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
self._payloadDigester = hashlib.sha1()
self._brokenPayloadDigester = hashlib.sha1()
self._recordedPayloadDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest')
self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
else:
self._payloadDigester = None
self._brokenPayloadDigester = None
@@ -227,17 +258,17 @@ class VerifyMode(ProcessMode):
if self._brokenPayloadDigester:
self._brokenPayloadDigester.update(event.data)
elif type(event) is EndOfRecord:
if self._blockDigester:
if self._recordedBlockDigest != b'sha1:' + base64.b32encode(self._blockDigester.digest()):
print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest, base64.b32encode(self._blockDigester.digest())))
if self._blockDigester and self._recordedBlockDigest:
if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())))
if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
if self._recordedPayloadDigest != b'sha1:' + base64.b32encode(self._payloadDigester.digest()):
if self._recordedPayloadDigest == b'sha1:' + base64.b32encode(self._brokenPayloadDigester.digest()):
if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
if not self._printedBrokenPayloadWarning:
print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding')
self._printedBrokenPayloadWarning = True
else:
print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest, base64.b32encode(self._payloadDigester.digest()), base64.b32encode(self._brokenPayloadDigester.digest())))
print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())))