Browse Source

Raise an error when verification fails

master
JustAnotherArchivist 10 months ago
parent
commit
828dae2597
1 changed files with 43 additions and 4 deletions
  1. +43
    -4
      warc-tiny

+ 43
- 4
warc-tiny View File

@@ -51,7 +51,7 @@ class Event:
pass pass




class NewFile(Event):
class FileEvent(Event):
def __init__(self, filename): def __init__(self, filename):
self._filename = filename self._filename = filename


@@ -60,6 +60,10 @@ class NewFile(Event):
return self._filename return self._filename




class NewFile(FileEvent):
pass


class BeginOfRecord(Event): class BeginOfRecord(Event):
def __init__(self, warcHeaders, rawData): def __init__(self, warcHeaders, rawData):
self._warcHeaders = warcHeaders self._warcHeaders = warcHeaders
@@ -123,6 +127,21 @@ class EndOfRecord(Event):
pass pass




class WARCParsingIssue(enum.Enum):
TRUNCATED_FILE = enum.auto()
MALFORMED_HTTP_RECORD = enum.auto()


class WARCParsingIssueEvent(Event):
def __init__(self, issue, message = None):
self.issue = issue
self.message = message


class EndOfFile(FileEvent):
pass


@contextlib.contextmanager @contextlib.contextmanager
def open_warc(f): def open_warc(f):
if hasattr(f, 'read'): if hasattr(f, 'read'):
@@ -169,6 +188,7 @@ def iter_warc(f):
pass pass
if len(buf) < warcContentLength + 4: if len(buf) < warcContentLength + 4:
print('Error: truncated WARC', file = sys.stderr) print('Error: truncated WARC', file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.TRUNCATED_FILE)
break break
warcContent = buf[:warcContentLength] warcContent = buf[:warcContentLength]
buf = buf[warcContentLength + 4:] buf = buf[warcContentLength + 4:]
@@ -210,7 +230,9 @@ def iter_warc(f):
try: try:
chunkLineEnd = httpBody.index(b'\r\n', pos) chunkLineEnd = httpBody.index(b'\r\n', pos)
except ValueError: except ValueError:
print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
message = 'could not find chunk line end in record {}'.format(recordID)
print('Error: {}, skipping'.format(message), file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
break break
chunkLine = httpBody[pos:chunkLineEnd] chunkLine = httpBody[pos:chunkLineEnd]
if b';' in chunkLine: if b';' in chunkLine:
@@ -218,7 +240,9 @@ def iter_warc(f):
else: else:
chunkLength = chunkLine.strip() chunkLength = chunkLine.strip()
if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'': if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
message = 'malformed chunk length {!r} in record {}'.format(chunkLength, recordID)
print('Error: {}, skipping'.format(message), file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
break break
chunkLength = int(chunkLength, base = 16) chunkLength = int(chunkLength, base = 16)
if chunkLength == 0: if chunkLength == 0:
@@ -229,7 +253,9 @@ def iter_warc(f):
else: else:
yield HTTPBodyChunk(httpDecompressor.decompress(httpBody)) yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
else: else:
print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
message = 'malformed HTTP request or response in record {}'.format(recordID)
print('Warning: {}, skipping'.format(message), file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
yield WARCBlockChunk(warcContent) yield WARCBlockChunk(warcContent)
else: else:
yield WARCBlockChunk(warcContent) yield WARCBlockChunk(warcContent)
@@ -267,6 +293,10 @@ class HexDigest(Digest):
return (digest if digest else self._digest).hex() return (digest if digest else self._digest).hex()




class VerificationError(Exception):
pass


class VerifyMode(ProcessMode): class VerifyMode(ProcessMode):
def __init__(self): def __init__(self):
self._blockDigester = None self._blockDigester = None
@@ -275,6 +305,7 @@ class VerifyMode(ProcessMode):
self._brokenPayloadDigester = None self._brokenPayloadDigester = None
self._recordedPayloadDigest = None self._recordedPayloadDigest = None
self._printedBrokenPayloadWarning = False self._printedBrokenPayloadWarning = False
self._verificationFailed = False


def parse_digest(self, digest): def parse_digest(self, digest):
if not digest.startswith(b'sha1:'): if not digest.startswith(b'sha1:'):
@@ -289,6 +320,7 @@ class VerifyMode(ProcessMode):
def process_event(self, event): def process_event(self, event):
if type(event) is NewFile: if type(event) is NewFile:
self._printedBrokenPayloadWarning = False self._printedBrokenPayloadWarning = False
self._verificationFailed = False
elif type(event) is BeginOfRecord: elif type(event) is BeginOfRecord:
if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders): if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
self._blockDigester = hashlib.sha1() self._blockDigester = hashlib.sha1()
@@ -315,10 +347,13 @@ class VerifyMode(ProcessMode):
elif type(event) is RawHTTPBodyChunk: elif type(event) is RawHTTPBodyChunk:
if self._brokenPayloadDigester: if self._brokenPayloadDigester:
self._brokenPayloadDigester.update(event.data) self._brokenPayloadDigester.update(event.data)
elif type(event) is WARCParsingIssueEvent:
self._verificationFailed = True
elif type(event) is EndOfRecord: elif type(event) is EndOfRecord:
if self._blockDigester and self._recordedBlockDigest: if self._blockDigester and self._recordedBlockDigest:
if not self._recordedBlockDigest.equals(self._blockDigester.digest()): if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr) print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
self._verificationFailed = True
if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()): if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()): if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
@@ -327,6 +362,9 @@ class VerifyMode(ProcessMode):
self._printedBrokenPayloadWarning = True self._printedBrokenPayloadWarning = True
else: else:
print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr) print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
self._verificationFailed = True
elif type(event) is EndOfFile and self._verificationFailed:
raise VerificationError('one or more errors encountered while verifying {}'.format(event.filename))




class DumpResponsesMode(ProcessMode): class DumpResponsesMode(ProcessMode):
@@ -546,6 +584,7 @@ def main():
f = sys.stdin.buffer f = sys.stdin.buffer
for event in iter_warc(f): for event in iter_warc(f):
processor.process_event(event) processor.process_event(event)
processor.process_event(EndOfFile(f))
except BrokenPipeError: except BrokenPipeError:
return return




Loading…
Cancel
Save