From 49376db51bba1f843f88bd98e06e3ef004e320a8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 28 May 2020 22:32:37 +0000 Subject: [PATCH] Decode HTTP request bodies --- warc-tiny | 88 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/warc-tiny b/warc-tiny index d4b2525..e03fdcc 100755 --- a/warc-tiny +++ b/warc-tiny @@ -67,16 +67,16 @@ class WARCBlockChunk(_DataChunk): return self._isHttpHeader -class RawHTTPResponseBodyChunk(_DataChunk): +class RawHTTPBodyChunk(_DataChunk): ''' Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding. - This is like HTTPResponseBodyChunk but without transfer encoding stripping. + This is like HTTPBodyChunk but without transfer encoding stripping. ''' -class HTTPResponseBodyChunk(_DataChunk): +class HTTPBodyChunk(_DataChunk): ''' - Representing a part of the HTTP response body with transfer encoding stripped. + Representing a part of the HTTP body with transfer encoding stripped. ''' @@ -123,7 +123,7 @@ def iter_warc(f): warcContent = buf[:warcContentLength] buf = buf[warcContentLength + 4:] - # Decode HTTP response if it is one + # Decode HTTP body if appropriate if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request': httpType = 'request' elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response': @@ -146,39 +146,38 @@ def iter_warc(f): yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True) yield WARCBlockChunk(httpBody, isHttpHeader = False) - yield RawHTTPResponseBodyChunk(httpBody) - - if httpType == 'response': - # Decode body - if gzipped: - httpDecompressor = GzipDecompressor() - else: - httpDecompressor = DummyDecompressor() - if chunked: - while True: - try: - chunkLineEnd = httpBody.index(b'\r\n') - except ValueError: - print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr) - break - chunkLine = httpBody[:chunkLineEnd] - if b';' in chunkLine: - chunkLength = chunkLine[:chunkLine.index(b';')].strip() - else: - chunkLength = chunkLine.strip() - if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'': - print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr) - break - chunkLength = int(chunkLength, base = 16) - if chunkLength == 0: - break - chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength]) - yield HTTPResponseBodyChunk(chunk) - httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:] - else: - yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)) + yield RawHTTPBodyChunk(httpBody) + + # Decode body + if gzipped: + httpDecompressor = GzipDecompressor() + else: + httpDecompressor = DummyDecompressor() + if chunked: + while True: + try: + chunkLineEnd = httpBody.index(b'\r\n') + except ValueError: + print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr) + break + chunkLine = httpBody[:chunkLineEnd] + if b';' in chunkLine: + chunkLength = chunkLine[:chunkLine.index(b';')].strip() + else: + chunkLength = chunkLine.strip() + if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'': + print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr) + break + chunkLength = int(chunkLength, base = 16) + if chunkLength == 0: + break + chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength]) + yield HTTPBodyChunk(chunk) + httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:] + else: + yield HTTPBodyChunk(httpDecompressor.decompress(httpBody)) else: - print('Warning: malformed HTTP response in record {}, skipping'.format(recordID), file = sys.stderr) + print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr) yield WARCBlockChunk(warcContent) else: yield WARCBlockChunk(warcContent) @@ -253,10 +252,10 @@ class VerifyMode(ProcessMode): elif type(event) is WARCBlockChunk: if self._blockDigester: self._blockDigester.update(event.data) - elif type(event) is HTTPResponseBodyChunk: + elif type(event) is HTTPBodyChunk: if self._payloadDigester: self._payloadDigester.update(event.data) - elif type(event) is RawHTTPResponseBodyChunk: + elif type(event) is RawHTTPBodyChunk: if self._brokenPayloadDigester: self._brokenPayloadDigester.update(event.data) elif type(event) is EndOfRecord: @@ -276,13 +275,18 @@ class VerifyMode(ProcessMode): class DumpResponsesMode(ProcessMode): def __init__(self): self._printEOR = False + self._isResponse = False def process_event(self, event): if type(event) is BeginOfRecord: + warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type') + warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type') + self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response' self._printEOR = False - elif type(event) is HTTPResponseBodyChunk: - self._printEOR = True - sys.stdout.buffer.write(event.data) + elif type(event) is HTTPBodyChunk: + if self._isResponse: + self._printEOR = True + sys.stdout.buffer.write(event.data) elif type(event) is EndOfRecord: if self._printEOR: sys.stdout.buffer.write(b'\r\n')