Browse Source

Decode HTTP request bodies

master
JustAnotherArchivist 3 years ago
parent
commit
49376db51b
1 changed files with 46 additions and 42 deletions
  1. +46
    -42
      warc-tiny

+ 46
- 42
warc-tiny View File

@@ -67,16 +67,16 @@ class WARCBlockChunk(_DataChunk):
return self._isHttpHeader


class RawHTTPResponseBodyChunk(_DataChunk):
class RawHTTPBodyChunk(_DataChunk):
'''
Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
This is like HTTPResponseBodyChunk but without transfer encoding stripping.
This is like HTTPBodyChunk but without transfer encoding stripping.
'''


class HTTPResponseBodyChunk(_DataChunk):
class HTTPBodyChunk(_DataChunk):
'''
Representing a part of the HTTP response body with transfer encoding stripped.
Representing a part of the HTTP body with transfer encoding stripped.
'''


@@ -123,7 +123,7 @@ def iter_warc(f):
warcContent = buf[:warcContentLength]
buf = buf[warcContentLength + 4:]

# Decode HTTP response if it is one
# Decode HTTP body if appropriate
if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
httpType = 'request'
elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
@@ -146,39 +146,38 @@ def iter_warc(f):

yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
yield WARCBlockChunk(httpBody, isHttpHeader = False)
yield RawHTTPResponseBodyChunk(httpBody)

if httpType == 'response':
# Decode body
if gzipped:
httpDecompressor = GzipDecompressor()
else:
httpDecompressor = DummyDecompressor()
if chunked:
while True:
try:
chunkLineEnd = httpBody.index(b'\r\n')
except ValueError:
print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
break
chunkLine = httpBody[:chunkLineEnd]
if b';' in chunkLine:
chunkLength = chunkLine[:chunkLine.index(b';')].strip()
else:
chunkLength = chunkLine.strip()
if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
break
chunkLength = int(chunkLength, base = 16)
if chunkLength == 0:
break
chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
yield HTTPResponseBodyChunk(chunk)
httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
else:
yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody))
yield RawHTTPBodyChunk(httpBody)

# Decode body
if gzipped:
httpDecompressor = GzipDecompressor()
else:
httpDecompressor = DummyDecompressor()
if chunked:
while True:
try:
chunkLineEnd = httpBody.index(b'\r\n')
except ValueError:
print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
break
chunkLine = httpBody[:chunkLineEnd]
if b';' in chunkLine:
chunkLength = chunkLine[:chunkLine.index(b';')].strip()
else:
chunkLength = chunkLine.strip()
if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
break
chunkLength = int(chunkLength, base = 16)
if chunkLength == 0:
break
chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
yield HTTPBodyChunk(chunk)
httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
else:
yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
else:
print('Warning: malformed HTTP response in record {}, skipping'.format(recordID), file = sys.stderr)
print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
yield WARCBlockChunk(warcContent)
else:
yield WARCBlockChunk(warcContent)
@@ -253,10 +252,10 @@ class VerifyMode(ProcessMode):
elif type(event) is WARCBlockChunk:
if self._blockDigester:
self._blockDigester.update(event.data)
elif type(event) is HTTPResponseBodyChunk:
elif type(event) is HTTPBodyChunk:
if self._payloadDigester:
self._payloadDigester.update(event.data)
elif type(event) is RawHTTPResponseBodyChunk:
elif type(event) is RawHTTPBodyChunk:
if self._brokenPayloadDigester:
self._brokenPayloadDigester.update(event.data)
elif type(event) is EndOfRecord:
@@ -276,13 +275,18 @@ class VerifyMode(ProcessMode):
class DumpResponsesMode(ProcessMode):
def __init__(self):
self._printEOR = False
self._isResponse = False

def process_event(self, event):
if type(event) is BeginOfRecord:
warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
self._printEOR = False
elif type(event) is HTTPResponseBodyChunk:
self._printEOR = True
sys.stdout.buffer.write(event.data)
elif type(event) is HTTPBodyChunk:
if self._isResponse:
self._printEOR = True
sys.stdout.buffer.write(event.data)
elif type(event) is EndOfRecord:
if self._printEOR:
sys.stdout.buffer.write(b'\r\n')


Loading…
Cancel
Save