From 70b413f5c1194a5ee772c94245bf7cf8220d0a7f Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 30 Apr 2019 02:19:20 +0000 Subject: [PATCH] Better events: include raw WARC header data and separate HTTP requests into headers and body --- warc-tiny | 90 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 34 deletions(-) diff --git a/warc-tiny b/warc-tiny index 5d9d18c..250ce23 100755 --- a/warc-tiny +++ b/warc-tiny @@ -27,13 +27,18 @@ class NewFile(Event): class BeginOfRecord(Event): - def __init__(self, warcHeaders): + def __init__(self, warcHeaders, rawData): self._warcHeaders = warcHeaders + self._rawData = rawData @property def warcHeaders(self): return self._warcHeaders + @property + def rawData(self): + return self._rawData + class _DataChunk(Event): def __init__(self, data): @@ -48,7 +53,14 @@ class _DataChunk(Event): class WARCBlockChunk(_DataChunk): - pass + def __init__(self, data, isHttpHeader = None): + super().__init__(data) + self._isHttpHeader = isHttpHeader + + @property + def isHttpHeader(self): + # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record + return self._isHttpHeader class RawHTTPResponseBodyChunk(_DataChunk): @@ -70,6 +82,7 @@ class EndOfRecord(Event): def iter_warc(f): # Yields Events + # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either. with gzip.open(f, 'rb') as fp: buf = b'' @@ -91,7 +104,7 @@ def iter_warc(f): warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type') warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length')) warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type') - yield BeginOfRecord(warcHeaders) + yield BeginOfRecord(warcHeaders, warcHeaderBuf) # Read WARC block (and skip CRLFCRLF at the end of the record) if len(buf) < warcContentLength + 4: @@ -105,10 +118,14 @@ def iter_warc(f): warcContent = buf[:warcContentLength] buf = buf[warcContentLength + 4:] - yield WARCBlockChunk(warcContent) - # Decode HTTP response if it is one - if warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType in (b'request', b'response'): #TODO: Support revisit + if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request': + httpType = 'request' + elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response': + httpType = 'response' + else: + httpType = None + if httpType is not None: if b'\r\n\r\n' in warcContent: httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1) @@ -122,38 +139,43 @@ def iter_warc(f): chunked = b'chunked' in transferEncodings gzipped = b'gzip' in transferEncodings + yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True) + yield WARCBlockChunk(httpBody, isHttpHeader = False) yield RawHTTPResponseBodyChunk(httpBody) - # Decode body - if gzipped: - httpDecompressor = GzipDecompressor() - else: - httpDecompressor = DummyDecompressor() - if chunked: - while True: - try: - chunkLineEnd = httpBody.index(b'\r\n') - except ValueError: - print('Error: could not find chunk line end, skipping', file = sys.stderr) - break - chunkLine = httpBody[:chunkLineEnd] - if b';' in chunkLine: - chunkLength = chunkLine[:chunkLine.index(b';')].strip() - else: - chunkLength = chunkLine.strip() - if chunkLength.lstrip(b'0123456789abcdef') != b'': - print('Error: malformed chunk length, skipping', file = sys.stderr) - break - chunkLength = int(chunkLength, base = 16) - if chunkLength == 0: - break - chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength]) - yield HTTPResponseBodyChunk(chunk) - httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:] - else: - yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)[:50]) + if httpType == 'response': + # Decode body + if gzipped: + httpDecompressor = GzipDecompressor() + else: + httpDecompressor = DummyDecompressor() + if chunked: + while True: + try: + chunkLineEnd = httpBody.index(b'\r\n') + except ValueError: + print('Error: could not find chunk line end, skipping', file = sys.stderr) + break + chunkLine = httpBody[:chunkLineEnd] + if b';' in chunkLine: + chunkLength = chunkLine[:chunkLine.index(b';')].strip() + else: + chunkLength = chunkLine.strip() + if chunkLength.lstrip(b'0123456789abcdef') != b'': + print('Error: malformed chunk length, skipping', file = sys.stderr) + break + chunkLength = int(chunkLength, base = 16) + if chunkLength == 0: + break + chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength]) + yield HTTPResponseBodyChunk(chunk) + httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:] + else: + yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)[:50]) else: print('Warning: malformed HTTP response, skipping', file = sys.stderr) + else: + yield WARCBlockChunk(warcContent) yield EndOfRecord()