# True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
return self._isHttpHeader
class RawHTTPResponseBodyChunk(_DataChunk):
@@ -70,6 +82,7 @@ class EndOfRecord(Event):
def iter_warc(f):
# Yields Events
# BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
with gzip.open(f, 'rb') as fp:
buf = b''
@@ -91,7 +104,7 @@ def iter_warc(f):
warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
yield BeginOfRecord(warcHeaders)
yield BeginOfRecord(warcHeaders, warcHeaderBuf)
# Read WARC block (and skip CRLFCRLF at the end of the record)
if len(buf) < warcContentLength + 4:
@@ -105,10 +118,14 @@ def iter_warc(f):
warcContent = buf[:warcContentLength]
buf = buf[warcContentLength + 4:]
yield WARCBlockChunk(warcContent)
# Decode HTTP response if it is one
if warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType in (b'request', b'response'): #TODO: Support revisit
if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
httpType = 'request'
elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':