|
|
@@ -109,6 +109,7 @@ def iter_warc(f): |
|
|
|
warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length')) |
|
|
|
warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type') |
|
|
|
yield BeginOfRecord(warcHeaders, warcHeaderBuf) |
|
|
|
recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID') |
|
|
|
|
|
|
|
# Read WARC block (and skip CRLFCRLF at the end of the record) |
|
|
|
if len(buf) < warcContentLength + 4: |
|
|
@@ -158,7 +159,7 @@ def iter_warc(f): |
|
|
|
try: |
|
|
|
chunkLineEnd = httpBody.index(b'\r\n') |
|
|
|
except ValueError: |
|
|
|
print('Error: could not find chunk line end, skipping', file = sys.stderr) |
|
|
|
print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr) |
|
|
|
break |
|
|
|
chunkLine = httpBody[:chunkLineEnd] |
|
|
|
if b';' in chunkLine: |
|
|
@@ -166,7 +167,7 @@ def iter_warc(f): |
|
|
|
else: |
|
|
|
chunkLength = chunkLine.strip() |
|
|
|
if chunkLength.lstrip(b'0123456789abcdef') != b'': |
|
|
|
print('Error: malformed chunk length, skipping', file = sys.stderr) |
|
|
|
print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr) |
|
|
|
break |
|
|
|
chunkLength = int(chunkLength, base = 16) |
|
|
|
if chunkLength == 0: |
|
|
@@ -177,7 +178,7 @@ def iter_warc(f): |
|
|
|
else: |
|
|
|
yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)) |
|
|
|
else: |
|
|
|
print('Warning: malformed HTTP response, skipping', file = sys.stderr) |
|
|
|
print('Warning: malformed HTTP response in record {}, skipping'.format(recordID), file = sys.stderr) |
|
|
|
else: |
|
|
|
yield WARCBlockChunk(warcContent) |
|
|
|
yield EndOfRecord() |
|
|
|