Browse Source

Prevent constantly moving bytes around for better performance on large chunked records

master
JustAnotherArchivist 3 years ago
parent
commit
01274e461a
1 changed files with 4 additions and 3 deletions
  1. +4
    -3
      warc-tiny

+ 4
- 3
warc-tiny View File

@@ -162,13 +162,14 @@ def iter_warc(f):
else:
httpDecompressor = DummyDecompressor()
if chunked:
pos = 0
while True:
try:
chunkLineEnd = httpBody.index(b'\r\n')
chunkLineEnd = httpBody.index(b'\r\n', pos)
except ValueError:
print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
break
chunkLine = httpBody[:chunkLineEnd]
chunkLine = httpBody[pos:chunkLineEnd]
if b';' in chunkLine:
chunkLength = chunkLine[:chunkLine.index(b';')].strip()
else:
@@ -181,7 +182,7 @@ def iter_warc(f):
break
chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
yield HTTPBodyChunk(chunk)
httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
pos = chunkLineEnd + 2 + chunkLength + 2
else:
yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
else:


Loading…
Cancel
Save