From 01274e461a003cdf73200e6532356f0189fdb9b7 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 11 Nov 2020 01:50:50 +0000 Subject: [PATCH] Prevent constantly moving bytes around for better performance on large chunked records --- warc-tiny | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/warc-tiny b/warc-tiny index 0ec6204..796519b 100755 --- a/warc-tiny +++ b/warc-tiny @@ -162,13 +162,14 @@ def iter_warc(f): else: httpDecompressor = DummyDecompressor() if chunked: + pos = 0 while True: try: - chunkLineEnd = httpBody.index(b'\r\n') + chunkLineEnd = httpBody.index(b'\r\n', pos) except ValueError: print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr) break - chunkLine = httpBody[:chunkLineEnd] + chunkLine = httpBody[pos:chunkLineEnd] if b';' in chunkLine: chunkLength = chunkLine[:chunkLine.index(b';')].strip() else: @@ -181,7 +182,7 @@ def iter_warc(f): break chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength]) yield HTTPBodyChunk(chunk) - httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:] + pos = chunkLineEnd + 2 + chunkLength + 2 else: yield HTTPBodyChunk(httpDecompressor.decompress(httpBody)) else: