From 01274e461a003cdf73200e6532356f0189fdb9b7 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Wed, 11 Nov 2020 01:50:50 +0000
Subject: [PATCH] Prevent constantly moving bytes around for better performance
 on large chunked records

---
 warc-tiny | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/warc-tiny b/warc-tiny
index 0ec6204..796519b 100755
--- a/warc-tiny
+++ b/warc-tiny
@@ -162,13 +162,14 @@ def iter_warc(f):
 					else:
 						httpDecompressor = DummyDecompressor()
 					if chunked:
+						pos = 0
 						while True:
 							try:
-								chunkLineEnd = httpBody.index(b'\r\n')
+								chunkLineEnd = httpBody.index(b'\r\n', pos)
 							except ValueError:
 								print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
 								break
-							chunkLine = httpBody[:chunkLineEnd]
+							chunkLine = httpBody[pos:chunkLineEnd]
 							if b';' in chunkLine:
 								chunkLength = chunkLine[:chunkLine.index(b';')].strip()
 							else:
@@ -181,7 +182,7 @@ def iter_warc(f):
 								break
 							chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
 							yield HTTPBodyChunk(chunk)
-							httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
+							pos = chunkLineEnd + 2 + chunkLength + 2
 					else:
 						yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
 				else: