From 49376db51bba1f843f88bd98e06e3ef004e320a8 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Thu, 28 May 2020 22:32:37 +0000
Subject: [PATCH] Decode HTTP request bodies

---
 warc-tiny | 88 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 46 insertions(+), 42 deletions(-)

diff --git a/warc-tiny b/warc-tiny
index d4b2525..e03fdcc 100755
--- a/warc-tiny
+++ b/warc-tiny
@@ -67,16 +67,16 @@ class WARCBlockChunk(_DataChunk):
 		return self._isHttpHeader
 
 
-class RawHTTPResponseBodyChunk(_DataChunk):
+class RawHTTPBodyChunk(_DataChunk):
 	'''
 	Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
-	This is like HTTPResponseBodyChunk but without transfer encoding stripping.
+	This is like HTTPBodyChunk but without transfer encoding stripping.
 	'''
 
 
-class HTTPResponseBodyChunk(_DataChunk):
+class HTTPBodyChunk(_DataChunk):
 	'''
-	Representing a part of the HTTP response body with transfer encoding stripped.
+	Representing a part of the HTTP body with transfer encoding stripped.
 	'''
 
 
@@ -123,7 +123,7 @@ def iter_warc(f):
 			warcContent = buf[:warcContentLength]
 			buf = buf[warcContentLength + 4:]
 
-			# Decode HTTP response if it is one
+			# Decode HTTP body if appropriate
 			if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
 				httpType = 'request'
 			elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
@@ -146,39 +146,38 @@ def iter_warc(f):
 
 					yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
 					yield WARCBlockChunk(httpBody, isHttpHeader = False)
-					yield RawHTTPResponseBodyChunk(httpBody)
-
-					if httpType == 'response':
-						# Decode body
-						if gzipped:
-							httpDecompressor = GzipDecompressor()
-						else:
-							httpDecompressor = DummyDecompressor()
-						if chunked:
-							while True:
-								try:
-									chunkLineEnd = httpBody.index(b'\r\n')
-								except ValueError:
-									print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
-									break
-								chunkLine = httpBody[:chunkLineEnd]
-								if b';' in chunkLine:
-									chunkLength = chunkLine[:chunkLine.index(b';')].strip()
-								else:
-									chunkLength = chunkLine.strip()
-								if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
-									print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
-									break
-								chunkLength = int(chunkLength, base = 16)
-								if chunkLength == 0:
-									break
-								chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
-								yield HTTPResponseBodyChunk(chunk)
-								httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
-						else:
-							yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody))
+					yield RawHTTPBodyChunk(httpBody)
+
+					# Decode body
+					if gzipped:
+						httpDecompressor = GzipDecompressor()
+					else:
+						httpDecompressor = DummyDecompressor()
+					if chunked:
+						while True:
+							try:
+								chunkLineEnd = httpBody.index(b'\r\n')
+							except ValueError:
+								print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
+								break
+							chunkLine = httpBody[:chunkLineEnd]
+							if b';' in chunkLine:
+								chunkLength = chunkLine[:chunkLine.index(b';')].strip()
+							else:
+								chunkLength = chunkLine.strip()
+							if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
+								print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
+								break
+							chunkLength = int(chunkLength, base = 16)
+							if chunkLength == 0:
+								break
+							chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
+							yield HTTPBodyChunk(chunk)
+							httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
+					else:
+						yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
 				else:
-					print('Warning: malformed HTTP response in record {}, skipping'.format(recordID), file = sys.stderr)
+					print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
 					yield WARCBlockChunk(warcContent)
 			else:
 				yield WARCBlockChunk(warcContent)
@@ -253,10 +252,10 @@ class VerifyMode(ProcessMode):
 		elif type(event) is WARCBlockChunk:
 			if self._blockDigester:
 				self._blockDigester.update(event.data)
-		elif type(event) is HTTPResponseBodyChunk:
+		elif type(event) is HTTPBodyChunk:
 			if self._payloadDigester:
 				self._payloadDigester.update(event.data)
-		elif type(event) is RawHTTPResponseBodyChunk:
+		elif type(event) is RawHTTPBodyChunk:
 			if self._brokenPayloadDigester:
 				self._brokenPayloadDigester.update(event.data)
 		elif type(event) is EndOfRecord:
@@ -276,13 +275,18 @@ class VerifyMode(ProcessMode):
 class DumpResponsesMode(ProcessMode):
 	def __init__(self):
 		self._printEOR = False
+		self._isResponse = False
 
 	def process_event(self, event):
 		if type(event) is BeginOfRecord:
+			warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
+			warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
+			self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
 			self._printEOR = False
-		elif type(event) is HTTPResponseBodyChunk:
-			self._printEOR = True
-			sys.stdout.buffer.write(event.data)
+		elif type(event) is HTTPBodyChunk:
+			if self._isResponse:
+				self._printEOR = True
+				sys.stdout.buffer.write(event.data)
 		elif type(event) is EndOfRecord:
 			if self._printEOR:
 				sys.stdout.buffer.write(b'\r\n')