JustAnotherArchivist
/
little-things


			
							#!/usr/bin/env python3

# Tiny tool for WARC stuff. Currently has two modes: verifying the integrity of a WARC by comparing the digests and dumping the HTTP response bodies to stdout.

import base64
import gzip
import hashlib
import sys
import zlib


def GzipDecompressor():
	return zlib.decompressobj(16 + zlib.MAX_WBITS)


class DummyDecompressor:
	def decompress(self, data):
		return data


class Event:
	pass


class NewFile(Event):
	pass


class BeginOfRecord(Event):
	def __init__(self, warcHeaders, rawData):
		self._warcHeaders = warcHeaders
		self._rawData = rawData

	@property
	def warcHeaders(self):
		return self._warcHeaders

	@property
	def rawData(self):
		return self._rawData


class _DataChunk(Event):
	def __init__(self, data):
		self._data = data

	@property
	def data(self):
		return self._data

	def __repr__(self):
		return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')


class WARCBlockChunk(_DataChunk):
	def __init__(self, data, isHttpHeader = None):
		super().__init__(data)
		self._isHttpHeader = isHttpHeader

	@property
	def isHttpHeader(self):
		# True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
		return self._isHttpHeader


class RawHTTPResponseBodyChunk(_DataChunk):
	'''
	Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
	This is like HTTPResponseBodyChunk but without transfer encoding stripping.
	'''


class HTTPResponseBodyChunk(_DataChunk):
	'''
	Representing a part of the HTTP response body with transfer encoding stripped.
	'''


class EndOfRecord(Event):
	pass


def iter_warc(f):
	# Yields Events
	# BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.

	with gzip.open(f, 'rb') as fp:
		buf = b''
		while True:
			# Read WARC header
			while b'\r\n\r\n' not in buf:
				try:
					buf = buf + fp.read(4096)
				except EOFError:
					break
				if not buf:
					break
			if not buf:
				break
			warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
			assert warcHeaderBuf.startswith(b'WARC/1.0\r\n')
			assert b'\r\nContent-Length:' in warcHeaderBuf
			warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
			warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
			warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
			warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
			yield BeginOfRecord(warcHeaders, warcHeaderBuf)

			# Read WARC block (and skip CRLFCRLF at the end of the record)
			if len(buf) < warcContentLength + 4:
				try:
					buf = buf + fp.read(warcContentLength + 4 - len(buf))
				except EOFError:
					pass
			if len(buf) < warcContentLength + 4:
				print('Error: truncated WARC', file = sys.stderr)
				break
			warcContent = buf[:warcContentLength]
			buf = buf[warcContentLength + 4:]

			# Decode HTTP response if it is one
			if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
				httpType = 'request'
			elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
				httpType = 'response'
			else:
				httpType = None
			if httpType is not None:
				if b'\r\n\r\n' in warcContent:
					httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)

					# Parse headers and extract transfer encoding
					httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
					chunked = False
					gzipped = False
					if b'\r\ntransfer-encoding' in httpHeaders.lower():
						transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
						transferEncodings = map(bytes.strip, transferEncoding.split(b','))
						chunked = b'chunked' in transferEncodings
						gzipped = b'gzip' in transferEncodings

					yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
					yield WARCBlockChunk(httpBody, isHttpHeader = False)
					yield RawHTTPResponseBodyChunk(httpBody)

					if httpType == 'response':
						# Decode body
						if gzipped:
							httpDecompressor = GzipDecompressor()
						else:
							httpDecompressor = DummyDecompressor()
						if chunked:
							while True:
								try:
									chunkLineEnd = httpBody.index(b'\r\n')
								except ValueError:
									print('Error: could not find chunk line end, skipping', file = sys.stderr)
									break
								chunkLine = httpBody[:chunkLineEnd]
								if b';' in chunkLine:
									chunkLength = chunkLine[:chunkLine.index(b';')].strip()
								else:
									chunkLength = chunkLine.strip()
								if chunkLength.lstrip(b'0123456789abcdef') != b'':
									print('Error: malformed chunk length, skipping', file = sys.stderr)
									break
								chunkLength = int(chunkLength, base = 16)
								if chunkLength == 0:
									break
								chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
								yield HTTPResponseBodyChunk(chunk)
								httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
						else:
							yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)[:50])
				else:
					print('Warning: malformed HTTP response, skipping', file = sys.stderr)
			else:
				yield WARCBlockChunk(warcContent)
			yield EndOfRecord()


class ProcessMode:
	def process_event(self, event):
		raise NotImplementedError


class VerifyMode(ProcessMode):
	def __init__(self):
		self._blockDigester = None
		self._recordedBlockDigest = None
		self._payloadDigester = None
		self._brokenPayloadDigester = None
		self._recordedPayloadDigest = None
		self._printedBrokenPayloadWarning = False

	def process_event(self, event):
		if type(event) is NewFile:
			self._printedBrokenPayloadWarning = False
		elif type(event) is BeginOfRecord:
			if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
				self._blockDigester = hashlib.sha1()
				self._recordedBlockDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest')
			else:
				self._blockDigester = None
				self._recordedBlockDigest = None
			if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
				self._payloadDigester = hashlib.sha1()
				self._brokenPayloadDigester = hashlib.sha1()
				self._recordedPayloadDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest')
			else:
				self._payloadDigester = None
				self._brokenPayloadDigester = None
				self._recordedPayloadDigest = None
			self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
			self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
		elif type(event) is WARCBlockChunk:
			self._blockDigester.update(event.data)
		elif type(event) is HTTPResponseBodyChunk:
			self._payloadDigester.update(event.data)
		elif type(event) is RawHTTPResponseBodyChunk:
			self._brokenPayloadDigester.update(event.data)
		elif type(event) is EndOfRecord:
			if self._blockDigester:
				if self._recordedBlockDigest != b'sha1:' + base64.b32encode(self._blockDigester.digest()):
					print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest, base64.b32encode(self._blockDigester.digest())))
			if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
				if self._recordedPayloadDigest != b'sha1:' + base64.b32encode(self._payloadDigester.digest()):
					if self._recordedPayloadDigest == b'sha1:' + base64.b32encode(self._brokenPayloadDigester.digest()):
						if not self._printedBrokenPayloadWarning:
							print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding')
							self._printedBrokenPayloadWarning = True
					else:
						print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest, base64.b32encode(self._payloadDigester.digest()), base64.b32encode(self._brokenPayloadDigester.digest())))


class DumpResponsesMode(ProcessMode):
	def __init__(self):
		self._printEOR = False

	def process_event(self, event):
		if type(event) is BeginOfRecord:
			self._printEOR = False
		elif type(event) is HTTPResponseBodyChunk:
			self._printEOR = True
			sys.stdout.buffer.write(event.data)
		elif type(event) is EndOfRecord:
			if self._printEOR:
				sys.stdout.buffer.write(b'\r\n')


def main():
	processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode}

	assert len(sys.argv) - 1 >= 2
	mode = sys.argv[1]
	assert mode in processorMap
	files = sys.argv[2:]
	assert files

	processor = processorMap[mode]()

	for f in files:
		print('Info: processing {}'.format(f), file = sys.stderr)
		processor.process_event(NewFile())
		for event in iter_warc(f):
			processor.process_event(event)


if __name__ == '__main__':
	main()