From d5f646f99550e3a66c9164933177cc41aa1dd89a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 26 Jul 2021 23:07:56 +0000 Subject: [PATCH] Add zstdwarccat --- zstdwarccat | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 zstdwarccat diff --git a/zstdwarccat b/zstdwarccat new file mode 100755 index 0000000..7ed521e --- /dev/null +++ b/zstdwarccat @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +import io +import os +import struct +import subprocess +import sys +import tempfile + + +if len(sys.argv) != 2 or sys.argv[1] == '--help' or sys.argv[1] == '-h': + print('Usage: unzstd-warc FILE', file = sys.stderr) + print('Decompresses FILE and writes its contents to stdout', file = sys.stderr) + sys.exit(1) + + +with open(sys.argv[1], 'rb') as fp: + magic = fp.read(4) + assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary' + dictSize = fp.read(4) + assert len(dictSize) == 4, 'missing dict size' + dictSize = struct.unpack('= 4, 'dict too small' + assert dictSize < 100 * 1024**2, 'dict too large' + d = fp.read(dictSize) + assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict' + if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict + # Decompress with unzstd + p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + out, err = p.communicate(d) + assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}' + d = out + #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do + +with tempfile.NamedTemporaryFile() as dfp: + dfp.write(d) + pzstd = subprocess.Popen(['zstdcat', '-D', dfp.name, sys.argv[1]]) + pzstd.communicate()