The little things give you away... A collection of various small helper stuff
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.
 
 
 

38 satır
1.3 KiB

  1. #!/usr/bin/env python3
  2. import io
  3. import os
  4. import struct
  5. import subprocess
  6. import sys
  7. import tempfile
  8. if len(sys.argv) != 2 or sys.argv[1] == '--help' or sys.argv[1] == '-h':
  9. print('Usage: unzstd-warc FILE', file = sys.stderr)
  10. print('Decompresses FILE and writes its contents to stdout', file = sys.stderr)
  11. sys.exit(1)
  12. with open(sys.argv[1], 'rb') as fp:
  13. magic = fp.read(4)
  14. assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
  15. dictSize = fp.read(4)
  16. assert len(dictSize) == 4, 'missing dict size'
  17. dictSize = struct.unpack('<I', dictSize)[0]
  18. assert dictSize >= 4, 'dict too small'
  19. assert dictSize < 100 * 1024**2, 'dict too large'
  20. d = fp.read(dictSize)
  21. assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict'
  22. if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict
  23. # Decompress with unzstd
  24. p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  25. out, err = p.communicate(d)
  26. assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}'
  27. d = out
  28. #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do
  29. with tempfile.NamedTemporaryFile() as dfp:
  30. dfp.write(d)
  31. pzstd = subprocess.Popen(['zstdcat', '-D', dfp.name, sys.argv[1]])
  32. pzstd.communicate()