The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

61 lines
2.0 KiB

  1. #!/usr/bin/env python3
  2. import io
  3. import os
  4. import struct
  5. import subprocess
  6. import sys
  7. import tempfile
  8. def get_dict(fp):
  9. magic = fp.read(4)
  10. assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
  11. dictSize = fp.read(4)
  12. assert len(dictSize) == 4, 'missing dict size'
  13. dictSize = struct.unpack('<I', dictSize)[0]
  14. assert dictSize >= 4, 'dict too small'
  15. assert dictSize < 100 * 1024**2, 'dict too large'
  16. ds = []
  17. dlen = 0
  18. while dlen < dictSize:
  19. c = fp.read(dictSize - dlen)
  20. if c is None or c == b'': # EOF
  21. break
  22. ds.append(c)
  23. dlen += len(c)
  24. d = b''.join(ds)
  25. assert len(d) == dictSize, f'could not read dict fully: expected {dictSize}, got {len(d)}'
  26. assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict'
  27. if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict
  28. # Decompress with unzstd
  29. p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  30. out, err = p.communicate(d)
  31. assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {err!r}'
  32. d = out
  33. #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do
  34. return d
  35. if (len(sys.argv) != 2 and sys.stdin.isatty()) or sys.argv[1:2] == ['--help'] or sys.argv[1:2] == ['-h']:
  36. print('Usage: unzstd-warc [FILE]', file = sys.stderr)
  37. print('Decompresses FILE or stdin and writes its contents to stdout', file = sys.stderr)
  38. sys.exit(1)
  39. if len(sys.argv) == 2:
  40. with open(sys.argv[1], 'rb') as fp:
  41. d = get_dict(fp)
  42. else:
  43. d = get_dict(sys.stdin.buffer.raw)
  44. # The file must be written to the file system before zstdcat is executed. The most reliable way for that is to close the file. This requires manually deleting it at the end.
  45. with tempfile.NamedTemporaryFile(delete = False) as dfp:
  46. dfp.write(d)
  47. try:
  48. args = ['zstdcat', '-D', dfp.name]
  49. if len(sys.argv) == 2:
  50. args.append(sys.argv[1])
  51. pzstd = subprocess.Popen(args)
  52. pzstd.communicate()
  53. finally:
  54. os.remove(dfp.name)