From d5874656407b688ea0459fc23cf55b1bd08f73fb Mon Sep 17 00:00:00 2001 From: Alard Date: Sat, 20 Oct 2012 23:32:50 +0200 Subject: [PATCH] Check for trailing zeroes in gzips. --- megawarc | 35 +++++++++++++++++------------------ megawarc-fix | 26 +++++++++++++++++++------- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/megawarc b/megawarc index d9d6449..f931698 100755 --- a/megawarc +++ b/megawarc @@ -169,24 +169,23 @@ def test_gz(filename, offset, size, verbose=False, copy_to_file=None): f = CopyReader(f, copy_to_file) start_pos = copy_to_file.tell() try: - gz = subprocess.Popen(["gunzip", "-t", "-q"], - shell=False, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - while True: - buf = f.read(4096) - size -= len(buf) - if len(buf) > 0: - gz.stdin.write(buf) - else: - break - gz.stdin.close() - gz.stdout.close() - gz.stderr.close() - ret = gz.wait() - if ret != 0: - raise IOError("Could not decompress warc.gz. gunzip returned %d." % ret) + with open("/dev/null", "w") as dev_null: + gz = subprocess.Popen(["gunzip", "-tv"], + shell=False, + stdin=subprocess.PIPE, + stdout=dev_null, + stderr=dev_null) + while True: + buf = f.read(4096) + size -= len(buf) + if len(buf) > 0: + gz.stdin.write(buf) + else: + break + gz.stdin.close() + ret = gz.wait() + if ret != 0: + raise IOError("Could not decompress warc.gz. gunzip returned %d." % ret) except (IOError, OSError) as e: if verbose: print >>sys.stderr, e diff --git a/megawarc-fix b/megawarc-fix index 17b7d3c..5d6da22 100755 --- a/megawarc-fix +++ b/megawarc-fix @@ -14,6 +14,7 @@ import gzip import json import os.path import re +import subprocess import sys import tarfile import zlib @@ -92,16 +93,27 @@ def test_gz(filename, offset, size, verbose=False): with open(filename, "r") as f_stream: f = RangeFile(f_stream, offset, size) try: - gz = gzip.GzipFile(fileobj=f, mode="rb") - while True: - buf = gz.read(4096) - if len(buf) == 0: - break - except (IOError, ValueError, zlib.error) as e: + with open("/dev/null", "w") as dev_null: + gz = subprocess.Popen(["gunzip", "-tv"], + shell=False, + stdin=subprocess.PIPE, + stdout=dev_null, + stderr=dev_null) + while True: + buf = f.read(4096) + size -= len(buf) + if len(buf) > 0: + gz.stdin.write(buf) + else: + break + gz.stdin.close() + ret = gz.wait() + if ret != 0: + raise IOError("Could not decompress warc.gz. gunzip returned %d." % ret) + except (IOError, OSError) as e: if verbose: print >>sys.stderr, e return False - return True