From fb0ba014ff4df76411cdd426a15764695a33c59e Mon Sep 17 00:00:00 2001 From: Alard Date: Fri, 12 Oct 2012 17:41:24 +0200 Subject: [PATCH] Fixed gzip checking. --- megawarc | 101 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 46 deletions(-) diff --git a/megawarc b/megawarc index 86cd763..f442389 100755 --- a/megawarc +++ b/megawarc @@ -63,7 +63,6 @@ import os.path import re import sys import tarfile -import zlib from optparse import OptionParser try: @@ -99,6 +98,59 @@ def copy_to_stream(stream, input_filename, offset, size): stream.flush() +# part of a stream as a file +# (seek relative to an offset) +class RangeFile(object): + def __init__(self, stream, offset, size): + self._stream = stream + self._offset = offset + self._size = size + + self._current_rel_offset = 0 + + def tell(self): + return self._current_rel_offset + + def seek(self, pos, whence=os.SEEK_SET): + if whence == os.SEEK_SET: + self._current_rel_offset = pos + elif whence == os.SEEK_CUR: + self._current_rel_offset += pos + elif whence == os.SEEK_END: + self._current_rel_offset = self._size + pos + else: + raise Exception("Unknown whence: %d." % whence) + if self._current_rel_offset < 0 or self._current_rel_offset > self._size: + raise Exception("Seek outside file: %d." % self._current_rel_offset) + self._stream.seek(self._offset + self._current_rel_offset) + + def read(self, size): + size = min(self._size - self._current_rel_offset, size) + self._current_rel_offset += size + buf = self._stream.read(size) + if len(buf) < size: + raise Exception("Expected to read %d but received %d." % (size, len(buf))) + return buf + + +# check for gzip errors +def test_gz(filename, offset, size, verbose=False): + with open(filename, "r") as f_stream: + f = RangeFile(f_stream, offset, size) + try: + gz = gzip.GzipFile(fileobj=f, mode="rb") + while True: + buf = gz.read(4096) + if len(buf) == 0: + break + except (IOError, ValueError) as e: + if verbose: + print >>sys.stderr, e + return False + + return True + + # converting a .tar with warcs to megawarc tar+warc+json class MegawarcBuilder(object): def __init__(self, input_filename): @@ -121,28 +173,6 @@ class MegawarcBuilder(object): if padding > 0: tar_out.write("\0" * padding) - def test_gz(self, offset, size): - with open(self.input_filename, "r") as f: - z = zlib.decompressobj(15 + 32) - f.seek(offset) - - to_read = size - while to_read > 0: - buf_size = min(to_read, 4096) - buf = f.read(buf_size) - if len(buf) < buf_size: - # end of file, not a valid gz - return False - else: - z.decompress(buf) - to_read -= len(buf) - - if z.flush()!="": - # remaining uncompressed data - return False - - return True - def process_entry(self, entry, warc_out, tar_out, json_out): # calculate position of tar entry block_size = (tarfile.BLOCKSIZE + # header @@ -161,7 +191,7 @@ class MegawarcBuilder(object): if entry.isfile() and re.search(r"\.warc\.gz", entry.name): if self.verbose: print >>sys.stderr, "Checking %s" % entry.name - valid_warc_gz = self.test_gz(data_offset, entry.size) + valid_warc_gz = test_gz(self.input_filename, data_offset, entry.size, self.verbose) if not valid_warc_gz: if self.verbose: print >>sys.stderr, "Invalid gzip %s" % entry.name @@ -234,27 +264,6 @@ class MegawarcPacker(object): if padding > 0: tar_out.write("\0" * padding) - def test_gz(self, filename, size): - with open(filename, "r") as f: - z = zlib.decompressobj(15 + 32) - - to_read = size - while to_read > 0: - buf_size = min(to_read, 4096) - buf = f.read(buf_size) - if len(buf) < buf_size: - # end of file, not a valid gz - return False - else: - z.decompress(buf) - to_read -= len(buf) - - if z.flush()!="": - # remaining uncompressed data - return False - - return True - def process_file(self, filename, warc_out, tar_out, json_out): # make tar header arcname = filename @@ -294,7 +303,7 @@ class MegawarcPacker(object): if re.search(r"\.warc\.gz", filename): if self.verbose: print >>sys.stderr, "Checking %s" % filename - valid_warc_gz = self.test_gz(filename, entry.size) + valid_warc_gz = test_gz(filename, 0, entry.size, self.verbose) if not valid_warc_gz: if self.verbose: print >>sys.stderr, "Invalid gzip %s" % filename