Browse Source

Fixed gzip checking.

master
Alard 11 years ago
parent
commit
fb0ba014ff
1 changed files with 55 additions and 46 deletions
  1. +55
    -46
      megawarc

+ 55
- 46
megawarc View File

@@ -63,7 +63,6 @@ import os.path
import re
import sys
import tarfile
import zlib

from optparse import OptionParser
try:
@@ -99,6 +98,59 @@ def copy_to_stream(stream, input_filename, offset, size):
stream.flush()


# part of a stream as a file
# (seek relative to an offset)
class RangeFile(object):
def __init__(self, stream, offset, size):
self._stream = stream
self._offset = offset
self._size = size

self._current_rel_offset = 0

def tell(self):
return self._current_rel_offset

def seek(self, pos, whence=os.SEEK_SET):
if whence == os.SEEK_SET:
self._current_rel_offset = pos
elif whence == os.SEEK_CUR:
self._current_rel_offset += pos
elif whence == os.SEEK_END:
self._current_rel_offset = self._size + pos
else:
raise Exception("Unknown whence: %d." % whence)
if self._current_rel_offset < 0 or self._current_rel_offset > self._size:
raise Exception("Seek outside file: %d." % self._current_rel_offset)
self._stream.seek(self._offset + self._current_rel_offset)

def read(self, size):
size = min(self._size - self._current_rel_offset, size)
self._current_rel_offset += size
buf = self._stream.read(size)
if len(buf) < size:
raise Exception("Expected to read %d but received %d." % (size, len(buf)))
return buf


# check for gzip errors
def test_gz(filename, offset, size, verbose=False):
with open(filename, "r") as f_stream:
f = RangeFile(f_stream, offset, size)
try:
gz = gzip.GzipFile(fileobj=f, mode="rb")
while True:
buf = gz.read(4096)
if len(buf) == 0:
break
except (IOError, ValueError) as e:
if verbose:
print >>sys.stderr, e
return False

return True


# converting a .tar with warcs to megawarc tar+warc+json
class MegawarcBuilder(object):
def __init__(self, input_filename):
@@ -121,28 +173,6 @@ class MegawarcBuilder(object):
if padding > 0:
tar_out.write("\0" * padding)

def test_gz(self, offset, size):
with open(self.input_filename, "r") as f:
z = zlib.decompressobj(15 + 32)
f.seek(offset)

to_read = size
while to_read > 0:
buf_size = min(to_read, 4096)
buf = f.read(buf_size)
if len(buf) < buf_size:
# end of file, not a valid gz
return False
else:
z.decompress(buf)
to_read -= len(buf)

if z.flush()!="":
# remaining uncompressed data
return False

return True

def process_entry(self, entry, warc_out, tar_out, json_out):
# calculate position of tar entry
block_size = (tarfile.BLOCKSIZE + # header
@@ -161,7 +191,7 @@ class MegawarcBuilder(object):
if entry.isfile() and re.search(r"\.warc\.gz", entry.name):
if self.verbose:
print >>sys.stderr, "Checking %s" % entry.name
valid_warc_gz = self.test_gz(data_offset, entry.size)
valid_warc_gz = test_gz(self.input_filename, data_offset, entry.size, self.verbose)
if not valid_warc_gz:
if self.verbose:
print >>sys.stderr, "Invalid gzip %s" % entry.name
@@ -234,27 +264,6 @@ class MegawarcPacker(object):
if padding > 0:
tar_out.write("\0" * padding)

def test_gz(self, filename, size):
with open(filename, "r") as f:
z = zlib.decompressobj(15 + 32)

to_read = size
while to_read > 0:
buf_size = min(to_read, 4096)
buf = f.read(buf_size)
if len(buf) < buf_size:
# end of file, not a valid gz
return False
else:
z.decompress(buf)
to_read -= len(buf)

if z.flush()!="":
# remaining uncompressed data
return False

return True

def process_file(self, filename, warc_out, tar_out, json_out):
# make tar header
arcname = filename
@@ -294,7 +303,7 @@ class MegawarcPacker(object):
if re.search(r"\.warc\.gz", filename):
if self.verbose:
print >>sys.stderr, "Checking %s" % filename
valid_warc_gz = self.test_gz(filename, entry.size)
valid_warc_gz = test_gz(filename, 0, entry.size, self.verbose)
if not valid_warc_gz:
if self.verbose:
print >>sys.stderr, "Invalid gzip %s" % filename


Loading…
Cancel
Save