|
|
@@ -63,7 +63,6 @@ import os.path |
|
|
|
import re |
|
|
|
import sys |
|
|
|
import tarfile |
|
|
|
import zlib |
|
|
|
|
|
|
|
from optparse import OptionParser |
|
|
|
try: |
|
|
@@ -99,6 +98,59 @@ def copy_to_stream(stream, input_filename, offset, size): |
|
|
|
stream.flush() |
|
|
|
|
|
|
|
|
|
|
|
# part of a stream as a file |
|
|
|
# (seek relative to an offset) |
|
|
|
class RangeFile(object): |
|
|
|
def __init__(self, stream, offset, size): |
|
|
|
self._stream = stream |
|
|
|
self._offset = offset |
|
|
|
self._size = size |
|
|
|
|
|
|
|
self._current_rel_offset = 0 |
|
|
|
|
|
|
|
def tell(self): |
|
|
|
return self._current_rel_offset |
|
|
|
|
|
|
|
def seek(self, pos, whence=os.SEEK_SET): |
|
|
|
if whence == os.SEEK_SET: |
|
|
|
self._current_rel_offset = pos |
|
|
|
elif whence == os.SEEK_CUR: |
|
|
|
self._current_rel_offset += pos |
|
|
|
elif whence == os.SEEK_END: |
|
|
|
self._current_rel_offset = self._size + pos |
|
|
|
else: |
|
|
|
raise Exception("Unknown whence: %d." % whence) |
|
|
|
if self._current_rel_offset < 0 or self._current_rel_offset > self._size: |
|
|
|
raise Exception("Seek outside file: %d." % self._current_rel_offset) |
|
|
|
self._stream.seek(self._offset + self._current_rel_offset) |
|
|
|
|
|
|
|
def read(self, size): |
|
|
|
size = min(self._size - self._current_rel_offset, size) |
|
|
|
self._current_rel_offset += size |
|
|
|
buf = self._stream.read(size) |
|
|
|
if len(buf) < size: |
|
|
|
raise Exception("Expected to read %d but received %d." % (size, len(buf))) |
|
|
|
return buf |
|
|
|
|
|
|
|
|
|
|
|
# check for gzip errors |
|
|
|
def test_gz(filename, offset, size, verbose=False): |
|
|
|
with open(filename, "r") as f_stream: |
|
|
|
f = RangeFile(f_stream, offset, size) |
|
|
|
try: |
|
|
|
gz = gzip.GzipFile(fileobj=f, mode="rb") |
|
|
|
while True: |
|
|
|
buf = gz.read(4096) |
|
|
|
if len(buf) == 0: |
|
|
|
break |
|
|
|
except (IOError, ValueError) as e: |
|
|
|
if verbose: |
|
|
|
print >>sys.stderr, e |
|
|
|
return False |
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
# converting a .tar with warcs to megawarc tar+warc+json |
|
|
|
class MegawarcBuilder(object): |
|
|
|
def __init__(self, input_filename): |
|
|
@@ -121,28 +173,6 @@ class MegawarcBuilder(object): |
|
|
|
if padding > 0: |
|
|
|
tar_out.write("\0" * padding) |
|
|
|
|
|
|
|
def test_gz(self, offset, size): |
|
|
|
with open(self.input_filename, "r") as f: |
|
|
|
z = zlib.decompressobj(15 + 32) |
|
|
|
f.seek(offset) |
|
|
|
|
|
|
|
to_read = size |
|
|
|
while to_read > 0: |
|
|
|
buf_size = min(to_read, 4096) |
|
|
|
buf = f.read(buf_size) |
|
|
|
if len(buf) < buf_size: |
|
|
|
# end of file, not a valid gz |
|
|
|
return False |
|
|
|
else: |
|
|
|
z.decompress(buf) |
|
|
|
to_read -= len(buf) |
|
|
|
|
|
|
|
if z.flush()!="": |
|
|
|
# remaining uncompressed data |
|
|
|
return False |
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
def process_entry(self, entry, warc_out, tar_out, json_out): |
|
|
|
# calculate position of tar entry |
|
|
|
block_size = (tarfile.BLOCKSIZE + # header |
|
|
@@ -161,7 +191,7 @@ class MegawarcBuilder(object): |
|
|
|
if entry.isfile() and re.search(r"\.warc\.gz", entry.name): |
|
|
|
if self.verbose: |
|
|
|
print >>sys.stderr, "Checking %s" % entry.name |
|
|
|
valid_warc_gz = self.test_gz(data_offset, entry.size) |
|
|
|
valid_warc_gz = test_gz(self.input_filename, data_offset, entry.size, self.verbose) |
|
|
|
if not valid_warc_gz: |
|
|
|
if self.verbose: |
|
|
|
print >>sys.stderr, "Invalid gzip %s" % entry.name |
|
|
@@ -234,27 +264,6 @@ class MegawarcPacker(object): |
|
|
|
if padding > 0: |
|
|
|
tar_out.write("\0" * padding) |
|
|
|
|
|
|
|
def test_gz(self, filename, size): |
|
|
|
with open(filename, "r") as f: |
|
|
|
z = zlib.decompressobj(15 + 32) |
|
|
|
|
|
|
|
to_read = size |
|
|
|
while to_read > 0: |
|
|
|
buf_size = min(to_read, 4096) |
|
|
|
buf = f.read(buf_size) |
|
|
|
if len(buf) < buf_size: |
|
|
|
# end of file, not a valid gz |
|
|
|
return False |
|
|
|
else: |
|
|
|
z.decompress(buf) |
|
|
|
to_read -= len(buf) |
|
|
|
|
|
|
|
if z.flush()!="": |
|
|
|
# remaining uncompressed data |
|
|
|
return False |
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
def process_file(self, filename, warc_out, tar_out, json_out): |
|
|
|
# make tar header |
|
|
|
arcname = filename |
|
|
@@ -294,7 +303,7 @@ class MegawarcPacker(object): |
|
|
|
if re.search(r"\.warc\.gz", filename): |
|
|
|
if self.verbose: |
|
|
|
print >>sys.stderr, "Checking %s" % filename |
|
|
|
valid_warc_gz = self.test_gz(filename, entry.size) |
|
|
|
valid_warc_gz = test_gz(filename, 0, entry.size, self.verbose) |
|
|
|
if not valid_warc_gz: |
|
|
|
if self.verbose: |
|
|
|
print >>sys.stderr, "Invalid gzip %s" % filename |
|
|
|