From b7e7f571d4bbe4532210232b3ca22eec1b66fab3 Mon Sep 17 00:00:00 2001 From: Alard Date: Sat, 13 Oct 2012 00:41:58 +0200 Subject: [PATCH] Use real tar header (perhaps > 512 bytes). --- megawarc | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/megawarc b/megawarc index b326e5a..fce5604 100755 --- a/megawarc +++ b/megawarc @@ -71,16 +71,6 @@ try: except ImportError: from ordereddict import OrderedDict -# modify tarfile.TarInfo to keep the original tar headers -tarfile.TarInfo.orig_frombuf = tarfile.TarInfo.frombuf -@classmethod -def keepbuf_frombuf(cls, buf): - entry = cls.orig_frombuf(buf) - entry.buf = buf - return entry -tarfile.TarInfo.frombuf = keepbuf_frombuf - - # open input_filename and write the data from offset to # (offset+size) to stream def copy_to_stream(stream, input_filename, offset, size): @@ -175,16 +165,19 @@ class MegawarcBuilder(object): tar_out.write("\0" * padding) def process_entry(self, entry, warc_out, tar_out, json_out): + with open(self.input_filename, "r") as tar: + tar.seek(entry.offset) + tar_header = tar.read(entry.offset_data - entry.offset) + # calculate position of tar entry - block_size = (tarfile.BLOCKSIZE + # header + block_size = (len(tar_header) + # header entry.size + # data (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE) - data_offset = entry.offset + tarfile.BLOCKSIZE next_offset = entry.offset + block_size d_src_offsets = OrderedDict() d_src_offsets["entry"] = entry.offset - d_src_offsets["data"] = data_offset + d_src_offsets["data"] = entry.offset_data d_src_offsets["next_entry"] = next_offset # decide what to do with this entry @@ -192,7 +185,7 @@ class MegawarcBuilder(object): if entry.isfile() and re.search(r"\.warc\.gz", entry.name): if self.verbose: print >>sys.stderr, "Checking %s" % entry.name - valid_warc_gz = test_gz(self.input_filename, data_offset, entry.size, self.verbose) + valid_warc_gz = test_gz(self.input_filename, entry.offset_data, entry.size, self.verbose) if not valid_warc_gz: if self.verbose: print >>sys.stderr, "Invalid gzip %s" % entry.name @@ -204,7 +197,7 @@ class MegawarcBuilder(object): warc_offset = warc_out.tell() if self.verbose: print >>sys.stderr, "Copying %s to warc" % entry.name - copy_to_stream(warc_out, self.input_filename, data_offset, entry.size) + copy_to_stream(warc_out, self.input_filename, entry.offset_data, entry.size) d_target["container"] = "warc" d_target["offset"] = warc_offset @@ -226,7 +219,7 @@ class MegawarcBuilder(object): d["target"] = d_target d["src_offsets"] = d_src_offsets d["header_fields"] = entry.get_info("utf-8", {}) - d["header_string"] = entry.buf + d["header_string"] = tar_header # store metadata json.dump(d, json_out, separators=(',', ':'))