diff --git a/README.md b/README.md index 59dd131..bb8b06c 100644 --- a/README.md +++ b/README.md @@ -40,12 +40,19 @@ One line with a JSON object per file in the .tar. Usage ----- ``` -megawarc build FILE +megawarc convert FILE ``` Converts the tar file (containing .warc.gz files) to a megawarc. It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE. +``` +megawarc pack FILE INFILE_1 [[INFILE_2] ...] +``` +Creates a megawarc with basename FILE and recursively adds the +given files and directories to it, as if they were in a tar file. +It creates FILE.warc.gz, FILE.tar and FILE.json.gz. + ``` megawarc restore FILE ``` diff --git a/megawarc b/megawarc index e00d7bb..86cd763 100755 --- a/megawarc +++ b/megawarc @@ -43,10 +43,15 @@ One line with a JSON object per file in the .tar. USAGE ----- -megawarc build FILE +megawarc convert FILE Converts the tar file (containing .warc.gz files) to a megawarc. It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE. +megawarc pack FILE INFILE_1 [[INFILE_2] ...] + Creates a megawarc with basename FILE and recursively adds the + given files and directories to it, as if they were in a tar file. + It creates FILE.warc.gz, FILE.tar and FILE.json.gz. + megawarc restore FILE Converts the megawarc back to the original tar. It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE. @@ -197,6 +202,144 @@ class MegawarcBuilder(object): json_out.write("\n") +# adding .warc.gz and other files to megawarc tar+warc+json +class MegawarcPacker(object): + def __init__(self, output_basename): + self.verbose = False + self.output_basename = output_basename + self.output_warc_filename = output_basename + ".megawarc.warc.gz" + self.output_tar_filename = output_basename + ".megawarc.tar" + self.output_json_filename = output_basename + ".megawarc.json.gz" + + self.tar_pos = 0 + + def process(self, filelist): + with open(self.output_warc_filename, "wb") as warc_out: + with open(self.output_tar_filename, "wb") as tar_out: + with gzip.open(self.output_json_filename, "wb") as json_out: + def each_file(arg, dirname, names): + for n in names: + n = os.path.join(dirname, n) + if os.path.isfile(n): + self.process_file(n, warc_out, tar_out, json_out) + + for filename in filelist: + if os.path.isdir(filename): + os.path.walk(filename, each_file, None) + elif os.path.isfile(filename): + self.process_file(filename, warc_out, tar_out, json_out) + + tar_out.flush() + padding = (tarfile.RECORDSIZE - tar_out.tell()) % tarfile.RECORDSIZE + if padding > 0: + tar_out.write("\0" * padding) + + def test_gz(self, filename, size): + with open(filename, "r") as f: + z = zlib.decompressobj(15 + 32) + + to_read = size + while to_read > 0: + buf_size = min(to_read, 4096) + buf = f.read(buf_size) + if len(buf) < buf_size: + # end of file, not a valid gz + return False + else: + z.decompress(buf) + to_read -= len(buf) + + if z.flush()!="": + # remaining uncompressed data + return False + + return True + + def process_file(self, filename, warc_out, tar_out, json_out): + # make tar header + arcname = filename + arcname = arcname.replace(os.sep, "/") + arcname = arcname.lstrip("/") + entry = tarfile.TarInfo() + statres = os.stat(filename) + stmd = statres.st_mode + entry.name = arcname + entry.mode = stmd + entry.uid = statres.st_uid + entry.gid = statres.st_gid + entry.size = statres.st_size + entry.mtime = statres.st_mtime + entry.type = tarfile.REGTYPE + + # find position in imaginary tar + entry.offset = self.tar_pos + + # calculate position of tar entry + block_size = (tarfile.BLOCKSIZE + # header + entry.size + # data + (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE) + data_offset = entry.offset + tarfile.BLOCKSIZE + next_offset = entry.offset + block_size + + # move to next position in imaginary tar + self.tar_pos = next_offset + + d_src_offsets = OrderedDict() + d_src_offsets["entry"] = entry.offset + d_src_offsets["data"] = data_offset + d_src_offsets["next_entry"] = next_offset + + # decide what to do with this file + valid_warc_gz = False + if re.search(r"\.warc\.gz", filename): + if self.verbose: + print >>sys.stderr, "Checking %s" % filename + valid_warc_gz = self.test_gz(filename, entry.size) + if not valid_warc_gz: + if self.verbose: + print >>sys.stderr, "Invalid gzip %s" % filename + + # save in megawarc or in tar + d_target = OrderedDict() + if valid_warc_gz: + # a warc file.gz, add to megawarc + warc_offset = warc_out.tell() + if self.verbose: + print >>sys.stderr, "Copying %s to warc" % filename + copy_to_stream(warc_out, filename, 0, entry.size) + + d_target["container"] = "warc" + d_target["offset"] = warc_offset + d_target["size"] = entry.size + + else: + # not a warc.gz file, add to tar + tar_offset = tar_out.tell() + if self.verbose: + print >>sys.stderr, "Copying %s to tar" % filename + tar_out.write(entry.tobuf()) + copy_to_stream(tar_out, filename, 0, entry.size) + padding = (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE + if padding > 0: + tar_out.write("\0" * padding) + tar_out.flush() + + d_target["container"] = "tar" + d_target["offset"] = tar_offset + d_target["size"] = block_size + + # store details + d = OrderedDict() + d["target"] = d_target + d["src_offsets"] = d_src_offsets + d["header_fields"] = entry.get_info("utf-8", {}) + d["header_string"] = entry.tobuf() + + # store metadata + json.dump(d, json_out, separators=(',', ':')) + json_out.write("\n") + + # recreate the original .tar from a megawarc tar+warc+json class MegawarcRestorer(object): def __init__(self, output_filename): @@ -246,9 +389,10 @@ class MegawarcRestorer(object): def main(): parser = OptionParser( - usage="Usage: %prog [--verbose] build FILE\n %prog [--verbose] restore FILE", - description="""%prog build FILE converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar. -Use %prog build FILE to reconstruct original tar. + usage="Usage: %prog [--verbose] convert FILE\n %prog [--verbose] pack FILE [INFILE [INFILE ...]]\n %prog [--verbose] restore FILE", + description="""%prog convert FILE converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar. +Use %prog pack FILE INFILE ... to create a megawarc containing the files. +Use %prog restore FILE to reconstruct original tar. """ ) parser.add_option("-v", "--verbose", dest="verbose", @@ -256,11 +400,11 @@ Use %prog build FILE to reconstruct original tar. help="print status messages", default=False) (options, args) = parser.parse_args() - if len(args) != 2: + if len(args) < 2: parser.print_usage() exit(1) - if args[0] == "build": + if args[0] == "convert": if not os.path.exists(args[1]): print >>sys.stderr, "Input file %s does not exist." % args[1] exit(1) @@ -275,6 +419,17 @@ Use %prog build FILE to reconstruct original tar. os.unlink(args[1]+ext) raise + elif args[0] == "pack": + try: + mwb = MegawarcPacker(args[1]) + mwb.verbose = options.verbose + mwb.process(args[2:]) + except: + for ext in (".megawarc.warc.gz", ".megawarc.json.gz", ".megawarc.tar"): + if os.path.exists(args[1]+ext): + os.unlink(args[1]+ext) + raise + elif args[0] == "restore": for ext in (".megawarc.warc.gz", ".megawarc.json.gz"): if not os.path.exists(args[1]+ext):