Add 'pack' option to pack files.

11 years ago · a55b7b8adc
--- a/README.md
+++ b/README.md
@@ -40,12 +40,19 @@ One line with a JSON object per file in the .tar.
 Usage
 -----
 ```
 megawarc build FILE
 megawarc convert FILE
 ```

 Converts the tar file (containing .warc.gz files) to a megawarc.
 It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE.

 ```
 megawarc pack FILE INFILE_1 [[INFILE_2] ...]
 ```
 Creates a megawarc with basename FILE and recursively adds the
 given files and directories to it, as if they were in a tar file.
 It creates FILE.warc.gz, FILE.tar and FILE.json.gz.

 ```
 megawarc restore FILE
 ```
--- a/+ 161
+++ b/+ 161
@@ -43,10 +43,15 @@ One line with a JSON object per file in the .tar.

 USAGE
 -----
 megawarc build FILE
 megawarc convert FILE
  Converts the tar file (containing .warc.gz files) to a megawarc.
  It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE.

 megawarc pack FILE INFILE_1 [[INFILE_2] ...]
  Creates a megawarc with basename FILE and recursively adds the
  given files and directories to it, as if they were in a tar file.
  It creates FILE.warc.gz, FILE.tar and FILE.json.gz.

 megawarc restore FILE
  Converts the megawarc back to the original tar.
  It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE.
@@ -197,6 +202,144 @@ class MegawarcBuilder(object):
    json_out.write("\n")


 # adding .warc.gz and other files to megawarc tar+warc+json
 class MegawarcPacker(object):
  def __init__(self, output_basename):
    self.verbose = False
    self.output_basename = output_basename
    self.output_warc_filename = output_basename + ".megawarc.warc.gz"
    self.output_tar_filename = output_basename + ".megawarc.tar"
    self.output_json_filename = output_basename + ".megawarc.json.gz"

    self.tar_pos = 0

  def process(self, filelist):
    with open(self.output_warc_filename, "wb") as warc_out:
      with open(self.output_tar_filename, "wb") as tar_out:
        with gzip.open(self.output_json_filename, "wb") as json_out:
          def each_file(arg, dirname, names):
            for n in names:
              n = os.path.join(dirname, n)
              if os.path.isfile(n):
                self.process_file(n, warc_out, tar_out, json_out)

          for filename in filelist:
            if os.path.isdir(filename):
              os.path.walk(filename, each_file, None)
            elif os.path.isfile(filename):
              self.process_file(filename, warc_out, tar_out, json_out)

          tar_out.flush()
          padding = (tarfile.RECORDSIZE - tar_out.tell()) % tarfile.RECORDSIZE
          if padding > 0:
            tar_out.write("\0" * padding)

  def test_gz(self, filename, size):
    with open(filename, "r") as f:
      z = zlib.decompressobj(15 + 32)

      to_read = size
      while to_read > 0:
        buf_size = min(to_read, 4096)
        buf = f.read(buf_size)
        if len(buf) < buf_size:
          # end of file, not a valid gz
          return False
        else:
          z.decompress(buf)
          to_read -= len(buf)

      if z.flush()!="":
        # remaining uncompressed data
        return False

    return True

  def process_file(self, filename, warc_out, tar_out, json_out):
    # make tar header
    arcname = filename
    arcname = arcname.replace(os.sep, "/")
    arcname = arcname.lstrip("/")
    entry = tarfile.TarInfo()
    statres = os.stat(filename)
    stmd = statres.st_mode
    entry.name = arcname
    entry.mode = stmd
    entry.uid = statres.st_uid
    entry.gid = statres.st_gid
    entry.size = statres.st_size
    entry.mtime = statres.st_mtime
    entry.type = tarfile.REGTYPE

    # find position in imaginary tar
    entry.offset = self.tar_pos

    # calculate position of tar entry
    block_size = (tarfile.BLOCKSIZE +  # header
                  entry.size +         # data
                  (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE)
    data_offset = entry.offset + tarfile.BLOCKSIZE
    next_offset = entry.offset + block_size

    # move to next position in imaginary tar
    self.tar_pos = next_offset

    d_src_offsets = OrderedDict()
    d_src_offsets["entry"] = entry.offset
    d_src_offsets["data"] = data_offset
    d_src_offsets["next_entry"] = next_offset

    # decide what to do with this file
    valid_warc_gz = False
    if re.search(r"\.warc\.gz", filename):
      if self.verbose:
        print >>sys.stderr, "Checking %s" % filename
      valid_warc_gz = self.test_gz(filename, entry.size)
      if not valid_warc_gz:
        if self.verbose:
          print >>sys.stderr, "Invalid gzip %s" % filename

    # save in megawarc or in tar
    d_target = OrderedDict()
    if valid_warc_gz:
      # a warc file.gz, add to megawarc
      warc_offset = warc_out.tell()
      if self.verbose:
        print >>sys.stderr, "Copying %s to warc" % filename
      copy_to_stream(warc_out, filename, 0, entry.size)

      d_target["container"] = "warc"
      d_target["offset"] = warc_offset
      d_target["size"] = entry.size

    else:
      # not a warc.gz file, add to tar
      tar_offset = tar_out.tell()
      if self.verbose:
        print >>sys.stderr, "Copying %s to tar" % filename
      tar_out.write(entry.tobuf())
      copy_to_stream(tar_out, filename, 0, entry.size)
      padding = (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE
      if padding > 0:
        tar_out.write("\0" * padding)
      tar_out.flush()

      d_target["container"] = "tar"
      d_target["offset"] = tar_offset
      d_target["size"] = block_size

    # store details
    d = OrderedDict()
    d["target"] = d_target
    d["src_offsets"] = d_src_offsets
    d["header_fields"] = entry.get_info("utf-8", {})
    d["header_string"] = entry.tobuf()

    # store metadata
    json.dump(d, json_out, separators=(',', ':'))
    json_out.write("\n")


 # recreate the original .tar from a megawarc tar+warc+json
 class MegawarcRestorer(object):
  def __init__(self, output_filename):
@@ -246,9 +389,10 @@ class MegawarcRestorer(object):

 def main():
  parser = OptionParser(
      usage="Usage: %prog [--verbose] build FILE\n       %prog [--verbose] restore FILE",
      description="""%prog build FILE  converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar.
 Use  %prog build FILE  to reconstruct original tar.
      usage="Usage: %prog [--verbose] convert FILE\n       %prog [--verbose] pack FILE [INFILE [INFILE ...]]\n       %prog [--verbose] restore FILE",
      description="""%prog convert FILE  converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar.
 Use  %prog pack FILE INFILE ...  to create a megawarc containing the files.
 Use  %prog restore FILE  to reconstruct original tar.
      """
  )
  parser.add_option("-v", "--verbose", dest="verbose",
@@ -256,11 +400,11 @@ Use  %prog build FILE  to reconstruct original tar.
                    help="print status messages", default=False)
  (options, args) = parser.parse_args()

  if len(args) != 2:
  if len(args) < 2:
    parser.print_usage()
    exit(1)

  if args[0] == "build":
  if args[0] == "convert":
    if not os.path.exists(args[1]):
      print >>sys.stderr, "Input file %s does not exist." % args[1]
      exit(1)
@@ -275,6 +419,17 @@ Use  %prog build FILE  to reconstruct original tar.
          os.unlink(args[1]+ext)
      raise

  elif args[0] == "pack":
    try:
      mwb = MegawarcPacker(args[1])
      mwb.verbose = options.verbose
      mwb.process(args[2:])
    except:
      for ext in (".megawarc.warc.gz", ".megawarc.json.gz", ".megawarc.tar"):
        if os.path.exists(args[1]+ext):
          os.unlink(args[1]+ext)
      raise

  elif args[0] == "restore":
    for ext in (".megawarc.warc.gz", ".megawarc.json.gz"):
      if not os.path.exists(args[1]+ext):