Browse Source

Add 'pack' option to pack files.

master
Alard 11 years ago
parent
commit
a55b7b8adc
2 changed files with 169 additions and 7 deletions
  1. +8
    -1
      README.md
  2. +161
    -6
      megawarc

+ 8
- 1
README.md View File

@@ -40,12 +40,19 @@ One line with a JSON object per file in the .tar.
Usage Usage
----- -----
``` ```
megawarc build FILE
megawarc convert FILE
``` ```


Converts the tar file (containing .warc.gz files) to a megawarc. Converts the tar file (containing .warc.gz files) to a megawarc.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE. It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE.


```
megawarc pack FILE INFILE_1 [[INFILE_2] ...]
```
Creates a megawarc with basename FILE and recursively adds the
given files and directories to it, as if they were in a tar file.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz.

``` ```
megawarc restore FILE megawarc restore FILE
``` ```


+ 161
- 6
megawarc View File

@@ -43,10 +43,15 @@ One line with a JSON object per file in the .tar.


USAGE USAGE
----- -----
megawarc build FILE
megawarc convert FILE
Converts the tar file (containing .warc.gz files) to a megawarc. Converts the tar file (containing .warc.gz files) to a megawarc.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE. It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE.


megawarc pack FILE INFILE_1 [[INFILE_2] ...]
Creates a megawarc with basename FILE and recursively adds the
given files and directories to it, as if they were in a tar file.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz.

megawarc restore FILE megawarc restore FILE
Converts the megawarc back to the original tar. Converts the megawarc back to the original tar.
It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE. It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE.
@@ -197,6 +202,144 @@ class MegawarcBuilder(object):
json_out.write("\n") json_out.write("\n")




# adding .warc.gz and other files to megawarc tar+warc+json
class MegawarcPacker(object):
def __init__(self, output_basename):
self.verbose = False
self.output_basename = output_basename
self.output_warc_filename = output_basename + ".megawarc.warc.gz"
self.output_tar_filename = output_basename + ".megawarc.tar"
self.output_json_filename = output_basename + ".megawarc.json.gz"

self.tar_pos = 0

def process(self, filelist):
with open(self.output_warc_filename, "wb") as warc_out:
with open(self.output_tar_filename, "wb") as tar_out:
with gzip.open(self.output_json_filename, "wb") as json_out:
def each_file(arg, dirname, names):
for n in names:
n = os.path.join(dirname, n)
if os.path.isfile(n):
self.process_file(n, warc_out, tar_out, json_out)

for filename in filelist:
if os.path.isdir(filename):
os.path.walk(filename, each_file, None)
elif os.path.isfile(filename):
self.process_file(filename, warc_out, tar_out, json_out)

tar_out.flush()
padding = (tarfile.RECORDSIZE - tar_out.tell()) % tarfile.RECORDSIZE
if padding > 0:
tar_out.write("\0" * padding)

def test_gz(self, filename, size):
with open(filename, "r") as f:
z = zlib.decompressobj(15 + 32)

to_read = size
while to_read > 0:
buf_size = min(to_read, 4096)
buf = f.read(buf_size)
if len(buf) < buf_size:
# end of file, not a valid gz
return False
else:
z.decompress(buf)
to_read -= len(buf)

if z.flush()!="":
# remaining uncompressed data
return False

return True

def process_file(self, filename, warc_out, tar_out, json_out):
# make tar header
arcname = filename
arcname = arcname.replace(os.sep, "/")
arcname = arcname.lstrip("/")
entry = tarfile.TarInfo()
statres = os.stat(filename)
stmd = statres.st_mode
entry.name = arcname
entry.mode = stmd
entry.uid = statres.st_uid
entry.gid = statres.st_gid
entry.size = statres.st_size
entry.mtime = statres.st_mtime
entry.type = tarfile.REGTYPE

# find position in imaginary tar
entry.offset = self.tar_pos

# calculate position of tar entry
block_size = (tarfile.BLOCKSIZE + # header
entry.size + # data
(tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE)
data_offset = entry.offset + tarfile.BLOCKSIZE
next_offset = entry.offset + block_size

# move to next position in imaginary tar
self.tar_pos = next_offset

d_src_offsets = OrderedDict()
d_src_offsets["entry"] = entry.offset
d_src_offsets["data"] = data_offset
d_src_offsets["next_entry"] = next_offset

# decide what to do with this file
valid_warc_gz = False
if re.search(r"\.warc\.gz", filename):
if self.verbose:
print >>sys.stderr, "Checking %s" % filename
valid_warc_gz = self.test_gz(filename, entry.size)
if not valid_warc_gz:
if self.verbose:
print >>sys.stderr, "Invalid gzip %s" % filename

# save in megawarc or in tar
d_target = OrderedDict()
if valid_warc_gz:
# a warc file.gz, add to megawarc
warc_offset = warc_out.tell()
if self.verbose:
print >>sys.stderr, "Copying %s to warc" % filename
copy_to_stream(warc_out, filename, 0, entry.size)

d_target["container"] = "warc"
d_target["offset"] = warc_offset
d_target["size"] = entry.size

else:
# not a warc.gz file, add to tar
tar_offset = tar_out.tell()
if self.verbose:
print >>sys.stderr, "Copying %s to tar" % filename
tar_out.write(entry.tobuf())
copy_to_stream(tar_out, filename, 0, entry.size)
padding = (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE
if padding > 0:
tar_out.write("\0" * padding)
tar_out.flush()

d_target["container"] = "tar"
d_target["offset"] = tar_offset
d_target["size"] = block_size

# store details
d = OrderedDict()
d["target"] = d_target
d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = entry.tobuf()

# store metadata
json.dump(d, json_out, separators=(',', ':'))
json_out.write("\n")


# recreate the original .tar from a megawarc tar+warc+json # recreate the original .tar from a megawarc tar+warc+json
class MegawarcRestorer(object): class MegawarcRestorer(object):
def __init__(self, output_filename): def __init__(self, output_filename):
@@ -246,9 +389,10 @@ class MegawarcRestorer(object):


def main(): def main():
parser = OptionParser( parser = OptionParser(
usage="Usage: %prog [--verbose] build FILE\n %prog [--verbose] restore FILE",
description="""%prog build FILE converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar.
Use %prog build FILE to reconstruct original tar.
usage="Usage: %prog [--verbose] convert FILE\n %prog [--verbose] pack FILE [INFILE [INFILE ...]]\n %prog [--verbose] restore FILE",
description="""%prog convert FILE converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar.
Use %prog pack FILE INFILE ... to create a megawarc containing the files.
Use %prog restore FILE to reconstruct original tar.
""" """
) )
parser.add_option("-v", "--verbose", dest="verbose", parser.add_option("-v", "--verbose", dest="verbose",
@@ -256,11 +400,11 @@ Use %prog build FILE to reconstruct original tar.
help="print status messages", default=False) help="print status messages", default=False)
(options, args) = parser.parse_args() (options, args) = parser.parse_args()


if len(args) != 2:
if len(args) < 2:
parser.print_usage() parser.print_usage()
exit(1) exit(1)


if args[0] == "build":
if args[0] == "convert":
if not os.path.exists(args[1]): if not os.path.exists(args[1]):
print >>sys.stderr, "Input file %s does not exist." % args[1] print >>sys.stderr, "Input file %s does not exist." % args[1]
exit(1) exit(1)
@@ -275,6 +419,17 @@ Use %prog build FILE to reconstruct original tar.
os.unlink(args[1]+ext) os.unlink(args[1]+ext)
raise raise


elif args[0] == "pack":
try:
mwb = MegawarcPacker(args[1])
mwb.verbose = options.verbose
mwb.process(args[2:])
except:
for ext in (".megawarc.warc.gz", ".megawarc.json.gz", ".megawarc.tar"):
if os.path.exists(args[1]+ext):
os.unlink(args[1]+ext)
raise

elif args[0] == "restore": elif args[0] == "restore":
for ext in (".megawarc.warc.gz", ".megawarc.json.gz"): for ext in (".megawarc.warc.gz", ".megawarc.json.gz"):
if not os.path.exists(args[1]+ext): if not os.path.exists(args[1]+ext):


Loading…
Cancel
Save