Browse Source

Add 'pack' option to pack files.

master
Alard 11 years ago
parent
commit
a55b7b8adc
2 changed files with 169 additions and 7 deletions
  1. +8
    -1
      README.md
  2. +161
    -6
      megawarc

+ 8
- 1
README.md View File

@@ -40,12 +40,19 @@ One line with a JSON object per file in the .tar.
Usage
-----
```
megawarc build FILE
megawarc convert FILE
```

Converts the tar file (containing .warc.gz files) to a megawarc.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE.

```
megawarc pack FILE INFILE_1 [[INFILE_2] ...]
```
Creates a megawarc with basename FILE and recursively adds the
given files and directories to it, as if they were in a tar file.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz.

```
megawarc restore FILE
```


+ 161
- 6
megawarc View File

@@ -43,10 +43,15 @@ One line with a JSON object per file in the .tar.

USAGE
-----
megawarc build FILE
megawarc convert FILE
Converts the tar file (containing .warc.gz files) to a megawarc.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE.

megawarc pack FILE INFILE_1 [[INFILE_2] ...]
Creates a megawarc with basename FILE and recursively adds the
given files and directories to it, as if they were in a tar file.
It creates FILE.warc.gz, FILE.tar and FILE.json.gz.

megawarc restore FILE
Converts the megawarc back to the original tar.
It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE.
@@ -197,6 +202,144 @@ class MegawarcBuilder(object):
json_out.write("\n")


# adding .warc.gz and other files to megawarc tar+warc+json
class MegawarcPacker(object):
def __init__(self, output_basename):
self.verbose = False
self.output_basename = output_basename
self.output_warc_filename = output_basename + ".megawarc.warc.gz"
self.output_tar_filename = output_basename + ".megawarc.tar"
self.output_json_filename = output_basename + ".megawarc.json.gz"

self.tar_pos = 0

def process(self, filelist):
with open(self.output_warc_filename, "wb") as warc_out:
with open(self.output_tar_filename, "wb") as tar_out:
with gzip.open(self.output_json_filename, "wb") as json_out:
def each_file(arg, dirname, names):
for n in names:
n = os.path.join(dirname, n)
if os.path.isfile(n):
self.process_file(n, warc_out, tar_out, json_out)

for filename in filelist:
if os.path.isdir(filename):
os.path.walk(filename, each_file, None)
elif os.path.isfile(filename):
self.process_file(filename, warc_out, tar_out, json_out)

tar_out.flush()
padding = (tarfile.RECORDSIZE - tar_out.tell()) % tarfile.RECORDSIZE
if padding > 0:
tar_out.write("\0" * padding)

def test_gz(self, filename, size):
with open(filename, "r") as f:
z = zlib.decompressobj(15 + 32)

to_read = size
while to_read > 0:
buf_size = min(to_read, 4096)
buf = f.read(buf_size)
if len(buf) < buf_size:
# end of file, not a valid gz
return False
else:
z.decompress(buf)
to_read -= len(buf)

if z.flush()!="":
# remaining uncompressed data
return False

return True

def process_file(self, filename, warc_out, tar_out, json_out):
# make tar header
arcname = filename
arcname = arcname.replace(os.sep, "/")
arcname = arcname.lstrip("/")
entry = tarfile.TarInfo()
statres = os.stat(filename)
stmd = statres.st_mode
entry.name = arcname
entry.mode = stmd
entry.uid = statres.st_uid
entry.gid = statres.st_gid
entry.size = statres.st_size
entry.mtime = statres.st_mtime
entry.type = tarfile.REGTYPE

# find position in imaginary tar
entry.offset = self.tar_pos

# calculate position of tar entry
block_size = (tarfile.BLOCKSIZE + # header
entry.size + # data
(tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE)
data_offset = entry.offset + tarfile.BLOCKSIZE
next_offset = entry.offset + block_size

# move to next position in imaginary tar
self.tar_pos = next_offset

d_src_offsets = OrderedDict()
d_src_offsets["entry"] = entry.offset
d_src_offsets["data"] = data_offset
d_src_offsets["next_entry"] = next_offset

# decide what to do with this file
valid_warc_gz = False
if re.search(r"\.warc\.gz", filename):
if self.verbose:
print >>sys.stderr, "Checking %s" % filename
valid_warc_gz = self.test_gz(filename, entry.size)
if not valid_warc_gz:
if self.verbose:
print >>sys.stderr, "Invalid gzip %s" % filename

# save in megawarc or in tar
d_target = OrderedDict()
if valid_warc_gz:
# a warc file.gz, add to megawarc
warc_offset = warc_out.tell()
if self.verbose:
print >>sys.stderr, "Copying %s to warc" % filename
copy_to_stream(warc_out, filename, 0, entry.size)

d_target["container"] = "warc"
d_target["offset"] = warc_offset
d_target["size"] = entry.size

else:
# not a warc.gz file, add to tar
tar_offset = tar_out.tell()
if self.verbose:
print >>sys.stderr, "Copying %s to tar" % filename
tar_out.write(entry.tobuf())
copy_to_stream(tar_out, filename, 0, entry.size)
padding = (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE
if padding > 0:
tar_out.write("\0" * padding)
tar_out.flush()

d_target["container"] = "tar"
d_target["offset"] = tar_offset
d_target["size"] = block_size

# store details
d = OrderedDict()
d["target"] = d_target
d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = entry.tobuf()

# store metadata
json.dump(d, json_out, separators=(',', ':'))
json_out.write("\n")


# recreate the original .tar from a megawarc tar+warc+json
class MegawarcRestorer(object):
def __init__(self, output_filename):
@@ -246,9 +389,10 @@ class MegawarcRestorer(object):

def main():
parser = OptionParser(
usage="Usage: %prog [--verbose] build FILE\n %prog [--verbose] restore FILE",
description="""%prog build FILE converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar.
Use %prog build FILE to reconstruct original tar.
usage="Usage: %prog [--verbose] convert FILE\n %prog [--verbose] pack FILE [INFILE [INFILE ...]]\n %prog [--verbose] restore FILE",
description="""%prog convert FILE converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar.
Use %prog pack FILE INFILE ... to create a megawarc containing the files.
Use %prog restore FILE to reconstruct original tar.
"""
)
parser.add_option("-v", "--verbose", dest="verbose",
@@ -256,11 +400,11 @@ Use %prog build FILE to reconstruct original tar.
help="print status messages", default=False)
(options, args) = parser.parse_args()

if len(args) != 2:
if len(args) < 2:
parser.print_usage()
exit(1)

if args[0] == "build":
if args[0] == "convert":
if not os.path.exists(args[1]):
print >>sys.stderr, "Input file %s does not exist." % args[1]
exit(1)
@@ -275,6 +419,17 @@ Use %prog build FILE to reconstruct original tar.
os.unlink(args[1]+ext)
raise

elif args[0] == "pack":
try:
mwb = MegawarcPacker(args[1])
mwb.verbose = options.verbose
mwb.process(args[2:])
except:
for ext in (".megawarc.warc.gz", ".megawarc.json.gz", ".megawarc.tar"):
if os.path.exists(args[1]+ext):
os.unlink(args[1]+ext)
raise

elif args[0] == "restore":
for ext in (".megawarc.warc.gz", ".megawarc.json.gz"):
if not os.path.exists(args[1]+ext):


Loading…
Cancel
Save