Browse Source

Use base64-encoded headers.

master
Alard 11 years ago
parent
commit
01fc97cde7
2 changed files with 22 additions and 6 deletions
  1. +13
    -4
      megawarc
  2. +9
    -2
      megawarc-fix

+ 13
- 4
megawarc View File

@@ -37,9 +37,12 @@ One line with a JSON object per file in the .tar.
"header_fields": {
... (parsed fields from the tar header)
},
"header_string": string (the tar header for this entry)
"header_base64": string (the base64-encoded tar header)
}

In older megawarcs the header is sometimes not base64-encoded:
"header_string": string (the tar header for this entry)


USAGE
-----
@@ -57,6 +60,7 @@ megawarc restore FILE
It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE.
"""

import base64
import gzip
import json
import os.path
@@ -219,7 +223,7 @@ class MegawarcBuilder(object):
d["target"] = d_target
d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = tar_header
d["header_base64"] = base64.b64encode(tar_header)

# store metadata
json.dump(d, json_out, separators=(',', ':'))
@@ -338,7 +342,7 @@ class MegawarcPacker(object):
d["target"] = d_target
d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = tar_header
d["header_base64"] = base64.b64encode(tar_header)

# store metadata
json.dump(d, json_out, separators=(',', ':'))
@@ -371,7 +375,12 @@ class MegawarcRestorer(object):
if entry["target"]["container"] == "warc":
if self.verbose:
print >>sys.stderr, "Copying %s from warc" % entry["header_fields"]["name"]
tar_out.write(entry["header_string"])
if "header_base64" in entry:
tar_out.write(base64.b64decode(entry["header_base64"]))
elif "header_string" in entry:
tar_out.write(entry["header_string"])
else:
raise Exception("Missing header_string or header_base64.")
copy_to_stream(tar_out, self.input_warc_filename,
entry["target"]["offset"], entry["target"]["size"])
padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE


+ 9
- 2
megawarc-fix View File

@@ -9,6 +9,7 @@
# ./megawarc-fix BASENAME
# where BASENAME is the part before .megawarc.(warc.gz|json.gz|tar)
#
import base64
import gzip
import json
import os.path
@@ -161,7 +162,10 @@ class MegawarcFixer(object):
block_size = (tarfile.BLOCKSIZE + # header
entry["target"]["size"] + # data
(tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE)
tar_out.write(entry["header_string"])
if "header_base64" in entry:
tar_out.write(base64.b64decode(entry["header_base64"]))
elif "header_string" in entry:
tar_out.write(entry["header_string"])
copy_to_stream(tar_out, self.input_warc_filename,
entry["target"]["offset"], entry["target"]["size"])
padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE
@@ -192,7 +196,10 @@ class MegawarcFixer(object):
d["target"] = d_target
d["src_offsets"] = entry["src_offsets"]
d["header_fields"] = entry["header_fields"]
d["header_string"] = entry["header_string"]
if "header_base64" in entry:
d["header_base64"] = entry["header_base64"]
elif "header_string" in entry:
d["header_base64"] = base64.b64encode(entry["header_string"])

json.dump(d, json_out, separators=(',', ':'))
json_out.write("\n")


Loading…
Cancel
Save