Browse Source

Use base64-encoded headers.

master
Alard 11 years ago
parent
commit
01fc97cde7
2 changed files with 22 additions and 6 deletions
  1. +13
    -4
      megawarc
  2. +9
    -2
      megawarc-fix

+ 13
- 4
megawarc View File

@@ -37,9 +37,12 @@ One line with a JSON object per file in the .tar.
"header_fields": { "header_fields": {
... (parsed fields from the tar header) ... (parsed fields from the tar header)
}, },
"header_string": string (the tar header for this entry)
"header_base64": string (the base64-encoded tar header)
} }


In older megawarcs the header is sometimes not base64-encoded:
"header_string": string (the tar header for this entry)



USAGE USAGE
----- -----
@@ -57,6 +60,7 @@ megawarc restore FILE
It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE. It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE.
""" """


import base64
import gzip import gzip
import json import json
import os.path import os.path
@@ -219,7 +223,7 @@ class MegawarcBuilder(object):
d["target"] = d_target d["target"] = d_target
d["src_offsets"] = d_src_offsets d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {}) d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = tar_header
d["header_base64"] = base64.b64encode(tar_header)


# store metadata # store metadata
json.dump(d, json_out, separators=(',', ':')) json.dump(d, json_out, separators=(',', ':'))
@@ -338,7 +342,7 @@ class MegawarcPacker(object):
d["target"] = d_target d["target"] = d_target
d["src_offsets"] = d_src_offsets d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {}) d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = tar_header
d["header_base64"] = base64.b64encode(tar_header)


# store metadata # store metadata
json.dump(d, json_out, separators=(',', ':')) json.dump(d, json_out, separators=(',', ':'))
@@ -371,7 +375,12 @@ class MegawarcRestorer(object):
if entry["target"]["container"] == "warc": if entry["target"]["container"] == "warc":
if self.verbose: if self.verbose:
print >>sys.stderr, "Copying %s from warc" % entry["header_fields"]["name"] print >>sys.stderr, "Copying %s from warc" % entry["header_fields"]["name"]
tar_out.write(entry["header_string"])
if "header_base64" in entry:
tar_out.write(base64.b64decode(entry["header_base64"]))
elif "header_string" in entry:
tar_out.write(entry["header_string"])
else:
raise Exception("Missing header_string or header_base64.")
copy_to_stream(tar_out, self.input_warc_filename, copy_to_stream(tar_out, self.input_warc_filename,
entry["target"]["offset"], entry["target"]["size"]) entry["target"]["offset"], entry["target"]["size"])
padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE


+ 9
- 2
megawarc-fix View File

@@ -9,6 +9,7 @@
# ./megawarc-fix BASENAME # ./megawarc-fix BASENAME
# where BASENAME is the part before .megawarc.(warc.gz|json.gz|tar) # where BASENAME is the part before .megawarc.(warc.gz|json.gz|tar)
# #
import base64
import gzip import gzip
import json import json
import os.path import os.path
@@ -161,7 +162,10 @@ class MegawarcFixer(object):
block_size = (tarfile.BLOCKSIZE + # header block_size = (tarfile.BLOCKSIZE + # header
entry["target"]["size"] + # data entry["target"]["size"] + # data
(tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE) (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE)
tar_out.write(entry["header_string"])
if "header_base64" in entry:
tar_out.write(base64.b64decode(entry["header_base64"]))
elif "header_string" in entry:
tar_out.write(entry["header_string"])
copy_to_stream(tar_out, self.input_warc_filename, copy_to_stream(tar_out, self.input_warc_filename,
entry["target"]["offset"], entry["target"]["size"]) entry["target"]["offset"], entry["target"]["size"])
padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE
@@ -192,7 +196,10 @@ class MegawarcFixer(object):
d["target"] = d_target d["target"] = d_target
d["src_offsets"] = entry["src_offsets"] d["src_offsets"] = entry["src_offsets"]
d["header_fields"] = entry["header_fields"] d["header_fields"] = entry["header_fields"]
d["header_string"] = entry["header_string"]
if "header_base64" in entry:
d["header_base64"] = entry["header_base64"]
elif "header_string" in entry:
d["header_base64"] = base64.b64encode(entry["header_string"])


json.dump(d, json_out, separators=(',', ':')) json.dump(d, json_out, separators=(',', ':'))
json_out.write("\n") json_out.write("\n")


Loading…
Cancel
Save