From 01fc97cde7bde4a404fb35bcee8febfd14c17052 Mon Sep 17 00:00:00 2001 From: Alard Date: Mon, 15 Oct 2012 09:51:21 +0200 Subject: [PATCH] Use base64-encoded headers. --- megawarc | 17 +++++++++++++---- megawarc-fix | 11 +++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/megawarc b/megawarc index fce5604..0340bd6 100755 --- a/megawarc +++ b/megawarc @@ -37,9 +37,12 @@ One line with a JSON object per file in the .tar. "header_fields": { ... (parsed fields from the tar header) }, - "header_string": string (the tar header for this entry) + "header_base64": string (the base64-encoded tar header) } +In older megawarcs the header is sometimes not base64-encoded: + "header_string": string (the tar header for this entry) + USAGE ----- @@ -57,6 +60,7 @@ megawarc restore FILE It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE. """ +import base64 import gzip import json import os.path @@ -219,7 +223,7 @@ class MegawarcBuilder(object): d["target"] = d_target d["src_offsets"] = d_src_offsets d["header_fields"] = entry.get_info("utf-8", {}) - d["header_string"] = tar_header + d["header_base64"] = base64.b64encode(tar_header) # store metadata json.dump(d, json_out, separators=(',', ':')) @@ -338,7 +342,7 @@ class MegawarcPacker(object): d["target"] = d_target d["src_offsets"] = d_src_offsets d["header_fields"] = entry.get_info("utf-8", {}) - d["header_string"] = tar_header + d["header_base64"] = base64.b64encode(tar_header) # store metadata json.dump(d, json_out, separators=(',', ':')) @@ -371,7 +375,12 @@ class MegawarcRestorer(object): if entry["target"]["container"] == "warc": if self.verbose: print >>sys.stderr, "Copying %s from warc" % entry["header_fields"]["name"] - tar_out.write(entry["header_string"]) + if "header_base64" in entry: + tar_out.write(base64.b64decode(entry["header_base64"])) + elif "header_string" in entry: + tar_out.write(entry["header_string"]) + else: + raise Exception("Missing header_string or header_base64.") copy_to_stream(tar_out, self.input_warc_filename, entry["target"]["offset"], entry["target"]["size"]) padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE diff --git a/megawarc-fix b/megawarc-fix index b71d456..17b7d3c 100755 --- a/megawarc-fix +++ b/megawarc-fix @@ -9,6 +9,7 @@ # ./megawarc-fix BASENAME # where BASENAME is the part before .megawarc.(warc.gz|json.gz|tar) # +import base64 import gzip import json import os.path @@ -161,7 +162,10 @@ class MegawarcFixer(object): block_size = (tarfile.BLOCKSIZE + # header entry["target"]["size"] + # data (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE) - tar_out.write(entry["header_string"]) + if "header_base64" in entry: + tar_out.write(base64.b64decode(entry["header_base64"])) + elif "header_string" in entry: + tar_out.write(entry["header_string"]) copy_to_stream(tar_out, self.input_warc_filename, entry["target"]["offset"], entry["target"]["size"]) padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE @@ -192,7 +196,10 @@ class MegawarcFixer(object): d["target"] = d_target d["src_offsets"] = entry["src_offsets"] d["header_fields"] = entry["header_fields"] - d["header_string"] = entry["header_string"] + if "header_base64" in entry: + d["header_base64"] = entry["header_base64"] + elif "header_string" in entry: + d["header_base64"] = base64.b64encode(entry["header_string"]) json.dump(d, json_out, separators=(',', ':')) json_out.write("\n")