You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

472 lines
16 KiB

  1. #!/usr/bin/env python
  2. """
  3. megawarc is useful if you have .tar full of .warc.gz files and
  4. you really want one big .warc.gz. With megawarc you get your
  5. .warc.gz, but you can still restore the original .tar.
  6. The megawarc tool looks for .warc.gz in the .tar file and
  7. creates three files, the megawarc:
  8. FILE.warc.gz is the concatenated .warc.gz
  9. FILE.tar contains any non-warc files from the .tar
  10. FILE.json.gz contains metadata
  11. You need the JSON file to reconstruct the original .tar from
  12. the .warc.gz and .tar files. The JSON file has the location
  13. of every file from the original .tar file.
  14. METADATA FORMAT
  15. ---------------
  16. One line with a JSON object per file in the .tar.
  17. {
  18. "target": {
  19. "container": "warc" or "tar", (where is this file?)
  20. "offset": number, (where in the tar/warc does this
  21. file start? for files in the tar
  22. this includes the tar header,
  23. which is copied to the tar.)
  24. "size": size (where does this file end?
  25. for files in the tar, this includes
  26. the padding to 512 bytes)
  27. },
  28. "src_offsets": {
  29. "entry": number, (where is this file in the original tar?)
  30. "data": number, (where does the data start? entry+512)
  31. "next_entry": number (where does the next tar entry start)
  32. },
  33. "header_fields": {
  34. ... (parsed fields from the tar header)
  35. },
  36. "header_base64": string (the base64-encoded tar header)
  37. }
  38. In older megawarcs the header is sometimes not base64-encoded:
  39. "header_string": string (the tar header for this entry)
  40. USAGE
  41. -----
  42. megawarc convert FILE
  43. Converts the tar file (containing .warc.gz files) to a megawarc.
  44. It creates FILE.warc.gz, FILE.tar and FILE.json.gz from FILE.
  45. megawarc pack FILE INFILE_1 [[INFILE_2] ...]
  46. Creates a megawarc with basename FILE and recursively adds the
  47. given files and directories to it, as if they were in a tar file.
  48. It creates FILE.warc.gz, FILE.tar and FILE.json.gz.
  49. megawarc restore FILE
  50. Converts the megawarc back to the original tar.
  51. It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE.
  52. """
  53. import base64
  54. import gzip
  55. import json
  56. import os.path
  57. import re
  58. import sys
  59. import tarfile
  60. import zlib
  61. from optparse import OptionParser
  62. try:
  63. from collections import OrderedDict
  64. except ImportError:
  65. from ordereddict import OrderedDict
  66. # open input_filename and write the data from offset to
  67. # (offset+size) to stream
  68. def copy_to_stream(stream, input_filename, offset, size):
  69. with open(input_filename, "r") as f:
  70. f.seek(offset)
  71. to_read = size
  72. while to_read > 0:
  73. buf_size = min(to_read, 4096)
  74. buf = f.read(buf_size)
  75. if len(buf) < buf_size:
  76. raise Exception("End of file: %d bytes expected, but %d bytes read." % (buf_size, len(buf)))
  77. stream.write(buf)
  78. to_read -= len(buf)
  79. stream.flush()
  80. # part of a stream as a file
  81. # (seek relative to an offset)
  82. class RangeFile(object):
  83. def __init__(self, stream, offset, size):
  84. self._stream = stream
  85. self._offset = offset
  86. self._size = size
  87. self._current_rel_offset = 0
  88. def tell(self):
  89. return self._current_rel_offset
  90. def seek(self, pos, whence=os.SEEK_SET):
  91. if whence == os.SEEK_SET:
  92. self._current_rel_offset = pos
  93. elif whence == os.SEEK_CUR:
  94. self._current_rel_offset += pos
  95. elif whence == os.SEEK_END:
  96. self._current_rel_offset = self._size + pos
  97. else:
  98. raise Exception("Unknown whence: %d." % whence)
  99. if self._current_rel_offset < 0 or self._current_rel_offset > self._size:
  100. raise Exception("Seek outside file: %d." % self._current_rel_offset)
  101. self._stream.seek(self._offset + self._current_rel_offset)
  102. def read(self, size):
  103. size = min(self._size - self._current_rel_offset, size)
  104. self._current_rel_offset += size
  105. buf = self._stream.read(size)
  106. if len(buf) < size:
  107. raise Exception("Expected to read %d but received %d." % (size, len(buf)))
  108. return buf
  109. # check for gzip errors
  110. def test_gz(filename, offset, size, verbose=False):
  111. with open(filename, "r") as f_stream:
  112. f = RangeFile(f_stream, offset, size)
  113. try:
  114. gz = gzip.GzipFile(fileobj=f, mode="rb")
  115. while True:
  116. buf = gz.read(4096)
  117. if len(buf) == 0:
  118. break
  119. except (IOError, ValueError, zlib.error) as e:
  120. if verbose:
  121. print >>sys.stderr, e
  122. return False
  123. return True
  124. # converting a .tar with warcs to megawarc tar+warc+json
  125. class MegawarcBuilder(object):
  126. def __init__(self, input_filename):
  127. self.verbose = False
  128. self.input_filename = input_filename
  129. self.output_warc_filename = input_filename + ".megawarc.warc.gz"
  130. self.output_tar_filename = input_filename + ".megawarc.tar"
  131. self.output_json_filename = input_filename + ".megawarc.json.gz"
  132. def process(self):
  133. with open(self.output_warc_filename, "wb") as warc_out:
  134. with open(self.output_tar_filename, "wb") as tar_out:
  135. with gzip.open(self.output_json_filename, "wb") as json_out:
  136. with tarfile.open(self.input_filename, "r") as tar:
  137. for tarinfo in tar:
  138. self.process_entry(tarinfo, warc_out, tar_out, json_out)
  139. tar_out.flush()
  140. padding = (tarfile.RECORDSIZE - tar_out.tell()) % tarfile.RECORDSIZE
  141. if padding > 0:
  142. tar_out.write("\0" * padding)
  143. def process_entry(self, entry, warc_out, tar_out, json_out):
  144. with open(self.input_filename, "r") as tar:
  145. tar.seek(entry.offset)
  146. tar_header = tar.read(entry.offset_data - entry.offset)
  147. # calculate position of tar entry
  148. block_size = (len(tar_header) + # header
  149. entry.size + # data
  150. (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE)
  151. next_offset = entry.offset + block_size
  152. d_src_offsets = OrderedDict()
  153. d_src_offsets["entry"] = entry.offset
  154. d_src_offsets["data"] = entry.offset_data
  155. d_src_offsets["next_entry"] = next_offset
  156. # decide what to do with this entry
  157. valid_warc_gz = False
  158. if entry.isfile() and re.search(r"\.warc\.gz", entry.name):
  159. if self.verbose:
  160. print >>sys.stderr, "Checking %s" % entry.name
  161. valid_warc_gz = test_gz(self.input_filename, entry.offset_data, entry.size, self.verbose)
  162. if not valid_warc_gz:
  163. if self.verbose:
  164. print >>sys.stderr, "Invalid gzip %s" % entry.name
  165. # save in megawarc or in tar
  166. d_target = OrderedDict()
  167. if valid_warc_gz:
  168. # a warc file.gz, add to megawarc
  169. warc_offset = warc_out.tell()
  170. if self.verbose:
  171. print >>sys.stderr, "Copying %s to warc" % entry.name
  172. copy_to_stream(warc_out, self.input_filename, entry.offset_data, entry.size)
  173. d_target["container"] = "warc"
  174. d_target["offset"] = warc_offset
  175. d_target["size"] = entry.size
  176. else:
  177. # not a warc.gz file, add to tar
  178. tar_offset = tar_out.tell()
  179. if self.verbose:
  180. print >>sys.stderr, "Copying %s to tar" % entry.name
  181. copy_to_stream(tar_out, self.input_filename, entry.offset, block_size)
  182. d_target["container"] = "tar"
  183. d_target["offset"] = tar_offset
  184. d_target["size"] = block_size
  185. # store details
  186. d = OrderedDict()
  187. d["target"] = d_target
  188. d["src_offsets"] = d_src_offsets
  189. d["header_fields"] = entry.get_info("utf-8", {})
  190. d["header_base64"] = base64.b64encode(tar_header)
  191. # store metadata
  192. json.dump(d, json_out, separators=(',', ':'))
  193. json_out.write("\n")
  194. # adding .warc.gz and other files to megawarc tar+warc+json
  195. class MegawarcPacker(object):
  196. def __init__(self, output_basename):
  197. self.verbose = False
  198. self.output_basename = output_basename
  199. self.output_warc_filename = output_basename + ".megawarc.warc.gz"
  200. self.output_tar_filename = output_basename + ".megawarc.tar"
  201. self.output_json_filename = output_basename + ".megawarc.json.gz"
  202. self.tar_pos = 0
  203. def process(self, filelist):
  204. with open(self.output_warc_filename, "wb") as warc_out:
  205. with open(self.output_tar_filename, "wb") as tar_out:
  206. with gzip.open(self.output_json_filename, "wb") as json_out:
  207. def each_file(arg, dirname, names):
  208. for n in names:
  209. n = os.path.join(dirname, n)
  210. if os.path.isfile(n):
  211. self.process_file(n, warc_out, tar_out, json_out)
  212. for filename in filelist:
  213. if os.path.isdir(filename):
  214. os.path.walk(filename, each_file, None)
  215. elif os.path.isfile(filename):
  216. self.process_file(filename, warc_out, tar_out, json_out)
  217. tar_out.flush()
  218. padding = (tarfile.RECORDSIZE - tar_out.tell()) % tarfile.RECORDSIZE
  219. if padding > 0:
  220. tar_out.write("\0" * padding)
  221. def process_file(self, filename, warc_out, tar_out, json_out):
  222. # make tar header
  223. arcname = filename
  224. arcname = arcname.replace(os.sep, "/")
  225. arcname = arcname.lstrip("/")
  226. entry = tarfile.TarInfo()
  227. statres = os.stat(filename)
  228. stmd = statres.st_mode
  229. entry.name = arcname
  230. entry.mode = stmd
  231. entry.uid = statres.st_uid
  232. entry.gid = statres.st_gid
  233. entry.size = statres.st_size
  234. entry.mtime = statres.st_mtime
  235. entry.type = tarfile.REGTYPE
  236. tar_header = entry.tobuf()
  237. # find position in imaginary tar
  238. entry.offset = self.tar_pos
  239. # calculate position of tar entry
  240. block_size = (len(tar_header) + # header
  241. entry.size + # data
  242. (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE)
  243. data_offset = entry.offset + len(tar_header)
  244. next_offset = entry.offset + block_size
  245. # move to next position in imaginary tar
  246. self.tar_pos = next_offset
  247. d_src_offsets = OrderedDict()
  248. d_src_offsets["entry"] = entry.offset
  249. d_src_offsets["data"] = data_offset
  250. d_src_offsets["next_entry"] = next_offset
  251. # decide what to do with this file
  252. valid_warc_gz = False
  253. if re.search(r"\.warc\.gz", filename):
  254. if self.verbose:
  255. print >>sys.stderr, "Checking %s" % filename
  256. valid_warc_gz = test_gz(filename, 0, entry.size, self.verbose)
  257. if not valid_warc_gz:
  258. if self.verbose:
  259. print >>sys.stderr, "Invalid gzip %s" % filename
  260. # save in megawarc or in tar
  261. d_target = OrderedDict()
  262. if valid_warc_gz:
  263. # a warc file.gz, add to megawarc
  264. warc_offset = warc_out.tell()
  265. if self.verbose:
  266. print >>sys.stderr, "Copying %s to warc" % filename
  267. copy_to_stream(warc_out, filename, 0, entry.size)
  268. d_target["container"] = "warc"
  269. d_target["offset"] = warc_offset
  270. d_target["size"] = entry.size
  271. else:
  272. # not a warc.gz file, add to tar
  273. tar_offset = tar_out.tell()
  274. if self.verbose:
  275. print >>sys.stderr, "Copying %s to tar" % filename
  276. tar_out.write(tar_header)
  277. copy_to_stream(tar_out, filename, 0, entry.size)
  278. padding = (tarfile.BLOCKSIZE - entry.size) % tarfile.BLOCKSIZE
  279. if padding > 0:
  280. tar_out.write("\0" * padding)
  281. tar_out.flush()
  282. d_target["container"] = "tar"
  283. d_target["offset"] = tar_offset
  284. d_target["size"] = block_size
  285. # store details
  286. d = OrderedDict()
  287. d["target"] = d_target
  288. d["src_offsets"] = d_src_offsets
  289. d["header_fields"] = entry.get_info("utf-8", {})
  290. d["header_base64"] = base64.b64encode(tar_header)
  291. # store metadata
  292. json.dump(d, json_out, separators=(',', ':'))
  293. json_out.write("\n")
  294. # recreate the original .tar from a megawarc tar+warc+json
  295. class MegawarcRestorer(object):
  296. def __init__(self, output_filename):
  297. self.verbose = False
  298. self.output_filename = output_filename
  299. self.input_warc_filename = output_filename + ".megawarc.warc.gz"
  300. self.input_tar_filename = output_filename + ".megawarc.tar"
  301. self.input_json_filename = output_filename + ".megawarc.json.gz"
  302. def process(self):
  303. with gzip.open(self.input_json_filename, "rb") as json_in:
  304. with open(self.output_filename, "wb") as tar_out:
  305. for line in json_in:
  306. entry = json.loads(line)
  307. self.process_entry(entry, tar_out)
  308. tar_out.flush()
  309. padding = (tarfile.RECORDSIZE - tar_out.tell()) % tarfile.RECORDSIZE
  310. if padding > 0:
  311. tar_out.write("\0" * padding)
  312. def process_entry(self, entry, tar_out):
  313. if entry["target"]["container"] == "warc":
  314. if self.verbose:
  315. print >>sys.stderr, "Copying %s from warc" % entry["header_fields"]["name"]
  316. if "header_base64" in entry:
  317. tar_out.write(base64.b64decode(entry["header_base64"]))
  318. elif "header_string" in entry:
  319. tar_out.write(entry["header_string"])
  320. else:
  321. raise Exception("Missing header_string or header_base64.")
  322. copy_to_stream(tar_out, self.input_warc_filename,
  323. entry["target"]["offset"], entry["target"]["size"])
  324. padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE
  325. if padding > 0:
  326. tar_out.write("\0" * padding)
  327. elif entry["target"]["container"] == "tar":
  328. if self.verbose:
  329. print >>sys.stderr, "Copying %s from tar" % entry["header_fields"]["name"]
  330. copy_to_stream(tar_out, self.input_tar_filename,
  331. entry["target"]["offset"], entry["target"]["size"])
  332. padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE
  333. if padding > 0:
  334. tar_out.write("\0" * padding)
  335. else:
  336. raise Exception("Unkown container: %s for %s" %
  337. (entry["target"]["container"], entry["header_fields"]["name"]))
  338. def main():
  339. parser = OptionParser(
  340. usage="Usage: %prog [--verbose] convert FILE\n %prog [--verbose] pack FILE [INFILE [INFILE ...]]\n %prog [--verbose] restore FILE",
  341. description="""%prog convert FILE converts the tar file (containing .warc.gz files) to a megawarc. A megawarc has three parts: 1. a .warc.gz of the concatenated warc files; 2. a .tar with the non-warc files from the original tar; 3. a .json.gz with metadata that can be used to reconstruct the original tar.
  342. Use %prog pack FILE INFILE ... to create a megawarc containing the files.
  343. Use %prog restore FILE to reconstruct original tar.
  344. """
  345. )
  346. parser.add_option("-v", "--verbose", dest="verbose",
  347. action="store_true",
  348. help="print status messages", default=False)
  349. (options, args) = parser.parse_args()
  350. if len(args) < 2:
  351. parser.print_usage()
  352. exit(1)
  353. if args[0] == "convert":
  354. if not os.path.exists(args[1]):
  355. print >>sys.stderr, "Input file %s does not exist." % args[1]
  356. exit(1)
  357. try:
  358. mwb = MegawarcBuilder(args[1])
  359. mwb.verbose = options.verbose
  360. mwb.process()
  361. except:
  362. for ext in (".megawarc.warc.gz", ".megawarc.json.gz", ".megawarc.tar"):
  363. if os.path.exists(args[1]+ext):
  364. os.unlink(args[1]+ext)
  365. raise
  366. elif args[0] == "pack":
  367. try:
  368. mwb = MegawarcPacker(args[1])
  369. mwb.verbose = options.verbose
  370. mwb.process(args[2:])
  371. except:
  372. for ext in (".megawarc.warc.gz", ".megawarc.json.gz", ".megawarc.tar"):
  373. if os.path.exists(args[1]+ext):
  374. os.unlink(args[1]+ext)
  375. raise
  376. elif args[0] == "restore":
  377. for ext in (".megawarc.warc.gz", ".megawarc.json.gz"):
  378. if not os.path.exists(args[1]+ext):
  379. print >>sys.stderr, "Input file %s does not exist." % (args[1] + ext)
  380. exit(1)
  381. if os.path.exists(args[1]):
  382. print >>sys.stderr, "Outputfile %s already exists." % args[1]
  383. exit(1)
  384. try:
  385. mwr = MegawarcRestorer(args[1])
  386. mwr.verbose = options.verbose
  387. mwr.process()
  388. except:
  389. if os.path.exists(args[1]):
  390. os.unlink(args[1])
  391. raise
  392. else:
  393. parser.print_usage()
  394. exit(1)
  395. if __name__ == "__main__":
  396. main()