A framework for quick web archiving
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

283 lines
9.0 KiB

  1. import base64
  2. import fcntl
  3. import hashlib
  4. import io
  5. import itertools
  6. import json
  7. import logging
  8. import qwarc.utils
  9. import time
  10. import uuid
  11. import warcio
  12. class _WARCRecord:
  13. def __init__(self, headers, body, length):
  14. self.headers = headers
  15. self.body = body
  16. self.length = length
  17. class _Digester:
  18. def __init__(self):
  19. self._digester = hashlib.sha1()
  20. def update(self, data):
  21. self._digester.update(data)
  22. def __str__(self):
  23. return f'sha1:{base64.b32encode(self._digester.digest()).decode("ascii")}'
  24. class WARC:
  25. def __init__(self, prefix, maxFileSize, dedupe, command, specFile, specDependencies, logFilename):
  26. '''
  27. Initialise the WARC writer
  28. prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended.
  29. maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting.
  30. dedupe: bool, whether to enable record deduplication
  31. command: list, the command line call for qwarc
  32. specFile: str, path to the spec file
  33. specDependencies: qwarc.utils.SpecDependencies
  34. logFilename: str, name of the log file written by this process
  35. '''
  36. self._prefix = prefix
  37. self._counter = 0
  38. self._maxFileSize = maxFileSize
  39. self._closed = True
  40. self._file = None
  41. self._dedupe = dedupe
  42. self._dedupeMap = {}
  43. self._command = command
  44. self._specFile = specFile
  45. self._specDependencies = specDependencies
  46. self._logFilename = logFilename
  47. self._metaWarcinfoRecordID = None
  48. self._write_meta_warc(self._write_initial_meta_records)
  49. def _ensure_opened(self):
  50. '''Open the next file that doesn't exist yet if there is currently no file opened'''
  51. if not self._closed:
  52. return
  53. while True:
  54. filename = f'{self._prefix}-{self._counter:05d}.warc.gz'
  55. try:
  56. # Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
  57. self._file = open(filename, 'xb')
  58. fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
  59. except FileExistsError:
  60. logging.info(f'{filename} already exists, skipping')
  61. self._counter += 1
  62. else:
  63. break
  64. logging.info(f'Opened {filename}')
  65. self._closed = False
  66. self._counter += 1
  67. def _create_warc_record(self, recordType, headers, body, length = None):
  68. startPos = body.tell()
  69. if 'WARC-Record-ID' not in headers:
  70. headers['WARC-Record-ID'] = f'<urn:uuid:{uuid.uuid4()}>'
  71. headers['WARC-Type'] = recordType
  72. digester = _Digester()
  73. for buf in qwarc.utils.iter_file(body, length = length):
  74. digester.update(buf)
  75. body.seek(startPos)
  76. headers['WARC-Block-Digest'] = str(digester)
  77. if 'WARC-Payload-Digest' not in headers and headers['Content-Type'].startswith('application/http;'):
  78. digester = _Digester()
  79. httpHeaders = qwarc.utils.read_http_headers(body)
  80. for buf in qwarc.utils.read_http_body(body, length = (length - body.tell()) if length is not None else None, headers = httpHeaders):
  81. digester.update(buf)
  82. body.seek(startPos)
  83. headers['WARC-Payload-Digest'] = str(digester)
  84. if not length:
  85. body.seek(0, io.SEEK_END)
  86. length = body.tell() - startPos
  87. body.seek(startPos)
  88. headers['Content-Length'] = str(length)
  89. return _WARCRecord(headers, body, length)
  90. def _write_warc_record(self, record):
  91. with qwarc.utils.GzipWrapper(self._file) as fp:
  92. fp.write(b'WARC/1.1\r\n')
  93. fp.write(b'\r\n'.join(k.encode('utf-8') + b': ' + v.encode('utf-8') for k, v in record.headers.items()))
  94. fp.write(b'\r\n\r\n') # Trailing CRLF for last header line plus end of headers
  95. for buf in qwarc.utils.iter_file(record.body, length = record.length):
  96. fp.write(buf)
  97. fp.write(b'\r\n\r\n') # Record separator
  98. def _write_warcinfo_record(self):
  99. data = {
  100. 'software': qwarc.utils.get_software_info(self._specFile, self._specDependencies),
  101. 'command': self._command,
  102. 'files': {
  103. 'spec': self._specFile,
  104. 'spec-dependencies': self._specDependencies.files
  105. },
  106. 'extra': self._specDependencies.extra,
  107. }
  108. payload = io.BytesIO(json.dumps(data, indent = 2).encode('utf-8'))
  109. record = self._create_warc_record(
  110. 'warcinfo',
  111. {
  112. 'Content-Type': 'application/json; charset=utf-8',
  113. },
  114. payload
  115. )
  116. self._write_warc_record(record)
  117. return record.headers['WARC-Record-ID']
  118. def write_client_response(self, response):
  119. '''
  120. Write the requests and responses stored in a ClientResponse instance to the currently opened WARC.
  121. A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
  122. '''
  123. self._ensure_opened()
  124. for r in response.iter_all():
  125. usec = f'{(r.rawRequestTimestamp - int(r.rawRequestTimestamp)):.6f}'[2:]
  126. requestDate = time.strftime(f'%Y-%m-%dT%H:%M:%S.{usec}Z', time.gmtime(r.rawRequestTimestamp))
  127. r.rawRequestData.seek(0)
  128. requestRecord = self._create_warc_record(
  129. 'request',
  130. {
  131. 'WARC-Date': requestDate,
  132. 'WARC-Target-URI': str(r.url),
  133. 'WARC-IP-Address': r.remoteAddress[0],
  134. 'Content-Type': 'application/http; msgtype=request',
  135. 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID,
  136. },
  137. r.rawRequestData
  138. )
  139. requestRecordID = requestRecord.headers['WARC-Record-ID']
  140. r.rawResponseData.seek(0)
  141. responseRecord = self._create_warc_record(
  142. 'response',
  143. {
  144. 'WARC-Date': requestDate,
  145. 'WARC-Target-URI': str(r.url),
  146. 'WARC-IP-Address': r.remoteAddress[0],
  147. 'Content-Type': 'application/http; msgtype=response',
  148. 'WARC-Concurrent-To': requestRecordID,
  149. 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID,
  150. },
  151. r.rawResponseData
  152. )
  153. payloadDigest = responseRecord.headers['WARC-Payload-Digest']
  154. assert payloadDigest is not None
  155. if self._dedupe and responseRecord.length > 1024: # Don't deduplicate small responses; the additional headers are typically larger than the payload dedupe savings...
  156. if payloadDigest in self._dedupeMap:
  157. refersToRecordId, refersToUri, refersToDate = self._dedupeMap[payloadDigest]
  158. httpHeaderData = io.BytesIO()
  159. qwarc.utils.read_http_headers(r.rawResponseData, copy = httpHeaderData)
  160. httpHeaderData.seek(0)
  161. responseRecord = self._create_warc_record(
  162. 'revisit',
  163. {
  164. 'WARC-Date': requestDate,
  165. 'WARC-Target-URI': str(r.url),
  166. 'WARC-IP-Address': r.remoteAddress[0],
  167. 'WARC-Concurrent-To': requestRecordID,
  168. 'Content-Type': 'application/http; msgtype=response',
  169. 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID,
  170. 'WARC-Profile': 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
  171. 'WARC-Refers-To-Target-URI': refersToUri,
  172. 'WARC-Refers-To-Date': refersToDate,
  173. 'WARC-Refers-To': refersToRecordId,
  174. 'WARC-Payload-Digest': payloadDigest,
  175. 'WARC-Truncated': 'length',
  176. },
  177. httpHeaderData
  178. )
  179. else:
  180. self._dedupeMap[payloadDigest] = (responseRecord.headers['WARC-Record-ID'], str(r.url), requestDate)
  181. self._write_warc_record(requestRecord)
  182. self._write_warc_record(responseRecord)
  183. if self._maxFileSize and self._file.tell() > self._maxFileSize:
  184. self._close_file()
  185. def _write_resource_records(self):
  186. '''Write spec file and dependencies'''
  187. assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first'
  188. for type_, contentType, fn in itertools.chain((('specfile', 'application/x-python', self._specFile),), map(lambda x: ('spec-dependency-file', 'application/octet-stream', x), self._specDependencies.files)):
  189. with open(fn, 'rb') as f:
  190. record = self._create_warc_record(
  191. 'resource',
  192. {
  193. 'WARC-Target-URI': f'file://{fn}',
  194. 'X-QWARC-Type': type_,
  195. 'Content-Type': contentType,
  196. 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID,
  197. },
  198. f
  199. )
  200. self._write_warc_record(record)
  201. def _write_initial_meta_records(self):
  202. self._metaWarcinfoRecordID = self._write_warcinfo_record()
  203. self._write_resource_records()
  204. def _write_log_record(self):
  205. assert self._metaWarcinfoRecordID is not None, 'write_warcinfo_record must be called first'
  206. rootLogger = logging.getLogger()
  207. for handler in rootLogger.handlers: #FIXME: Uses undocumented attribute handlers
  208. handler.flush()
  209. with open(self._logFilename, 'rb') as fp:
  210. record = self._create_warc_record(
  211. 'resource',
  212. {
  213. 'WARC-Target-URI': f'file://{self._logFilename}',
  214. 'X-QWARC-Type': 'log',
  215. 'Content-Type': 'text/plain; charset=utf-8',
  216. 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID,
  217. },
  218. fp
  219. )
  220. self._write_warc_record(record)
  221. def _close_file(self):
  222. '''Close the currently opened WARC'''
  223. if not self._closed:
  224. self._file.close()
  225. self._file = None
  226. self._closed = True
  227. def _write_meta_warc(self, callback):
  228. filename = f'{self._prefix}-meta.warc.gz'
  229. #TODO: Handle OSError on fcntl.flock and retry
  230. self._file = open(filename, 'ab')
  231. try:
  232. fcntl.flock(self._file.fileno(), fcntl.LOCK_EX)
  233. logging.info(f'Opened {filename}')
  234. self._closed = False
  235. callback()
  236. finally:
  237. self._close_file()
  238. def close(self):
  239. '''Clean up everything.'''
  240. self._close_file()
  241. self._write_meta_warc(self._write_log_record)