A framework for quick web archiving
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

94 řádky
3.0 KiB

  1. import fcntl
  2. import io
  3. import logging
  4. import time
  5. import warcio
  6. class WARCWriter(warcio.warcwriter.WARCWriter):
  7. def _do_write_req_resp(self, req, resp, params): #FIXME: Internal API
  8. # Write request before response, like wget and wpull; cf. https://github.com/webrecorder/warcio/issues/20
  9. self._write_warc_record(self.out, req)
  10. self._write_warc_record(self.out, resp)
  11. class WARC:
  12. def __init__(self, prefix, maxFileSize):
  13. '''
  14. Initialise the WARC writer
  15. prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended.
  16. maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting.
  17. '''
  18. self._prefix = prefix
  19. self._counter = 0
  20. self._maxFileSize = maxFileSize
  21. self._closed = True
  22. self._file = None
  23. self._warcWriter = None
  24. self._cycle()
  25. def _cycle(self):
  26. '''Close the current file, open the next file that doesn't exist yet'''
  27. #TODO: This opens a new file also at the end, which can result in empty WARCs. Should try to reorder this to only open a WARC when writing a record, and to only close the current WARC if the size is exceeded after write_client_response.
  28. self.close()
  29. while True:
  30. filename = '{}-{:05d}.warc.gz'.format(self._prefix, self._counter)
  31. try:
  32. # Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
  33. self._file = open(filename, 'xb')
  34. fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
  35. except FileExistsError:
  36. logging.info('{} already exists, skipping'.format(filename))
  37. self._counter += 1
  38. else:
  39. break
  40. logging.info('Opened {}'.format(filename))
  41. self._warcWriter = WARCWriter(self._file, gzip = True)
  42. self._closed = False
  43. self._counter += 1
  44. def write_client_response(self, response):
  45. '''
  46. Write the requests and responses stored in a ClientResponse instance to the currently opened WARC.
  47. A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
  48. '''
  49. for r in response.iter_all():
  50. requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp))
  51. requestRecord = self._warcWriter.create_warc_record(
  52. str(r.url),
  53. 'request',
  54. payload = io.BytesIO(r.rawRequestData),
  55. warc_headers_dict = {
  56. 'WARC-Date': requestDate,
  57. 'WARC-IP-Address': r.remoteAddress[0],
  58. }
  59. )
  60. responseRecord = self._warcWriter.create_warc_record(
  61. str(r.url),
  62. 'response',
  63. payload = io.BytesIO(r.rawResponseData),
  64. warc_headers_dict = {
  65. 'WARC-Date': requestDate,
  66. 'WARC-IP-Address': r.remoteAddress[0],
  67. }
  68. )
  69. self._warcWriter.write_request_response_pair(requestRecord, responseRecord)
  70. if self._maxFileSize and self._file.tell() > self._maxFileSize:
  71. self._cycle()
  72. def close(self):
  73. '''Close the currently opened WARC'''
  74. if not self._closed:
  75. self._file.close()
  76. self._warcWriter = None
  77. self._file = None
  78. self._closed = True