A framework for quick web archiving
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

94 lignes
3.0 KiB

  1. import fcntl
  2. import io
  3. import logging
  4. import time
  5. import warcio
  6. class WARCWriter(warcio.warcwriter.WARCWriter):
  7. def _do_write_req_resp(self, req, resp, params): #FIXME: Internal API
  8. # Write request before response, like wget and wpull; cf. https://github.com/webrecorder/warcio/issues/20
  9. self._write_warc_record(self.out, req)
  10. self._write_warc_record(self.out, resp)
  11. class WARC:
  12. def __init__(self, prefix, maxFileSize):
  13. '''
  14. Initialise the WARC writer
  15. prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended.
  16. maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting.
  17. '''
  18. self._prefix = prefix
  19. self._counter = 0
  20. self._maxFileSize = maxFileSize
  21. self._closed = True
  22. self._file = None
  23. self._warcWriter = None
  24. self._cycle()
  25. def _cycle(self):
  26. '''Close the current file, open the next file that doesn't exist yet'''
  27. #TODO: This opens a new file also at the end, which can result in empty WARCs. Should try to reorder this to only open a WARC when writing a record, and to only close the current WARC if the size is exceeded after write_client_response.
  28. self.close()
  29. while True:
  30. filename = '{}-{:05d}.warc.gz'.format(self._prefix, self._counter)
  31. try:
  32. # Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
  33. self._file = open(filename, 'xb')
  34. fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
  35. except FileExistsError:
  36. logging.info('{} already exists, skipping'.format(filename))
  37. self._counter += 1
  38. else:
  39. break
  40. logging.info('Opened {}'.format(filename))
  41. self._warcWriter = WARCWriter(self._file, gzip = True)
  42. self._closed = False
  43. self._counter += 1
  44. def write_client_response(self, response):
  45. '''
  46. Write the requests and responses stored in a ClientResponse instance to the currently opened WARC.
  47. A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
  48. '''
  49. for r in response.iter_all():
  50. requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp))
  51. requestRecord = self._warcWriter.create_warc_record(
  52. str(r.url),
  53. 'request',
  54. payload = io.BytesIO(r.rawRequestData),
  55. warc_headers_dict = {
  56. 'WARC-Date': requestDate,
  57. 'WARC-IP-Address': r.remoteAddress[0],
  58. }
  59. )
  60. responseRecord = self._warcWriter.create_warc_record(
  61. str(r.url),
  62. 'response',
  63. payload = io.BytesIO(r.rawResponseData),
  64. warc_headers_dict = {
  65. 'WARC-Date': requestDate,
  66. 'WARC-IP-Address': r.remoteAddress[0],
  67. }
  68. )
  69. self._warcWriter.write_request_response_pair(requestRecord, responseRecord)
  70. if self._maxFileSize and self._file.tell() > self._maxFileSize:
  71. self._cycle()
  72. def close(self):
  73. '''Close the currently opened WARC'''
  74. if not self._closed:
  75. self._file.close()
  76. self._warcWriter = None
  77. self._file = None
  78. self._closed = True