A framework for quick web archiving
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

115 linhas
3.8 KiB

  1. import fcntl
  2. import io
  3. import logging
  4. import time
  5. import warcio
  6. class WARC:
  7. def __init__(self, prefix, maxFileSize, dedupe):
  8. '''
  9. Initialise the WARC writer
  10. prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended.
  11. maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting.
  12. dedupe: bool, whether to enable record deduplication
  13. '''
  14. self._prefix = prefix
  15. self._counter = 0
  16. self._maxFileSize = maxFileSize
  17. self._closed = True
  18. self._file = None
  19. self._warcWriter = None
  20. self._dedupe = dedupe
  21. self._dedupeMap = {}
  22. def _ensure_opened(self):
  23. '''Open the next file that doesn't exist yet if there is currently no file opened'''
  24. if not self._closed:
  25. return
  26. while True:
  27. filename = f'{self._prefix}-{self._counter:05d}.warc.gz'
  28. try:
  29. # Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it
  30. self._file = open(filename, 'xb')
  31. fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
  32. except FileExistsError:
  33. logging.info(f'{filename} already exists, skipping')
  34. self._counter += 1
  35. else:
  36. break
  37. logging.info(f'Opened {filename}')
  38. self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True)
  39. self._closed = False
  40. self._counter += 1
  41. def write_client_response(self, response):
  42. '''
  43. Write the requests and responses stored in a ClientResponse instance to the currently opened WARC.
  44. A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC.
  45. '''
  46. self._ensure_opened()
  47. for r in response.iter_all():
  48. requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp))
  49. requestRecord = self._warcWriter.create_warc_record(
  50. str(r.url),
  51. 'request',
  52. payload = io.BytesIO(r.rawRequestData),
  53. warc_headers_dict = {
  54. 'WARC-Date': requestDate,
  55. 'WARC-IP-Address': r.remoteAddress[0],
  56. }
  57. )
  58. requestRecordID = requestRecord.rec_headers.get_header('WARC-Record-ID')
  59. responseRecord = self._warcWriter.create_warc_record(
  60. str(r.url),
  61. 'response',
  62. payload = io.BytesIO(r.rawResponseData),
  63. warc_headers_dict = {
  64. 'WARC-Date': requestDate,
  65. 'WARC-IP-Address': r.remoteAddress[0],
  66. 'WARC-Concurrent-To': requestRecordID,
  67. }
  68. )
  69. payloadDigest = responseRecord.rec_headers.get_header('WARC-Payload-Digest')
  70. assert payloadDigest is not None
  71. if self._dedupe and responseRecord.payload_length > 0: # Don't "deduplicate" empty responses
  72. if payloadDigest in self._dedupeMap:
  73. refersToRecordId, refersToUri, refersToDate = self._dedupeMap[payloadDigest]
  74. responseHttpHeaders = responseRecord.http_headers
  75. responseRecord = self._warcWriter.create_revisit_record(
  76. str(r.url),
  77. digest = payloadDigest,
  78. refers_to_uri = refersToUri,
  79. refers_to_date = refersToDate,
  80. http_headers = responseHttpHeaders,
  81. warc_headers_dict = {
  82. 'WARC-Date': requestDate,
  83. 'WARC-IP-Address': r.remoteAddress[0],
  84. 'WARC-Concurrent-To': requestRecordID,
  85. 'WARC-Refers-To': refersToRecordId,
  86. 'WARC-Truncated': 'length',
  87. }
  88. )
  89. else:
  90. self._dedupeMap[payloadDigest] = (responseRecord.rec_headers.get_header('WARC-Record-ID'), str(r.url), requestDate)
  91. self._warcWriter.write_record(requestRecord)
  92. self._warcWriter.write_record(responseRecord)
  93. if self._maxFileSize and self._file.tell() > self._maxFileSize:
  94. self.close()
  95. def close(self):
  96. '''Close the currently opened WARC'''
  97. if not self._closed:
  98. self._file.close()
  99. self._warcWriter = None
  100. self._file = None
  101. self._closed = True