The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

247 lines
8.0 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff. Currently has two modes: verifying the integrity of a WARC by comparing the digests and dumping the HTTP response bodies to stdout.
  3. import base64
  4. import gzip
  5. import hashlib
  6. import sys
  7. import zlib
  8. def GzipDecompressor():
  9. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  10. class DummyDecompressor:
  11. def decompress(self, data):
  12. return data
  13. class Event:
  14. pass
  15. class NewFile(Event):
  16. pass
  17. class BeginOfRecord(Event):
  18. def __init__(self, warcHeaders):
  19. self._warcHeaders = warcHeaders
  20. @property
  21. def warcHeaders(self):
  22. return self._warcHeaders
  23. class _DataChunk(Event):
  24. def __init__(self, data):
  25. self._data = data
  26. @property
  27. def data(self):
  28. return self._data
  29. def __repr__(self):
  30. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  31. class WARCBlockChunk(_DataChunk):
  32. pass
  33. class RawHTTPResponseBodyChunk(_DataChunk):
  34. '''
  35. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  36. This is like HTTPResponseBodyChunk but without transfer encoding stripping.
  37. '''
  38. class HTTPResponseBodyChunk(_DataChunk):
  39. '''
  40. Representing a part of the HTTP response body with transfer encoding stripped.
  41. '''
  42. class EndOfRecord(Event):
  43. pass
  44. def iter_warc(f):
  45. # Yields Events
  46. with gzip.open(f, 'rb') as fp:
  47. buf = b''
  48. while True:
  49. # Read WARC header
  50. while b'\r\n\r\n' not in buf:
  51. try:
  52. buf = buf + fp.read(4096)
  53. except EOFError:
  54. break
  55. if not buf:
  56. break
  57. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  58. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n')
  59. assert b'\r\nContent-Length:' in warcHeaderBuf
  60. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  61. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  62. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  63. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  64. yield BeginOfRecord(warcHeaders)
  65. # Read WARC block (and skip CRLFCRLF at the end of the record)
  66. if len(buf) < warcContentLength + 4:
  67. try:
  68. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  69. except EOFError:
  70. pass
  71. if len(buf) < warcContentLength + 4:
  72. print('Error: truncated WARC', file = sys.stderr)
  73. break
  74. warcContent = buf[:warcContentLength]
  75. buf = buf[warcContentLength + 4:]
  76. yield WARCBlockChunk(warcContent)
  77. # Decode HTTP response if it is one
  78. if warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType in (b'request', b'response'): #TODO: Support revisit
  79. if b'\r\n\r\n' in warcContent:
  80. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  81. # Parse headers and extract transfer encoding
  82. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  83. chunked = False
  84. gzipped = False
  85. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  86. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  87. transferEncodings = map(bytes.strip, transferEncoding.split(b','))
  88. chunked = b'chunked' in transferEncodings
  89. gzipped = b'gzip' in transferEncodings
  90. yield RawHTTPResponseBodyChunk(httpBody)
  91. # Decode body
  92. if gzipped:
  93. httpDecompressor = GzipDecompressor()
  94. else:
  95. httpDecompressor = DummyDecompressor()
  96. if chunked:
  97. while True:
  98. try:
  99. chunkLineEnd = httpBody.index(b'\r\n')
  100. except ValueError:
  101. print('Error: could not find chunk line end, skipping', file = sys.stderr)
  102. break
  103. chunkLine = httpBody[:chunkLineEnd]
  104. if b';' in chunkLine:
  105. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  106. else:
  107. chunkLength = chunkLine.strip()
  108. if chunkLength.lstrip(b'0123456789abcdef') != b'':
  109. print('Error: malformed chunk length, skipping', file = sys.stderr)
  110. break
  111. chunkLength = int(chunkLength, base = 16)
  112. if chunkLength == 0:
  113. break
  114. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  115. yield HTTPResponseBodyChunk(chunk)
  116. httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
  117. else:
  118. yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)[:50])
  119. else:
  120. print('Warning: malformed HTTP response, skipping', file = sys.stderr)
  121. yield EndOfRecord()
  122. class ProcessMode:
  123. def process_event(self, event):
  124. raise NotImplementedError
  125. class VerifyMode(ProcessMode):
  126. def __init__(self):
  127. self._blockDigester = None
  128. self._recordedBlockDigest = None
  129. self._payloadDigester = None
  130. self._brokenPayloadDigester = None
  131. self._recordedPayloadDigest = None
  132. self._printedBrokenPayloadWarning = False
  133. def process_event(self, event):
  134. if type(event) is NewFile:
  135. self._printedBrokenPayloadWarning = False
  136. elif type(event) is BeginOfRecord:
  137. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  138. self._blockDigester = hashlib.sha1()
  139. self._recordedBlockDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest')
  140. else:
  141. self._blockDigester = None
  142. self._recordedBlockDigest = None
  143. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  144. self._payloadDigester = hashlib.sha1()
  145. self._brokenPayloadDigester = hashlib.sha1()
  146. self._recordedPayloadDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest')
  147. else:
  148. self._payloadDigester = None
  149. self._brokenPayloadDigester = None
  150. self._recordedPayloadDigest = None
  151. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  152. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  153. elif type(event) is WARCBlockChunk:
  154. self._blockDigester.update(event.data)
  155. elif type(event) is HTTPResponseBodyChunk:
  156. self._payloadDigester.update(event.data)
  157. elif type(event) is RawHTTPResponseBodyChunk:
  158. self._brokenPayloadDigester.update(event.data)
  159. elif type(event) is EndOfRecord:
  160. if self._blockDigester:
  161. if self._recordedBlockDigest != b'sha1:' + base64.b32encode(self._blockDigester.digest()):
  162. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest, base64.b32encode(self._blockDigester.digest())))
  163. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  164. if self._recordedPayloadDigest != b'sha1:' + base64.b32encode(self._payloadDigester.digest()):
  165. if self._recordedPayloadDigest == b'sha1:' + base64.b32encode(self._brokenPayloadDigester.digest()):
  166. if not self._printedBrokenPayloadWarning:
  167. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding')
  168. self._printedBrokenPayloadWarning = True
  169. else:
  170. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest, base64.b32encode(self._payloadDigester.digest()), base64.b32encode(self._brokenPayloadDigester.digest())))
  171. class DumpResponsesMode(ProcessMode):
  172. def __init__(self):
  173. self._printEOR = False
  174. def process_event(self, event):
  175. if type(event) is BeginOfRecord:
  176. self._printEOR = False
  177. elif type(event) is HTTPResponseBodyChunk:
  178. self._printEOR = True
  179. sys.stdout.buffer.write(event.data)
  180. elif type(event) is EndOfRecord:
  181. if self._printEOR:
  182. sys.stdout.buffer.write(b'\r\n')
  183. def main():
  184. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode}
  185. assert len(sys.argv) - 1 >= 2
  186. mode = sys.argv[1]
  187. assert mode in processorMap
  188. files = sys.argv[2:]
  189. assert files
  190. processor = processorMap[mode]()
  191. for f in files:
  192. print('Info: processing {}'.format(f), file = sys.stderr)
  193. processor.process_event(NewFile())
  194. for event in iter_warc(f):
  195. processor.process_event(event)
  196. if __name__ == '__main__':
  197. main()