The little things give you away... A collection of various small helper stuff
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 
 

271 rinda
9.0 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff. Currently has two modes: verifying the integrity of a WARC by comparing the digests and dumping the HTTP response bodies to stdout.
  3. import base64
  4. import gzip
  5. import hashlib
  6. import sys
  7. import zlib
  8. def GzipDecompressor():
  9. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  10. class DummyDecompressor:
  11. def decompress(self, data):
  12. return data
  13. class Event:
  14. pass
  15. class NewFile(Event):
  16. pass
  17. class BeginOfRecord(Event):
  18. def __init__(self, warcHeaders, rawData):
  19. self._warcHeaders = warcHeaders
  20. self._rawData = rawData
  21. @property
  22. def warcHeaders(self):
  23. return self._warcHeaders
  24. @property
  25. def rawData(self):
  26. return self._rawData
  27. class _DataChunk(Event):
  28. def __init__(self, data):
  29. self._data = data
  30. @property
  31. def data(self):
  32. return self._data
  33. def __repr__(self):
  34. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  35. class WARCBlockChunk(_DataChunk):
  36. def __init__(self, data, isHttpHeader = None):
  37. super().__init__(data)
  38. self._isHttpHeader = isHttpHeader
  39. @property
  40. def isHttpHeader(self):
  41. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  42. return self._isHttpHeader
  43. class RawHTTPResponseBodyChunk(_DataChunk):
  44. '''
  45. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  46. This is like HTTPResponseBodyChunk but without transfer encoding stripping.
  47. '''
  48. class HTTPResponseBodyChunk(_DataChunk):
  49. '''
  50. Representing a part of the HTTP response body with transfer encoding stripped.
  51. '''
  52. class EndOfRecord(Event):
  53. pass
  54. def iter_warc(f):
  55. # Yields Events
  56. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  57. with gzip.open(f, 'rb') as fp:
  58. buf = b''
  59. while True:
  60. # Read WARC header
  61. while b'\r\n\r\n' not in buf:
  62. try:
  63. buf = buf + fp.read(4096)
  64. except EOFError:
  65. break
  66. if not buf:
  67. break
  68. if not buf:
  69. break
  70. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  71. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n')
  72. assert b'\r\nContent-Length:' in warcHeaderBuf
  73. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  74. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  75. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  76. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  77. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  78. # Read WARC block (and skip CRLFCRLF at the end of the record)
  79. if len(buf) < warcContentLength + 4:
  80. try:
  81. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  82. except EOFError:
  83. pass
  84. if len(buf) < warcContentLength + 4:
  85. print('Error: truncated WARC', file = sys.stderr)
  86. break
  87. warcContent = buf[:warcContentLength]
  88. buf = buf[warcContentLength + 4:]
  89. # Decode HTTP response if it is one
  90. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  91. httpType = 'request'
  92. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  93. httpType = 'response'
  94. else:
  95. httpType = None
  96. if httpType is not None:
  97. if b'\r\n\r\n' in warcContent:
  98. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  99. # Parse headers and extract transfer encoding
  100. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  101. chunked = False
  102. gzipped = False
  103. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  104. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  105. transferEncodings = map(bytes.strip, transferEncoding.split(b','))
  106. chunked = b'chunked' in transferEncodings
  107. gzipped = b'gzip' in transferEncodings
  108. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  109. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  110. yield RawHTTPResponseBodyChunk(httpBody)
  111. if httpType == 'response':
  112. # Decode body
  113. if gzipped:
  114. httpDecompressor = GzipDecompressor()
  115. else:
  116. httpDecompressor = DummyDecompressor()
  117. if chunked:
  118. while True:
  119. try:
  120. chunkLineEnd = httpBody.index(b'\r\n')
  121. except ValueError:
  122. print('Error: could not find chunk line end, skipping', file = sys.stderr)
  123. break
  124. chunkLine = httpBody[:chunkLineEnd]
  125. if b';' in chunkLine:
  126. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  127. else:
  128. chunkLength = chunkLine.strip()
  129. if chunkLength.lstrip(b'0123456789abcdef') != b'':
  130. print('Error: malformed chunk length, skipping', file = sys.stderr)
  131. break
  132. chunkLength = int(chunkLength, base = 16)
  133. if chunkLength == 0:
  134. break
  135. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  136. yield HTTPResponseBodyChunk(chunk)
  137. httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
  138. else:
  139. yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)[:50])
  140. else:
  141. print('Warning: malformed HTTP response, skipping', file = sys.stderr)
  142. else:
  143. yield WARCBlockChunk(warcContent)
  144. yield EndOfRecord()
  145. class ProcessMode:
  146. def process_event(self, event):
  147. raise NotImplementedError
  148. class VerifyMode(ProcessMode):
  149. def __init__(self):
  150. self._blockDigester = None
  151. self._recordedBlockDigest = None
  152. self._payloadDigester = None
  153. self._brokenPayloadDigester = None
  154. self._recordedPayloadDigest = None
  155. self._printedBrokenPayloadWarning = False
  156. def process_event(self, event):
  157. if type(event) is NewFile:
  158. self._printedBrokenPayloadWarning = False
  159. elif type(event) is BeginOfRecord:
  160. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  161. self._blockDigester = hashlib.sha1()
  162. self._recordedBlockDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest')
  163. else:
  164. self._blockDigester = None
  165. self._recordedBlockDigest = None
  166. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  167. self._payloadDigester = hashlib.sha1()
  168. self._brokenPayloadDigester = hashlib.sha1()
  169. self._recordedPayloadDigest = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest')
  170. else:
  171. self._payloadDigester = None
  172. self._brokenPayloadDigester = None
  173. self._recordedPayloadDigest = None
  174. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  175. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  176. elif type(event) is WARCBlockChunk:
  177. self._blockDigester.update(event.data)
  178. elif type(event) is HTTPResponseBodyChunk:
  179. self._payloadDigester.update(event.data)
  180. elif type(event) is RawHTTPResponseBodyChunk:
  181. self._brokenPayloadDigester.update(event.data)
  182. elif type(event) is EndOfRecord:
  183. if self._blockDigester:
  184. if self._recordedBlockDigest != b'sha1:' + base64.b32encode(self._blockDigester.digest()):
  185. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest, base64.b32encode(self._blockDigester.digest())))
  186. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  187. if self._recordedPayloadDigest != b'sha1:' + base64.b32encode(self._payloadDigester.digest()):
  188. if self._recordedPayloadDigest == b'sha1:' + base64.b32encode(self._brokenPayloadDigester.digest()):
  189. if not self._printedBrokenPayloadWarning:
  190. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding')
  191. self._printedBrokenPayloadWarning = True
  192. else:
  193. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest, base64.b32encode(self._payloadDigester.digest()), base64.b32encode(self._brokenPayloadDigester.digest())))
  194. class DumpResponsesMode(ProcessMode):
  195. def __init__(self):
  196. self._printEOR = False
  197. def process_event(self, event):
  198. if type(event) is BeginOfRecord:
  199. self._printEOR = False
  200. elif type(event) is HTTPResponseBodyChunk:
  201. self._printEOR = True
  202. sys.stdout.buffer.write(event.data)
  203. elif type(event) is EndOfRecord:
  204. if self._printEOR:
  205. sys.stdout.buffer.write(b'\r\n')
  206. def main():
  207. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode}
  208. assert len(sys.argv) - 1 >= 2
  209. mode = sys.argv[1]
  210. assert mode in processorMap
  211. files = sys.argv[2:]
  212. assert files
  213. processor = processorMap[mode]()
  214. for f in files:
  215. print('Info: processing {}'.format(f), file = sys.stderr)
  216. processor.process_event(NewFile())
  217. for event in iter_warc(f):
  218. processor.process_event(event)
  219. if __name__ == '__main__':
  220. main()