The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

374 lines
12 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses FILES -- dump the HTTP response bodies to stdout
  6. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  7. import base64
  8. import gzip
  9. import hashlib
  10. import sys
  11. import zlib
  12. def GzipDecompressor():
  13. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  14. class DummyDecompressor:
  15. def decompress(self, data):
  16. return data
  17. class Event:
  18. pass
  19. class NewFile(Event):
  20. pass
  21. class BeginOfRecord(Event):
  22. def __init__(self, warcHeaders, rawData):
  23. self._warcHeaders = warcHeaders
  24. self._rawData = rawData
  25. @property
  26. def warcHeaders(self):
  27. return self._warcHeaders
  28. @property
  29. def rawData(self):
  30. return self._rawData
  31. class _DataChunk(Event):
  32. def __init__(self, data):
  33. self._data = data
  34. @property
  35. def data(self):
  36. return self._data
  37. def __repr__(self):
  38. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  39. class WARCBlockChunk(_DataChunk):
  40. def __init__(self, data, isHttpHeader = None):
  41. super().__init__(data)
  42. self._isHttpHeader = isHttpHeader
  43. @property
  44. def isHttpHeader(self):
  45. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  46. return self._isHttpHeader
  47. class RawHTTPResponseBodyChunk(_DataChunk):
  48. '''
  49. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  50. This is like HTTPResponseBodyChunk but without transfer encoding stripping.
  51. '''
  52. class HTTPResponseBodyChunk(_DataChunk):
  53. '''
  54. Representing a part of the HTTP response body with transfer encoding stripped.
  55. '''
  56. class EndOfRecord(Event):
  57. pass
  58. def iter_warc(f):
  59. # Yields Events
  60. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  61. with gzip.open(f, 'rb') as fp:
  62. buf = b''
  63. while True:
  64. # Read WARC header
  65. while b'\r\n\r\n' not in buf:
  66. try:
  67. buf = buf + fp.read(4096)
  68. except EOFError:
  69. break
  70. if not buf:
  71. break
  72. if not buf:
  73. break
  74. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  75. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n')
  76. assert b'\r\nContent-Length:' in warcHeaderBuf
  77. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  78. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  79. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  80. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  81. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  82. # Read WARC block (and skip CRLFCRLF at the end of the record)
  83. if len(buf) < warcContentLength + 4:
  84. try:
  85. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  86. except EOFError:
  87. pass
  88. if len(buf) < warcContentLength + 4:
  89. print('Error: truncated WARC', file = sys.stderr)
  90. break
  91. warcContent = buf[:warcContentLength]
  92. buf = buf[warcContentLength + 4:]
  93. # Decode HTTP response if it is one
  94. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  95. httpType = 'request'
  96. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  97. httpType = 'response'
  98. else:
  99. httpType = None
  100. if httpType is not None:
  101. if b'\r\n\r\n' in warcContent:
  102. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  103. # Parse headers and extract transfer encoding
  104. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  105. chunked = False
  106. gzipped = False
  107. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  108. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  109. transferEncodings = map(bytes.strip, transferEncoding.split(b','))
  110. chunked = b'chunked' in transferEncodings
  111. gzipped = b'gzip' in transferEncodings
  112. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  113. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  114. yield RawHTTPResponseBodyChunk(httpBody)
  115. if httpType == 'response':
  116. # Decode body
  117. if gzipped:
  118. httpDecompressor = GzipDecompressor()
  119. else:
  120. httpDecompressor = DummyDecompressor()
  121. if chunked:
  122. while True:
  123. try:
  124. chunkLineEnd = httpBody.index(b'\r\n')
  125. except ValueError:
  126. print('Error: could not find chunk line end, skipping', file = sys.stderr)
  127. break
  128. chunkLine = httpBody[:chunkLineEnd]
  129. if b';' in chunkLine:
  130. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  131. else:
  132. chunkLength = chunkLine.strip()
  133. if chunkLength.lstrip(b'0123456789abcdef') != b'':
  134. print('Error: malformed chunk length, skipping', file = sys.stderr)
  135. break
  136. chunkLength = int(chunkLength, base = 16)
  137. if chunkLength == 0:
  138. break
  139. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  140. yield HTTPResponseBodyChunk(chunk)
  141. httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
  142. else:
  143. yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody)[:50])
  144. else:
  145. print('Warning: malformed HTTP response, skipping', file = sys.stderr)
  146. else:
  147. yield WARCBlockChunk(warcContent)
  148. yield EndOfRecord()
  149. class ProcessMode:
  150. def process_event(self, event):
  151. raise NotImplementedError
  152. class Digest:
  153. def __init__(self, digest):
  154. self._digest = digest
  155. def format(self, digest = None):
  156. raise NotImplementedError
  157. def equals(self, digest):
  158. return self._digest == digest
  159. class Base32Digest(Digest):
  160. def format(self, digest = None):
  161. return base64.b32encode(digest if digest else self._digest)
  162. class HexDigest(Digest):
  163. def format(self, digest = None):
  164. return (digest if digest else self._digest).hex()
  165. class VerifyMode(ProcessMode):
  166. def __init__(self):
  167. self._blockDigester = None
  168. self._recordedBlockDigest = None
  169. self._payloadDigester = None
  170. self._brokenPayloadDigester = None
  171. self._recordedPayloadDigest = None
  172. self._printedBrokenPayloadWarning = False
  173. def parse_digest(self, digest):
  174. if not digest.startswith(b'sha1:'):
  175. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  176. return None
  177. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  178. return Base32Digest(base64.b32decode(digest[5:]))
  179. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  180. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  181. return None
  182. def process_event(self, event):
  183. if type(event) is NewFile:
  184. self._printedBrokenPayloadWarning = False
  185. elif type(event) is BeginOfRecord:
  186. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  187. self._blockDigester = hashlib.sha1()
  188. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  189. else:
  190. self._blockDigester = None
  191. self._recordedBlockDigest = None
  192. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  193. self._payloadDigester = hashlib.sha1()
  194. self._brokenPayloadDigester = hashlib.sha1()
  195. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  196. else:
  197. self._payloadDigester = None
  198. self._brokenPayloadDigester = None
  199. self._recordedPayloadDigest = None
  200. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  201. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  202. elif type(event) is WARCBlockChunk:
  203. if self._blockDigester:
  204. self._blockDigester.update(event.data)
  205. elif type(event) is HTTPResponseBodyChunk:
  206. if self._payloadDigester:
  207. self._payloadDigester.update(event.data)
  208. elif type(event) is RawHTTPResponseBodyChunk:
  209. if self._brokenPayloadDigester:
  210. self._brokenPayloadDigester.update(event.data)
  211. elif type(event) is EndOfRecord:
  212. if self._blockDigester and self._recordedBlockDigest:
  213. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  214. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())))
  215. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  216. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  217. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  218. if not self._printedBrokenPayloadWarning:
  219. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding')
  220. self._printedBrokenPayloadWarning = True
  221. else:
  222. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())))
  223. class DumpResponsesMode(ProcessMode):
  224. def __init__(self):
  225. self._printEOR = False
  226. def process_event(self, event):
  227. if type(event) is BeginOfRecord:
  228. self._printEOR = False
  229. elif type(event) is HTTPResponseBodyChunk:
  230. self._printEOR = True
  231. sys.stdout.buffer.write(event.data)
  232. elif type(event) is EndOfRecord:
  233. if self._printEOR:
  234. sys.stdout.buffer.write(b'\r\n')
  235. class COLOURS:
  236. RESET = b'\x1b[0m'
  237. GREEN = b'\x1b[0;32m'
  238. LIGHTGREEN = b'\x1b[1;32m'
  239. PURPLE = b'\x1b[0;35m'
  240. LIGHTPURPLE = b'\x1b[1;35m'
  241. RED = b'\x1b[0;31m'
  242. INVERTED = b'\x1b[7m'
  243. class ColourMode(ProcessMode):
  244. def __init__(self):
  245. self._hadHttpStatusLine = False
  246. def _replace_esc(self, data):
  247. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  248. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  249. if colourOnlyBeforeColon:
  250. if b':' in line:
  251. offset = line.index(b':')
  252. else:
  253. offset = 0
  254. else:
  255. offset = len(line)
  256. if offset > 0:
  257. sys.stdout.buffer.write(colour)
  258. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  259. sys.stdout.buffer.write(COLOURS.RESET)
  260. sys.stdout.buffer.write(line[offset:])
  261. if withLF:
  262. sys.stdout.buffer.write(b'\n')
  263. def _print_data(self, data, colour, colourOnlyBeforeColon):
  264. later = False
  265. for line in data.split(b'\r\n'):
  266. if later:
  267. sys.stdout.buffer.write(b'\n')
  268. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  269. later = True
  270. def process_event(self, event):
  271. if type(event) is BeginOfRecord:
  272. firstNewline = event.rawData.index(b'\r\n')
  273. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  274. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  275. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  276. self._hadHttpStatusLine = False
  277. elif type(event) is WARCBlockChunk:
  278. if event.isHttpHeader is True:
  279. if not self._hadHttpStatusLine:
  280. firstNewline = event.data.index(b'\r\n')
  281. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  282. offset = firstNewline + 2
  283. self._hadHttpStatusLine = True
  284. else:
  285. offset = 0
  286. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  287. elif event.isHttpHeader is False:
  288. self._print_data(event.data, COLOURS.RED, False)
  289. elif event.isHttpHeader is None:
  290. sys.stdout.buffer.write(self._replace_esc(event.data))
  291. elif type(event) is EndOfRecord:
  292. sys.stdout.buffer.write(b'\n\n')
  293. def main():
  294. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
  295. assert len(sys.argv) - 1 >= 2
  296. mode = sys.argv[1]
  297. assert mode in processorMap
  298. files = sys.argv[2:]
  299. assert files
  300. processor = processorMap[mode]()
  301. for f in files:
  302. print('Info: processing {}'.format(f), file = sys.stderr)
  303. processor.process_event(NewFile())
  304. for event in iter_warc(f):
  305. processor.process_event(event)
  306. if __name__ == '__main__':
  307. main()