The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

379 lines
13 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses FILES -- dump the HTTP response bodies to stdout
  6. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  7. import base64
  8. import gzip
  9. import hashlib
  10. import sys
  11. import zlib
  12. def GzipDecompressor():
  13. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  14. class DummyDecompressor:
  15. def decompress(self, data):
  16. return data
  17. class Event:
  18. pass
  19. class NewFile(Event):
  20. pass
  21. class BeginOfRecord(Event):
  22. def __init__(self, warcHeaders, rawData):
  23. self._warcHeaders = warcHeaders
  24. self._rawData = rawData
  25. @property
  26. def warcHeaders(self):
  27. return self._warcHeaders
  28. @property
  29. def rawData(self):
  30. return self._rawData
  31. class _DataChunk(Event):
  32. def __init__(self, data):
  33. self._data = data
  34. @property
  35. def data(self):
  36. return self._data
  37. def __repr__(self):
  38. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  39. class WARCBlockChunk(_DataChunk):
  40. def __init__(self, data, isHttpHeader = None):
  41. super().__init__(data)
  42. self._isHttpHeader = isHttpHeader
  43. @property
  44. def isHttpHeader(self):
  45. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  46. return self._isHttpHeader
  47. class RawHTTPResponseBodyChunk(_DataChunk):
  48. '''
  49. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  50. This is like HTTPResponseBodyChunk but without transfer encoding stripping.
  51. '''
  52. class HTTPResponseBodyChunk(_DataChunk):
  53. '''
  54. Representing a part of the HTTP response body with transfer encoding stripped.
  55. '''
  56. class EndOfRecord(Event):
  57. pass
  58. def iter_warc(f):
  59. # Yields Events
  60. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  61. with gzip.open(f, 'rb') as fp:
  62. buf = b''
  63. while True:
  64. # Read WARC header
  65. while b'\r\n\r\n' not in buf:
  66. try:
  67. buf = buf + fp.read(4096)
  68. except EOFError:
  69. break
  70. if not buf:
  71. break
  72. if not buf:
  73. break
  74. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  75. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')
  76. assert b'\r\nContent-Length:' in warcHeaderBuf
  77. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  78. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  79. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  80. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  81. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  82. recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID')
  83. # Read WARC block (and skip CRLFCRLF at the end of the record)
  84. if len(buf) < warcContentLength + 4:
  85. try:
  86. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  87. except EOFError:
  88. pass
  89. if len(buf) < warcContentLength + 4:
  90. print('Error: truncated WARC', file = sys.stderr)
  91. break
  92. warcContent = buf[:warcContentLength]
  93. buf = buf[warcContentLength + 4:]
  94. # Decode HTTP response if it is one
  95. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  96. httpType = 'request'
  97. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  98. httpType = 'response'
  99. else:
  100. httpType = None
  101. if httpType is not None:
  102. if b'\r\n\r\n' in warcContent:
  103. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  104. # Parse headers and extract transfer encoding
  105. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  106. chunked = False
  107. gzipped = False
  108. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  109. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  110. transferEncodings = set(map(bytes.strip, transferEncoding.split(b',')))
  111. chunked = b'chunked' in transferEncodings
  112. gzipped = b'gzip' in transferEncodings
  113. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  114. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  115. yield RawHTTPResponseBodyChunk(httpBody)
  116. if httpType == 'response':
  117. # Decode body
  118. if gzipped:
  119. httpDecompressor = GzipDecompressor()
  120. else:
  121. httpDecompressor = DummyDecompressor()
  122. if chunked:
  123. while True:
  124. try:
  125. chunkLineEnd = httpBody.index(b'\r\n')
  126. except ValueError:
  127. print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
  128. break
  129. chunkLine = httpBody[:chunkLineEnd]
  130. if b';' in chunkLine:
  131. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  132. else:
  133. chunkLength = chunkLine.strip()
  134. if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
  135. print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
  136. break
  137. chunkLength = int(chunkLength, base = 16)
  138. if chunkLength == 0:
  139. break
  140. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  141. yield HTTPResponseBodyChunk(chunk)
  142. httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
  143. else:
  144. yield HTTPResponseBodyChunk(httpDecompressor.decompress(httpBody))
  145. else:
  146. print('Warning: malformed HTTP response in record {}, skipping'.format(recordID), file = sys.stderr)
  147. yield WARCBlockChunk(warcContent)
  148. else:
  149. yield WARCBlockChunk(warcContent)
  150. yield EndOfRecord()
  151. class ProcessMode:
  152. def process_event(self, event):
  153. raise NotImplementedError
  154. class Digest:
  155. def __init__(self, digest):
  156. self._digest = digest
  157. def format(self, digest = None):
  158. raise NotImplementedError
  159. def equals(self, digest):
  160. return self._digest == digest
  161. class Base32Digest(Digest):
  162. def format(self, digest = None):
  163. return base64.b32encode(digest if digest else self._digest)
  164. class HexDigest(Digest):
  165. def format(self, digest = None):
  166. return (digest if digest else self._digest).hex()
  167. class VerifyMode(ProcessMode):
  168. def __init__(self):
  169. self._blockDigester = None
  170. self._recordedBlockDigest = None
  171. self._payloadDigester = None
  172. self._brokenPayloadDigester = None
  173. self._recordedPayloadDigest = None
  174. self._printedBrokenPayloadWarning = False
  175. def parse_digest(self, digest):
  176. if not digest.startswith(b'sha1:'):
  177. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  178. return None
  179. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  180. return Base32Digest(base64.b32decode(digest[5:]))
  181. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  182. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  183. return None
  184. def process_event(self, event):
  185. if type(event) is NewFile:
  186. self._printedBrokenPayloadWarning = False
  187. elif type(event) is BeginOfRecord:
  188. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  189. self._blockDigester = hashlib.sha1()
  190. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  191. else:
  192. self._blockDigester = None
  193. self._recordedBlockDigest = None
  194. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  195. self._payloadDigester = hashlib.sha1()
  196. self._brokenPayloadDigester = hashlib.sha1()
  197. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  198. else:
  199. self._payloadDigester = None
  200. self._brokenPayloadDigester = None
  201. self._recordedPayloadDigest = None
  202. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  203. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  204. elif type(event) is WARCBlockChunk:
  205. if self._blockDigester:
  206. self._blockDigester.update(event.data)
  207. elif type(event) is HTTPResponseBodyChunk:
  208. if self._payloadDigester:
  209. self._payloadDigester.update(event.data)
  210. elif type(event) is RawHTTPResponseBodyChunk:
  211. if self._brokenPayloadDigester:
  212. self._brokenPayloadDigester.update(event.data)
  213. elif type(event) is EndOfRecord:
  214. if self._blockDigester and self._recordedBlockDigest:
  215. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  216. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
  217. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  218. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  219. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  220. if not self._printedBrokenPayloadWarning:
  221. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding', file = sys.stderr)
  222. self._printedBrokenPayloadWarning = True
  223. else:
  224. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
  225. class DumpResponsesMode(ProcessMode):
  226. def __init__(self):
  227. self._printEOR = False
  228. def process_event(self, event):
  229. if type(event) is BeginOfRecord:
  230. self._printEOR = False
  231. elif type(event) is HTTPResponseBodyChunk:
  232. self._printEOR = True
  233. sys.stdout.buffer.write(event.data)
  234. elif type(event) is EndOfRecord:
  235. if self._printEOR:
  236. sys.stdout.buffer.write(b'\r\n')
  237. class COLOURS:
  238. RESET = b'\x1b[0m'
  239. GREEN = b'\x1b[0;32m'
  240. LIGHTGREEN = b'\x1b[1;32m'
  241. PURPLE = b'\x1b[0;35m'
  242. LIGHTPURPLE = b'\x1b[1;35m'
  243. RED = b'\x1b[0;31m'
  244. INVERTED = b'\x1b[7m'
  245. class ColourMode(ProcessMode):
  246. def __init__(self):
  247. self._hadHttpStatusLine = False
  248. def _replace_esc(self, data):
  249. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  250. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  251. if colourOnlyBeforeColon:
  252. if b':' in line:
  253. offset = line.index(b':')
  254. else:
  255. offset = 0
  256. else:
  257. offset = len(line)
  258. if offset > 0:
  259. sys.stdout.buffer.write(colour)
  260. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  261. sys.stdout.buffer.write(COLOURS.RESET)
  262. sys.stdout.buffer.write(line[offset:])
  263. if withLF:
  264. sys.stdout.buffer.write(b'\n')
  265. def _print_data(self, data, colour, colourOnlyBeforeColon):
  266. later = False
  267. for line in data.split(b'\r\n'):
  268. if later:
  269. sys.stdout.buffer.write(b'\n')
  270. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  271. later = True
  272. def process_event(self, event):
  273. if type(event) is BeginOfRecord:
  274. firstNewline = event.rawData.index(b'\r\n')
  275. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  276. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  277. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  278. self._hadHttpStatusLine = False
  279. elif type(event) is WARCBlockChunk:
  280. if event.isHttpHeader is True:
  281. if not self._hadHttpStatusLine:
  282. firstNewline = event.data.index(b'\r\n')
  283. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  284. offset = firstNewline + 2
  285. self._hadHttpStatusLine = True
  286. else:
  287. offset = 0
  288. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  289. elif event.isHttpHeader is False:
  290. self._print_data(event.data, COLOURS.RED, False)
  291. elif event.isHttpHeader is None:
  292. sys.stdout.buffer.write(self._replace_esc(event.data))
  293. elif type(event) is EndOfRecord:
  294. sys.stdout.buffer.write(b'\n\n')
  295. def main():
  296. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
  297. assert len(sys.argv) - 1 >= 2
  298. mode = sys.argv[1]
  299. assert mode in processorMap
  300. files = sys.argv[2:]
  301. assert files
  302. processor = processorMap[mode]()
  303. try:
  304. for f in files:
  305. print('Info: processing {}'.format(f), file = sys.stderr)
  306. processor.process_event(NewFile())
  307. for event in iter_warc(f):
  308. processor.process_event(event)
  309. except BrokenPipeError:
  310. return
  311. if __name__ == '__main__':
  312. main()