The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

383 lines
13 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses FILES -- dump the HTTP response bodies to stdout
  6. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  7. import base64
  8. import gzip
  9. import hashlib
  10. import sys
  11. import zlib
  12. def GzipDecompressor():
  13. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  14. class DummyDecompressor:
  15. def decompress(self, data):
  16. return data
  17. class Event:
  18. pass
  19. class NewFile(Event):
  20. pass
  21. class BeginOfRecord(Event):
  22. def __init__(self, warcHeaders, rawData):
  23. self._warcHeaders = warcHeaders
  24. self._rawData = rawData
  25. @property
  26. def warcHeaders(self):
  27. return self._warcHeaders
  28. @property
  29. def rawData(self):
  30. return self._rawData
  31. class _DataChunk(Event):
  32. def __init__(self, data):
  33. self._data = data
  34. @property
  35. def data(self):
  36. return self._data
  37. def __repr__(self):
  38. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  39. class WARCBlockChunk(_DataChunk):
  40. def __init__(self, data, isHttpHeader = None):
  41. super().__init__(data)
  42. self._isHttpHeader = isHttpHeader
  43. @property
  44. def isHttpHeader(self):
  45. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  46. return self._isHttpHeader
  47. class RawHTTPBodyChunk(_DataChunk):
  48. '''
  49. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  50. This is like HTTPBodyChunk but without transfer encoding stripping.
  51. '''
  52. class HTTPBodyChunk(_DataChunk):
  53. '''
  54. Representing a part of the HTTP body with transfer encoding stripped.
  55. '''
  56. class EndOfRecord(Event):
  57. pass
  58. def iter_warc(f):
  59. # Yields Events
  60. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  61. with gzip.open(f, 'rb') as fp:
  62. buf = b''
  63. while True:
  64. # Read WARC header
  65. while b'\r\n\r\n' not in buf:
  66. try:
  67. buf = buf + fp.read(4096)
  68. except EOFError:
  69. break
  70. if not buf:
  71. break
  72. if not buf:
  73. break
  74. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  75. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')
  76. assert b'\r\nContent-Length:' in warcHeaderBuf
  77. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  78. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  79. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  80. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  81. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  82. recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID')
  83. # Read WARC block (and skip CRLFCRLF at the end of the record)
  84. if len(buf) < warcContentLength + 4:
  85. try:
  86. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  87. except EOFError:
  88. pass
  89. if len(buf) < warcContentLength + 4:
  90. print('Error: truncated WARC', file = sys.stderr)
  91. break
  92. warcContent = buf[:warcContentLength]
  93. buf = buf[warcContentLength + 4:]
  94. # Decode HTTP body if appropriate
  95. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  96. httpType = 'request'
  97. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  98. httpType = 'response'
  99. else:
  100. httpType = None
  101. if httpType is not None:
  102. if b'\r\n\r\n' in warcContent:
  103. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  104. # Parse headers and extract transfer encoding
  105. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  106. chunked = False
  107. gzipped = False
  108. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  109. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  110. transferEncodings = set(map(bytes.strip, transferEncoding.split(b',')))
  111. chunked = b'chunked' in transferEncodings
  112. gzipped = b'gzip' in transferEncodings
  113. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  114. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  115. yield RawHTTPBodyChunk(httpBody)
  116. # Decode body
  117. if gzipped:
  118. httpDecompressor = GzipDecompressor()
  119. else:
  120. httpDecompressor = DummyDecompressor()
  121. if chunked:
  122. while True:
  123. try:
  124. chunkLineEnd = httpBody.index(b'\r\n')
  125. except ValueError:
  126. print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
  127. break
  128. chunkLine = httpBody[:chunkLineEnd]
  129. if b';' in chunkLine:
  130. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  131. else:
  132. chunkLength = chunkLine.strip()
  133. if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
  134. print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
  135. break
  136. chunkLength = int(chunkLength, base = 16)
  137. if chunkLength == 0:
  138. break
  139. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  140. yield HTTPBodyChunk(chunk)
  141. httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
  142. else:
  143. yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
  144. else:
  145. print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
  146. yield WARCBlockChunk(warcContent)
  147. else:
  148. yield WARCBlockChunk(warcContent)
  149. yield EndOfRecord()
  150. class ProcessMode:
  151. def process_event(self, event):
  152. raise NotImplementedError
  153. class Digest:
  154. def __init__(self, digest):
  155. self._digest = digest
  156. def format(self, digest = None):
  157. raise NotImplementedError
  158. def equals(self, digest):
  159. return self._digest == digest
  160. class Base32Digest(Digest):
  161. def format(self, digest = None):
  162. return base64.b32encode(digest if digest else self._digest)
  163. class HexDigest(Digest):
  164. def format(self, digest = None):
  165. return (digest if digest else self._digest).hex()
  166. class VerifyMode(ProcessMode):
  167. def __init__(self):
  168. self._blockDigester = None
  169. self._recordedBlockDigest = None
  170. self._payloadDigester = None
  171. self._brokenPayloadDigester = None
  172. self._recordedPayloadDigest = None
  173. self._printedBrokenPayloadWarning = False
  174. def parse_digest(self, digest):
  175. if not digest.startswith(b'sha1:'):
  176. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  177. return None
  178. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  179. return Base32Digest(base64.b32decode(digest[5:]))
  180. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  181. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  182. return None
  183. def process_event(self, event):
  184. if type(event) is NewFile:
  185. self._printedBrokenPayloadWarning = False
  186. elif type(event) is BeginOfRecord:
  187. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  188. self._blockDigester = hashlib.sha1()
  189. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  190. else:
  191. self._blockDigester = None
  192. self._recordedBlockDigest = None
  193. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  194. self._payloadDigester = hashlib.sha1()
  195. self._brokenPayloadDigester = hashlib.sha1()
  196. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  197. else:
  198. self._payloadDigester = None
  199. self._brokenPayloadDigester = None
  200. self._recordedPayloadDigest = None
  201. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  202. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  203. elif type(event) is WARCBlockChunk:
  204. if self._blockDigester:
  205. self._blockDigester.update(event.data)
  206. elif type(event) is HTTPBodyChunk:
  207. if self._payloadDigester:
  208. self._payloadDigester.update(event.data)
  209. elif type(event) is RawHTTPBodyChunk:
  210. if self._brokenPayloadDigester:
  211. self._brokenPayloadDigester.update(event.data)
  212. elif type(event) is EndOfRecord:
  213. if self._blockDigester and self._recordedBlockDigest:
  214. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  215. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
  216. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  217. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  218. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  219. if not self._printedBrokenPayloadWarning:
  220. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding', file = sys.stderr)
  221. self._printedBrokenPayloadWarning = True
  222. else:
  223. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
  224. class DumpResponsesMode(ProcessMode):
  225. def __init__(self):
  226. self._printEOR = False
  227. self._isResponse = False
  228. def process_event(self, event):
  229. if type(event) is BeginOfRecord:
  230. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  231. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  232. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  233. self._printEOR = False
  234. elif type(event) is HTTPBodyChunk:
  235. if self._isResponse:
  236. self._printEOR = True
  237. sys.stdout.buffer.write(event.data)
  238. elif type(event) is EndOfRecord:
  239. if self._printEOR:
  240. sys.stdout.buffer.write(b'\r\n')
  241. class COLOURS:
  242. RESET = b'\x1b[0m'
  243. GREEN = b'\x1b[0;32m'
  244. LIGHTGREEN = b'\x1b[1;32m'
  245. PURPLE = b'\x1b[0;35m'
  246. LIGHTPURPLE = b'\x1b[1;35m'
  247. RED = b'\x1b[0;31m'
  248. INVERTED = b'\x1b[7m'
  249. class ColourMode(ProcessMode):
  250. def __init__(self):
  251. self._hadHttpStatusLine = False
  252. def _replace_esc(self, data):
  253. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  254. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  255. if colourOnlyBeforeColon:
  256. if b':' in line:
  257. offset = line.index(b':')
  258. else:
  259. offset = 0
  260. else:
  261. offset = len(line)
  262. if offset > 0:
  263. sys.stdout.buffer.write(colour)
  264. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  265. sys.stdout.buffer.write(COLOURS.RESET)
  266. sys.stdout.buffer.write(line[offset:])
  267. if withLF:
  268. sys.stdout.buffer.write(b'\n')
  269. def _print_data(self, data, colour, colourOnlyBeforeColon):
  270. later = False
  271. for line in data.split(b'\r\n'):
  272. if later:
  273. sys.stdout.buffer.write(b'\n')
  274. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  275. later = True
  276. def process_event(self, event):
  277. if type(event) is BeginOfRecord:
  278. firstNewline = event.rawData.index(b'\r\n')
  279. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  280. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  281. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  282. self._hadHttpStatusLine = False
  283. elif type(event) is WARCBlockChunk:
  284. if event.isHttpHeader is True:
  285. if not self._hadHttpStatusLine:
  286. firstNewline = event.data.index(b'\r\n')
  287. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  288. offset = firstNewline + 2
  289. self._hadHttpStatusLine = True
  290. else:
  291. offset = 0
  292. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  293. elif event.isHttpHeader is False:
  294. self._print_data(event.data, COLOURS.RED, False)
  295. elif event.isHttpHeader is None:
  296. sys.stdout.buffer.write(self._replace_esc(event.data))
  297. elif type(event) is EndOfRecord:
  298. sys.stdout.buffer.write(b'\n\n')
  299. def main():
  300. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
  301. assert len(sys.argv) - 1 >= 2
  302. mode = sys.argv[1]
  303. assert mode in processorMap
  304. files = sys.argv[2:]
  305. assert files
  306. processor = processorMap[mode]()
  307. try:
  308. for f in files:
  309. print('Info: processing {}'.format(f), file = sys.stderr)
  310. processor.process_event(NewFile())
  311. for event in iter_warc(f):
  312. processor.process_event(event)
  313. except BrokenPipeError:
  314. return
  315. if __name__ == '__main__':
  316. main()