The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

426 lines
15 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses [-m|--meta] FILES -- dump the HTTP response bodies to stdout
  6. # With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:https://example.org/: foobar'
  7. # The record offset may be -1 if it is not known.
  8. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  9. import base64
  10. import gzip
  11. import hashlib
  12. import sys
  13. import zlib
  14. def GzipDecompressor():
  15. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  16. class DummyDecompressor:
  17. def decompress(self, data):
  18. return data
  19. class Event:
  20. pass
  21. class NewFile(Event):
  22. def __init__(self, filename):
  23. self._filename = filename
  24. @property
  25. def filename(self):
  26. return self._filename
  27. class BeginOfRecord(Event):
  28. def __init__(self, warcHeaders, rawData):
  29. self._warcHeaders = warcHeaders
  30. self._rawData = rawData
  31. @property
  32. def warcHeaders(self):
  33. return self._warcHeaders
  34. @property
  35. def rawData(self):
  36. return self._rawData
  37. class _DataChunk(Event):
  38. def __init__(self, data):
  39. self._data = data
  40. @property
  41. def data(self):
  42. return self._data
  43. def __repr__(self):
  44. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  45. class WARCBlockChunk(_DataChunk):
  46. def __init__(self, data, isHttpHeader = None):
  47. super().__init__(data)
  48. self._isHttpHeader = isHttpHeader
  49. @property
  50. def isHttpHeader(self):
  51. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  52. return self._isHttpHeader
  53. class RawHTTPBodyChunk(_DataChunk):
  54. '''
  55. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  56. This is like HTTPBodyChunk but without transfer encoding stripping.
  57. '''
  58. class HTTPBodyChunk(_DataChunk):
  59. '''
  60. Representing a part of the HTTP body with transfer encoding stripped.
  61. '''
  62. class EndOfRecord(Event):
  63. pass
  64. def iter_warc(f):
  65. # Yields Events
  66. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  67. with gzip.open(f, 'rb') as fp:
  68. buf = b''
  69. while True:
  70. # Read WARC header
  71. while b'\r\n\r\n' not in buf:
  72. try:
  73. buf = buf + fp.read(4096)
  74. except EOFError:
  75. break
  76. if not buf:
  77. break
  78. if not buf:
  79. break
  80. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  81. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')
  82. assert b'\r\nContent-Length:' in warcHeaderBuf
  83. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  84. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  85. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  86. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  87. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  88. recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID')
  89. # Read WARC block (and skip CRLFCRLF at the end of the record)
  90. if len(buf) < warcContentLength + 4:
  91. try:
  92. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  93. except EOFError:
  94. pass
  95. if len(buf) < warcContentLength + 4:
  96. print('Error: truncated WARC', file = sys.stderr)
  97. break
  98. warcContent = buf[:warcContentLength]
  99. buf = buf[warcContentLength + 4:]
  100. # Decode HTTP body if appropriate
  101. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  102. httpType = 'request'
  103. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  104. httpType = 'response'
  105. else:
  106. httpType = None
  107. if httpType is not None:
  108. if b'\r\n\r\n' in warcContent:
  109. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  110. # Parse headers and extract transfer encoding
  111. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  112. chunked = False
  113. gzipped = False
  114. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  115. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  116. transferEncodings = set(map(bytes.strip, transferEncoding.split(b',')))
  117. chunked = b'chunked' in transferEncodings
  118. gzipped = b'gzip' in transferEncodings
  119. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  120. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  121. yield RawHTTPBodyChunk(httpBody)
  122. # Decode body
  123. if gzipped:
  124. httpDecompressor = GzipDecompressor()
  125. else:
  126. httpDecompressor = DummyDecompressor()
  127. if chunked:
  128. while True:
  129. try:
  130. chunkLineEnd = httpBody.index(b'\r\n')
  131. except ValueError:
  132. print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
  133. break
  134. chunkLine = httpBody[:chunkLineEnd]
  135. if b';' in chunkLine:
  136. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  137. else:
  138. chunkLength = chunkLine.strip()
  139. if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
  140. print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
  141. break
  142. chunkLength = int(chunkLength, base = 16)
  143. if chunkLength == 0:
  144. break
  145. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  146. yield HTTPBodyChunk(chunk)
  147. httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
  148. else:
  149. yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
  150. else:
  151. print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
  152. yield WARCBlockChunk(warcContent)
  153. else:
  154. yield WARCBlockChunk(warcContent)
  155. yield EndOfRecord()
  156. class ProcessMode:
  157. @classmethod
  158. def split_args(cls, args):
  159. '''Split args into arguments to be passed into __init__ and filenames'''
  160. return (), args
  161. def process_event(self, event):
  162. raise NotImplementedError
  163. class Digest:
  164. def __init__(self, digest):
  165. self._digest = digest
  166. def format(self, digest = None):
  167. raise NotImplementedError
  168. def equals(self, digest):
  169. return self._digest == digest
  170. class Base32Digest(Digest):
  171. def format(self, digest = None):
  172. return base64.b32encode(digest if digest else self._digest)
  173. class HexDigest(Digest):
  174. def format(self, digest = None):
  175. return (digest if digest else self._digest).hex()
  176. class VerifyMode(ProcessMode):
  177. def __init__(self):
  178. self._blockDigester = None
  179. self._recordedBlockDigest = None
  180. self._payloadDigester = None
  181. self._brokenPayloadDigester = None
  182. self._recordedPayloadDigest = None
  183. self._printedBrokenPayloadWarning = False
  184. def parse_digest(self, digest):
  185. if not digest.startswith(b'sha1:'):
  186. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  187. return None
  188. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  189. return Base32Digest(base64.b32decode(digest[5:]))
  190. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  191. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  192. return None
  193. def process_event(self, event):
  194. if type(event) is NewFile:
  195. self._printedBrokenPayloadWarning = False
  196. elif type(event) is BeginOfRecord:
  197. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  198. self._blockDigester = hashlib.sha1()
  199. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  200. else:
  201. self._blockDigester = None
  202. self._recordedBlockDigest = None
  203. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  204. self._payloadDigester = hashlib.sha1()
  205. self._brokenPayloadDigester = hashlib.sha1()
  206. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  207. else:
  208. self._payloadDigester = None
  209. self._brokenPayloadDigester = None
  210. self._recordedPayloadDigest = None
  211. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  212. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  213. elif type(event) is WARCBlockChunk:
  214. if self._blockDigester:
  215. self._blockDigester.update(event.data)
  216. elif type(event) is HTTPBodyChunk:
  217. if self._payloadDigester:
  218. self._payloadDigester.update(event.data)
  219. elif type(event) is RawHTTPBodyChunk:
  220. if self._brokenPayloadDigester:
  221. self._brokenPayloadDigester.update(event.data)
  222. elif type(event) is EndOfRecord:
  223. if self._blockDigester and self._recordedBlockDigest:
  224. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  225. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
  226. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  227. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  228. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  229. if not self._printedBrokenPayloadWarning:
  230. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding', file = sys.stderr)
  231. self._printedBrokenPayloadWarning = True
  232. else:
  233. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
  234. class DumpResponsesMode(ProcessMode):
  235. @classmethod
  236. def split_args(cls, args):
  237. if args[0] == '-m' or args[0] == '--meta':
  238. return (True,), args[1:]
  239. return (False,), args
  240. def __init__(self, withMeta):
  241. self._printEOR = False
  242. self._isResponse = False
  243. self._withMeta = withMeta
  244. if withMeta:
  245. self._recordID = None
  246. self._targetURI = None
  247. self._buffer = b''
  248. def _write(self, data):
  249. if not self._withMeta:
  250. sys.stdout.buffer.write(data)
  251. return
  252. buf = self._buffer + data
  253. lines = buf.split(b'\n')
  254. self._buffer = lines.pop() # Since there's an explicit `_write(b'\r\n')` at the end of the record, this implicitly resets the buffer as well
  255. for line in lines:
  256. sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, self._targetURI, '')).encode('utf-8'))
  257. sys.stdout.buffer.write(line)
  258. sys.stdout.buffer.write(b'\n')
  259. def process_event(self, event):
  260. if type(event) is NewFile:
  261. self._filename = event.filename
  262. elif type(event) is BeginOfRecord:
  263. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  264. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  265. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  266. self._printEOR = False
  267. if self._withMeta:
  268. # Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
  269. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
  270. self._targetURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
  271. self._buffer = b''
  272. elif type(event) is HTTPBodyChunk:
  273. if self._isResponse:
  274. self._printEOR = True
  275. self._write(event.data)
  276. elif type(event) is EndOfRecord:
  277. if self._printEOR:
  278. self._write(b'\r\n')
  279. class COLOURS:
  280. RESET = b'\x1b[0m'
  281. GREEN = b'\x1b[0;32m'
  282. LIGHTGREEN = b'\x1b[1;32m'
  283. PURPLE = b'\x1b[0;35m'
  284. LIGHTPURPLE = b'\x1b[1;35m'
  285. RED = b'\x1b[0;31m'
  286. INVERTED = b'\x1b[7m'
  287. class ColourMode(ProcessMode):
  288. def __init__(self):
  289. self._hadHttpStatusLine = False
  290. def _replace_esc(self, data):
  291. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  292. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  293. if colourOnlyBeforeColon:
  294. if b':' in line:
  295. offset = line.index(b':')
  296. else:
  297. offset = 0
  298. else:
  299. offset = len(line)
  300. if offset > 0:
  301. sys.stdout.buffer.write(colour)
  302. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  303. sys.stdout.buffer.write(COLOURS.RESET)
  304. sys.stdout.buffer.write(line[offset:])
  305. if withLF:
  306. sys.stdout.buffer.write(b'\n')
  307. def _print_data(self, data, colour, colourOnlyBeforeColon):
  308. later = False
  309. for line in data.split(b'\r\n'):
  310. if later:
  311. sys.stdout.buffer.write(b'\n')
  312. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  313. later = True
  314. def process_event(self, event):
  315. if type(event) is BeginOfRecord:
  316. firstNewline = event.rawData.index(b'\r\n')
  317. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  318. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  319. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  320. self._hadHttpStatusLine = False
  321. elif type(event) is WARCBlockChunk:
  322. if event.isHttpHeader is True:
  323. if not self._hadHttpStatusLine:
  324. firstNewline = event.data.index(b'\r\n')
  325. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  326. offset = firstNewline + 2
  327. self._hadHttpStatusLine = True
  328. else:
  329. offset = 0
  330. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  331. elif event.isHttpHeader is False:
  332. self._print_data(event.data, COLOURS.RED, False)
  333. elif event.isHttpHeader is None:
  334. sys.stdout.buffer.write(self._replace_esc(event.data))
  335. elif type(event) is EndOfRecord:
  336. sys.stdout.buffer.write(b'\n\n')
  337. def main():
  338. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
  339. assert len(sys.argv) - 1 >= 2
  340. mode = sys.argv[1]
  341. assert mode in processorMap
  342. processorArgs, files = processorMap[mode].split_args(sys.argv[2:])
  343. assert files
  344. processor = processorMap[mode](*processorArgs)
  345. try:
  346. for f in files:
  347. print('Info: processing {}'.format(f), file = sys.stderr)
  348. processor.process_event(NewFile(f))
  349. for event in iter_warc(f):
  350. processor.process_event(event)
  351. except BrokenPipeError:
  352. return
  353. if __name__ == '__main__':
  354. main()