The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

430 lines
15 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses [-m|--meta] FILES -- dump the HTTP response bodies to stdout
  6. # With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:<https://example.org/>: foobar'
  7. # The record offset may be -1 if it is not known.
  8. # The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon).
  9. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  10. import base64
  11. import gzip
  12. import hashlib
  13. import sys
  14. import zlib
  15. def GzipDecompressor():
  16. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  17. class DummyDecompressor:
  18. def decompress(self, data):
  19. return data
  20. class Event:
  21. pass
  22. class NewFile(Event):
  23. def __init__(self, filename):
  24. self._filename = filename
  25. @property
  26. def filename(self):
  27. return self._filename
  28. class BeginOfRecord(Event):
  29. def __init__(self, warcHeaders, rawData):
  30. self._warcHeaders = warcHeaders
  31. self._rawData = rawData
  32. @property
  33. def warcHeaders(self):
  34. return self._warcHeaders
  35. @property
  36. def rawData(self):
  37. return self._rawData
  38. class _DataChunk(Event):
  39. def __init__(self, data):
  40. self._data = data
  41. @property
  42. def data(self):
  43. return self._data
  44. def __repr__(self):
  45. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  46. class WARCBlockChunk(_DataChunk):
  47. def __init__(self, data, isHttpHeader = None):
  48. super().__init__(data)
  49. self._isHttpHeader = isHttpHeader
  50. @property
  51. def isHttpHeader(self):
  52. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  53. return self._isHttpHeader
  54. class RawHTTPBodyChunk(_DataChunk):
  55. '''
  56. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  57. This is like HTTPBodyChunk but without transfer encoding stripping.
  58. '''
  59. class HTTPBodyChunk(_DataChunk):
  60. '''
  61. Representing a part of the HTTP body with transfer encoding stripped.
  62. '''
  63. class EndOfRecord(Event):
  64. pass
  65. def iter_warc(f):
  66. # Yields Events
  67. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  68. with gzip.open(f, 'rb') as fp:
  69. buf = b''
  70. while True:
  71. # Read WARC header
  72. while b'\r\n\r\n' not in buf:
  73. try:
  74. buf = buf + fp.read(4096)
  75. except EOFError:
  76. break
  77. if not buf:
  78. break
  79. if not buf:
  80. break
  81. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  82. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')
  83. assert b'\r\nContent-Length:' in warcHeaderBuf
  84. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  85. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  86. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  87. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  88. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  89. recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID')
  90. # Read WARC block (and skip CRLFCRLF at the end of the record)
  91. if len(buf) < warcContentLength + 4:
  92. try:
  93. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  94. except EOFError:
  95. pass
  96. if len(buf) < warcContentLength + 4:
  97. print('Error: truncated WARC', file = sys.stderr)
  98. break
  99. warcContent = buf[:warcContentLength]
  100. buf = buf[warcContentLength + 4:]
  101. # Decode HTTP body if appropriate
  102. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  103. httpType = 'request'
  104. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  105. httpType = 'response'
  106. else:
  107. httpType = None
  108. if httpType is not None:
  109. if b'\r\n\r\n' in warcContent:
  110. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  111. # Parse headers and extract transfer encoding
  112. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  113. chunked = False
  114. gzipped = False
  115. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  116. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  117. transferEncodings = set(map(bytes.strip, transferEncoding.split(b',')))
  118. chunked = b'chunked' in transferEncodings
  119. gzipped = b'gzip' in transferEncodings
  120. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  121. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  122. yield RawHTTPBodyChunk(httpBody)
  123. # Decode body
  124. if gzipped:
  125. httpDecompressor = GzipDecompressor()
  126. else:
  127. httpDecompressor = DummyDecompressor()
  128. if chunked:
  129. pos = 0
  130. while True:
  131. try:
  132. chunkLineEnd = httpBody.index(b'\r\n', pos)
  133. except ValueError:
  134. print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
  135. break
  136. chunkLine = httpBody[pos:chunkLineEnd]
  137. if b';' in chunkLine:
  138. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  139. else:
  140. chunkLength = chunkLine.strip()
  141. if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
  142. print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
  143. break
  144. chunkLength = int(chunkLength, base = 16)
  145. if chunkLength == 0:
  146. break
  147. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  148. yield HTTPBodyChunk(chunk)
  149. pos = chunkLineEnd + 2 + chunkLength + 2
  150. else:
  151. yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
  152. else:
  153. print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
  154. yield WARCBlockChunk(warcContent)
  155. else:
  156. yield WARCBlockChunk(warcContent)
  157. yield EndOfRecord()
  158. class ProcessMode:
  159. @classmethod
  160. def split_args(cls, args):
  161. '''Split args into arguments to be passed into __init__ and filenames'''
  162. return (), args
  163. def process_event(self, event):
  164. raise NotImplementedError
  165. class Digest:
  166. def __init__(self, digest):
  167. self._digest = digest
  168. def format(self, digest = None):
  169. raise NotImplementedError
  170. def equals(self, digest):
  171. return self._digest == digest
  172. class Base32Digest(Digest):
  173. def format(self, digest = None):
  174. return base64.b32encode(digest if digest else self._digest)
  175. class HexDigest(Digest):
  176. def format(self, digest = None):
  177. return (digest if digest else self._digest).hex()
  178. class VerifyMode(ProcessMode):
  179. def __init__(self):
  180. self._blockDigester = None
  181. self._recordedBlockDigest = None
  182. self._payloadDigester = None
  183. self._brokenPayloadDigester = None
  184. self._recordedPayloadDigest = None
  185. self._printedBrokenPayloadWarning = False
  186. def parse_digest(self, digest):
  187. if not digest.startswith(b'sha1:'):
  188. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  189. return None
  190. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  191. return Base32Digest(base64.b32decode(digest[5:]))
  192. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  193. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  194. return None
  195. def process_event(self, event):
  196. if type(event) is NewFile:
  197. self._printedBrokenPayloadWarning = False
  198. elif type(event) is BeginOfRecord:
  199. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  200. self._blockDigester = hashlib.sha1()
  201. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  202. else:
  203. self._blockDigester = None
  204. self._recordedBlockDigest = None
  205. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  206. self._payloadDigester = hashlib.sha1()
  207. self._brokenPayloadDigester = hashlib.sha1()
  208. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  209. else:
  210. self._payloadDigester = None
  211. self._brokenPayloadDigester = None
  212. self._recordedPayloadDigest = None
  213. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  214. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  215. elif type(event) is WARCBlockChunk:
  216. if self._blockDigester:
  217. self._blockDigester.update(event.data)
  218. elif type(event) is HTTPBodyChunk:
  219. if self._payloadDigester:
  220. self._payloadDigester.update(event.data)
  221. elif type(event) is RawHTTPBodyChunk:
  222. if self._brokenPayloadDigester:
  223. self._brokenPayloadDigester.update(event.data)
  224. elif type(event) is EndOfRecord:
  225. if self._blockDigester and self._recordedBlockDigest:
  226. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  227. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
  228. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  229. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  230. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  231. if not self._printedBrokenPayloadWarning:
  232. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding', file = sys.stderr)
  233. self._printedBrokenPayloadWarning = True
  234. else:
  235. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
  236. class DumpResponsesMode(ProcessMode):
  237. @classmethod
  238. def split_args(cls, args):
  239. if args[0] == '-m' or args[0] == '--meta':
  240. return (True,), args[1:]
  241. return (False,), args
  242. def __init__(self, withMeta):
  243. self._printEOR = False
  244. self._isResponse = False
  245. self._withMeta = withMeta
  246. if withMeta:
  247. self._recordID = None
  248. self._targetURI = None
  249. self._buffer = b''
  250. def _write(self, data):
  251. if not self._withMeta:
  252. sys.stdout.buffer.write(data)
  253. return
  254. buf = self._buffer + data
  255. lines = buf.split(b'\n')
  256. self._buffer = lines.pop() # Since there's an explicit `_write(b'\r\n')` at the end of the record, this implicitly resets the buffer as well
  257. for line in lines:
  258. sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, '<' + self._targetURI + '>', '')).encode('utf-8'))
  259. sys.stdout.buffer.write(line)
  260. sys.stdout.buffer.write(b'\n')
  261. def process_event(self, event):
  262. if type(event) is NewFile:
  263. self._filename = event.filename
  264. if ':' in self._filename:
  265. self._filename = '<' + self._filename + '>'
  266. elif type(event) is BeginOfRecord:
  267. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  268. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  269. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  270. self._printEOR = False
  271. if self._withMeta:
  272. # Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
  273. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
  274. self._targetURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
  275. self._buffer = b''
  276. elif type(event) is HTTPBodyChunk:
  277. if self._isResponse:
  278. self._printEOR = True
  279. self._write(event.data)
  280. elif type(event) is EndOfRecord:
  281. if self._printEOR:
  282. self._write(b'\r\n')
  283. class COLOURS:
  284. RESET = b'\x1b[0m'
  285. GREEN = b'\x1b[0;32m'
  286. LIGHTGREEN = b'\x1b[1;32m'
  287. PURPLE = b'\x1b[0;35m'
  288. LIGHTPURPLE = b'\x1b[1;35m'
  289. RED = b'\x1b[0;31m'
  290. INVERTED = b'\x1b[7m'
  291. class ColourMode(ProcessMode):
  292. def __init__(self):
  293. self._hadHttpStatusLine = False
  294. def _replace_esc(self, data):
  295. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  296. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  297. if colourOnlyBeforeColon:
  298. if b':' in line:
  299. offset = line.index(b':')
  300. else:
  301. offset = 0
  302. else:
  303. offset = len(line)
  304. if offset > 0:
  305. sys.stdout.buffer.write(colour)
  306. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  307. sys.stdout.buffer.write(COLOURS.RESET)
  308. sys.stdout.buffer.write(line[offset:])
  309. if withLF:
  310. sys.stdout.buffer.write(b'\n')
  311. def _print_data(self, data, colour, colourOnlyBeforeColon):
  312. later = False
  313. for line in data.split(b'\r\n'):
  314. if later:
  315. sys.stdout.buffer.write(b'\n')
  316. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  317. later = True
  318. def process_event(self, event):
  319. if type(event) is BeginOfRecord:
  320. firstNewline = event.rawData.index(b'\r\n')
  321. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  322. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  323. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  324. self._hadHttpStatusLine = False
  325. elif type(event) is WARCBlockChunk:
  326. if event.isHttpHeader is True:
  327. if not self._hadHttpStatusLine:
  328. firstNewline = event.data.index(b'\r\n')
  329. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  330. offset = firstNewline + 2
  331. self._hadHttpStatusLine = True
  332. else:
  333. offset = 0
  334. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  335. elif event.isHttpHeader is False:
  336. self._print_data(event.data, COLOURS.RED, False)
  337. elif event.isHttpHeader is None:
  338. sys.stdout.buffer.write(self._replace_esc(event.data))
  339. elif type(event) is EndOfRecord:
  340. sys.stdout.buffer.write(b'\n\n')
  341. def main():
  342. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
  343. assert len(sys.argv) - 1 >= 2
  344. mode = sys.argv[1]
  345. assert mode in processorMap
  346. processorArgs, files = processorMap[mode].split_args(sys.argv[2:])
  347. assert files
  348. processor = processorMap[mode](*processorArgs)
  349. try:
  350. for f in files:
  351. print('Info: processing {}'.format(f), file = sys.stderr)
  352. processor.process_event(NewFile(f))
  353. for event in iter_warc(f):
  354. processor.process_event(event)
  355. except BrokenPipeError:
  356. return
  357. if __name__ == '__main__':
  358. main()