Add warc-tiny scrape command for parsing HTTP responses using wpull and extracting links

3 years ago · 491a80a04b
--- a/+ 109
+++ b/+ 109
@@ -7,14 +7,34 @@
 #    With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:<https://example.org/>:    foobar'
 #      The record offset may be -1 if it is not known.
 #      The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon).
 #  warc-tiny scrape [-u|--urls] FILES  --  extract all links and page requisites from the records; produces lines of filename, record offset, record URI, link type, inline flag, and URL as JSONL
 #    With --urls, only the URL is printed.
 #    wpull's scrapers are used for the extraction.
 #  warc-tiny verify FILES  --  verify the integrity of a WARC by comparing the digests

 import base64
 import gzip
 import hashlib
 import json
 import sys
 import tempfile
 import zlib

 try:
 	import wpull.body
 	import wpull.document.htmlparse.lxml_
 	try:
 		import wpull.protocol.http.request as wpull_protocol_http_request # wpull 2.x
 	except ImportError:
 		import wpull.http.request as wpull_protocol_http_request # wpull 1.x
 	import wpull.scraper.base
 	import wpull.scraper.css
 	import wpull.scraper.html
 	import wpull.scraper.javascript
 	import wpull.scraper.sitemap
 except ImportError:
 	wpull = None


 def GzipDecompressor():
 	return zlib.decompressobj(16 + zlib.MAX_WBITS)
@@ -52,6 +72,15 @@ class BeginOfRecord(Event):
 		return self._rawData


 class HTTPHeaders(Event):
 	def __init__(self, headers):
 		self._headers = headers

 	@property
 	def headers(self):
 		return self._headers


 class _DataChunk(Event):
 	def __init__(self, data):
 		self._data = data
@@ -153,6 +182,7 @@ def iter_warc(f):
 						gzipped = b'gzip' in transferEncodings

 					yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
 					yield HTTPHeaders(httpHeaderLines)
 					yield WARCBlockChunk(httpBody, isHttpHeader = False)
 					yield RawHTTPBodyChunk(httpBody)

@@ -404,8 +434,86 @@ class ColourMode(ProcessMode):
 		elif type(event) is EndOfRecord:
 			sys.stdout.buffer.write(b'\n\n')


 class ScrapeMode(ProcessMode):
 	@classmethod
 	def split_args(cls, args):
 		if args[0] == '-u' or args[0] == '--urls':
 			return (True,), args[1:]
 		return (False,), args

 	def __init__(self, urlsOnly):
 		self._urlsOnly = urlsOnly

 		assert wpull is not None, 'Scrape mode requires wpull'
 		htmlParser = wpull.document.htmlparse.lxml_.HTMLParser()
 		elementWalker = wpull.scraper.html.ElementWalker()
 		scrapers = []
 		scrapers.append(wpull.scraper.html.HTMLScraper(htmlParser, elementWalker))
 		scrapers.append(wpull.scraper.css.CSSScraper())
 		elementWalker.css_scraper = scrapers[-1]
 		scrapers.append(wpull.scraper.javascript.JavaScriptScraper())
 		elementWalker.javascript_scraper = scrapers[-1]
 		scrapers.append(wpull.scraper.sitemap.SitemapScraper(htmlParser))
 		self._scraper = wpull.scraper.base.DemuxDocumentScraper(scrapers)

 		self._isResponse = None
 		self._body = None
 		self._recordURI = None
 		self._statusCode = None
 		self._statusReason = None
 		if not self._urlsOnly:
 			self._filename = None
 			self._recordID = None

 	def process_event(self, event):
 		if type(event) is NewFile and not self._urlsOnly:
 			self._filename = event.filename
 		elif type(event) is BeginOfRecord:
 			warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
 			warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
 			self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
 			if self._isResponse:
 				self._body = wpull.body.Body(file = tempfile.SpooledTemporaryFile(max_size = 10485760)) # Up to 10 MiB in memory
 			self._printEOR = False
 			if not self._urlsOnly:
 				# Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
 				self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
 			self._recordURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
 		elif type(event) is HTTPHeaders and self._isResponse:
 			assert len(event.headers[0]) == 1 and event.headers[0][0].startswith(b'HTTP/'), 'malformed HTTP response'
 			_, statusCode, reason = event.headers[0][0].decode('ascii').split(' ', 2)
 			self._statusCode = int(statusCode)
 			self._statusReason = reason
 		elif type(event) is HTTPBodyChunk and self._isResponse:
 			self._body.write(event.data)
 		elif type(event) is EndOfRecord and self._isResponse:
 			request = wpull_protocol_http_request.Request(self._recordURI)
 			response = wpull_protocol_http_request.Response(self._statusCode, self._statusReason)
 			response.body = self._body
 			response.body.seek(0)
 			for scraper, scrapeResult in self._scraper.scrape_info(request, response).items():
 				if not scrapeResult:
 					continue
 				for linkContext in scrapeResult.link_contexts:
 					if self._urlsOnly:
 						print(linkContext.link)
 						continue
 					o = {
 						'filename': self._filename,
 						'recordOffset': None,
 						'recordID': self._recordID,
 						'recordURI': self._recordURI,
 						'linkType': linkContext.link_type,
 						'inline': bool(linkContext.inline), # Needs manual casting; https://github.com/ArchiveTeam/wpull/issues/458
 						'linked': bool(linkContext.linked),
 						'url': linkContext.link,
 					}
 					print(json.dumps(o))


 def main():
 	processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
 	processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode, 'scrape': ScrapeMode}

 	assert len(sys.argv) - 1 >= 2
 	mode = sys.argv[1]