diff --git a/warc-tiny b/warc-tiny index 796519b..f5ccf75 100755 --- a/warc-tiny +++ b/warc-tiny @@ -7,14 +7,34 @@ # With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123::: foobar' # The record offset may be -1 if it is not known. # The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon). +# warc-tiny scrape [-u|--urls] FILES -- extract all links and page requisites from the records; produces lines of filename, record offset, record URI, link type, inline flag, and URL as JSONL +# With --urls, only the URL is printed. +# wpull's scrapers are used for the extraction. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests import base64 import gzip import hashlib +import json import sys +import tempfile import zlib +try: + import wpull.body + import wpull.document.htmlparse.lxml_ + try: + import wpull.protocol.http.request as wpull_protocol_http_request # wpull 2.x + except ImportError: + import wpull.http.request as wpull_protocol_http_request # wpull 1.x + import wpull.scraper.base + import wpull.scraper.css + import wpull.scraper.html + import wpull.scraper.javascript + import wpull.scraper.sitemap +except ImportError: + wpull = None + def GzipDecompressor(): return zlib.decompressobj(16 + zlib.MAX_WBITS) @@ -52,6 +72,15 @@ class BeginOfRecord(Event): return self._rawData +class HTTPHeaders(Event): + def __init__(self, headers): + self._headers = headers + + @property + def headers(self): + return self._headers + + class _DataChunk(Event): def __init__(self, data): self._data = data @@ -153,6 +182,7 @@ def iter_warc(f): gzipped = b'gzip' in transferEncodings yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True) + yield HTTPHeaders(httpHeaderLines) yield WARCBlockChunk(httpBody, isHttpHeader = False) yield RawHTTPBodyChunk(httpBody) @@ -404,8 +434,86 @@ class ColourMode(ProcessMode): elif type(event) is EndOfRecord: sys.stdout.buffer.write(b'\n\n') + +class ScrapeMode(ProcessMode): + @classmethod + def split_args(cls, args): + if args[0] == '-u' or args[0] == '--urls': + return (True,), args[1:] + return (False,), args + + def __init__(self, urlsOnly): + self._urlsOnly = urlsOnly + + assert wpull is not None, 'Scrape mode requires wpull' + htmlParser = wpull.document.htmlparse.lxml_.HTMLParser() + elementWalker = wpull.scraper.html.ElementWalker() + scrapers = [] + scrapers.append(wpull.scraper.html.HTMLScraper(htmlParser, elementWalker)) + scrapers.append(wpull.scraper.css.CSSScraper()) + elementWalker.css_scraper = scrapers[-1] + scrapers.append(wpull.scraper.javascript.JavaScriptScraper()) + elementWalker.javascript_scraper = scrapers[-1] + scrapers.append(wpull.scraper.sitemap.SitemapScraper(htmlParser)) + self._scraper = wpull.scraper.base.DemuxDocumentScraper(scrapers) + + self._isResponse = None + self._body = None + self._recordURI = None + self._statusCode = None + self._statusReason = None + if not self._urlsOnly: + self._filename = None + self._recordID = None + + def process_event(self, event): + if type(event) is NewFile and not self._urlsOnly: + self._filename = event.filename + elif type(event) is BeginOfRecord: + warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type') + warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type') + self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response' + if self._isResponse: + self._body = wpull.body.Body(file = tempfile.SpooledTemporaryFile(max_size = 10485760)) # Up to 10 MiB in memory + self._printEOR = False + if not self._urlsOnly: + # Both of these are URIs, and per RFC 3986, those can only contain ASCII characters. + self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii') + self._recordURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii') + elif type(event) is HTTPHeaders and self._isResponse: + assert len(event.headers[0]) == 1 and event.headers[0][0].startswith(b'HTTP/'), 'malformed HTTP response' + _, statusCode, reason = event.headers[0][0].decode('ascii').split(' ', 2) + self._statusCode = int(statusCode) + self._statusReason = reason + elif type(event) is HTTPBodyChunk and self._isResponse: + self._body.write(event.data) + elif type(event) is EndOfRecord and self._isResponse: + request = wpull_protocol_http_request.Request(self._recordURI) + response = wpull_protocol_http_request.Response(self._statusCode, self._statusReason) + response.body = self._body + response.body.seek(0) + for scraper, scrapeResult in self._scraper.scrape_info(request, response).items(): + if not scrapeResult: + continue + for linkContext in scrapeResult.link_contexts: + if self._urlsOnly: + print(linkContext.link) + continue + o = { + 'filename': self._filename, + 'recordOffset': None, + 'recordID': self._recordID, + 'recordURI': self._recordURI, + 'linkType': linkContext.link_type, + 'inline': bool(linkContext.inline), # Needs manual casting; https://github.com/ArchiveTeam/wpull/issues/458 + 'linked': bool(linkContext.linked), + 'url': linkContext.link, + } + print(json.dumps(o)) + + def main(): - processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode} + processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode, 'scrape': ScrapeMode} assert len(sys.argv) - 1 >= 2 mode = sys.argv[1]