|
|
@@ -7,14 +7,34 @@ |
|
|
|
# With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:<https://example.org/>: foobar' |
|
|
|
# The record offset may be -1 if it is not known. |
|
|
|
# The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon). |
|
|
|
# warc-tiny scrape [-u|--urls] FILES -- extract all links and page requisites from the records; produces lines of filename, record offset, record URI, link type, inline flag, and URL as JSONL |
|
|
|
# With --urls, only the URL is printed. |
|
|
|
# wpull's scrapers are used for the extraction. |
|
|
|
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests |
|
|
|
|
|
|
|
import base64 |
|
|
|
import gzip |
|
|
|
import hashlib |
|
|
|
import json |
|
|
|
import sys |
|
|
|
import tempfile |
|
|
|
import zlib |
|
|
|
|
|
|
|
try: |
|
|
|
import wpull.body |
|
|
|
import wpull.document.htmlparse.lxml_ |
|
|
|
try: |
|
|
|
import wpull.protocol.http.request as wpull_protocol_http_request # wpull 2.x |
|
|
|
except ImportError: |
|
|
|
import wpull.http.request as wpull_protocol_http_request # wpull 1.x |
|
|
|
import wpull.scraper.base |
|
|
|
import wpull.scraper.css |
|
|
|
import wpull.scraper.html |
|
|
|
import wpull.scraper.javascript |
|
|
|
import wpull.scraper.sitemap |
|
|
|
except ImportError: |
|
|
|
wpull = None |
|
|
|
|
|
|
|
|
|
|
|
def GzipDecompressor(): |
|
|
|
return zlib.decompressobj(16 + zlib.MAX_WBITS) |
|
|
@@ -52,6 +72,15 @@ class BeginOfRecord(Event): |
|
|
|
return self._rawData |
|
|
|
|
|
|
|
|
|
|
|
class HTTPHeaders(Event): |
|
|
|
def __init__(self, headers): |
|
|
|
self._headers = headers |
|
|
|
|
|
|
|
@property |
|
|
|
def headers(self): |
|
|
|
return self._headers |
|
|
|
|
|
|
|
|
|
|
|
class _DataChunk(Event): |
|
|
|
def __init__(self, data): |
|
|
|
self._data = data |
|
|
@@ -153,6 +182,7 @@ def iter_warc(f): |
|
|
|
gzipped = b'gzip' in transferEncodings |
|
|
|
|
|
|
|
yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True) |
|
|
|
yield HTTPHeaders(httpHeaderLines) |
|
|
|
yield WARCBlockChunk(httpBody, isHttpHeader = False) |
|
|
|
yield RawHTTPBodyChunk(httpBody) |
|
|
|
|
|
|
@@ -404,8 +434,86 @@ class ColourMode(ProcessMode): |
|
|
|
elif type(event) is EndOfRecord: |
|
|
|
sys.stdout.buffer.write(b'\n\n') |
|
|
|
|
|
|
|
|
|
|
|
class ScrapeMode(ProcessMode): |
|
|
|
@classmethod |
|
|
|
def split_args(cls, args): |
|
|
|
if args[0] == '-u' or args[0] == '--urls': |
|
|
|
return (True,), args[1:] |
|
|
|
return (False,), args |
|
|
|
|
|
|
|
def __init__(self, urlsOnly): |
|
|
|
self._urlsOnly = urlsOnly |
|
|
|
|
|
|
|
assert wpull is not None, 'Scrape mode requires wpull' |
|
|
|
htmlParser = wpull.document.htmlparse.lxml_.HTMLParser() |
|
|
|
elementWalker = wpull.scraper.html.ElementWalker() |
|
|
|
scrapers = [] |
|
|
|
scrapers.append(wpull.scraper.html.HTMLScraper(htmlParser, elementWalker)) |
|
|
|
scrapers.append(wpull.scraper.css.CSSScraper()) |
|
|
|
elementWalker.css_scraper = scrapers[-1] |
|
|
|
scrapers.append(wpull.scraper.javascript.JavaScriptScraper()) |
|
|
|
elementWalker.javascript_scraper = scrapers[-1] |
|
|
|
scrapers.append(wpull.scraper.sitemap.SitemapScraper(htmlParser)) |
|
|
|
self._scraper = wpull.scraper.base.DemuxDocumentScraper(scrapers) |
|
|
|
|
|
|
|
self._isResponse = None |
|
|
|
self._body = None |
|
|
|
self._recordURI = None |
|
|
|
self._statusCode = None |
|
|
|
self._statusReason = None |
|
|
|
if not self._urlsOnly: |
|
|
|
self._filename = None |
|
|
|
self._recordID = None |
|
|
|
|
|
|
|
def process_event(self, event): |
|
|
|
if type(event) is NewFile and not self._urlsOnly: |
|
|
|
self._filename = event.filename |
|
|
|
elif type(event) is BeginOfRecord: |
|
|
|
warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type') |
|
|
|
warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type') |
|
|
|
self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response' |
|
|
|
if self._isResponse: |
|
|
|
self._body = wpull.body.Body(file = tempfile.SpooledTemporaryFile(max_size = 10485760)) # Up to 10 MiB in memory |
|
|
|
self._printEOR = False |
|
|
|
if not self._urlsOnly: |
|
|
|
# Both of these are URIs, and per RFC 3986, those can only contain ASCII characters. |
|
|
|
self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii') |
|
|
|
self._recordURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii') |
|
|
|
elif type(event) is HTTPHeaders and self._isResponse: |
|
|
|
assert len(event.headers[0]) == 1 and event.headers[0][0].startswith(b'HTTP/'), 'malformed HTTP response' |
|
|
|
_, statusCode, reason = event.headers[0][0].decode('ascii').split(' ', 2) |
|
|
|
self._statusCode = int(statusCode) |
|
|
|
self._statusReason = reason |
|
|
|
elif type(event) is HTTPBodyChunk and self._isResponse: |
|
|
|
self._body.write(event.data) |
|
|
|
elif type(event) is EndOfRecord and self._isResponse: |
|
|
|
request = wpull_protocol_http_request.Request(self._recordURI) |
|
|
|
response = wpull_protocol_http_request.Response(self._statusCode, self._statusReason) |
|
|
|
response.body = self._body |
|
|
|
response.body.seek(0) |
|
|
|
for scraper, scrapeResult in self._scraper.scrape_info(request, response).items(): |
|
|
|
if not scrapeResult: |
|
|
|
continue |
|
|
|
for linkContext in scrapeResult.link_contexts: |
|
|
|
if self._urlsOnly: |
|
|
|
print(linkContext.link) |
|
|
|
continue |
|
|
|
o = { |
|
|
|
'filename': self._filename, |
|
|
|
'recordOffset': None, |
|
|
|
'recordID': self._recordID, |
|
|
|
'recordURI': self._recordURI, |
|
|
|
'linkType': linkContext.link_type, |
|
|
|
'inline': bool(linkContext.inline), # Needs manual casting; https://github.com/ArchiveTeam/wpull/issues/458 |
|
|
|
'linked': bool(linkContext.linked), |
|
|
|
'url': linkContext.link, |
|
|
|
} |
|
|
|
print(json.dumps(o)) |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode} |
|
|
|
processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode, 'scrape': ScrapeMode} |
|
|
|
|
|
|
|
assert len(sys.argv) - 1 >= 2 |
|
|
|
mode = sys.argv[1] |
|
|
|