Browse Source

Add warc-tiny scrape command for parsing HTTP responses using wpull and extracting links

master
JustAnotherArchivist 3 years ago
parent
commit
491a80a04b
1 changed files with 109 additions and 1 deletions
  1. +109
    -1
      warc-tiny

+ 109
- 1
warc-tiny View File

@@ -7,14 +7,34 @@
# With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:<https://example.org/>: foobar'
# The record offset may be -1 if it is not known.
# The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon).
# warc-tiny scrape [-u|--urls] FILES -- extract all links and page requisites from the records; produces lines of filename, record offset, record URI, link type, inline flag, and URL as JSONL
# With --urls, only the URL is printed.
# wpull's scrapers are used for the extraction.
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests

import base64
import gzip
import hashlib
import json
import sys
import tempfile
import zlib

try:
import wpull.body
import wpull.document.htmlparse.lxml_
try:
import wpull.protocol.http.request as wpull_protocol_http_request # wpull 2.x
except ImportError:
import wpull.http.request as wpull_protocol_http_request # wpull 1.x
import wpull.scraper.base
import wpull.scraper.css
import wpull.scraper.html
import wpull.scraper.javascript
import wpull.scraper.sitemap
except ImportError:
wpull = None


def GzipDecompressor():
return zlib.decompressobj(16 + zlib.MAX_WBITS)
@@ -52,6 +72,15 @@ class BeginOfRecord(Event):
return self._rawData


class HTTPHeaders(Event):
def __init__(self, headers):
self._headers = headers

@property
def headers(self):
return self._headers


class _DataChunk(Event):
def __init__(self, data):
self._data = data
@@ -153,6 +182,7 @@ def iter_warc(f):
gzipped = b'gzip' in transferEncodings

yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
yield HTTPHeaders(httpHeaderLines)
yield WARCBlockChunk(httpBody, isHttpHeader = False)
yield RawHTTPBodyChunk(httpBody)

@@ -404,8 +434,86 @@ class ColourMode(ProcessMode):
elif type(event) is EndOfRecord:
sys.stdout.buffer.write(b'\n\n')


class ScrapeMode(ProcessMode):
@classmethod
def split_args(cls, args):
if args[0] == '-u' or args[0] == '--urls':
return (True,), args[1:]
return (False,), args

def __init__(self, urlsOnly):
self._urlsOnly = urlsOnly

assert wpull is not None, 'Scrape mode requires wpull'
htmlParser = wpull.document.htmlparse.lxml_.HTMLParser()
elementWalker = wpull.scraper.html.ElementWalker()
scrapers = []
scrapers.append(wpull.scraper.html.HTMLScraper(htmlParser, elementWalker))
scrapers.append(wpull.scraper.css.CSSScraper())
elementWalker.css_scraper = scrapers[-1]
scrapers.append(wpull.scraper.javascript.JavaScriptScraper())
elementWalker.javascript_scraper = scrapers[-1]
scrapers.append(wpull.scraper.sitemap.SitemapScraper(htmlParser))
self._scraper = wpull.scraper.base.DemuxDocumentScraper(scrapers)

self._isResponse = None
self._body = None
self._recordURI = None
self._statusCode = None
self._statusReason = None
if not self._urlsOnly:
self._filename = None
self._recordID = None

def process_event(self, event):
if type(event) is NewFile and not self._urlsOnly:
self._filename = event.filename
elif type(event) is BeginOfRecord:
warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
if self._isResponse:
self._body = wpull.body.Body(file = tempfile.SpooledTemporaryFile(max_size = 10485760)) # Up to 10 MiB in memory
self._printEOR = False
if not self._urlsOnly:
# Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
self._recordURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
elif type(event) is HTTPHeaders and self._isResponse:
assert len(event.headers[0]) == 1 and event.headers[0][0].startswith(b'HTTP/'), 'malformed HTTP response'
_, statusCode, reason = event.headers[0][0].decode('ascii').split(' ', 2)
self._statusCode = int(statusCode)
self._statusReason = reason
elif type(event) is HTTPBodyChunk and self._isResponse:
self._body.write(event.data)
elif type(event) is EndOfRecord and self._isResponse:
request = wpull_protocol_http_request.Request(self._recordURI)
response = wpull_protocol_http_request.Response(self._statusCode, self._statusReason)
response.body = self._body
response.body.seek(0)
for scraper, scrapeResult in self._scraper.scrape_info(request, response).items():
if not scrapeResult:
continue
for linkContext in scrapeResult.link_contexts:
if self._urlsOnly:
print(linkContext.link)
continue
o = {
'filename': self._filename,
'recordOffset': None,
'recordID': self._recordID,
'recordURI': self._recordURI,
'linkType': linkContext.link_type,
'inline': bool(linkContext.inline), # Needs manual casting; https://github.com/ArchiveTeam/wpull/issues/458
'linked': bool(linkContext.linked),
'url': linkContext.link,
}
print(json.dumps(o))


def main():
processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode, 'scrape': ScrapeMode}

assert len(sys.argv) - 1 >= 2
mode = sys.argv[1]


Loading…
Cancel
Save