From 59ae1183d2630ee708f6ec47dd959c7e8e8e8438 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 11 Jul 2020 21:26:28 +0000 Subject: [PATCH] Add fromResponse parameter for URL completion and automatic Referer header --- qwarc/__init__.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/qwarc/__init__.py b/qwarc/__init__.py index 34b2316..22e984a 100644 --- a/qwarc/__init__.py +++ b/qwarc/__init__.py @@ -49,12 +49,15 @@ class Item: else: self._baseUrl = yarl.URL(baseUrl) - def _merge_headers(self, headers): + def _merge_headers(self, headers, extraHeaders = []): d = {} # Preserves order from Python 3.7 (guaranteed) or CPython 3.6 (implementation detail) keys = {} # casefolded key -> d key for key, value in self.headers: d[key] = value keys[key.casefold()] = key + for key, value in extraHeaders: + d[key] = value + keys[key.casefold()] = key for key, value in headers: keyc = key.casefold() if value is None: @@ -75,7 +78,7 @@ class Item: out.append((key, value)) return out - async def fetch(self, url, responseHandler = None, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60): + async def fetch(self, url, responseHandler = None, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60, fromResponse = None): ''' HTTP GET or POST a URL @@ -88,6 +91,7 @@ class Item: If a header appears multiple times, only the last one is used. To send a header multiple times, pass a tuple of values. verify_ssl: bool, whether the SSL/TLS certificate should be validated timeout: int or float, how long the fetch may take at most in total (sending request until finishing reading the response) + fromResponse: ClientResponse or None; if provided, use fromResponse.url for the url completion (instead of self.baseUrl) and add it as a Referer header Returns response (a ClientResponse object or a qwarc.utils.DummyClientResponse object) ''' @@ -96,13 +100,16 @@ class Item: url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc. if not url.scheme or not url.host: - if not self.baseUrl: + if fromResponse is not None: + url = fromResponse.url.join(url) + elif not self.baseUrl: raise ValueError('Incomplete URL and no baseUrl to join it with') - url = self.baseUrl.join(url) + else: + url = self.baseUrl.join(url) if responseHandler is None: responseHandler = self.defaultResponseHandler assert method in ('GET', 'POST'), 'method must be GET or POST' - headers = self._merge_headers(headers) + headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else []) history = [] attempt = 0 #TODO redirectLevel