Преглед на файлове

Add fromResponse parameter for URL completion and automatic Referer header

master
JustAnotherArchivist преди 3 години
родител
ревизия
59ae1183d2
променени са 1 файла, в които са добавени 12 реда и са изтрити 5 реда
  1. +12
    -5
      qwarc/__init__.py

+ 12
- 5
qwarc/__init__.py Целия файл

@@ -49,12 +49,15 @@ class Item:
else: else:
self._baseUrl = yarl.URL(baseUrl) self._baseUrl = yarl.URL(baseUrl)


def _merge_headers(self, headers):
def _merge_headers(self, headers, extraHeaders = []):
d = {} # Preserves order from Python 3.7 (guaranteed) or CPython 3.6 (implementation detail) d = {} # Preserves order from Python 3.7 (guaranteed) or CPython 3.6 (implementation detail)
keys = {} # casefolded key -> d key keys = {} # casefolded key -> d key
for key, value in self.headers: for key, value in self.headers:
d[key] = value d[key] = value
keys[key.casefold()] = key keys[key.casefold()] = key
for key, value in extraHeaders:
d[key] = value
keys[key.casefold()] = key
for key, value in headers: for key, value in headers:
keyc = key.casefold() keyc = key.casefold()
if value is None: if value is None:
@@ -75,7 +78,7 @@ class Item:
out.append((key, value)) out.append((key, value))
return out return out


async def fetch(self, url, responseHandler = None, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60):
async def fetch(self, url, responseHandler = None, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60, fromResponse = None):
''' '''
HTTP GET or POST a URL HTTP GET or POST a URL


@@ -88,6 +91,7 @@ class Item:
If a header appears multiple times, only the last one is used. To send a header multiple times, pass a tuple of values. If a header appears multiple times, only the last one is used. To send a header multiple times, pass a tuple of values.
verify_ssl: bool, whether the SSL/TLS certificate should be validated verify_ssl: bool, whether the SSL/TLS certificate should be validated
timeout: int or float, how long the fetch may take at most in total (sending request until finishing reading the response) timeout: int or float, how long the fetch may take at most in total (sending request until finishing reading the response)
fromResponse: ClientResponse or None; if provided, use fromResponse.url for the url completion (instead of self.baseUrl) and add it as a Referer header


Returns response (a ClientResponse object or a qwarc.utils.DummyClientResponse object) Returns response (a ClientResponse object or a qwarc.utils.DummyClientResponse object)
''' '''
@@ -96,13 +100,16 @@ class Item:


url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc. url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
if not url.scheme or not url.host: if not url.scheme or not url.host:
if not self.baseUrl:
if fromResponse is not None:
url = fromResponse.url.join(url)
elif not self.baseUrl:
raise ValueError('Incomplete URL and no baseUrl to join it with') raise ValueError('Incomplete URL and no baseUrl to join it with')
url = self.baseUrl.join(url)
else:
url = self.baseUrl.join(url)
if responseHandler is None: if responseHandler is None:
responseHandler = self.defaultResponseHandler responseHandler = self.defaultResponseHandler
assert method in ('GET', 'POST'), 'method must be GET or POST' assert method in ('GET', 'POST'), 'method must be GET or POST'
headers = self._merge_headers(headers)
headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else [])
history = [] history = []
attempt = 0 attempt = 0
#TODO redirectLevel #TODO redirectLevel


Зареждане…
Отказ
Запис