From f8f5258197a993a1369634eb7fe925e146d67cfe Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 11 Jul 2020 23:48:25 +0000 Subject: [PATCH] Track redirect depth --- qwarc/__init__.py | 7 ++++--- qwarc/utils.py | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/qwarc/__init__.py b/qwarc/__init__.py index 7368e7e..0b5c260 100644 --- a/qwarc/__init__.py +++ b/qwarc/__init__.py @@ -106,7 +106,7 @@ class Item: headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else []) history = [] attempt = 0 - #TODO redirectLevel + redirectLevel = 0 while True: attempt += 1 response = None @@ -138,10 +138,10 @@ class Item: self.stats['requests'] += 1 except (asyncio.TimeoutError, _aiohttp.ClientError) as e: self.logger.warning(f'Request for {url} failed: {e!r}') - action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = e, item = self) + action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = e, redirectLevel = redirectLevel, item = self) exc = e # Pass the exception outward for the history else: - action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, item = self) + action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, redirectLevel = redirectLevel, item = self) if response and exc is None and writeToWarc: self.warc.write_client_response(response) history.append((response, exc)) @@ -159,6 +159,7 @@ class Item: method = 'GET' data = None attempt = 0 + redirectLevel += 1 elif action == ACTION_RETRIES_EXCEEDED: self.logger.error(f'Request for {url} failed {attempt} times') retResponse.qhistory = tuple(history) diff --git a/qwarc/utils.py b/qwarc/utils.py index 55ac36f..2db713d 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -127,7 +127,7 @@ def generate_range_items(start, stop, step): yield f'{i}-{min(i + step - 1, stop)}' -async def handle_response_default(*, url, attempt, response, exc, item): +async def handle_response_default(*, url, attempt, response, exc, redirectLevel, item): ''' The default response handler, which behaves as follows: - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds. @@ -141,14 +141,14 @@ async def handle_response_default(*, url, attempt, response, exc, item): Note that this handler does not limit the number of retries on errors. - Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), item (qwarc.Item instance) + Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), redirectLevel (int), item (qwarc.Item instance) At least one of response and exc is not None. + The redirectLevel indicates how many redirects were followed to get to this url, i.e. it starts out as zero and increases by one for every redirect. + The attempt starts from 1 for every url, i.e. it is reset on redirects. The handler is invoked at most once for each attempt. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not) The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC. ''' - #TODO: Document that `attempt` is reset on redirects - if response is None: await asyncio.sleep(5) return ACTION_RETRY, True