diff --git a/qwarc/__init__.py b/qwarc/__init__.py index 9e53a96..e15d2b0 100644 --- a/qwarc/__init__.py +++ b/qwarc/__init__.py @@ -116,10 +116,10 @@ class Item: self.stats['requests'] += 1 except (asyncio.TimeoutError, _aiohttp.ClientError) as e: self.logger.warning(f'Request for {url} failed: {e!r}') - action, writeToWarc = await responseHandler(url, attempt, response, e) + action, writeToWarc = await responseHandler(url, attempt, response, e, self) exc = e # Pass the exception outward for the history else: - action, writeToWarc = await responseHandler(url, attempt, response, None) + action, writeToWarc = await responseHandler(url, attempt, response, None, self) if response and exc is None and writeToWarc: self.warc.write_client_response(response) history.append((response, exc)) diff --git a/qwarc/utils.py b/qwarc/utils.py index d217fd6..29599f5 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -127,7 +127,7 @@ def generate_range_items(start, stop, step): yield f'{i}-{min(i + step - 1, stop)}' -async def handle_response_default(url, attempt, response, exc): +async def handle_response_default(url, attempt, response, exc, item): ''' The default response handler, which behaves as follows: - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds. @@ -141,7 +141,7 @@ async def handle_response_default(url, attempt, response, exc): Note that this handler does not limit the number of retries on errors. - Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None) + Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), item (qwarc.Item instance) At least one of response and exc is not None. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not) The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC. @@ -166,10 +166,10 @@ async def handle_response_default(url, attempt, response, exc): return ACTION_RETRY, True -async def handle_response_ignore_redirects(url, attempt, response, exc): +async def handle_response_ignore_redirects(url, attempt, response, exc, item): '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.''' - action, writeToWarc = await handle_response_default(url, attempt, response, exc) + action, writeToWarc = await handle_response_default(url, attempt, response, exc, item) if action == ACTION_FOLLOW_OR_SUCCESS: action = ACTION_SUCCESS return action, writeToWarc @@ -183,8 +183,8 @@ def handle_response_limit_error_retries(maxRetries, handler = handle_response_de If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time. ''' - async def _handler(url, attempt, response, exc): - action, writeToWarc = await handler(url, attempt, response, exc) + async def _handler(url, attempt, response, exc, item): + action, writeToWarc = await handler(url, attempt, response, exc, item) if action == ACTION_RETRY and attempt > maxRetries: action = ACTION_RETRIES_EXCEEDED return action, writeToWarc