@@ -106,7 +106,7 @@ class Item: | |||||
headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else []) | headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else []) | ||||
history = [] | history = [] | ||||
attempt = 0 | attempt = 0 | ||||
#TODO redirectLevel | |||||
redirectLevel = 0 | |||||
while True: | while True: | ||||
attempt += 1 | attempt += 1 | ||||
response = None | response = None | ||||
@@ -138,10 +138,10 @@ class Item: | |||||
self.stats['requests'] += 1 | self.stats['requests'] += 1 | ||||
except (asyncio.TimeoutError, _aiohttp.ClientError) as e: | except (asyncio.TimeoutError, _aiohttp.ClientError) as e: | ||||
self.logger.warning(f'Request for {url} failed: {e!r}') | self.logger.warning(f'Request for {url} failed: {e!r}') | ||||
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = e, item = self) | |||||
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = e, redirectLevel = redirectLevel, item = self) | |||||
exc = e # Pass the exception outward for the history | exc = e # Pass the exception outward for the history | ||||
else: | else: | ||||
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, item = self) | |||||
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, redirectLevel = redirectLevel, item = self) | |||||
if response and exc is None and writeToWarc: | if response and exc is None and writeToWarc: | ||||
self.warc.write_client_response(response) | self.warc.write_client_response(response) | ||||
history.append((response, exc)) | history.append((response, exc)) | ||||
@@ -159,6 +159,7 @@ class Item: | |||||
method = 'GET' | method = 'GET' | ||||
data = None | data = None | ||||
attempt = 0 | attempt = 0 | ||||
redirectLevel += 1 | |||||
elif action == ACTION_RETRIES_EXCEEDED: | elif action == ACTION_RETRIES_EXCEEDED: | ||||
self.logger.error(f'Request for {url} failed {attempt} times') | self.logger.error(f'Request for {url} failed {attempt} times') | ||||
retResponse.qhistory = tuple(history) | retResponse.qhistory = tuple(history) | ||||
@@ -127,7 +127,7 @@ def generate_range_items(start, stop, step): | |||||
yield f'{i}-{min(i + step - 1, stop)}' | yield f'{i}-{min(i + step - 1, stop)}' | ||||
async def handle_response_default(*, url, attempt, response, exc, item): | |||||
async def handle_response_default(*, url, attempt, response, exc, redirectLevel, item): | |||||
''' | ''' | ||||
The default response handler, which behaves as follows: | The default response handler, which behaves as follows: | ||||
- If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds. | - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds. | ||||
@@ -141,14 +141,14 @@ async def handle_response_default(*, url, attempt, response, exc, item): | |||||
Note that this handler does not limit the number of retries on errors. | Note that this handler does not limit the number of retries on errors. | ||||
Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), item (qwarc.Item instance) | |||||
Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), redirectLevel (int), item (qwarc.Item instance) | |||||
At least one of response and exc is not None. | At least one of response and exc is not None. | ||||
The redirectLevel indicates how many redirects were followed to get to this url, i.e. it starts out as zero and increases by one for every redirect. | |||||
The attempt starts from 1 for every url, i.e. it is reset on redirects. The handler is invoked at most once for each attempt. | |||||
Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not) | Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not) | ||||
The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC. | The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC. | ||||
''' | ''' | ||||
#TODO: Document that `attempt` is reset on redirects | |||||
if response is None: | if response is None: | ||||
await asyncio.sleep(5) | await asyncio.sleep(5) | ||||
return ACTION_RETRY, True | return ACTION_RETRY, True | ||||