Browse Source

Don't write responses to WARC that triggered an exception

For example, if the connection breaks while retrieving a response but after the headers have been parsed, the response body would be incomplete.
tags/v0.2.2
JustAnotherArchivist 4 years ago
parent
commit
37dbcfad21
1 changed files with 4 additions and 3 deletions
  1. +4
    -3
      qwarc/utils.py

+ 4
- 3
qwarc/utils.py View File

@@ -153,9 +153,10 @@ async def handle_response_default(url, attempt, response, exc):
return ACTION_RETRY, True
if response.status in (401, 403, 404, 405, 410):
return ACTION_IGNORE, True
if exc is not None and isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
await asyncio.sleep(5)
return ACTION_RETRY, True
if exc is not None:
if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
await asyncio.sleep(5)
return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response
if response.status in (200, 204, 206, 304):
return ACTION_SUCCESS, True
if response.status in (301, 302, 303, 307, 308):


Loading…
Cancel
Save