Browse Source

Handle redirect traps/loops

master
JustAnotherArchivist 3 years ago
parent
commit
15203bd991
3 changed files with 24 additions and 0 deletions
  1. +5
    -0
      qwarc/__init__.py
  2. +3
    -0
      qwarc/const.py
  3. +16
    -0
      qwarc/utils.py

+ 5
- 0
qwarc/__init__.py View File

@@ -100,6 +100,7 @@ class Item:
raise ValueError('Incomplete URL and no baseUrl to join it with')
else:
url = self.baseUrl.join(url)
originalUrl = url
if responseHandler is None:
responseHandler = self.defaultResponseHandler
assert method in ('GET', 'POST'), 'method must be GET or POST'
@@ -164,6 +165,10 @@ class Item:
self.logger.error(f'Request for {url} failed {attempt} times')
retResponse.qhistory = tuple(history)
return retResponse
elif action == ACTION_TOO_MANY_REDIRECTS:
self.logger.error(f'Request for {url} (from {originalUrl}) exceeded redirect limit')
retResponse.qhistory = tuple(history)
return retResponse
elif action == ACTION_RETRY:
# Nothing to do, just go to the next cycle
pass


+ 3
- 0
qwarc/const.py View File

@@ -26,6 +26,9 @@ ACTION_FOLLOW_OR_SUCCESS = 3
ACTION_RETRIES_EXCEEDED = 4
'''This request failed repeatedly and exceeded the retry limit.'''

ACTION_TOO_MANY_REDIRECTS = 5
'''Too many redirects were encountered.'''

DEFAULT_HEADERS = [
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'),
('Accept', '*/*'),


+ 16
- 0
qwarc/utils.py View File

@@ -191,6 +191,22 @@ def handle_response_limit_error_retries(maxRetries, handler = handle_response_de
return _handler


def handle_response_limit_redirect_depth(maxRedirects, handler = handle_response_default):
'''
A response handler that limits how many redirects are followed. It behaves as handler otherwise, which defaults to handle_response_default.

The same details as for handle_response_limit_error_retries apply.
'''

async def _handler(**kwargs):
action, writeToWarc = await handler(**kwargs)
# redirectLevel starts off at 0 so if it is equal to maxRedirects - 1, there were exactly maxRedirects redirects
if action == ACTION_FOLLOW_OR_SUCCESS and kwargs['redirectLevel'] >= maxRedirects - 1:
action = ACTION_TOO_MANY_REDIRECTS
return action, writeToWarc
return _handler


def _get_dependency_versions(*pkgs):
pending = set(pkgs)
have = set(pkgs)


Loading…
Cancel
Save