From 15203bd991dc2ebe7a52f7e09abc8b9107ec9e8a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 12 Jul 2020 00:06:17 +0000 Subject: [PATCH] Handle redirect traps/loops --- qwarc/__init__.py | 5 +++++ qwarc/const.py | 3 +++ qwarc/utils.py | 16 ++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/qwarc/__init__.py b/qwarc/__init__.py index 0b5c260..bd7c075 100644 --- a/qwarc/__init__.py +++ b/qwarc/__init__.py @@ -100,6 +100,7 @@ class Item: raise ValueError('Incomplete URL and no baseUrl to join it with') else: url = self.baseUrl.join(url) + originalUrl = url if responseHandler is None: responseHandler = self.defaultResponseHandler assert method in ('GET', 'POST'), 'method must be GET or POST' @@ -164,6 +165,10 @@ class Item: self.logger.error(f'Request for {url} failed {attempt} times') retResponse.qhistory = tuple(history) return retResponse + elif action == ACTION_TOO_MANY_REDIRECTS: + self.logger.error(f'Request for {url} (from {originalUrl}) exceeded redirect limit') + retResponse.qhistory = tuple(history) + return retResponse elif action == ACTION_RETRY: # Nothing to do, just go to the next cycle pass diff --git a/qwarc/const.py b/qwarc/const.py index 1c0db65..f0bd75f 100644 --- a/qwarc/const.py +++ b/qwarc/const.py @@ -26,6 +26,9 @@ ACTION_FOLLOW_OR_SUCCESS = 3 ACTION_RETRIES_EXCEEDED = 4 '''This request failed repeatedly and exceeded the retry limit.''' +ACTION_TOO_MANY_REDIRECTS = 5 +'''Too many redirects were encountered.''' + DEFAULT_HEADERS = [ ('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'), ('Accept', '*/*'), diff --git a/qwarc/utils.py b/qwarc/utils.py index 2db713d..5b4c531 100644 --- a/qwarc/utils.py +++ b/qwarc/utils.py @@ -191,6 +191,22 @@ def handle_response_limit_error_retries(maxRetries, handler = handle_response_de return _handler +def handle_response_limit_redirect_depth(maxRedirects, handler = handle_response_default): + ''' + A response handler that limits how many redirects are followed. It behaves as handler otherwise, which defaults to handle_response_default. + + The same details as for handle_response_limit_error_retries apply. + ''' + + async def _handler(**kwargs): + action, writeToWarc = await handler(**kwargs) + # redirectLevel starts off at 0 so if it is equal to maxRedirects - 1, there were exactly maxRedirects redirects + if action == ACTION_FOLLOW_OR_SUCCESS and kwargs['redirectLevel'] >= maxRedirects - 1: + action = ACTION_TOO_MANY_REDIRECTS + return action, writeToWarc + return _handler + + def _get_dependency_versions(*pkgs): pending = set(pkgs) have = set(pkgs)