Browse Source

Fix ISO-8859-1-encoded Location header handling

tags/v0.2.6
JustAnotherArchivist 2 years ago
parent
commit
a7d7852c6d
1 changed files with 5 additions and 0 deletions
  1. +5
    -0
      qwarc/__init__.py

+ 5
- 0
qwarc/__init__.py View File

@@ -16,6 +16,7 @@ import logging
import os
import random
import sqlite3
import urllib.parse
import yarl


@@ -101,6 +102,10 @@ class Item:
redirectUrl = response.headers.get('Location') or response.headers.get('URI')
if not redirectUrl:
return retResponse, tuple(history)
if any(56448 <= ord(c) <= 56575 for c in redirectUrl):
# Surrogate escape characters in the redirect URL, which usually means that the server sent non-ASCII data (e.g. ISO-8859-1).
# Revert the encoding, then percent-encode the non-ASCII bytes.
redirectUrl = urllib.parse.quote_from_bytes(redirectUrl.encode('utf8', 'surrogateescape'), safe = ''.join(chr(i) for i in range(128)))
url = url.join(yarl.URL(redirectUrl))
if response.status in (301, 302, 303) and method == 'POST':
method = 'GET'


Loading…
Cancel
Save