Browse Source

Add support for PermanentRedirect error responses

master
JustAnotherArchivist 6 months ago
parent
commit
7e458457d6
1 changed files with 33 additions and 10 deletions
  1. +33
    -10
      s3-bucket-find-direct-url

+ 33
- 10
s3-bucket-find-direct-url View File

@@ -2,27 +2,55 @@
import re
import requests
import sys
import urllib3


RESPONSE_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'''<ListBucketResult xmlns=(["'])http://(?:s3\.amazonaws\.com/doc/2006-03-01/|doc\.s3\.amazonaws\.com/2006-03-01)\3>''')
NAME_PATTERN = re.compile(r'<Name>([^<]*)</Name>')
KEY_PATTERN = re.compile(r'<Key>([^<]*)</Key>')
MTIME_PATTERN = re.compile(r'<LastModified>([^<]*)</LastModified>')
REDIRECT_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'<Error><Code>PermanentRedirect</Code>')
REDIRECT_TARGET_ENDPOINT_PATTERN = re.compile(r'<Endpoint>([^<]*)</Endpoint>')
REDIRECT_TARGET_BUCKET_PATTERN = re.compile(r'<Bucket>([^<]*)</Bucket>')
PROVIDERS = {
'amazon': ['https://s3.amazonaws.com/{}/'],
'amazon': ['https://s3.amazonaws.com/{}/', 'https://{}.s3.amazonaws.com/'],
'google': ['https://storage.googleapis.com/{}/'],
'scaleway': ['https://s3.nl-ams.scw.cloud/{}/', 'https://s3.fr-par.scw.cloud/{}/'],
'wasabi': ['https://s3.wasabisys.com/{}/'],
}


def find(url, providers):
# AWS S3 buckets whose names contain a . are broken because AWS can't be bothered to serve valid TLS certs for them.
urllib3.disable_warnings()


def fetch_with_redirect(url):
print(f'Fetching {url}', file = sys.stderr)
r = requests.get(url, timeout = 60)
r = requests.get(url, verify = False, timeout = 60)
print(f'{r.status_code} {url}', file = sys.stderr)
body = r.text
if not RESPONSE_PATTERN.match(body):
if r.status_code == 301 and REDIRECT_PATTERN.match(body):
m = REDIRECT_TARGET_ENDPOINT_PATTERN.search(body)
if not m:
raise RuntimeError('Could not get redirect endpoint')
endpoint = m.group(1)
m = REDIRECT_TARGET_BUCKET_PATTERN.search(body)
if not m:
raise RuntimeError('Could not get redirect bucket')
bucket = m.group(1)
print(f'Redirect to endpoint {endpoint!r} bucket {bucket!r}')
url = f'https://{endpoint}/{bucket}/'
print(f'Fetching {url}')
r = requests.get(url, timeout = 60)
print(f'{r.status_code} {url}', file = sys.stderr)
body = r.text
if r.status_code == 200 and not RESPONSE_PATTERN.match(body):
raise RuntimeError(f'Invalid body: {body[:200]}...')
return r, url, body


def find(url, providers):
_, _, body = fetch_with_redirect(url)

# Get bucket name
m = NAME_PATTERN.search(body)
@@ -48,14 +76,9 @@ def find(url, providers):
for provider in providers:
for testUrlTemplate in PROVIDERS[provider]:
testUrl = testUrlTemplate.format(name)
print(f'Fetching {testUrl}', file = sys.stderr)
r = requests.get(testUrl, timeout = 60)
print(f'{r.status_code} {testUrl}', file = sys.stderr)
r, testUrl, body = fetch_with_redirect(testUrl)
if r.status_code != 200:
continue
body = r.text
if not RESPONSE_PATTERN.match(body):
raise RuntimeError(f'Invalid body: {body[:200]}...')

# Compare first object
if not firstKey:


Loading…
Cancel
Save