|
|
@@ -2,27 +2,55 @@ |
|
|
|
import re |
|
|
|
import requests |
|
|
|
import sys |
|
|
|
import urllib3 |
|
|
|
|
|
|
|
|
|
|
|
RESPONSE_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'''<ListBucketResult xmlns=(["'])http://(?:s3\.amazonaws\.com/doc/2006-03-01/|doc\.s3\.amazonaws\.com/2006-03-01)\3>''') |
|
|
|
NAME_PATTERN = re.compile(r'<Name>([^<]*)</Name>') |
|
|
|
KEY_PATTERN = re.compile(r'<Key>([^<]*)</Key>') |
|
|
|
MTIME_PATTERN = re.compile(r'<LastModified>([^<]*)</LastModified>') |
|
|
|
REDIRECT_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'<Error><Code>PermanentRedirect</Code>') |
|
|
|
REDIRECT_TARGET_ENDPOINT_PATTERN = re.compile(r'<Endpoint>([^<]*)</Endpoint>') |
|
|
|
REDIRECT_TARGET_BUCKET_PATTERN = re.compile(r'<Bucket>([^<]*)</Bucket>') |
|
|
|
PROVIDERS = { |
|
|
|
'amazon': ['https://s3.amazonaws.com/{}/'], |
|
|
|
'amazon': ['https://s3.amazonaws.com/{}/', 'https://{}.s3.amazonaws.com/'], |
|
|
|
'google': ['https://storage.googleapis.com/{}/'], |
|
|
|
'scaleway': ['https://s3.nl-ams.scw.cloud/{}/', 'https://s3.fr-par.scw.cloud/{}/'], |
|
|
|
'wasabi': ['https://s3.wasabisys.com/{}/'], |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
def find(url, providers): |
|
|
|
# AWS S3 buckets whose names contain a . are broken because AWS can't be bothered to serve valid TLS certs for them. |
|
|
|
urllib3.disable_warnings() |
|
|
|
|
|
|
|
|
|
|
|
def fetch_with_redirect(url): |
|
|
|
print(f'Fetching {url}', file = sys.stderr) |
|
|
|
r = requests.get(url, timeout = 60) |
|
|
|
r = requests.get(url, verify = False, timeout = 60) |
|
|
|
print(f'{r.status_code} {url}', file = sys.stderr) |
|
|
|
body = r.text |
|
|
|
if not RESPONSE_PATTERN.match(body): |
|
|
|
if r.status_code == 301 and REDIRECT_PATTERN.match(body): |
|
|
|
m = REDIRECT_TARGET_ENDPOINT_PATTERN.search(body) |
|
|
|
if not m: |
|
|
|
raise RuntimeError('Could not get redirect endpoint') |
|
|
|
endpoint = m.group(1) |
|
|
|
m = REDIRECT_TARGET_BUCKET_PATTERN.search(body) |
|
|
|
if not m: |
|
|
|
raise RuntimeError('Could not get redirect bucket') |
|
|
|
bucket = m.group(1) |
|
|
|
print(f'Redirect to endpoint {endpoint!r} bucket {bucket!r}') |
|
|
|
url = f'https://{endpoint}/{bucket}/' |
|
|
|
print(f'Fetching {url}') |
|
|
|
r = requests.get(url, timeout = 60) |
|
|
|
print(f'{r.status_code} {url}', file = sys.stderr) |
|
|
|
body = r.text |
|
|
|
if r.status_code == 200 and not RESPONSE_PATTERN.match(body): |
|
|
|
raise RuntimeError(f'Invalid body: {body[:200]}...') |
|
|
|
return r, url, body |
|
|
|
|
|
|
|
|
|
|
|
def find(url, providers): |
|
|
|
_, _, body = fetch_with_redirect(url) |
|
|
|
|
|
|
|
# Get bucket name |
|
|
|
m = NAME_PATTERN.search(body) |
|
|
@@ -48,14 +76,9 @@ def find(url, providers): |
|
|
|
for provider in providers: |
|
|
|
for testUrlTemplate in PROVIDERS[provider]: |
|
|
|
testUrl = testUrlTemplate.format(name) |
|
|
|
print(f'Fetching {testUrl}', file = sys.stderr) |
|
|
|
r = requests.get(testUrl, timeout = 60) |
|
|
|
print(f'{r.status_code} {testUrl}', file = sys.stderr) |
|
|
|
r, testUrl, body = fetch_with_redirect(testUrl) |
|
|
|
if r.status_code != 200: |
|
|
|
continue |
|
|
|
body = r.text |
|
|
|
if not RESPONSE_PATTERN.match(body): |
|
|
|
raise RuntimeError(f'Invalid body: {body[:200]}...') |
|
|
|
|
|
|
|
# Compare first object |
|
|
|
if not firstKey: |
|
|
|