#!/usr/bin/env python3 import datetime import json import os import re import requests import sys import time APP_VERSION_PATTERN = re.compile(r"appVersion\s*:\s*'([0-9.]+)'") def log(msg): print(f'{datetime.datetime.utcnow().isoformat()}Z {msg}', file = sys.stderr) def new_session(): session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept': 'application/json', 'Accept-Language': 'en-US,en;q=0.5', 'DNT': '1', }) session.__JAADockerHubScriptLastReqTime = 0 # ¯\_(ツ)_/¯ return session def fetch(session, url, **kwargs): now = time.time() if session.__JAADockerHubScriptLastReqTime > now - 0.5: time.sleep(now + 0.5 - session.__JAADockerHubScriptLastReqTime) session.__JAADockerHubScriptLastReqTime = time.time() log(f'Fetching {url}') r = session.get(url, **kwargs) if not r or r.status_code != 200: raise RuntimeError(f'Failed to fetch {url}') return r def retrieve(profile): assert '/' not in profile, f'profile name contains slashes: {profile!r}' log(f'Retrieving Docker Hub user {profile}') session = new_session() os.mkdir(profile) # Fetch profile page to get app version #r = fetch(session, f'https://hub.docker.com/u/{profile}') #m = APP_VERSION_PATTERN.search(r.text) #if not m: # raise RuntimeError('Failed to extract app version') #session.headers.update({'X-DOCKER-API-CLIENT': f'docker-hub/{m.group(1)}'}) ## ONLY API REQUESTS FROM THIS POINT # Get user or org info r = fetch(session, f'https://hub.docker.com/v2/users/{profile}/') with open(f'{profile}/user.json', 'w') as fp: json.dump(r.json(), fp, indent = '\t') # Get repository list repositories = [] nextUrl = f'https://hub.docker.com/v2/repositories/{profile}/?page_size=100&page=1&ordering=last_updated' while True: r = fetch(session, nextUrl) o = r.json() repositories.extend(o['results']) if o['next'] is None: break nextUrl = o['next'] with open(f'{profile}/repositories.json', 'w') as fp: json.dump(repositories, fp, indent = '\t') os.mkdir(f'{profile}/repositories') for repository in repositories: namespace, name = repository['namespace'], repository['name'] assert '/' not in namespace and '/' not in name, f'namespace and/or name contain slashes: {namespace!r}, {name!r}' os.mkdir(f'{profile}/repositories/{name}') # Get general repo info (more detailed than the list above) r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/') with open(f'{profile}/repositories/{name}/info.json', 'w') as fp: json.dump(r.json(), fp, indent = '\t') # Get Dockerfile (if non-empty) r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/dockerfile/') o = r.json() if o['contents']: with open(f'{profile}/repositories/{name}/Dockerfile', 'w') as fp: fp.write(o['contents']) # Get source info sourceObjects = [] r = fetch(session, 'https://hub.docker.com/api/build/v1/source/', params = {'image': f'{namespace}/{name}'}) while True: o = r.json() sourceObjects.extend(o['objects']) if o['meta']['next'] is None: break #TODO: Find an example that uses this r = fetch(session, o['meta']['next']) with open(f'{profile}/repositories/{name}/sources.json', 'w') as fp: json.dump(sourceObjects, fp, indent = '\t') # Get tags tags = [] nextUrl = f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/?page_size=100&page=1&ordering=last_updated' while True: r = fetch(session, nextUrl) o = r.json() tags.extend(o['results']) if o['next'] is None: break nextUrl = o['next'] with open(f'{profile}/repositories/{name}/tags.json', 'w') as fp: json.dump(tags, fp, indent = '\t') # Get data for each tag os.mkdir(f'{profile}/repositories/{name}/tags') for tag in tags: tagname = tag['name'] assert '/' not in tagname, f'tag contains slashes: {tagname!r}' r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/{tagname}/') with open(f'{profile}/repositories/{name}/tags/{tagname}.info.json', 'w') as fp: json.dump(r.json(), fp, indent = '\t') r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/{tagname}/images') with open(f'{profile}/repositories/{name}/tags/{tagname}.images.json', 'w') as fp: json.dump(r.json(), fp, indent = '\t') log(f'Done with {profile}') def main(): readStdin = False for profile in sys.argv[1:]: if profile == '-': readStdin = True continue retrieve(profile) if readStdin: for line in sys.stdin: retrieve(line.strip()) if __name__ == '__main__': main()