Metadata for the ArchiveTeam Docker Hub repositories
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

152 lines
4.6 KiB

  1. #!/usr/bin/env python3
  2. import datetime
  3. import json
  4. import os
  5. import re
  6. import requests
  7. import sys
  8. import time
  9. APP_VERSION_PATTERN = re.compile(r"appVersion\s*:\s*'([0-9.]+)'")
  10. def log(msg):
  11. print(f'{datetime.datetime.utcnow().isoformat()}Z {msg}', file = sys.stderr)
  12. def new_session():
  13. session = requests.Session()
  14. session.headers.update({
  15. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
  16. 'Accept': 'application/json',
  17. 'Accept-Language': 'en-US,en;q=0.5',
  18. 'DNT': '1',
  19. })
  20. session.__JAADockerHubScriptLastReqTime = 0 # ¯\_(ツ)_/¯
  21. return session
  22. def fetch(session, url, **kwargs):
  23. now = time.time()
  24. if session.__JAADockerHubScriptLastReqTime > now - 0.5:
  25. time.sleep(now + 0.5 - session.__JAADockerHubScriptLastReqTime)
  26. session.__JAADockerHubScriptLastReqTime = time.time()
  27. log(f'Fetching {url}')
  28. r = session.get(url, **kwargs)
  29. if not r or r.status_code != 200:
  30. raise RuntimeError(f'Failed to fetch {url}')
  31. return r
  32. def retrieve(profile):
  33. assert '/' not in profile, f'profile name contains slashes: {profile!r}'
  34. log(f'Retrieving Docker Hub user {profile}')
  35. session = new_session()
  36. os.mkdir(profile)
  37. # Fetch profile page to get app version
  38. #r = fetch(session, f'https://hub.docker.com/u/{profile}')
  39. #m = APP_VERSION_PATTERN.search(r.text)
  40. #if not m:
  41. # raise RuntimeError('Failed to extract app version')
  42. #session.headers.update({'X-DOCKER-API-CLIENT': f'docker-hub/{m.group(1)}'})
  43. ## ONLY API REQUESTS FROM THIS POINT
  44. # Get user or org info
  45. r = fetch(session, f'https://hub.docker.com/v2/users/{profile}/')
  46. with open(f'{profile}/user.json', 'w') as fp:
  47. json.dump(r.json(), fp, indent = '\t')
  48. # Get repository list
  49. repositories = []
  50. nextUrl = f'https://hub.docker.com/v2/repositories/{profile}/?page_size=100&page=1&ordering=last_updated'
  51. while True:
  52. r = fetch(session, nextUrl)
  53. o = r.json()
  54. repositories.extend(o['results'])
  55. if o['next'] is None:
  56. break
  57. nextUrl = o['next']
  58. with open(f'{profile}/repositories.json', 'w') as fp:
  59. json.dump(repositories, fp, indent = '\t')
  60. os.mkdir(f'{profile}/repositories')
  61. for repository in repositories:
  62. namespace, name = repository['namespace'], repository['name']
  63. assert '/' not in namespace and '/' not in name, f'namespace and/or name contain slashes: {namespace!r}, {name!r}'
  64. os.mkdir(f'{profile}/repositories/{name}')
  65. # Get general repo info (more detailed than the list above)
  66. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/')
  67. with open(f'{profile}/repositories/{name}/info.json', 'w') as fp:
  68. json.dump(r.json(), fp, indent = '\t')
  69. # Get Dockerfile (if non-empty)
  70. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/dockerfile/')
  71. o = r.json()
  72. if o['contents']:
  73. with open(f'{profile}/repositories/{name}/Dockerfile', 'w') as fp:
  74. fp.write(o['contents'])
  75. # Get source info
  76. sourceObjects = []
  77. r = fetch(session, 'https://hub.docker.com/api/build/v1/source/', params = {'image': f'{namespace}/{name}'})
  78. while True:
  79. o = r.json()
  80. sourceObjects.extend(o['objects'])
  81. if o['meta']['next'] is None:
  82. break
  83. #TODO: Find an example that uses this
  84. r = fetch(session, o['meta']['next'])
  85. with open(f'{profile}/repositories/{name}/sources.json', 'w') as fp:
  86. json.dump(sourceObjects, fp, indent = '\t')
  87. # Get tags
  88. tags = []
  89. nextUrl = f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/?page_size=100&page=1&ordering=last_updated'
  90. while True:
  91. r = fetch(session, nextUrl)
  92. o = r.json()
  93. tags.extend(o['results'])
  94. if o['next'] is None:
  95. break
  96. nextUrl = o['next']
  97. with open(f'{profile}/repositories/{name}/tags.json', 'w') as fp:
  98. json.dump(tags, fp, indent = '\t')
  99. # Get data for each tag
  100. os.mkdir(f'{profile}/repositories/{name}/tags')
  101. for tag in tags:
  102. tagname = tag['name']
  103. assert '/' not in tagname, f'tag contains slashes: {tagname!r}'
  104. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/{tagname}/')
  105. with open(f'{profile}/repositories/{name}/tags/{tagname}.info.json', 'w') as fp:
  106. json.dump(r.json(), fp, indent = '\t')
  107. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/{tagname}/images')
  108. with open(f'{profile}/repositories/{name}/tags/{tagname}.images.json', 'w') as fp:
  109. json.dump(r.json(), fp, indent = '\t')
  110. log(f'Done with {profile}')
  111. def main():
  112. readStdin = False
  113. for profile in sys.argv[1:]:
  114. if profile == '-':
  115. readStdin = True
  116. continue
  117. retrieve(profile)
  118. if readStdin:
  119. for line in sys.stdin:
  120. retrieve(line.strip())
  121. if __name__ == '__main__':
  122. main()