Metadata for the ArchiveTeam Docker Hub repositories
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

152 lignes
4.6 KiB

  1. #!/usr/bin/env python3
  2. import datetime
  3. import json
  4. import os
  5. import re
  6. import requests
  7. import sys
  8. import time
  9. APP_VERSION_PATTERN = re.compile(r"appVersion\s*:\s*'([0-9.]+)'")
  10. def log(msg):
  11. print(f'{datetime.datetime.utcnow().isoformat()}Z {msg}', file = sys.stderr)
  12. def new_session():
  13. session = requests.Session()
  14. session.headers.update({
  15. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
  16. 'Accept': 'application/json',
  17. 'Accept-Language': 'en-US,en;q=0.5',
  18. 'DNT': '1',
  19. })
  20. session.__JAADockerHubScriptLastReqTime = 0 # ¯\_(ツ)_/¯
  21. return session
  22. def fetch(session, url, **kwargs):
  23. now = time.time()
  24. if session.__JAADockerHubScriptLastReqTime > now - 0.5:
  25. time.sleep(now + 0.5 - session.__JAADockerHubScriptLastReqTime)
  26. session.__JAADockerHubScriptLastReqTime = time.time()
  27. log(f'Fetching {url}')
  28. r = session.get(url, **kwargs)
  29. if not r or r.status_code != 200:
  30. raise RuntimeError(f'Failed to fetch {url}')
  31. return r
  32. def retrieve(profile):
  33. assert '/' not in profile, f'profile name contains slashes: {profile!r}'
  34. log(f'Retrieving Docker Hub user {profile}')
  35. session = new_session()
  36. os.mkdir(profile)
  37. # Fetch profile page to get app version
  38. #r = fetch(session, f'https://hub.docker.com/u/{profile}')
  39. #m = APP_VERSION_PATTERN.search(r.text)
  40. #if not m:
  41. # raise RuntimeError('Failed to extract app version')
  42. #session.headers.update({'X-DOCKER-API-CLIENT': f'docker-hub/{m.group(1)}'})
  43. ## ONLY API REQUESTS FROM THIS POINT
  44. # Get user or org info
  45. r = fetch(session, f'https://hub.docker.com/v2/users/{profile}/')
  46. with open(f'{profile}/user.json', 'w') as fp:
  47. json.dump(r.json(), fp, indent = '\t')
  48. # Get repository list
  49. repositories = []
  50. nextUrl = f'https://hub.docker.com/v2/repositories/{profile}/?page_size=100&page=1&ordering=last_updated'
  51. while True:
  52. r = fetch(session, nextUrl)
  53. o = r.json()
  54. repositories.extend(o['results'])
  55. if o['next'] is None:
  56. break
  57. nextUrl = o['next']
  58. with open(f'{profile}/repositories.json', 'w') as fp:
  59. json.dump(repositories, fp, indent = '\t')
  60. os.mkdir(f'{profile}/repositories')
  61. for repository in repositories:
  62. namespace, name = repository['namespace'], repository['name']
  63. assert '/' not in namespace and '/' not in name, f'namespace and/or name contain slashes: {namespace!r}, {name!r}'
  64. os.mkdir(f'{profile}/repositories/{name}')
  65. # Get general repo info (more detailed than the list above)
  66. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/')
  67. with open(f'{profile}/repositories/{name}/info.json', 'w') as fp:
  68. json.dump(r.json(), fp, indent = '\t')
  69. # Get Dockerfile (if non-empty)
  70. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/dockerfile/')
  71. o = r.json()
  72. if o['contents']:
  73. with open(f'{profile}/repositories/{name}/Dockerfile', 'w') as fp:
  74. fp.write(o['contents'])
  75. # Get source info
  76. sourceObjects = []
  77. r = fetch(session, 'https://hub.docker.com/api/build/v1/source/', params = {'image': f'{namespace}/{name}'})
  78. while True:
  79. o = r.json()
  80. sourceObjects.extend(o['objects'])
  81. if o['meta']['next'] is None:
  82. break
  83. #TODO: Find an example that uses this
  84. r = fetch(session, o['meta']['next'])
  85. with open(f'{profile}/repositories/{name}/sources.json', 'w') as fp:
  86. json.dump(sourceObjects, fp, indent = '\t')
  87. # Get tags
  88. tags = []
  89. nextUrl = f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/?page_size=100&page=1&ordering=last_updated'
  90. while True:
  91. r = fetch(session, nextUrl)
  92. o = r.json()
  93. tags.extend(o['results'])
  94. if o['next'] is None:
  95. break
  96. nextUrl = o['next']
  97. with open(f'{profile}/repositories/{name}/tags.json', 'w') as fp:
  98. json.dump(tags, fp, indent = '\t')
  99. # Get data for each tag
  100. os.mkdir(f'{profile}/repositories/{name}/tags')
  101. for tag in tags:
  102. tagname = tag['name']
  103. assert '/' not in tagname, f'tag contains slashes: {tagname!r}'
  104. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/{tagname}/')
  105. with open(f'{profile}/repositories/{name}/tags/{tagname}.info.json', 'w') as fp:
  106. json.dump(r.json(), fp, indent = '\t')
  107. r = fetch(session, f'https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/{tagname}/images')
  108. with open(f'{profile}/repositories/{name}/tags/{tagname}.images.json', 'w') as fp:
  109. json.dump(r.json(), fp, indent = '\t')
  110. log(f'Done with {profile}')
  111. def main():
  112. readStdin = False
  113. for profile in sys.argv[1:]:
  114. if profile == '-':
  115. readStdin = True
  116. continue
  117. retrieve(profile)
  118. if readStdin:
  119. for line in sys.stdin:
  120. retrieve(line.strip())
  121. if __name__ == '__main__':
  122. main()