The little things give you away... A collection of various small helper stuff
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 

53 Zeilen
2.8 KiB

  1. #!/usr/bin/env python3
  2. import json
  3. import re
  4. import shlex
  5. import sys
  6. import urllib.request
  7. if len(sys.argv) not in (2, 4) or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', sys.argv[1], re.IGNORECASE):
  8. print('Usage: ia-cdx-search QUERY [PAGE NUMPAGES]', file = sys.stderr)
  9. print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
  10. print('The output, limit, resumeKey, showResumeKey, page, and showNumPages parameters must not be included.', file = sys.stderr)
  11. print('To resume a search that failed for some reason, provide the page number and number of pages through the second argument instead.', file = sys.stderr)
  12. print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
  13. print('', file = sys.stderr)
  14. print('Examples:', file = sys.stderr)
  15. print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
  16. print(' Note that this will only find subdomains whose homepages are in the Wayback Machine. To discover all known subdomains, remove the filter and then extract the domains from the results.', file = sys.stderr)
  17. print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
  18. print(' The same caveat applies. The directory must have been retrieved directly without an additional trailing path or query string.', file = sys.stderr)
  19. sys.exit(1)
  20. query = sys.argv[1]
  21. baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}'
  22. if sys.argv[2:]:
  23. startPage, numPages = map(int, sys.argv[2:])
  24. else:
  25. req = urllib.request.Request(f'{baseUrl}&showNumPages=true')
  26. with urllib.request.urlopen(req) as r:
  27. if r.getcode() != 200:
  28. raise RuntimeError(f'Could not fetch number of pages')
  29. numPages = int(r.read())
  30. startPage = 0
  31. print(f'{numPages} pages', file = sys.stderr)
  32. try:
  33. for page in range(startPage, numPages):
  34. url = f'{baseUrl}&output=json&page={page}'
  35. print(f'GET {url}', file = sys.stderr)
  36. req = urllib.request.Request(url)
  37. with urllib.request.urlopen(req) as r:
  38. if r.getcode() != 200:
  39. raise RuntimeError(f'Could not fetch {url}')
  40. o = json.load(r)
  41. assert o, 'got empty response'
  42. fields = o[0]
  43. assert all(len(v) == len(fields) for v in o[1:]), 'got unexpected response format'
  44. for row in o[1:]:
  45. print(json.dumps(dict(zip(fields, row))))
  46. except (RuntimeError, json.JSONDecodeError, AssertionError):
  47. print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)} {shlex.quote(page)} {shlex.quote(numPages)}', file = sys.stderr)
  48. raise