Browse Source

Add ia-cdx-search

master
JustAnotherArchivist 2 years ago
parent
commit
303bb69c37
1 changed files with 49 additions and 0 deletions
  1. +49
    -0
      ia-cdx-search

+ 49
- 0
ia-cdx-search View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
import json
import re
import shlex
import sys
import urllib.request


if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE):
print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr)
print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr)
print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr)
print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
print('', file = sys.stderr)
print('Examples:', file = sys.stderr)
print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
sys.exit(1)

query = sys.argv[1]
resumeKey = sys.argv[2:] or ''
resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else ''

baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true'
url = f'{baseUrl}{resumeKeyP}'
try:
while True:
print(f'GET {url}', file = sys.stderr)
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as r:
if r.getcode() != 200:
raise RuntimeError(f'Could not fetch {url}')
o = json.load(r)
assert o, 'got empty response'
hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1
fields = o[0]
endOfDataRows = -2 if hasResumeKey else None
newResumeKey = o[-1][0] if hasResumeKey else False
assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format'
for row in o[1 : endOfDataRows]:
print(json.dumps(dict(zip(fields, row))))
if not newResumeKey:
break
url = f'{baseUrl}&resumeKey={newResumeKey}'
except (RuntimeError, json.JSONDecodeError, AssertionError):
resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else ''
print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr)
raise

Loading…
Cancel
Save