|
|
@@ -0,0 +1,49 @@ |
|
|
|
#!/usr/bin/env python3 |
|
|
|
import json |
|
|
|
import re |
|
|
|
import shlex |
|
|
|
import sys |
|
|
|
import urllib.request |
|
|
|
|
|
|
|
|
|
|
|
if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE): |
|
|
|
print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr) |
|
|
|
print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr) |
|
|
|
print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr) |
|
|
|
print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr) |
|
|
|
print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr) |
|
|
|
print('', file = sys.stderr) |
|
|
|
print('Examples:', file = sys.stderr) |
|
|
|
print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr) |
|
|
|
print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr) |
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
query = sys.argv[1] |
|
|
|
resumeKey = sys.argv[2:] or '' |
|
|
|
resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else '' |
|
|
|
|
|
|
|
baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true' |
|
|
|
url = f'{baseUrl}{resumeKeyP}' |
|
|
|
try: |
|
|
|
while True: |
|
|
|
print(f'GET {url}', file = sys.stderr) |
|
|
|
req = urllib.request.Request(url) |
|
|
|
with urllib.request.urlopen(req) as r: |
|
|
|
if r.getcode() != 200: |
|
|
|
raise RuntimeError(f'Could not fetch {url}') |
|
|
|
o = json.load(r) |
|
|
|
assert o, 'got empty response' |
|
|
|
hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1 |
|
|
|
fields = o[0] |
|
|
|
endOfDataRows = -2 if hasResumeKey else None |
|
|
|
newResumeKey = o[-1][0] if hasResumeKey else False |
|
|
|
assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format' |
|
|
|
for row in o[1 : endOfDataRows]: |
|
|
|
print(json.dumps(dict(zip(fields, row)))) |
|
|
|
if not newResumeKey: |
|
|
|
break |
|
|
|
url = f'{baseUrl}&resumeKey={newResumeKey}' |
|
|
|
except (RuntimeError, json.JSONDecodeError, AssertionError): |
|
|
|
resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else '' |
|
|
|
print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr) |
|
|
|
raise |