From aba7a1b0b85dfd9c09567eb5975058e940b8acb0 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Thu, 11 Nov 2021 21:46:18 +0000
Subject: [PATCH] Replace resumeKey pagination with page number pagination

resumeKey pagination is horribly broken. It may return incomplete results or infinite loops.
---
 ia-cdx-search | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/ia-cdx-search b/ia-cdx-search
index 772b1e1..a50b7fe 100755
--- a/ia-cdx-search
+++ b/ia-cdx-search
@@ -6,26 +6,36 @@ import sys
 import urllib.request
 
 
-if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE):
-	print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr)
+if len(sys.argv) not in (2, 4) or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', sys.argv[1], re.IGNORECASE):
+	print('Usage: ia-cdx-search QUERY [PAGE NUMPAGES]', file = sys.stderr)
 	print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
-	print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr)
-	print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr)
+	print('The output, limit, resumeKey, showResumeKey, page, and showNumPages parameters must not be included.', file = sys.stderr)
+	print('To resume a search that failed for some reason, provide the page number and number of pages through the second argument instead.', file = sys.stderr)
 	print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
 	print('', file = sys.stderr)
 	print('Examples:', file = sys.stderr)
 	print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
+	print('   Note that this will only find subdomains whose homepages are in the Wayback Machine. To discover all known subdomains, remove the filter and then extract the domains from the results.', file = sys.stderr)
 	print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
+	print('   The same caveat applies. The directory must have been retrieved directly without an additional trailing path or query string.', file = sys.stderr)
 	sys.exit(1)
 
 query = sys.argv[1]
-resumeKey = sys.argv[2:] or ''
-resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else ''
+baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}'
+if sys.argv[2:]:
+	startPage, numPages = map(int, sys.argv[2:])
+else:
+	req = urllib.request.Request(f'{baseUrl}&showNumPages=true')
+	with urllib.request.urlopen(req) as r:
+		if r.getcode() != 200:
+			raise RuntimeError(f'Could not fetch number of pages')
+		numPages = int(r.read())
+	startPage = 0
+	print(f'{numPages} pages', file = sys.stderr)
 
-baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true'
-url = f'{baseUrl}{resumeKeyP}'
 try:
-	while True:
+	for page in range(startPage, numPages):
+		url = f'{baseUrl}&output=json&page={page}'
 		print(f'GET {url}', file = sys.stderr)
 		req = urllib.request.Request(url)
 		with urllib.request.urlopen(req) as r:
@@ -33,17 +43,10 @@ try:
 				raise RuntimeError(f'Could not fetch {url}')
 			o = json.load(r)
 		assert o, 'got empty response'
-		hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1
 		fields = o[0]
-		endOfDataRows = -2 if hasResumeKey else None
-		newResumeKey = o[-1][0] if hasResumeKey else False
-		assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format'
-		for row in o[1 : endOfDataRows]:
+		assert all(len(v) == len(fields) for v in o[1:]), 'got unexpected response format'
+		for row in o[1:]:
 			print(json.dumps(dict(zip(fields, row))))
-		if not newResumeKey:
-			break
-		url = f'{baseUrl}&resumeKey={newResumeKey}'
 except (RuntimeError, json.JSONDecodeError, AssertionError):
-	resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else ''
-	print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr)
+	print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)} {shlex.quote(page)} {shlex.quote(numPages)}', file = sys.stderr)
 	raise