From 5090a8ad028f4698b3aa7917f2760e6eb2326a19 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Sun, 5 May 2019 21:32:55 +0000
Subject: [PATCH] Enumerate users on a Mastodon instance

---
 mastodon-enumerate-users | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100755 mastodon-enumerate-users

diff --git a/mastodon-enumerate-users b/mastodon-enumerate-users
new file mode 100755
index 0000000..839004f
--- /dev/null
+++ b/mastodon-enumerate-users
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+import datetime
+import http.client
+import itertools
+import json
+import sys
+import time
+
+domain = sys.argv[1]
+apiUrlBase = 'https://{}/api/v1/accounts/'.format(domain)
+connection = http.client.HTTPSConnection(domain)
+try:
+	consecutive404s = 0
+	for i in itertools.count(start = 1):
+		connection.request('GET', '/api/v1/accounts/{}'.format(i))
+		response = connection.getresponse()
+		data = response.read()
+		if response.status == 200:
+			j = json.loads(data)
+			print(i, j['url'])
+		if response.status == 404:
+			consecutive404s += 1
+		else:
+			consecutive404s = 0
+		# If we got enough consecutive 404s, we likely reached the end of the list.
+		# For large instances, this happens when the last 1 % of scanned IDs don't exist.
+		# For small instances, at least 100 IDs need to fail.
+		# 458.211 is the solution to 0.01 * i + 100 / i ** (1 / x) = i with i = 100 (analytical form from Wolfram|Alpha: 2*ln(10)/(2*ln(2)-2*ln(3)+2*ln(5)-ln(11)))
+		consecutive404threshold = 0.01 * i + 100 / i ** (1 / 458.211)
+		print(f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S} Account {i}: {response.status} {response.reason} [404s: {consecutive404s}/{consecutive404threshold:.2f}]', file = sys.stderr)
+		if consecutive404s >= consecutive404threshold:
+			break
+		if int(response.getheader('X-RateLimit-Remaining')) < 10:
+			time.sleep(60) #TODO sleep until X-RateLimit-Reset
+finally:
+	connection.close()