diff --git a/archivebot-high-memory b/archivebot-high-memory
new file mode 100755
index 0000000..5b07b1c
--- /dev/null
+++ b/archivebot-high-memory
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Find high memory usage ArchiveBot jobs
+{ echo "PID RSS JOBID"; ps -C wpull --format 'pid,rss,cmd' --no-headers | sed 's,^\s*,,; s,/usr/bin/python3.*/data/[^/]\+/\([0-9a-z]\+\)/wpull\.log.*$,\1,'; } | column -t
diff --git a/archivebot-list-stuck-requests b/archivebot-list-stuck-requests
new file mode 100755
index 0000000..e0b9404
--- /dev/null
+++ b/archivebot-list-stuck-requests
@@ -0,0 +1,3 @@
+#!/bin/bash
+# For each ArchiveBot job running on the machine, list requests that are stuck, i.e. older than 6 hours
+ps -C wpull --format 'cmd' --no-headers | sed 's,^\s*,,; s,/usr/bin/python3.*/data/[^/]\+/\([0-9a-z]\+\)/wpull\.log.*$,\1,' | while read -r jobid; do echo $jobid; fs=$(find $(lsof -p $(pgrep -f $jobid) -F n | grep '^n.*tmp-' | sed 's,^n,,') -mmin +360 2> >(grep -v ': No such file or directory$' >&2)); if [[ "${fs}" ]]; then ls -al ${fs}; else echo 'None'; fi; echo; done
diff --git a/archivebot-monitor-job-queue b/archivebot-monitor-job-queue
new file mode 100644
index 0000000..e6eb54d
--- /dev/null
+++ b/archivebot-monitor-job-queue
@@ -0,0 +1,5 @@
+#!/bin/bash
+# Micro-optimisation FTW
+# Sometimes, sites have ridiculous rate limits, but there are also a lot of other URLs in the job's queue.
+# This command helps figuring out when the delay needs to be adjusted so that off-site stuff can be processed quickly while the on-site things are retrieved slowly in accordance with the rate limit.
+{ echo 'DIFF POS ID ID PARENT ROOT STATUS TRY LEVEL ILEVEL TYPE PRIO POST SCODE FN ID URL'; sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE queued_urls.status = "todo"' | grep -nF '//ridiculouslyratelimitedsite.example.net/' | grep -v -e '/most' -e '\?important' -e '&ignore' -e 'patterns$' | awk -F':' 'BEGIN{prev=0} {print ($1 - prev) " " $0; prev=$1}' | sed 's,:, ,; s,|, ,g'; } | head -1000 | column -nt | less -S
diff --git a/fos-ftp-upload b/fos-ftp-upload
new file mode 100644
index 0000000..5a85987
--- /dev/null
+++ b/fos-ftp-upload
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Upload something to FOS via FTP
+{ echo 'user username password'; echo 'cd directory'; echo 'mput something-00???.warc.gz'; } | ftp -ni fos.textfiles.com
diff --git a/get-crx4chrome-urls b/get-crx4chrome-urls
new file mode 100755
index 0000000..9673397
--- /dev/null
+++ b/get-crx4chrome-urls
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Generate a list of relevant crx4chrome.com URLs for an extension (e.g. for feeding it into ArchiveBot)
+# Call passing the URL to an extension page, e.g. https://www.crx4chrome.com/extensions/eebpioaailbjojmdbmlpomfgijnlcemk/
+url="$1"; echo "${url}"; historyUrl="https://www.crx4chrome.com$(curl -s "${url}" | grep -Po 'href="\K/history/[^"]+' | uniq)"; if [[ $(wc -l <<<"${historyUrl}") -ne 1 ]]; then echo "Not exactly one history URL" >&2; return; fi; echo "${historyUrl}"; curl -s "${historyUrl}" | tr -d '\n' | grep -Po '
.*?
' | grep -Po 'href="\K/crx/[^"]+' | while read -r versionUrl; do versionUrl="https://www.crx4chrome.com${versionUrl}"; echo "${versionUrl}"; curl -s "${versionUrl}" | grep -Po 'href="\Khttps://www.crx4chrome.com/go.php\?[^"]+' | while read -r downloadUrl; do echo "${downloadUrl}"; grep -Po '[?&]l=\K[^&]+' <<< "${downloadUrl}" | perl -pe 's/\%(\w\w)/chr hex $1/ge'; done; done;
diff --git a/ia-upload-progress b/ia-upload-progress
new file mode 100644
index 0000000..07048ae
--- /dev/null
+++ b/ia-upload-progress
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Check how much of an upload made it into the item yet (vs. how much is stuck in the S3 queue if you also watch the upload process)
+echo "Uploaded $(ia metadata $identifier | grep -Po '("source"\s*:\s*"original",[^}]*"size"\s*:\s*"\K\d+|"size"\s*:\s*"\K\d+(?="\s*,[^}]*"source"\s*:\s*"original"))' | awk '{sum+=$1} END {printf "%.2f GiB\n",sum/1024/1024/1024}') of $(du -bc *.warc.gz | tail -1 | cut -f1 | awk '{printf "%.2f GiB", $1/1024/1024/1024}')"
diff --git a/iasha1check b/iasha1check
new file mode 100755
index 0000000..dbc4c95
--- /dev/null
+++ b/iasha1check
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Fetch the SHA-1 hashes from an IA item and ensure that they match the local files (i.e. that the upload was successful)
+identifier="$1"; escapedIdentifier="$(sed 's/[.[\*^$()+?{|]/\\&/g' <<<"${identifier}")"; sha1sum -c <(curl -sL "https://archive.org/download/${identifier}/${identifier}_files.xml" | tr -d '\n' | grep -Po '' | grep 'source="original".*' | sed 's,^.*name=",,; s,".*, ,; s,.*$,,' | grep -Pv "^${escapedIdentifier}"'(\.cdx\.(gz|idx)|_meta\.(sqlite|xml)) ' | awk '{ print $2 " " $1 }');
diff --git a/killcx-all-https b/killcx-all-https
new file mode 100644
index 0000000..b769f3d
--- /dev/null
+++ b/killcx-all-https
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Call killcx for all HTTPS connections established by a process
+lsof -np $pid | grep TCP | grep -Po -- '->\K[^:]+:https?(?= )' | sort | sed 's,:https,:443,; s,:http,:80,' | while read -r con; do echo "=========== ${con}"; ./killcx "${con}"; done
diff --git a/pipelines-launch-in-tmux-windows b/pipelines-launch-in-tmux-windows
new file mode 100644
index 0000000..bd068ac
--- /dev/null
+++ b/pipelines-launch-in-tmux-windows
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Launch pipeline instances, each using a different IP and a separate directory, in windows in tmux session 'instances'
+for ip in ...; do tmux new-window -t instances "bash -c 'cd ~/whatever-grab.${ip}; run-pipeline3 pipeline.py --disable-web-server --concurrent 1 --context-value bind_address=${ip} YOURNICKHERE; exec bash'"; done
diff --git a/pipelines-stop-gracefully b/pipelines-stop-gracefully
new file mode 100755
index 0000000..b6719c8
--- /dev/null
+++ b/pipelines-stop-gracefully
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Stop all pipelines running on the current machine gracefully
+# DO NOT USE FOR ARCHIVEBOT
+pkill -INT pipeline
diff --git a/run-every-five-minutes b/run-every-five-minutes
new file mode 100755
index 0000000..9fba865
--- /dev/null
+++ b/run-every-five-minutes
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Run a command every full five minutes in a terminal window
+while :; do date; "$@"; echo; sleep $(echo "(5 - $(date '+%M') % 5) * 60 - $(date +'%S.%N')" | bc); done
diff --git a/tar-many-files-progress b/tar-many-files-progress
new file mode 100644
index 0000000..5df97e9
--- /dev/null
+++ b/tar-many-files-progress
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Tar a directory with many files with a progress report
+time stdbuf -oL tar -cvzf foo.tar.gz directory | awk 'BEGIN{count=0;total=1000000}{count+=1;if (count % 100 == 0) { printf "\r%d of %d done", count, total; fflush(); }}END{printf "\r%d of %d done\n", count, total}'
diff --git a/warc-size b/warc-size
new file mode 100755
index 0000000..8465a73
--- /dev/null
+++ b/warc-size
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Total size of all WARCs in the current directory (or subdirectories)
+find -name '*.warc.gz' -printf '%s\n' | awk 'BEGIN { units[0] = "B"; units[1] = "KiB"; units[2] = "MiB"; units[3] = "GiB"; units[4] = "TiB"; units[5] = "PiB"; } { size += $1 } END { if (size > 0) { magnitude = int(log(size) / log(1024)); if (magnitude > 5) { magnitude = 5; } } else { magnitude = 0; } if (magnitude > 0) { sizeformat = "%.2f"; } else { sizeformat = "%d"; } printf sizeformat " %s\n", size / (1024 ^ magnitude), units[magnitude]; }'
diff --git a/wget-spider-estimate-size b/wget-spider-estimate-size
new file mode 100755
index 0000000..229958b
--- /dev/null
+++ b/wget-spider-estimate-size
@@ -0,0 +1,5 @@
+#!/bin/bash
+# Estimate size of a website through wget spider
+# Note: this will miss anything where the server doesn't advertise the size.
+wget --recursive --level inf --spider --no-directories --output-file=wget.log --no-parent --reject-regex '/\?C=[NMSD];O=[AD]$' "$1"
+grep -Po ' \Khttps?://.*$|Length: \K\d+(?= )' wget.log | sed 's,^\(.*https\?://.*$\),url \1,; s,^\([0-9]\+\)$,length \1,' | awk 'BEGIN {url = ""; len = 0; totalsize = 0; } { if ($1 == "url") { if ($2 != url) { totalsize += len; url = $2; len = 0; } } else { if ($1 == "length") { len = $2; } } } END { totalsize += len; printf "%.0f\n", totalsize; }'
diff --git a/wpull1-parallel-progress-monitor b/wpull1-parallel-progress-monitor
new file mode 100644
index 0000000..c9baaed
--- /dev/null
+++ b/wpull1-parallel-progress-monitor
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Monitor a grab split up over multiple wpull 1.x processes (e.g. a forum where you split everything up by thread ID)
+sqlite3 < <(declare -i i=0; for f in /paths/to/wpull.db; do echo "ATTACH DATABASE '${f}' AS db${i};"; i+=1; done; declare -i n=i; echo -n "SELECT status, SUM(count) FROM ("; i=0; while [[ ${i} -lt ${n} ]]; do if [[ ${i} -ne 0 ]]; then echo -n "UNION ALL "; fi; echo "SELECT status, COUNT(id) AS count FROM db${i}.urls GROUP BY status "; i+=1; done; echo ") GROUP BY status;")
diff --git a/wpull1-progress-monitor b/wpull1-progress-monitor
new file mode 100755
index 0000000..43c51f4
--- /dev/null
+++ b/wpull1-progress-monitor
@@ -0,0 +1,3 @@
+#!/bin/bash
+sqlite3 *.db 'SELECT status, COUNT(id) FROM urls GROUP BY status'
+sqlite3 *.db 'SELECT status_code, COUNT(id) FROM urls GROUP BY status_code'
diff --git a/wpull2-url-origin b/wpull2-url-origin
new file mode 100755
index 0000000..60821e1
--- /dev/null
+++ b/wpull2-url-origin
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Trace back where a URL was discovered, all the way back to the root
+url="$1"; curId=$(sqlite3 wpull.db 'SELECT id FROM url_strings WHERE url = "'"${url}"'"'); while :; do sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '$curId; if [[ $curId -eq 1 ]]; then break; fi; curId=$(sqlite3 wpull.db 'SELECT parent_url_string_id FROM queued_urls WHERE url_string_id = '$curId); done