diff --git a/archivebot-high-memory b/archivebot-high-memory new file mode 100755 index 0000000..5b07b1c --- /dev/null +++ b/archivebot-high-memory @@ -0,0 +1,3 @@ +#!/bin/bash +# Find high memory usage ArchiveBot jobs +{ echo "PID RSS JOBID"; ps -C wpull --format 'pid,rss,cmd' --no-headers | sed 's,^\s*,,; s,/usr/bin/python3.*/data/[^/]\+/\([0-9a-z]\+\)/wpull\.log.*$,\1,'; } | column -t diff --git a/archivebot-list-stuck-requests b/archivebot-list-stuck-requests new file mode 100755 index 0000000..e0b9404 --- /dev/null +++ b/archivebot-list-stuck-requests @@ -0,0 +1,3 @@ +#!/bin/bash +# For each ArchiveBot job running on the machine, list requests that are stuck, i.e. older than 6 hours +ps -C wpull --format 'cmd' --no-headers | sed 's,^\s*,,; s,/usr/bin/python3.*/data/[^/]\+/\([0-9a-z]\+\)/wpull\.log.*$,\1,' | while read -r jobid; do echo $jobid; fs=$(find $(lsof -p $(pgrep -f $jobid) -F n | grep '^n.*tmp-' | sed 's,^n,,') -mmin +360 2> >(grep -v ': No such file or directory$' >&2)); if [[ "${fs}" ]]; then ls -al ${fs}; else echo 'None'; fi; echo; done diff --git a/archivebot-monitor-job-queue b/archivebot-monitor-job-queue new file mode 100644 index 0000000..e6eb54d --- /dev/null +++ b/archivebot-monitor-job-queue @@ -0,0 +1,5 @@ +#!/bin/bash +# Micro-optimisation FTW +# Sometimes, sites have ridiculous rate limits, but there are also a lot of other URLs in the job's queue. +# This command helps figuring out when the delay needs to be adjusted so that off-site stuff can be processed quickly while the on-site things are retrieved slowly in accordance with the rate limit. +{ echo 'DIFF POS ID ID PARENT ROOT STATUS TRY LEVEL ILEVEL TYPE PRIO POST SCODE FN ID URL'; sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE queued_urls.status = "todo"' | grep -nF '//ridiculouslyratelimitedsite.example.net/' | grep -v -e '/most' -e '\?important' -e '&ignore' -e 'patterns$' | awk -F':' 'BEGIN{prev=0} {print ($1 - prev) " " $0; prev=$1}' | sed 's,:, ,; s,|, ,g'; } | head -1000 | column -nt | less -S diff --git a/fos-ftp-upload b/fos-ftp-upload new file mode 100644 index 0000000..5a85987 --- /dev/null +++ b/fos-ftp-upload @@ -0,0 +1,3 @@ +#!/bin/bash +# Upload something to FOS via FTP +{ echo 'user username password'; echo 'cd directory'; echo 'mput something-00???.warc.gz'; } | ftp -ni fos.textfiles.com diff --git a/get-crx4chrome-urls b/get-crx4chrome-urls new file mode 100755 index 0000000..9673397 --- /dev/null +++ b/get-crx4chrome-urls @@ -0,0 +1,4 @@ +#!/bin/bash +# Generate a list of relevant crx4chrome.com URLs for an extension (e.g. for feeding it into ArchiveBot) +# Call passing the URL to an extension page, e.g. https://www.crx4chrome.com/extensions/eebpioaailbjojmdbmlpomfgijnlcemk/ +url="$1"; echo "${url}"; historyUrl="https://www.crx4chrome.com$(curl -s "${url}" | grep -Po 'href="\K/history/[^"]+' | uniq)"; if [[ $(wc -l <<<"${historyUrl}") -ne 1 ]]; then echo "Not exactly one history URL" >&2; return; fi; echo "${historyUrl}"; curl -s "${historyUrl}" | tr -d '\n' | grep -Po '
    .*?
' | grep -Po 'href="\K/crx/[^"]+' | while read -r versionUrl; do versionUrl="https://www.crx4chrome.com${versionUrl}"; echo "${versionUrl}"; curl -s "${versionUrl}" | grep -Po 'href="\Khttps://www.crx4chrome.com/go.php\?[^"]+' | while read -r downloadUrl; do echo "${downloadUrl}"; grep -Po '[?&]l=\K[^&]+' <<< "${downloadUrl}" | perl -pe 's/\%(\w\w)/chr hex $1/ge'; done; done; diff --git a/ia-upload-progress b/ia-upload-progress new file mode 100644 index 0000000..07048ae --- /dev/null +++ b/ia-upload-progress @@ -0,0 +1,3 @@ +#!/bin/bash +# Check how much of an upload made it into the item yet (vs. how much is stuck in the S3 queue if you also watch the upload process) +echo "Uploaded $(ia metadata $identifier | grep -Po '("source"\s*:\s*"original",[^}]*"size"\s*:\s*"\K\d+|"size"\s*:\s*"\K\d+(?="\s*,[^}]*"source"\s*:\s*"original"))' | awk '{sum+=$1} END {printf "%.2f GiB\n",sum/1024/1024/1024}') of $(du -bc *.warc.gz | tail -1 | cut -f1 | awk '{printf "%.2f GiB", $1/1024/1024/1024}')" diff --git a/iasha1check b/iasha1check new file mode 100755 index 0000000..dbc4c95 --- /dev/null +++ b/iasha1check @@ -0,0 +1,3 @@ +#!/bin/bash +# Fetch the SHA-1 hashes from an IA item and ensure that they match the local files (i.e. that the upload was successful) +identifier="$1"; escapedIdentifier="$(sed 's/[.[\*^$()+?{|]/\\&/g' <<<"${identifier}")"; sha1sum -c <(curl -sL "https://archive.org/download/${identifier}/${identifier}_files.xml" | tr -d '\n' | grep -Po '' | grep 'source="original".*' | sed 's,^.*name=",,; s,".*, ,; s,.*$,,' | grep -Pv "^${escapedIdentifier}"'(\.cdx\.(gz|idx)|_meta\.(sqlite|xml)) ' | awk '{ print $2 " " $1 }'); diff --git a/killcx-all-https b/killcx-all-https new file mode 100644 index 0000000..b769f3d --- /dev/null +++ b/killcx-all-https @@ -0,0 +1,3 @@ +#!/bin/bash +# Call killcx for all HTTPS connections established by a process +lsof -np $pid | grep TCP | grep -Po -- '->\K[^:]+:https?(?= )' | sort | sed 's,:https,:443,; s,:http,:80,' | while read -r con; do echo "=========== ${con}"; ./killcx "${con}"; done diff --git a/pipelines-launch-in-tmux-windows b/pipelines-launch-in-tmux-windows new file mode 100644 index 0000000..bd068ac --- /dev/null +++ b/pipelines-launch-in-tmux-windows @@ -0,0 +1,3 @@ +#!/bin/bash +# Launch pipeline instances, each using a different IP and a separate directory, in windows in tmux session 'instances' +for ip in ...; do tmux new-window -t instances "bash -c 'cd ~/whatever-grab.${ip}; run-pipeline3 pipeline.py --disable-web-server --concurrent 1 --context-value bind_address=${ip} YOURNICKHERE; exec bash'"; done diff --git a/pipelines-stop-gracefully b/pipelines-stop-gracefully new file mode 100755 index 0000000..b6719c8 --- /dev/null +++ b/pipelines-stop-gracefully @@ -0,0 +1,4 @@ +#!/bin/bash +# Stop all pipelines running on the current machine gracefully +# DO NOT USE FOR ARCHIVEBOT +pkill -INT pipeline diff --git a/run-every-five-minutes b/run-every-five-minutes new file mode 100755 index 0000000..9fba865 --- /dev/null +++ b/run-every-five-minutes @@ -0,0 +1,3 @@ +#!/bin/bash +# Run a command every full five minutes in a terminal window +while :; do date; "$@"; echo; sleep $(echo "(5 - $(date '+%M') % 5) * 60 - $(date +'%S.%N')" | bc); done diff --git a/tar-many-files-progress b/tar-many-files-progress new file mode 100644 index 0000000..5df97e9 --- /dev/null +++ b/tar-many-files-progress @@ -0,0 +1,3 @@ +#!/bin/bash +# Tar a directory with many files with a progress report +time stdbuf -oL tar -cvzf foo.tar.gz directory | awk 'BEGIN{count=0;total=1000000}{count+=1;if (count % 100 == 0) { printf "\r%d of %d done", count, total; fflush(); }}END{printf "\r%d of %d done\n", count, total}' diff --git a/warc-size b/warc-size new file mode 100755 index 0000000..8465a73 --- /dev/null +++ b/warc-size @@ -0,0 +1,3 @@ +#!/bin/bash +# Total size of all WARCs in the current directory (or subdirectories) +find -name '*.warc.gz' -printf '%s\n' | awk 'BEGIN { units[0] = "B"; units[1] = "KiB"; units[2] = "MiB"; units[3] = "GiB"; units[4] = "TiB"; units[5] = "PiB"; } { size += $1 } END { if (size > 0) { magnitude = int(log(size) / log(1024)); if (magnitude > 5) { magnitude = 5; } } else { magnitude = 0; } if (magnitude > 0) { sizeformat = "%.2f"; } else { sizeformat = "%d"; } printf sizeformat " %s\n", size / (1024 ^ magnitude), units[magnitude]; }' diff --git a/wget-spider-estimate-size b/wget-spider-estimate-size new file mode 100755 index 0000000..229958b --- /dev/null +++ b/wget-spider-estimate-size @@ -0,0 +1,5 @@ +#!/bin/bash +# Estimate size of a website through wget spider +# Note: this will miss anything where the server doesn't advertise the size. +wget --recursive --level inf --spider --no-directories --output-file=wget.log --no-parent --reject-regex '/\?C=[NMSD];O=[AD]$' "$1" +grep -Po ' \Khttps?://.*$|Length: \K\d+(?= )' wget.log | sed 's,^\(.*https\?://.*$\),url \1,; s,^\([0-9]\+\)$,length \1,' | awk 'BEGIN {url = ""; len = 0; totalsize = 0; } { if ($1 == "url") { if ($2 != url) { totalsize += len; url = $2; len = 0; } } else { if ($1 == "length") { len = $2; } } } END { totalsize += len; printf "%.0f\n", totalsize; }' diff --git a/wpull1-parallel-progress-monitor b/wpull1-parallel-progress-monitor new file mode 100644 index 0000000..c9baaed --- /dev/null +++ b/wpull1-parallel-progress-monitor @@ -0,0 +1,3 @@ +#!/bin/bash +# Monitor a grab split up over multiple wpull 1.x processes (e.g. a forum where you split everything up by thread ID) +sqlite3 < <(declare -i i=0; for f in /paths/to/wpull.db; do echo "ATTACH DATABASE '${f}' AS db${i};"; i+=1; done; declare -i n=i; echo -n "SELECT status, SUM(count) FROM ("; i=0; while [[ ${i} -lt ${n} ]]; do if [[ ${i} -ne 0 ]]; then echo -n "UNION ALL "; fi; echo "SELECT status, COUNT(id) AS count FROM db${i}.urls GROUP BY status "; i+=1; done; echo ") GROUP BY status;") diff --git a/wpull1-progress-monitor b/wpull1-progress-monitor new file mode 100755 index 0000000..43c51f4 --- /dev/null +++ b/wpull1-progress-monitor @@ -0,0 +1,3 @@ +#!/bin/bash +sqlite3 *.db 'SELECT status, COUNT(id) FROM urls GROUP BY status' +sqlite3 *.db 'SELECT status_code, COUNT(id) FROM urls GROUP BY status_code' diff --git a/wpull2-url-origin b/wpull2-url-origin new file mode 100755 index 0000000..60821e1 --- /dev/null +++ b/wpull2-url-origin @@ -0,0 +1,3 @@ +#!/bin/bash +# Trace back where a URL was discovered, all the way back to the root +url="$1"; curId=$(sqlite3 wpull.db 'SELECT id FROM url_strings WHERE url = "'"${url}"'"'); while :; do sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '$curId; if [[ $curId -eq 1 ]]; then break; fi; curId=$(sqlite3 wpull.db 'SELECT parent_url_string_id FROM queued_urls WHERE url_string_id = '$curId); done