Browse Source

First set of little things

master
JustAnotherArchivist 1 year ago
parent
commit
d10a1d3675
17 changed files with 57 additions and 0 deletions
  1. +3
    -0
      archivebot-high-memory
  2. +3
    -0
      archivebot-list-stuck-requests
  3. +5
    -0
      archivebot-monitor-job-queue
  4. +3
    -0
      fos-ftp-upload
  5. +4
    -0
      get-crx4chrome-urls
  6. +3
    -0
      ia-upload-progress
  7. +3
    -0
      iasha1check
  8. +3
    -0
      killcx-all-https
  9. +3
    -0
      pipelines-launch-in-tmux-windows
  10. +4
    -0
      pipelines-stop-gracefully
  11. +3
    -0
      run-every-five-minutes
  12. +3
    -0
      tar-many-files-progress
  13. +3
    -0
      warc-size
  14. +5
    -0
      wget-spider-estimate-size
  15. +3
    -0
      wpull1-parallel-progress-monitor
  16. +3
    -0
      wpull1-progress-monitor
  17. +3
    -0
      wpull2-url-origin

+ 3
- 0
archivebot-high-memory View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Find high memory usage ArchiveBot jobs
{ echo "PID RSS JOBID"; ps -C wpull --format 'pid,rss,cmd' --no-headers | sed 's,^\s*,,; s,/usr/bin/python3.*/data/[^/]\+/\([0-9a-z]\+\)/wpull\.log.*$,\1,'; } | column -t

+ 3
- 0
archivebot-list-stuck-requests View File

@@ -0,0 +1,3 @@
#!/bin/bash
# For each ArchiveBot job running on the machine, list requests that are stuck, i.e. older than 6 hours
ps -C wpull --format 'cmd' --no-headers | sed 's,^\s*,,; s,/usr/bin/python3.*/data/[^/]\+/\([0-9a-z]\+\)/wpull\.log.*$,\1,' | while read -r jobid; do echo $jobid; fs=$(find $(lsof -p $(pgrep -f $jobid) -F n | grep '^n.*tmp-' | sed 's,^n,,') -mmin +360 2> >(grep -v ': No such file or directory$' >&2)); if [[ "${fs}" ]]; then ls -al ${fs}; else echo 'None'; fi; echo; done

+ 5
- 0
archivebot-monitor-job-queue View File

@@ -0,0 +1,5 @@
#!/bin/bash
# Micro-optimisation FTW
# Sometimes, sites have ridiculous rate limits, but there are also a lot of other URLs in the job's queue.
# This command helps figuring out when the delay needs to be adjusted so that off-site stuff can be processed quickly while the on-site things are retrieved slowly in accordance with the rate limit.
{ echo 'DIFF POS ID ID PARENT ROOT STATUS TRY LEVEL ILEVEL TYPE PRIO POST SCODE FN ID URL'; sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE queued_urls.status = "todo"' | grep -nF '//ridiculouslyratelimitedsite.example.net/' | grep -v -e '/most' -e '\?important' -e '&ignore' -e 'patterns$' | awk -F':' 'BEGIN{prev=0} {print ($1 - prev) " " $0; prev=$1}' | sed 's,:, ,; s,|, ,g'; } | head -1000 | column -nt | less -S

+ 3
- 0
fos-ftp-upload View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Upload something to FOS via FTP
{ echo 'user username password'; echo 'cd directory'; echo 'mput something-00???.warc.gz'; } | ftp -ni fos.textfiles.com

+ 4
- 0
get-crx4chrome-urls View File

@@ -0,0 +1,4 @@
#!/bin/bash
# Generate a list of relevant crx4chrome.com URLs for an extension (e.g. for feeding it into ArchiveBot)
# Call passing the URL to an extension page, e.g. https://www.crx4chrome.com/extensions/eebpioaailbjojmdbmlpomfgijnlcemk/
url="$1"; echo "${url}"; historyUrl="https://www.crx4chrome.com$(curl -s "${url}" | grep -Po 'href="\K/history/[^"]+' | uniq)"; if [[ $(wc -l <<<"${historyUrl}") -ne 1 ]]; then echo "Not exactly one history URL" >&2; return; fi; echo "${historyUrl}"; curl -s "${historyUrl}" | tr -d '\n' | grep -Po '<ol class="history">.*?</ol>' | grep -Po 'href="\K/crx/[^"]+' | while read -r versionUrl; do versionUrl="https://www.crx4chrome.com${versionUrl}"; echo "${versionUrl}"; curl -s "${versionUrl}" | grep -Po 'href="\Khttps://www.crx4chrome.com/go.php\?[^"]+' | while read -r downloadUrl; do echo "${downloadUrl}"; grep -Po '[?&]l=\K[^&]+' <<< "${downloadUrl}" | perl -pe 's/\%(\w\w)/chr hex $1/ge'; done; done;

+ 3
- 0
ia-upload-progress View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Check how much of an upload made it into the item yet (vs. how much is stuck in the S3 queue if you also watch the upload process)
echo "Uploaded $(ia metadata $identifier | grep -Po '("source"\s*:\s*"original",[^}]*"size"\s*:\s*"\K\d+|"size"\s*:\s*"\K\d+(?="\s*,[^}]*"source"\s*:\s*"original"))' | awk '{sum+=$1} END {printf "%.2f GiB\n",sum/1024/1024/1024}') of $(du -bc *.warc.gz | tail -1 | cut -f1 | awk '{printf "%.2f GiB", $1/1024/1024/1024}')"

+ 3
- 0
iasha1check View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Fetch the SHA-1 hashes from an IA item and ensure that they match the local files (i.e. that the upload was successful)
identifier="$1"; escapedIdentifier="$(sed 's/[.[\*^$()+?{|]/\\&/g' <<<"${identifier}")"; sha1sum -c <(curl -sL "https://archive.org/download/${identifier}/${identifier}_files.xml" | tr -d '\n' | grep -Po '<file .*?</file>' | grep 'source="original".*<sha1>' | sed 's,^.*name=",,; s,".*<sha1>, ,; s,</sha1>.*$,,' | grep -Pv "^${escapedIdentifier}"'(\.cdx\.(gz|idx)|_meta\.(sqlite|xml)) ' | awk '{ print $2 " " $1 }');

+ 3
- 0
killcx-all-https View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Call killcx for all HTTPS connections established by a process
lsof -np $pid | grep TCP | grep -Po -- '->\K[^:]+:https?(?= )' | sort | sed 's,:https,:443,; s,:http,:80,' | while read -r con; do echo "=========== ${con}"; ./killcx "${con}"; done

+ 3
- 0
pipelines-launch-in-tmux-windows View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Launch pipeline instances, each using a different IP and a separate directory, in windows in tmux session 'instances'
for ip in ...; do tmux new-window -t instances "bash -c 'cd ~/whatever-grab.${ip}; run-pipeline3 pipeline.py --disable-web-server --concurrent 1 --context-value bind_address=${ip} YOURNICKHERE; exec bash'"; done

+ 4
- 0
pipelines-stop-gracefully View File

@@ -0,0 +1,4 @@
#!/bin/bash
# Stop all pipelines running on the current machine gracefully
# DO NOT USE FOR ARCHIVEBOT
pkill -INT pipeline

+ 3
- 0
run-every-five-minutes View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Run a command every full five minutes in a terminal window
while :; do date; "$@"; echo; sleep $(echo "(5 - $(date '+%M') % 5) * 60 - $(date +'%S.%N')" | bc); done

+ 3
- 0
tar-many-files-progress View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Tar a directory with many files with a progress report
time stdbuf -oL tar -cvzf foo.tar.gz directory | awk 'BEGIN{count=0;total=1000000}{count+=1;if (count % 100 == 0) { printf "\r%d of %d done", count, total; fflush(); }}END{printf "\r%d of %d done\n", count, total}'

+ 3
- 0
warc-size View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Total size of all WARCs in the current directory (or subdirectories)
find -name '*.warc.gz' -printf '%s\n' | awk 'BEGIN { units[0] = "B"; units[1] = "KiB"; units[2] = "MiB"; units[3] = "GiB"; units[4] = "TiB"; units[5] = "PiB"; } { size += $1 } END { if (size > 0) { magnitude = int(log(size) / log(1024)); if (magnitude > 5) { magnitude = 5; } } else { magnitude = 0; } if (magnitude > 0) { sizeformat = "%.2f"; } else { sizeformat = "%d"; } printf sizeformat " %s\n", size / (1024 ^ magnitude), units[magnitude]; }'

+ 5
- 0
wget-spider-estimate-size View File

@@ -0,0 +1,5 @@
#!/bin/bash
# Estimate size of a website through wget spider
# Note: this will miss anything where the server doesn't advertise the size.
wget --recursive --level inf --spider --no-directories --output-file=wget.log --no-parent --reject-regex '/\?C=[NMSD];O=[AD]$' "$1"
grep -Po ' \Khttps?://.*$|Length: \K\d+(?= )' wget.log | sed 's,^\(.*https\?://.*$\),url \1,; s,^\([0-9]\+\)$,length \1,' | awk 'BEGIN {url = ""; len = 0; totalsize = 0; } { if ($1 == "url") { if ($2 != url) { totalsize += len; url = $2; len = 0; } } else { if ($1 == "length") { len = $2; } } } END { totalsize += len; printf "%.0f\n", totalsize; }'

+ 3
- 0
wpull1-parallel-progress-monitor View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Monitor a grab split up over multiple wpull 1.x processes (e.g. a forum where you split everything up by thread ID)
sqlite3 < <(declare -i i=0; for f in /paths/to/wpull.db; do echo "ATTACH DATABASE '${f}' AS db${i};"; i+=1; done; declare -i n=i; echo -n "SELECT status, SUM(count) FROM ("; i=0; while [[ ${i} -lt ${n} ]]; do if [[ ${i} -ne 0 ]]; then echo -n "UNION ALL "; fi; echo "SELECT status, COUNT(id) AS count FROM db${i}.urls GROUP BY status "; i+=1; done; echo ") GROUP BY status;")

+ 3
- 0
wpull1-progress-monitor View File

@@ -0,0 +1,3 @@
#!/bin/bash
sqlite3 *.db 'SELECT status, COUNT(id) FROM urls GROUP BY status'
sqlite3 *.db 'SELECT status_code, COUNT(id) FROM urls GROUP BY status_code'

+ 3
- 0
wpull2-url-origin View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Trace back where a URL was discovered, all the way back to the root
url="$1"; curId=$(sqlite3 wpull.db 'SELECT id FROM url_strings WHERE url = "'"${url}"'"'); while :; do sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '$curId; if [[ $curId -eq 1 ]]; then break; fi; curId=$(sqlite3 wpull.db 'SELECT parent_url_string_id FROM queued_urls WHERE url_string_id = '$curId); done

Loading…
Cancel
Save