From 53535b925a81510726a3182061162bbc0f0cbd11 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:43:59 +0000
Subject: [PATCH] Add wpull2-extract-ignored-offsite and
 extract-urls-for-archiveteam-projects

---
 extract-urls-for-archiveteam-projects | 25 ++++++++++++++++
 wpull2-extract-ignored-offsite        | 43 +++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100755 extract-urls-for-archiveteam-projects
 create mode 100755 wpull2-extract-ignored-offsite

diff --git a/extract-urls-for-archiveteam-projects b/extract-urls-for-archiveteam-projects
new file mode 100755
index 0000000..8a98d5d
--- /dev/null
+++ b/extract-urls-for-archiveteam-projects
@@ -0,0 +1,25 @@
+#!/bin/bash
+if [[ $# -ne 1 || "$1" == '--help' || "$1" == '-h' ]]; then
+	printf 'Usage: extract-urls-for-archiveteam-projects PREFIX\n' >&2
+	printf 'Reads URLs from stdin, extracts interesting for the different currently relevant AT projects into files prefixed by PREFIX\n' >&2
+	exit 1
+fi
+
+prefix="$1"
+if [[ "${prefix}" == *[*?[]* ]]; then
+	printf 'Error: prefixes containing * ? [ not supported\n' >&2
+	exit 1
+fi
+
+if compgen -G "${prefix}*" >/dev/null; then
+	printf 'Error: there already exist files starting with %q\n' "${prefix}" >&2
+	exit 1
+fi
+
+tee \
+	>(grep -Fai imgur >"${prefix}-imgur") \
+	>(grep -Fai -e mediafire -e mfi.re >"${prefix}-mediafire") \
+	>(grep -Fai pastebin.com >"${prefix}-pastebin") \
+	>(grep -Fai -e blogspot -e blogger >"${prefix}-blogger") \
+	>(grep -Fai -e telegram.me -e //t.me/ >"${prefix}-telegram") \
+	>/dev/null
diff --git a/wpull2-extract-ignored-offsite b/wpull2-extract-ignored-offsite
new file mode 100755
index 0000000..19c6289
--- /dev/null
+++ b/wpull2-extract-ignored-offsite
@@ -0,0 +1,43 @@
+#!/bin/bash
+if [[ "$1" == '--help' || "$1" == '-h' ]]; then
+	printf 'Usage: %q [FILENAME]\n' "$0" >&2
+	printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
+	exit
+fi
+
+if [[ $# -eq 1 ]]
+then
+	filename="$1"
+else
+	filename=wpull.db
+fi
+if [[ ! -f "${filename}" ]]
+then
+	printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
+	exit 1
+fi
+
+# Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated.
+# This query is unfortunately slow due to a lack of index since it isn't needed for other operations.
+readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0')
+if [[ ${#roots[@]} -ne 1 ]]; then
+	printf 'Error: jobs with more than one root URL are not supported.\n' >&2
+	exit 1
+fi
+root="${roots[0]}"
+
+# Extract root hostname
+roothost="${root#*//}"
+roothost="${roothost%%/*}"
+if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi
+if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi
+
+# Bail if there are weird chars in the hostname; this shouldn't be possible.
+if [[ "${roothost}" == *[*?]* ]]; then
+	printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2
+	exit 1
+fi
+# GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem.
+
+# Go!
+sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'