From 53535b925a81510726a3182061162bbc0f0cbd11 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 6 Dec 2023 17:43:59 +0000 Subject: [PATCH] Add wpull2-extract-ignored-offsite and extract-urls-for-archiveteam-projects --- extract-urls-for-archiveteam-projects | 25 ++++++++++++++++ wpull2-extract-ignored-offsite | 43 +++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100755 extract-urls-for-archiveteam-projects create mode 100755 wpull2-extract-ignored-offsite diff --git a/extract-urls-for-archiveteam-projects b/extract-urls-for-archiveteam-projects new file mode 100755 index 0000000..8a98d5d --- /dev/null +++ b/extract-urls-for-archiveteam-projects @@ -0,0 +1,25 @@ +#!/bin/bash +if [[ $# -ne 1 || "$1" == '--help' || "$1" == '-h' ]]; then + printf 'Usage: extract-urls-for-archiveteam-projects PREFIX\n' >&2 + printf 'Reads URLs from stdin, extracts interesting for the different currently relevant AT projects into files prefixed by PREFIX\n' >&2 + exit 1 +fi + +prefix="$1" +if [[ "${prefix}" == *[*?[]* ]]; then + printf 'Error: prefixes containing * ? [ not supported\n' >&2 + exit 1 +fi + +if compgen -G "${prefix}*" >/dev/null; then + printf 'Error: there already exist files starting with %q\n' "${prefix}" >&2 + exit 1 +fi + +tee \ + >(grep -Fai imgur >"${prefix}-imgur") \ + >(grep -Fai -e mediafire -e mfi.re >"${prefix}-mediafire") \ + >(grep -Fai pastebin.com >"${prefix}-pastebin") \ + >(grep -Fai -e blogspot -e blogger >"${prefix}-blogger") \ + >(grep -Fai -e telegram.me -e //t.me/ >"${prefix}-telegram") \ + >/dev/null diff --git a/wpull2-extract-ignored-offsite b/wpull2-extract-ignored-offsite new file mode 100755 index 0000000..19c6289 --- /dev/null +++ b/wpull2-extract-ignored-offsite @@ -0,0 +1,43 @@ +#!/bin/bash +if [[ "$1" == '--help' || "$1" == '-h' ]]; then + printf 'Usage: %q [FILENAME]\n' "$0" >&2 + printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 + exit +fi + +if [[ $# -eq 1 ]] +then + filename="$1" +else + filename=wpull.db +fi +if [[ ! -f "${filename}" ]] +then + printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 + exit 1 +fi + +# Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated. +# This query is unfortunately slow due to a lack of index since it isn't needed for other operations. +readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0') +if [[ ${#roots[@]} -ne 1 ]]; then + printf 'Error: jobs with more than one root URL are not supported.\n' >&2 + exit 1 +fi +root="${roots[0]}" + +# Extract root hostname +roothost="${root#*//}" +roothost="${roothost%%/*}" +if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi +if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi + +# Bail if there are weird chars in the hostname; this shouldn't be possible. +if [[ "${roothost}" == *[*?]* ]]; then + printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2 + exit 1 +fi +# GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem. + +# Go! +sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'