diff --git a/wpull2-extract-ignored b/wpull2-extract-ignored new file mode 100755 index 0000000..870eb6b --- /dev/null +++ b/wpull2-extract-ignored @@ -0,0 +1,20 @@ +#!/bin/bash +if [[ "$1" == '--help' || "$1" == '-h' ]]; then + printf 'Usage: %q [FILENAME]\n' "$0" >&2 + printf 'Prints all ignored URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 + exit +fi + +if [[ $# -eq 1 ]] +then + filename="$1" +else + filename=wpull.db +fi +if [[ ! -f "${filename}" ]] +then + printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 + exit 1 +fi + +sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL' diff --git a/wpull2-extract-ignored-offsite b/wpull2-extract-ignored-offsite deleted file mode 100755 index 19c6289..0000000 --- a/wpull2-extract-ignored-offsite +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -if [[ "$1" == '--help' || "$1" == '-h' ]]; then - printf 'Usage: %q [FILENAME]\n' "$0" >&2 - printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 - exit -fi - -if [[ $# -eq 1 ]] -then - filename="$1" -else - filename=wpull.db -fi -if [[ ! -f "${filename}" ]] -then - printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 - exit 1 -fi - -# Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated. -# This query is unfortunately slow due to a lack of index since it isn't needed for other operations. -readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0') -if [[ ${#roots[@]} -ne 1 ]]; then - printf 'Error: jobs with more than one root URL are not supported.\n' >&2 - exit 1 -fi -root="${roots[0]}" - -# Extract root hostname -roothost="${root#*//}" -roothost="${roothost%%/*}" -if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi -if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi - -# Bail if there are weird chars in the hostname; this shouldn't be possible. -if [[ "${roothost}" == *[*?]* ]]; then - printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2 - exit 1 -fi -# GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem. - -# Go! -sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'