|
- #!/bin/bash
- if [[ "$1" == '--help' || "$1" == '-h' ]]; then
- printf 'Usage: %q [FILENAME]\n' "$0" >&2
- printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
- exit
- fi
-
- if [[ $# -eq 1 ]]
- then
- filename="$1"
- else
- filename=wpull.db
- fi
- if [[ ! -f "${filename}" ]]
- then
- printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
- exit 1
- fi
-
- # Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated.
- # This query is unfortunately slow due to a lack of index since it isn't needed for other operations.
- readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0')
- if [[ ${#roots[@]} -ne 1 ]]; then
- printf 'Error: jobs with more than one root URL are not supported.\n' >&2
- exit 1
- fi
- root="${roots[0]}"
-
- # Extract root hostname
- roothost="${root#*//}"
- roothost="${roothost%%/*}"
- if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi
- if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi
-
- # Bail if there are weird chars in the hostname; this shouldn't be possible.
- if [[ "${roothost}" == *[*?]* ]]; then
- printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2
- exit 1
- fi
- # GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem.
-
- # Go!
- sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'
|