Browse Source

Remove filtering of onsite URLs because it's unreliable

It also erroneously filters out offsite URLs that contain the root domain, and this isn't fixable without using regex, which isn't always available in the SQLite CLI before version 3.36.0.
master
JustAnotherArchivist 4 months ago
parent
commit
f3bec23348
2 changed files with 20 additions and 43 deletions
  1. +20
    -0
      wpull2-extract-ignored
  2. +0
    -43
      wpull2-extract-ignored-offsite

+ 20
- 0
wpull2-extract-ignored View File

@@ -0,0 +1,20 @@
#!/bin/bash
if [[ "$1" == '--help' || "$1" == '-h' ]]; then
printf 'Usage: %q [FILENAME]\n' "$0" >&2
printf 'Prints all ignored URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
exit
fi

if [[ $# -eq 1 ]]
then
filename="$1"
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
exit 1
fi

sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL'

+ 0
- 43
wpull2-extract-ignored-offsite View File

@@ -1,43 +0,0 @@
#!/bin/bash
if [[ "$1" == '--help' || "$1" == '-h' ]]; then
printf 'Usage: %q [FILENAME]\n' "$0" >&2
printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
exit
fi

if [[ $# -eq 1 ]]
then
filename="$1"
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
exit 1
fi

# Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated.
# This query is unfortunately slow due to a lack of index since it isn't needed for other operations.
readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0')
if [[ ${#roots[@]} -ne 1 ]]; then
printf 'Error: jobs with more than one root URL are not supported.\n' >&2
exit 1
fi
root="${roots[0]}"

# Extract root hostname
roothost="${root#*//}"
roothost="${roothost%%/*}"
if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi
if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi

# Bail if there are weird chars in the hostname; this shouldn't be possible.
if [[ "${roothost}" == *[*?]* ]]; then
printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2
exit 1
fi
# GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem.

# Go!
sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'

Loading…
Cancel
Save