From c7151efc3eeb872ee6e178c99b26b6424786b08a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 4 Jul 2020 01:40:00 +0000 Subject: [PATCH] Add script for checking whether a file on transfer.notkiska.pw was archived correctly with AB --- transfer.notkiska.pw-check-ia | 73 +++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 transfer.notkiska.pw-check-ia diff --git a/transfer.notkiska.pw-check-ia b/transfer.notkiska.pw-check-ia new file mode 100755 index 0000000..4fef3e1 --- /dev/null +++ b/transfer.notkiska.pw-check-ia @@ -0,0 +1,73 @@ +#!/bin/bash +set -e + +if [[ $# -ne 1 || "$1" == '-h' || "$1" == '--help' ]] +then + echo 'Usage: transfer.notkiska.pw-check-ia TKPPATH' + echo 'TKPPATH is the path of a file on transfer.notkiska.pw, e.g. "123F3V/twitter-#qanon"' + echo 'Checks that the file is archived correctly on IA by downloading both copies and comparing the SHA-1.' + echo 'If the TKPCHECK_CACHE_DIR environment variable is set, it is used as a cache for the CDXs to avoid redownloading them from IA on every check.' + exit 1 +fi + +if [[ "${TKPCHECK_CACHE_DIR}" ]] +then + if [[ -e "${TKPCHECK_CACHE_DIR}" && ! -d "${TKPCHECK_CACHE_DIR}" ]] + then + echo "Error: ${TKPCHECK_CACHE_DIR} is not a directory." >&2 + exit 1 + fi +fi + +file="$1" +fileid="${file%/*}" +filename="${file#*/}" + +echo "Downloading from transfer.notkiska.pw" >&2 +tkphash="$(curl "https://transfer.notkiska.pw/${file}" | sha1sum | tee /dev/fd/2)" + +echo "Retrieving WARC list from viewer" >&2 +mapfile -t warcs < <(curl -s "http://archive.fart.website/archivebot/viewer/api/v1/search.json?q=urls-transfer.notkiska.pw-transfer.notkiska.pw-" | python3 -c 'import json,sys; [print(x["job_id"]) for x in json.loads(sys.stdin.read())["results"]]' | sed 's,^,http://archive.fart.website/archivebot/viewer/job/,' | xargs curl -s | grep -Po 'href="\Khttps://archive.org/download/[^/"]+/[^/"]+-\d\d\d\d\d\.warc\.gz(?=")') + +cdxLines=() +for warc in "${warcs[@]}" +do + cdx="${warc::-3}.os.cdx.gz" + mapfile -t -O ${#cdxLines[@]} cdxLines < <( + { + if [[ "${TKPCHECK_CACHE_DIR}" ]] + then + cdxfn="${cdx:29}" + mkdir -p "${TKPCHECK_CACHE_DIR}/${cdxfn%/*}" + if [[ ! -e "${TKPCHECK_CACHE_DIR}/${cdxfn}" ]] + then + echo "Fetching ${cdx} into local cache" >&2 + curl -L "${cdx}" >"${TKPCHECK_CACHE_DIR}/${cdxfn}" + fi + cat "${TKPCHECK_CACHE_DIR}/${cdxfn}" + else + echo "Fetching ${cdx}" >&2 + curl -L "${cdx}" + fi + } | zgrep -F "/${fileid}/" 2>/dev/null | tee /dev/fd/2 + ) +done + +if [[ ${#cdxLines[@]} -ne 1 ]] +then + echo "Not exactly one matching CDX line found, cannot continue" >&2 + exit 1 +fi + +read -r length offset iapath < <(awk '{print $9 " " $10 " " $11}' <<<"${cdxLines[0]}") +echo "Fetching ${offset}-$((${offset}+${length})) from IA ${iapath}" >&2 +iahash="$(curl -L --range "${offset}-$((${offset}+${length}))" "https://archive.org/download/${iapath}" | zcat 2>/dev/null | awk '/^\r$/ {empty+=1; next} (empty >= 2)' | sha1sum | tee /dev/fd/2)" + +if [[ "${tkphash}" == "${iahash}" ]] +then + echo OK >&2 + exit 0 +else + echo "Hash mismatch!" >&2 + exit 1 +fi