From 0f13a1fadde68de753bebfa7f9da7932547e11e6 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 20 Oct 2019 18:10:21 +0000 Subject: [PATCH] Add verbosity options, and annotate stderr on wiki-recursive-extract --- social-media-extract-profile-link | 24 +++++++++++++++++++++++ website-extract-social-media | 8 ++++++++ wiki-recursive-extract | 32 +++++++++++++++++++++++++++---- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/social-media-extract-profile-link b/social-media-extract-profile-link index 1d99792..d0dea23 100755 --- a/social-media-extract-profile-link +++ b/social-media-extract-profile-link @@ -1,7 +1,15 @@ #!/bin/bash # Given social media links on stdin or as args, this extracts the link in the profile description, if any. +function verbose_echo { + if [[ "${verbose}" ]] + then + echo "$@" + fi +} + function fetch { + verbose_echo "Fetching $1" >&2 curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "$1" } @@ -33,6 +41,22 @@ function fetch_n_extract { fi } +verbose= +for arg in "$@" +do + if [[ "${arg}" == '--verbose' || "${arg}" == '-v' ]] + then + verbose=1 + shift + elif [[ "${arg}" == '--' ]] + then + shift + else + # Assume end of options + break + fi +done + { for arg in "$@" do diff --git a/website-extract-social-media b/website-extract-social-media index cb05719..96afcff 100755 --- a/website-extract-social-media +++ b/website-extract-social-media @@ -1,6 +1,9 @@ #!/bin/bash +function verbose_echo { if [[ "${verbose}" ]]; then echo "$@"; fi; } + function fetch_n_extract { local url="$1" + verbose_echo "Fetching ${url}" >&2 { curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \ grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \ @@ -52,12 +55,17 @@ function fetch_n_extract { # Parse options printInputUrl= +verbose= while [[ $# -gt 0 ]] do if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]] then printInputUrl=true shift + elif [[ "$1" == '--verbose' || "$1" == 'v' ]] + then + verbose=1 + shift elif [[ "$1" == '--' ]] then # End of options diff --git a/wiki-recursive-extract b/wiki-recursive-extract index 24a7dc5..d6e0c9c 100755 --- a/wiki-recursive-extract +++ b/wiki-recursive-extract @@ -4,6 +4,31 @@ # Everything else is run through website-extract-social-media. # This is done recursively until no new links are discovered anymore. +verbose= +while [[ $# -gt 0 ]] +do + if [[ "$1" == '--verbose' || "$1" == '-v' ]] + then + verbose='--verbose' + else + echo "Unknown option: $1" >&2 + exit 1 + fi + shift +done + +function verbose_echo { + if [[ "${verbose}" ]] + then + echo "$@" + fi +} + +function stderr_annotate { + name="${1##*/}" + "$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2) +} + scriptpath="$(cd "$(dirname "$0")"; pwd -P)" declare -A sectionUrls while read -r line @@ -11,6 +36,7 @@ do echo "${line}" if [[ "${line}" == '=='* ]] then + verbose_echo "${line}" >&2 unset sectionUrls declare -A sectionUrls fi @@ -31,11 +57,9 @@ do if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" then - echo "Calling social-media-extract-profile-link on ${curUrl}" >&2 - mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:)) + mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:)) else - echo "Calling website-extract-social-media on ${curUrl}" >&2 - mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }') + mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:)) fi for outUrl in "${outUrls[@]}"