From 0f13a1fadde68de753bebfa7f9da7932547e11e6 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Sun, 20 Oct 2019 18:10:21 +0000
Subject: [PATCH] Add verbosity options, and annotate stderr on
 wiki-recursive-extract

---
 social-media-extract-profile-link | 24 +++++++++++++++++++++++
 website-extract-social-media      |  8 ++++++++
 wiki-recursive-extract            | 32 +++++++++++++++++++++++++++----
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/social-media-extract-profile-link b/social-media-extract-profile-link
index 1d99792..d0dea23 100755
--- a/social-media-extract-profile-link
+++ b/social-media-extract-profile-link
@@ -1,7 +1,15 @@
 #!/bin/bash
 # Given social media links on stdin or as args, this extracts the link in the profile description, if any.
 
+function verbose_echo {
+	if [[ "${verbose}" ]]
+	then
+		echo "$@"
+	fi
+}
+
 function fetch {
+	verbose_echo "Fetching $1" >&2
 	curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "$1"
 }
 
@@ -33,6 +41,22 @@ function fetch_n_extract {
 	fi
 }
 
+verbose=
+for arg in "$@"
+do
+	if [[ "${arg}" == '--verbose' || "${arg}" == '-v' ]]
+	then
+		verbose=1
+		shift
+	elif [[ "${arg}" == '--' ]]
+	then
+		shift
+	else
+		# Assume end of options
+		break
+	fi
+done
+
 {
 	for arg in "$@"
 	do
diff --git a/website-extract-social-media b/website-extract-social-media
index cb05719..96afcff 100755
--- a/website-extract-social-media
+++ b/website-extract-social-media
@@ -1,6 +1,9 @@
 #!/bin/bash
+function verbose_echo { if [[ "${verbose}" ]]; then echo "$@"; fi; }
+
 function fetch_n_extract {
 	local url="$1"
+	verbose_echo "Fetching ${url}" >&2
 	{
 		curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \
 		  grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \
@@ -52,12 +55,17 @@ function fetch_n_extract {
 
 # Parse options
 printInputUrl=
+verbose=
 while [[ $# -gt 0 ]]
 do
 	if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]]
 	then
 		printInputUrl=true
 		shift
+	elif [[ "$1" == '--verbose' || "$1" == 'v' ]]
+	then
+		verbose=1
+		shift
 	elif [[ "$1" == '--' ]]
 	then
 		# End of options
diff --git a/wiki-recursive-extract b/wiki-recursive-extract
index 24a7dc5..d6e0c9c 100755
--- a/wiki-recursive-extract
+++ b/wiki-recursive-extract
@@ -4,6 +4,31 @@
 # Everything else is run through website-extract-social-media.
 # This is done recursively until no new links are discovered anymore.
 
+verbose=
+while [[ $# -gt 0 ]]
+do
+	if [[ "$1" == '--verbose' || "$1" == '-v' ]]
+	then
+		verbose='--verbose'
+	else
+		echo "Unknown option: $1" >&2
+		exit 1
+	fi
+	shift
+done
+
+function verbose_echo {
+	if [[ "${verbose}" ]]
+	then
+		echo "$@"
+	fi
+}
+
+function stderr_annotate {
+	name="${1##*/}"
+	"$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
+}
+
 scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
 declare -A sectionUrls
 while read -r line
@@ -11,6 +36,7 @@ do
 	echo "${line}"
 	if [[ "${line}" == '=='* ]]
 	then
+		verbose_echo "${line}" >&2
 		unset sectionUrls
 		declare -A sectionUrls
 	fi
@@ -31,11 +57,9 @@ do
 
 			if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
 			then
-				echo "Calling social-media-extract-profile-link on ${curUrl}" >&2
-				mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:))
+				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:))
 			else
-				echo "Calling website-extract-social-media on ${curUrl}" >&2
-				mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }')
+				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:))
 			fi
 
 			for outUrl in "${outUrls[@]}"