@@ -1,7 +1,15 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
# Given social media links on stdin or as args, this extracts the link in the profile description, if any. | # Given social media links on stdin or as args, this extracts the link in the profile description, if any. | ||||
function verbose_echo { | |||||
if [[ "${verbose}" ]] | |||||
then | |||||
echo "$@" | |||||
fi | |||||
} | |||||
function fetch { | function fetch { | ||||
verbose_echo "Fetching $1" >&2 | |||||
curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "$1" | curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "$1" | ||||
} | } | ||||
@@ -33,6 +41,22 @@ function fetch_n_extract { | |||||
fi | fi | ||||
} | } | ||||
verbose= | |||||
for arg in "$@" | |||||
do | |||||
if [[ "${arg}" == '--verbose' || "${arg}" == '-v' ]] | |||||
then | |||||
verbose=1 | |||||
shift | |||||
elif [[ "${arg}" == '--' ]] | |||||
then | |||||
shift | |||||
else | |||||
# Assume end of options | |||||
break | |||||
fi | |||||
done | |||||
{ | { | ||||
for arg in "$@" | for arg in "$@" | ||||
do | do | ||||
@@ -1,6 +1,9 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
function verbose_echo { if [[ "${verbose}" ]]; then echo "$@"; fi; } | |||||
function fetch_n_extract { | function fetch_n_extract { | ||||
local url="$1" | local url="$1" | ||||
verbose_echo "Fetching ${url}" >&2 | |||||
{ | { | ||||
curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \ | curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \ | ||||
grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \ | grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \ | ||||
@@ -52,12 +55,17 @@ function fetch_n_extract { | |||||
# Parse options | # Parse options | ||||
printInputUrl= | printInputUrl= | ||||
verbose= | |||||
while [[ $# -gt 0 ]] | while [[ $# -gt 0 ]] | ||||
do | do | ||||
if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]] | if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]] | ||||
then | then | ||||
printInputUrl=true | printInputUrl=true | ||||
shift | shift | ||||
elif [[ "$1" == '--verbose' || "$1" == 'v' ]] | |||||
then | |||||
verbose=1 | |||||
shift | |||||
elif [[ "$1" == '--' ]] | elif [[ "$1" == '--' ]] | ||||
then | then | ||||
# End of options | # End of options | ||||
@@ -4,6 +4,31 @@ | |||||
# Everything else is run through website-extract-social-media. | # Everything else is run through website-extract-social-media. | ||||
# This is done recursively until no new links are discovered anymore. | # This is done recursively until no new links are discovered anymore. | ||||
verbose= | |||||
while [[ $# -gt 0 ]] | |||||
do | |||||
if [[ "$1" == '--verbose' || "$1" == '-v' ]] | |||||
then | |||||
verbose='--verbose' | |||||
else | |||||
echo "Unknown option: $1" >&2 | |||||
exit 1 | |||||
fi | |||||
shift | |||||
done | |||||
function verbose_echo { | |||||
if [[ "${verbose}" ]] | |||||
then | |||||
echo "$@" | |||||
fi | |||||
} | |||||
function stderr_annotate { | |||||
name="${1##*/}" | |||||
"$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2) | |||||
} | |||||
scriptpath="$(cd "$(dirname "$0")"; pwd -P)" | scriptpath="$(cd "$(dirname "$0")"; pwd -P)" | ||||
declare -A sectionUrls | declare -A sectionUrls | ||||
while read -r line | while read -r line | ||||
@@ -11,6 +36,7 @@ do | |||||
echo "${line}" | echo "${line}" | ||||
if [[ "${line}" == '=='* ]] | if [[ "${line}" == '=='* ]] | ||||
then | then | ||||
verbose_echo "${line}" >&2 | |||||
unset sectionUrls | unset sectionUrls | ||||
declare -A sectionUrls | declare -A sectionUrls | ||||
fi | fi | ||||
@@ -31,11 +57,9 @@ do | |||||
if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" | if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" | ||||
then | then | ||||
echo "Calling social-media-extract-profile-link on ${curUrl}" >&2 | |||||
mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:)) | |||||
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:)) | |||||
else | else | ||||
echo "Calling website-extract-social-media on ${curUrl}" >&2 | |||||
mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }') | |||||
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:)) | |||||
fi | fi | ||||
for outUrl in "${outUrls[@]}" | for outUrl in "${outUrls[@]}" | ||||