Browse Source

Add verbosity options, and annotate stderr on wiki-recursive-extract

master
JustAnotherArchivist 4 years ago
parent
commit
0f13a1fadd
3 changed files with 60 additions and 4 deletions
  1. +24
    -0
      social-media-extract-profile-link
  2. +8
    -0
      website-extract-social-media
  3. +28
    -4
      wiki-recursive-extract

+ 24
- 0
social-media-extract-profile-link View File

@@ -1,7 +1,15 @@
#!/bin/bash
# Given social media links on stdin or as args, this extracts the link in the profile description, if any.

function verbose_echo {
if [[ "${verbose}" ]]
then
echo "$@"
fi
}

function fetch {
verbose_echo "Fetching $1" >&2
curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "$1"
}

@@ -33,6 +41,22 @@ function fetch_n_extract {
fi
}

verbose=
for arg in "$@"
do
if [[ "${arg}" == '--verbose' || "${arg}" == '-v' ]]
then
verbose=1
shift
elif [[ "${arg}" == '--' ]]
then
shift
else
# Assume end of options
break
fi
done

{
for arg in "$@"
do


+ 8
- 0
website-extract-social-media View File

@@ -1,6 +1,9 @@
#!/bin/bash
function verbose_echo { if [[ "${verbose}" ]]; then echo "$@"; fi; }

function fetch_n_extract {
local url="$1"
verbose_echo "Fetching ${url}" >&2
{
curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \
grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \
@@ -52,12 +55,17 @@ function fetch_n_extract {

# Parse options
printInputUrl=
verbose=
while [[ $# -gt 0 ]]
do
if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]]
then
printInputUrl=true
shift
elif [[ "$1" == '--verbose' || "$1" == 'v' ]]
then
verbose=1
shift
elif [[ "$1" == '--' ]]
then
# End of options


+ 28
- 4
wiki-recursive-extract View File

@@ -4,6 +4,31 @@
# Everything else is run through website-extract-social-media.
# This is done recursively until no new links are discovered anymore.

verbose=
while [[ $# -gt 0 ]]
do
if [[ "$1" == '--verbose' || "$1" == '-v' ]]
then
verbose='--verbose'
else
echo "Unknown option: $1" >&2
exit 1
fi
shift
done

function verbose_echo {
if [[ "${verbose}" ]]
then
echo "$@"
fi
}

function stderr_annotate {
name="${1##*/}"
"$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
}

scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
declare -A sectionUrls
while read -r line
@@ -11,6 +36,7 @@ do
echo "${line}"
if [[ "${line}" == '=='* ]]
then
verbose_echo "${line}" >&2
unset sectionUrls
declare -A sectionUrls
fi
@@ -31,11 +57,9 @@ do

if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
then
echo "Calling social-media-extract-profile-link on ${curUrl}" >&2
mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:))
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:))
else
echo "Calling website-extract-social-media on ${curUrl}" >&2
mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }')
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:))
fi

for outUrl in "${outUrls[@]}"


Loading…
Cancel
Save