#!/bin/bash # Takes a wiki page in new-style viewer format on stdin. # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link. # Everything else is run through website-extract-social-media. # This is done recursively until no new links are discovered anymore. # The output is further fed through url-normalise before and during processing to avoid equivalent but slightly different duplicates, and the output is deduplicated within each section at the end. verbose= while [[ $# -gt 0 ]] do if [[ "$1" == '--verbose' || "$1" == '-v' ]] then verbose='--verbose' else echo "Unknown option: $1" >&2 exit 1 fi shift done function verbose_echo { if [[ "${verbose}" ]] then echo "$@" fi } function stderr_annotate { name="$1" shift if [[ "${name}" == '' ]]; then name="${1##*/}"; fi "$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2) } scriptpath="$(cd "$(dirname "$0")"; pwd -P)" declare -A sectionUrls stderr_annotate 'url-normalise/before' "${scriptpath}/url-normalise" ${verbose} | while read -r line do echo "${line}" if [[ "${line}" == '=='* ]] then verbose_echo "${line}" >&2 unset sectionUrls declare -A sectionUrls fi if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]] then url="${line:2}" if [[ "${url}" == *' | '* ]] then url="${url%% | *}" fi if [[ "${sectionUrls[${url}]}" ]] then # Processed already, skip continue fi sectionUrls["${url}"]=1 toProcess=("${url}") while [[ ${#toProcess[@]} -gt 0 ]] do curUrl="${toProcess[0]}" toProcess=("${toProcess[@]:1}") if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" then mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-social' "${scriptpath}/url-normalise" ${verbose}) else mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-web' "${scriptpath}/url-normalise" ${verbose}) fi for outUrl in "${outUrls[@]}" do if [[ "${sectionUrls[${outUrl}]}" ]] then # The discovered URL was processed already, skip it entirely continue else # Not-yet-known URL, add to the list of URLs to process, mark as seen, and print toProcess+=("${outUrl}") sectionUrls["${outUrl}"]=1 echo "* ${outUrl}" fi done done fi done | mawk -W interactive '! /^\*/ { print; } /^\*/ && !seen[$0]++ { print; } /^==/ { delete seen; }'