diff --git a/website-extract-social-media b/website-extract-social-media new file mode 100755 index 0000000..f5233ef --- /dev/null +++ b/website-extract-social-media @@ -0,0 +1,91 @@ +#!/bin/bash +function fetch_n_extract { + local url="$1" + { + curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \ + grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \ + tee \ + >( + # Facebook + grep -Poi 'facebook\.com/[^/ <"'"'"']+' | \ + sed 's,^,https://www.,' | \ + grep -vi -e '^https://www\.facebook\.com/2008$' -e '^https://www\.facebook\.com/tr\?' -e '^https://www\.facebook\.com/plugins$' | \ + grep -Pvi '^https://www\.facebook\.com/sharer(\.php\?|\?|$)' + ) \ + >( + # Flickr + grep -Poi 'flickr\.com/photos/[^/ <"'"'"']+' | \ + sed 's,^,https://www.,' + ) \ + >( + # Instagram + grep -Poi 'instagram\.com/[^/ <"'"'"']+' | \ + sed 's,^,https://www.,' + ) \ + >( + # Telegram + grep -Poi '//(www\.)?t\.me/[^/ <"'"'"']+' | \ + sed 's,^//,,; s,^www\.,,; s,^,https://,' + ) \ + >( + # Twitter + grep -Poi 'twitter\.com/[^/ <"'"'"']+' | \ + sed 's,^,https://,' | \ + grep -vi -e '^https://twitter\.com/home\?' -e '^https://twitter\.com/widgets\.js$' -e '^https://twitter\.com/share\?' | \ + sed 's,\([?&]\)ref_src=[^&]\+&\?,\1,; s,?$,,' + ) \ + >( + # VKontakte + grep -Poi 'vk\.com/[^/ <"'"'"']+' | \ + sed 's,^,https://,' + ) \ + >( + # YouTube + grep -Poi '(youtube\.com/((user|channel|embed)/)?[^/ <"'"'"']+|youtu\.be/[^/ <"'"'"']+)' | \ + awk '/^youtube/ { print "https://www." $0 } /^youtu\.be/ { print "https://" $0 }' + ) \ + >/dev/null + } | awk '!seen[$0]++' +} + +# Parse options +printInputUrl= +while [[ $# -gt 0 ]] +do + if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]] + then + printInputUrl=true + shift + elif [[ "$1" == '--' ]] + then + # End of options + shift + break + elif [[ "$1" == '--'* ]] + then + echo "Unknown option: $1" >&2 + exit 1 + else + # Assume end of options + break + fi +done + +{ + for arg in "$@" + do + echo "${arg}" + done + + if [ ! -t 0 ] + then + cat + fi +} | while read -r url +do + if [[ "${printInputUrl}" ]] + then + echo "${url}" + fi + fetch_n_extract "${url}" +done diff --git a/wiki-website-extract-social-media b/wiki-website-extract-social-media new file mode 100755 index 0000000..ee7598c --- /dev/null +++ b/wiki-website-extract-social-media @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a wiki page in the new-style viewer format from stdin, runs everything that looks like a website through website-extract-social-media, and formats the output accordingly + +scriptpath="$(cd "$(dirname "$0")"; pwd -P)" +while read -r line +do + echo "${line}" + if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]] + then + url="${line:2}" + if [[ "${url}" == *' | '* ]] + then + url="${url%% | *}" + fi + if ! grep -Pq '//(www\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${url}" + then + "${scriptpath}/website-extract-social-media" "${url}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }' | sed 's,^,* ,' + fi + fi +done