Browse Source

Add script for automatic social media discovery

master
JustAnotherArchivist 1 year ago
parent
commit
e6008eb971
2 changed files with 111 additions and 0 deletions
  1. +91
    -0
      website-extract-social-media
  2. +20
    -0
      wiki-website-extract-social-media

+ 91
- 0
website-extract-social-media View File

@@ -0,0 +1,91 @@
#!/bin/bash
function fetch_n_extract {
local url="$1"
{
curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \
grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \
tee \
>(
# Facebook
grep -Poi 'facebook\.com/[^/ <"'"'"']+' | \
sed 's,^,https://www.,' | \
grep -vi -e '^https://www\.facebook\.com/2008$' -e '^https://www\.facebook\.com/tr\?' -e '^https://www\.facebook\.com/plugins$' | \
grep -Pvi '^https://www\.facebook\.com/sharer(\.php\?|\?|$)'
) \
>(
# Flickr
grep -Poi 'flickr\.com/photos/[^/ <"'"'"']+' | \
sed 's,^,https://www.,'
) \
>(
# Instagram
grep -Poi 'instagram\.com/[^/ <"'"'"']+' | \
sed 's,^,https://www.,'
) \
>(
# Telegram
grep -Poi '//(www\.)?t\.me/[^/ <"'"'"']+' | \
sed 's,^//,,; s,^www\.,,; s,^,https://,'
) \
>(
# Twitter
grep -Poi 'twitter\.com/[^/ <"'"'"']+' | \
sed 's,^,https://,' | \
grep -vi -e '^https://twitter\.com/home\?' -e '^https://twitter\.com/widgets\.js$' -e '^https://twitter\.com/share\?' | \
sed 's,\([?&]\)ref_src=[^&]\+&\?,\1,; s,?$,,'
) \
>(
# VKontakte
grep -Poi 'vk\.com/[^/ <"'"'"']+' | \
sed 's,^,https://,'
) \
>(
# YouTube
grep -Poi '(youtube\.com/((user|channel|embed)/)?[^/ <"'"'"']+|youtu\.be/[^/ <"'"'"']+)' | \
awk '/^youtube/ { print "https://www." $0 } /^youtu\.be/ { print "https://" $0 }'
) \
>/dev/null
} | awk '!seen[$0]++'
}

# Parse options
printInputUrl=
while [[ $# -gt 0 ]]
do
if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]]
then
printInputUrl=true
shift
elif [[ "$1" == '--' ]]
then
# End of options
shift
break
elif [[ "$1" == '--'* ]]
then
echo "Unknown option: $1" >&2
exit 1
else
# Assume end of options
break
fi
done

{
for arg in "$@"
do
echo "${arg}"
done

if [ ! -t 0 ]
then
cat
fi
} | while read -r url
do
if [[ "${printInputUrl}" ]]
then
echo "${url}"
fi
fetch_n_extract "${url}"
done

+ 20
- 0
wiki-website-extract-social-media View File

@@ -0,0 +1,20 @@
#!/bin/bash
# Reads a wiki page in the new-style viewer format from stdin, runs everything that looks like a website through website-extract-social-media, and formats the output accordingly

scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
while read -r line
do
echo "${line}"
if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
then
url="${line:2}"
if [[ "${url}" == *' | '* ]]
then
url="${url%% | *}"
fi
if ! grep -Pq '//(www\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${url}"
then
"${scriptpath}/website-extract-social-media" "${url}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }' | sed 's,^,* ,'
fi
fi
done

Loading…
Cancel
Save