From 6dc711c54ea74da17126f6f012c032055140a979 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 30 Apr 2019 15:23:15 +0000 Subject: [PATCH] Further helper scripts for snscrape: normalising usernames and extracting them from a list of URLs --- snscrape-extract-usernames | 25 +++++++++++++++++++++++ snscrape-normalise | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100755 snscrape-extract-usernames create mode 100755 snscrape-normalise diff --git a/snscrape-extract-usernames b/snscrape-extract-usernames new file mode 100755 index 0000000..a81cb41 --- /dev/null +++ b/snscrape-extract-usernames @@ -0,0 +1,25 @@ +#!/bin/bash +# Extract from stdin social media usernames suitable for snscrape, grouped by service +grep -Po '(https?://www\.\K(facebook|instagram)\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' | + sed 's,\.com/, ,' | + sort | + awk ' + BEGIN { + prev1=""; + } + + ($1 != prev1) { + if (prev1 != "") { + print ""; + } + printf "%s:", $1; + prev1 = $1; + } + + ($1 == prev1) { + printf " %s", $2; + } + + END { + print ""; + }' diff --git a/snscrape-normalise b/snscrape-normalise new file mode 100755 index 0000000..cdb05c6 --- /dev/null +++ b/snscrape-normalise @@ -0,0 +1,42 @@ +#!/bin/bash +# Read a list of URLs from stdin, replace suitable social media URLs with correctly capitalised version +errorUrls=() +while read -r url +do + if [[ "${url}" =~ ^https?://(www|m|[a-z][a-z]-[a-z][a-z]).facebook.com/[^/]+/?$ ]] + then + user="$(curl -s -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.${url#*.}" | grep -Po ']*(?<=\s)data-key\s*=\s*"tab_home".*?' | grep -Po ']*(?<=\s)href="/\K[^/]+')" + if [[ "${user}" ]] + then + echo "https://www.facebook.com/${user}/" + else + errorUrls+=("${url}") + echo "${url}" + fi + elif [[ "${url}" =~ ^https?://twitter\.com/[^/]+$ ]] + then + user="$(snscrape --max-results 1 twitter-user "${url##*/}" | grep -Po '^https?://twitter\.com/\K[^/]+')" + if [[ "${user}" ]] + then + echo "https://twitter.com/${user}" + else + errorUrls+=("${url}") + echo "${url}" + fi + elif [[ "${url}" =~ ^https?://www\.instagram\.com/[^/]+/$ ]] + then + echo "${url,,}" + else + echo "${url}" + fi +done + +if [[ ${#errorUrls[@]} -gt 0 ]] +then + echo "" >&2 + echo "Failed to process URLs:" >&2 + for errorUrl in "${errorUrls[@]}" + do + echo "${errorUrl}" >&2 + done +fi