Browse Source

Further helper scripts for snscrape: normalising usernames and extracting them from a list of URLs

master
JustAnotherArchivist 4 years ago
parent
commit
6dc711c54e
2 changed files with 67 additions and 0 deletions
  1. +25
    -0
      snscrape-extract-usernames
  2. +42
    -0
      snscrape-normalise

+ 25
- 0
snscrape-extract-usernames View File

@@ -0,0 +1,25 @@
#!/bin/bash
# Extract from stdin social media usernames suitable for snscrape, grouped by service
grep -Po '(https?://www\.\K(facebook|instagram)\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' |
sed 's,\.com/, ,' |
sort |
awk '
BEGIN {
prev1="";
}

($1 != prev1) {
if (prev1 != "") {
print "";
}
printf "%s:", $1;
prev1 = $1;
}

($1 == prev1) {
printf " %s", $2;
}

END {
print "";
}'

+ 42
- 0
snscrape-normalise View File

@@ -0,0 +1,42 @@
#!/bin/bash
# Read a list of URLs from stdin, replace suitable social media URLs with correctly capitalised version
errorUrls=()
while read -r url
do
if [[ "${url}" =~ ^https?://(www|m|[a-z][a-z]-[a-z][a-z]).facebook.com/[^/]+/?$ ]]
then
user="$(curl -s -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.${url#*.}" | grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
if [[ "${user}" ]]
then
echo "https://www.facebook.com/${user}/"
else
errorUrls+=("${url}")
echo "${url}"
fi
elif [[ "${url}" =~ ^https?://twitter\.com/[^/]+$ ]]
then
user="$(snscrape --max-results 1 twitter-user "${url##*/}" | grep -Po '^https?://twitter\.com/\K[^/]+')"
if [[ "${user}" ]]
then
echo "https://twitter.com/${user}"
else
errorUrls+=("${url}")
echo "${url}"
fi
elif [[ "${url}" =~ ^https?://www\.instagram\.com/[^/]+/$ ]]
then
echo "${url,,}"
else
echo "${url}"
fi
done

if [[ ${#errorUrls[@]} -gt 0 ]]
then
echo "" >&2
echo "Failed to process URLs:" >&2
for errorUrl in "${errorUrls[@]}"
do
echo "${errorUrl}" >&2
done
fi

Loading…
Cancel
Save