Browse Source

Handle more Facebook URLs

master
JustAnotherArchivist 5 years ago
parent
commit
66ec0c93c4
1 changed files with 23 additions and 4 deletions
  1. +23
    -4
      snscrape-normalise

+ 23
- 4
snscrape-normalise View File

@@ -3,16 +3,35 @@
errorUrls=()
while read -r url
do
if [[ "${url}" =~ ^https?://(www|m|[a-z][a-z]-[a-z][a-z]).facebook.com/[^/]+/?$ ]]
if [[ "${url}" =~ ^https?://(www|m|[a-z][a-z]-[a-z][a-z]).facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|profile\.php\?id=[0-9]+(&|$)) ]]
then
user="$(curl -s -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.${url#*.}" | grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
if [[ "${url}" == *profile.php* ]]
then
url="${url%%&*}"
else
url="${url%%\?*}"
fi
page="$(curl -sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.${url#*.}")"
user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
if [[ "${user}" ]]
then
echo "https://www.facebook.com/${user}/"
continue
else
errorUrls+=("${url}")
echo "${url}"
if grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
then
# Profile page which is only visible when logged in
# Extract canonical URL
user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
if [[ "${user}" ]]
then
echo "${user}"
continue
fi
fi
fi
errorUrls+=("${url}")
echo "${url}"
elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+$ ]]
then
user="$(snscrape --max-results 1 twitter-user "${url##*/}" | grep -Po '^https?://twitter\.com/\K[^/]+')"


Loading…
Cancel
Save