Browse Source

One URL normalisation script to rule them all

Consolidate social media profile, YouTube, and (new) generic web page URL normalisation into one script
master
JustAnotherArchivist 4 years ago
parent
commit
dc4efcfbfb
3 changed files with 133 additions and 111 deletions
  1. +0
    -74
      social-media-normalise
  2. +133
    -0
      url-normalise
  3. +0
    -37
      youtube-normalise

+ 0
- 74
social-media-normalise View File

@@ -1,74 +0,0 @@
#!/bin/bash
# Read a list of URLs from stdin, replace suitable social media URLs with correctly capitalised version
errorUrls=()
while read -r url
do
if [[ "${url}" == '* '* ]]
then
prefix="${url::2}"
url="${url:2}"
else
prefix=""
fi

if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
then
if [[ "${url}" == *profile.php* ]]
then
url="${url%%&*}"
else
url="${url%%\?*}"
fi
page="$(curl -sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
if [[ "${user}" ]]
then
echo "${prefix}https://www.facebook.com/${user}/"
continue
else
if grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
then
# Profile page which is only visible when logged in
# Extract canonical URL
user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
if [[ "${user}" ]]
then
echo "${prefix}${user}"
continue
fi
fi
fi
errorUrls+=("${url}")
echo "${prefix}${url}"
elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
then
url="${url%%\?*}"
url="${url%/}"
unnormalisedUser="${url##*/}"
user="$(curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
if [[ "${user}" ]]
then
echo "${prefix}https://twitter.com/${user}"
else
errorUrls+=("${url}")
echo "${prefix}${url}"
fi
elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
then
user="${url%/}"
user="${user##*/}"
echo "${prefix}https://www.instagram.com/${user,,}/"
else
echo "${prefix}${url}"
fi
done

if [[ ${#errorUrls[@]} -gt 0 ]]
then
echo "" >&2
echo "Failed to process URLs:" >&2
for errorUrl in "${errorUrls[@]}"
do
echo "${errorUrl}" >&2
done
fi

+ 133
- 0
url-normalise View File

@@ -0,0 +1,133 @@
#!/bin/bash
# Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
# - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
# - For YouTube user or channel URLs, the canonical base URL is extracted.
# - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)

otherCurlRedirectOpt='-L'
verbose=
while [[ $# -gt 0 ]]
do
if [[ "$1" == '--other-no-redirects' ]]
then
otherCurlRedirectOpt=
elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
then
verbose=1
else
echo "Unknown option: $1" >&2
exit 1
fi
shift
done

function verbose_echo {
if [[ "${verbose}" ]]
then
echo "$@"
fi
}

userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'

while read -r line
do
if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
then
echo "${line}"
continue
fi

if [[ "${line}" == '* '* ]]
then
prefix="${line::2}"
url="${line:2}"
else
prefix=""
url="${line}"
fi

if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
then
verbose_echo "Normalising Facebook URL: ${url}" >&2
if [[ "${url}" == *profile.php* ]]
then
url="${url%%&*}"
else
url="${url%%\?*}"
fi
page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
if [[ "${user}" ]]
then
echo "${prefix}https://www.facebook.com/${user}/"
continue
elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
then
# Profile page which is only visible when logged in
# Extract canonical URL
user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
if [[ "${user}" ]]
then
echo "${prefix}${user}"
continue
fi
fi
echo "Failed to normalise Facebook URL: ${url}" >&2
echo "${prefix}${url}"
elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
then
verbose_echo "Normalising Twitter URL: ${url}" >&2
url="${url%%\?*}"
url="${url%/}"
unnormalisedUser="${url##*/}"
user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
if [[ "${user}" ]]
then
echo "${prefix}https://twitter.com/${user}"
else
echo "Failed to normalise Twitter URL: ${url}" >&2
echo "${prefix}${url}"
fi
elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
then
verbose_echo "Normalising Instagram URL: ${url}" >&2
user="${url%/}"
user="${user##*/}"
echo "${prefix}https://www.instagram.com/${user,,}/"
elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
then
verbose_echo "Normalising YouTube URL: ${url}" >&2
if [[ "${url}" == *'?'* ]]
then
rurl="${url}&disable_polymer=1"
else
rurl="${url}?disable_polymer=1"
fi
page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
echo "Failed to normalise YouTube URL: ${url}" >&2
echo "${prefix}${url}"
fi
fi
else
verbose_echo "Normalising other URL: ${url}" >&2
canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
if [[ "${canonical}" ]]
then
echo "${prefix}${canonical}"
else
echo "Failed to normalise other URL: ${url}" >&2
echo "${prefix}${url}"
fi
fi
done

+ 0
- 37
youtube-normalise View File

@@ -1,37 +0,0 @@
#!/bin/bash
while read -r url
do
if [[ "${url}" == '* '* ]]
then
prefix='* '
url="${url:2}"
else
prefix=''
fi

if [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
then
if [[ "${url}" == *'?'* ]]
then
rurl="${url}&disable_polymer=1"
else
rurl="${url}?disable_polymer=1"
fi
page="$(curl -4sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
echo "${prefix}${url}"
fi
fi
else
echo "${prefix}${url}"
fi
done

Loading…
Cancel
Save