#!/bin/bash # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows: # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed. # - For YouTube user or channel URLs, the canonical base URL is extracted. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.) otherCurlRedirectOpt='-L' verbose= while [[ $# -gt 0 ]] do if [[ "$1" == '--other-no-redirects' ]] then otherCurlRedirectOpt= elif [[ "$1" == '--verbose' || "$1" == '-v' ]] then verbose=1 else echo "Unknown option: $1" >&2 exit 1 fi shift done function verbose_echo { if [[ "${verbose}" ]] then echo "$@" fi } userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' while read -r line do if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]] then echo "${line}" continue fi if [[ "${line}" == '* '* ]] then prefix="${line::2}" url="${line:2}" else prefix="" url="${line}" fi if [[ "${url}" == *' | '* ]] then suffix=" | ${url#* | }" url="${url%% | *}" else suffix="" fi # Normalise domain if [[ "${url}" =~ ^https?://.*/ ]] then domain="${url#*://}" domain="${domain%%/*}" url="${url%%://*}://${domain,,}/${url#*://*/}" fi if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]] then url="${url##*\?next=}" url="${url##*&next=}" url="${url%%&*}" url="$(printf '%b' "${url//%/\\x}")" fi if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]] then verbose_echo "Normalising Facebook URL: ${url}" >&2 if [[ "${url}" == *profile.php* ]] then url="${url%%&*}" else url="${url%%\?*}" fi page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")" user="$(grep -Po ']*(?<=\s)data-key\s*=\s*"tab_home".*?' <<< "${page}" | grep -Po ']*(?<=\s)href="/\K[^/]+')" if [[ "${user}" ]] then echo "${prefix}https://www.facebook.com/${user}/${suffix}" continue elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}" then # Profile page which is only visible when logged in # Extract canonical URL user="$(grep -Po '&2 echo "${line}" elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]] then if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]] then verbose_echo "Leaving Twitter URL alone: ${url}" >&2 echo "${line}" continue fi verbose_echo "Normalising Twitter URL: ${url}" >&2 url="${url%%\?*}" url="${url%/}" unnormalisedUser="${url##*/}" user="$(curl -sL --max-time 10 -A "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18" "https://twitter.com/${unnormalisedUser}" | grep -Po '&2 echo "${line}" fi elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]] then if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]] then verbose_echo "Leaving Instagram URL alone: ${url}" >&2 echo "${line}" continue fi verbose_echo "Normalising Instagram URL: ${url}" >&2 user="${url%/}" user="${user##*/}" echo "${prefix}https://www.instagram.com/${user,,}/${suffix}" elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]] then verbose_echo "Normalising YouTube URL: ${url}" >&2 if [[ "${url}" == *'?'* ]] then rurl="${url}&disable_polymer=1" else rurl="${url}?disable_polymer=1" fi page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")" canonical="$(grep -Po '&2 echo "${line}" fi fi else verbose_echo "Normalising other URL: ${url}" >&2 canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")" if [[ "${canonical}" ]] then echo "${prefix}${canonical}${suffix}" else echo "Failed to normalise other URL: ${url}" >&2 echo "${line}" fi fi done