One URL normalisation script to rule them all

Consolidate social media profile, YouTube, and (new) generic web page URL normalisation into one script
4 years ago · dc4efcfbfb
--- a/+ 0
+++ b/+ 0
@@ -1,74 +0,0 @@
 #!/bin/bash
 # Read a list of URLs from stdin, replace suitable social media URLs with correctly capitalised version
 errorUrls=()
 while read -r url
 do
 	if [[ "${url}" == '* '* ]]
 	then
 		prefix="${url::2}"
 		url="${url:2}"
 	else
 		prefix=""
 	fi

 	if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
 	then
 		if [[ "${url}" == *profile.php* ]]
 		then
 			url="${url%%&*}"
 		else
 			url="${url%%\?*}"
 		fi
 		page="$(curl -sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
 		user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
 		if [[ "${user}" ]]
 		then
 			echo "${prefix}https://www.facebook.com/${user}/"
 			continue
 		else
 			if grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
 			then
 				# Profile page which is only visible when logged in
 				# Extract canonical URL
 				user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
 				if [[ "${user}" ]]
 				then
 					echo "${prefix}${user}"
 					continue
 				fi
 			fi
 		fi
 		errorUrls+=("${url}")
 		echo "${prefix}${url}"
 	elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
 	then
 		url="${url%%\?*}"
 		url="${url%/}"
 		unnormalisedUser="${url##*/}"
 		user="$(curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
 		if [[ "${user}" ]]
 		then
 			echo "${prefix}https://twitter.com/${user}"
 		else
 			errorUrls+=("${url}")
 			echo "${prefix}${url}"
 		fi
 	elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
 	then
 		user="${url%/}"
 		user="${user##*/}"
 		echo "${prefix}https://www.instagram.com/${user,,}/"
 	else
 		echo "${prefix}${url}"
 	fi
 done

 if [[ ${#errorUrls[@]} -gt 0 ]]
 then
 	echo "" >&2
 	echo "Failed to process URLs:" >&2
 	for errorUrl in "${errorUrls[@]}"
 	do
 		echo "${errorUrl}" >&2
 	done
 fi
--- a/+ 133
+++ b/+ 133
@@ -0,0 +1,133 @@
 #!/bin/bash
 # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
 # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
 # - For YouTube user or channel URLs, the canonical base URL is extracted.
 # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)

 otherCurlRedirectOpt='-L'
 verbose=
 while [[ $# -gt 0 ]]
 do
 	if [[ "$1" == '--other-no-redirects' ]]
 	then
 		otherCurlRedirectOpt=
 	elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
 	then
 		verbose=1
 	else
 		echo "Unknown option: $1" >&2
 		exit 1
 	fi
 	shift
 done

 function verbose_echo {
 	if [[ "${verbose}" ]]
 	then
 		echo "$@"
 	fi
 }

 userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'

 while read -r line
 do
 	if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
 	then
 		echo "${line}"
 		continue
 	fi

 	if [[ "${line}" == '* '* ]]
 	then
 		prefix="${line::2}"
 		url="${line:2}"
 	else
 		prefix=""
 		url="${line}"
 	fi

 	if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
 	then
 		verbose_echo "Normalising Facebook URL: ${url}" >&2
 		if [[ "${url}" == *profile.php* ]]
 		then
 			url="${url%%&*}"
 		else
 			url="${url%%\?*}"
 		fi
 		page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
 		user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
 		if [[ "${user}" ]]
 		then
 			echo "${prefix}https://www.facebook.com/${user}/"
 			continue
 		elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
 		then
 			# Profile page which is only visible when logged in
 			# Extract canonical URL
 			user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
 			if [[ "${user}" ]]
 			then
 				echo "${prefix}${user}"
 				continue
 			fi
 		fi
 		echo "Failed to normalise Facebook URL: ${url}" >&2
 		echo "${prefix}${url}"
 	elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
 	then
 		verbose_echo "Normalising Twitter URL: ${url}" >&2
 		url="${url%%\?*}"
 		url="${url%/}"
 		unnormalisedUser="${url##*/}"
 		user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
 		if [[ "${user}" ]]
 		then
 			echo "${prefix}https://twitter.com/${user}"
 		else
 			echo "Failed to normalise Twitter URL: ${url}" >&2
 			echo "${prefix}${url}"
 		fi
 	elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
 	then
 		verbose_echo "Normalising Instagram URL: ${url}" >&2
 		user="${url%/}"
 		user="${user##*/}"
 		echo "${prefix}https://www.instagram.com/${user,,}/"
 	elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
 	then
 		verbose_echo "Normalising YouTube URL: ${url}" >&2
 		if [[ "${url}" == *'?'* ]]
 		then
 			rurl="${url}&disable_polymer=1"
 		else
 			rurl="${url}?disable_polymer=1"
 		fi
 		page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
 		canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
 		if [[ "${canonical}" ]]
 		then
 			echo "${prefix}https://www.youtube.com/${canonical}"
 		else
 			canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
 			if [[ "${canonical}" ]]
 			then
 				echo "${prefix}https://www.youtube.com/${canonical}"
 			else
 				echo "Failed to normalise YouTube URL: ${url}" >&2
 				echo "${prefix}${url}"
 			fi
 		fi
 	else
 		verbose_echo "Normalising other URL: ${url}" >&2
 		canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
 		if [[ "${canonical}" ]]
 		then
 			echo "${prefix}${canonical}"
 		else
 			echo "Failed to normalise other URL: ${url}" >&2
 			echo "${prefix}${url}"
 		fi
 	fi
 done
--- a/+ 0
+++ b/+ 0
@@ -1,37 +0,0 @@
 #!/bin/bash
 while read -r url
 do
 	if [[ "${url}" == '* '* ]]
 	then
 		prefix='* '
 		url="${url:2}"
 	else
 		prefix=''
 	fi

 	if [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
 	then
 		if [[ "${url}" == *'?'* ]]
 		then
 			rurl="${url}&disable_polymer=1"
 		else
 			rurl="${url}?disable_polymer=1"
 		fi
 		page="$(curl -4sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
 		canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
 		if [[ "${canonical}" ]]
 		then
 			echo "${prefix}https://www.youtube.com/${canonical}"
 		else
 			canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
 			if [[ "${canonical}" ]]
 			then
 				echo "${prefix}https://www.youtube.com/${canonical}"
 			else
 				echo "${prefix}${url}"
 			fi
 		fi
 	else
 		echo "${prefix}${url}"
 	fi
 done