The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

169 lines
5.1 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" == *' | '* ]]
  45. then
  46. suffix=" | ${url#* | }"
  47. url="${url%% | *}"
  48. else
  49. suffix=""
  50. fi
  51. # Normalise domain
  52. if [[ "${url}" =~ ^https?://.*/ ]]
  53. then
  54. domain="${url#*://}"
  55. domain="${domain%%/*}"
  56. url="${url%%://*}://${domain,,}/${url#*://*/}"
  57. fi
  58. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  59. then
  60. url="${url##*\?next=}"
  61. url="${url##*&next=}"
  62. url="${url%%&*}"
  63. url="$(printf '%b' "${url//%/\\x}")"
  64. fi
  65. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  66. then
  67. verbose_echo "Normalising Facebook URL: ${url}" >&2
  68. if [[ "${url}" == *profile.php* ]]
  69. then
  70. url="${url%%&*}"
  71. else
  72. url="${url%%\?*}"
  73. fi
  74. page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  75. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  76. if [[ "${user}" ]]
  77. then
  78. echo "${prefix}https://www.facebook.com/${user}/${suffix}"
  79. continue
  80. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  81. then
  82. # Profile page which is only visible when logged in
  83. # Extract canonical URL
  84. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  85. if [[ "${user}" ]]
  86. then
  87. echo "${prefix}${user}${suffix}"
  88. continue
  89. fi
  90. fi
  91. echo "Failed to normalise Facebook URL: ${url}" >&2
  92. echo "${line}"
  93. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  94. then
  95. if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]]
  96. then
  97. verbose_echo "Leaving Twitter URL alone: ${url}" >&2
  98. echo "${line}"
  99. continue
  100. fi
  101. verbose_echo "Normalising Twitter URL: ${url}" >&2
  102. url="${url%%\?*}"
  103. url="${url%/}"
  104. unnormalisedUser="${url##*/}"
  105. user="$(curl -sL --max-time 10 -A "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  106. if [[ "${user}" ]]
  107. then
  108. echo "${prefix}https://twitter.com/${user}${suffix}"
  109. else
  110. echo "Failed to normalise Twitter URL: ${url}" >&2
  111. echo "${line}"
  112. fi
  113. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  114. then
  115. if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]]
  116. then
  117. verbose_echo "Leaving Instagram URL alone: ${url}" >&2
  118. echo "${line}"
  119. continue
  120. fi
  121. verbose_echo "Normalising Instagram URL: ${url}" >&2
  122. user="${url%/}"
  123. user="${user##*/}"
  124. echo "${prefix}https://www.instagram.com/${user,,}/${suffix}"
  125. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  126. then
  127. verbose_echo "Normalising YouTube URL: ${url}" >&2
  128. if [[ "${url}" == *'?'* ]]
  129. then
  130. rurl="${url}&disable_polymer=1"
  131. else
  132. rurl="${url}?disable_polymer=1"
  133. fi
  134. page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  135. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  136. if [[ "${canonical}" ]]
  137. then
  138. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  139. else
  140. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  141. if [[ "${canonical}" ]]
  142. then
  143. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  144. else
  145. echo "Failed to normalise YouTube URL: ${url}" >&2
  146. echo "${line}"
  147. fi
  148. fi
  149. else
  150. verbose_echo "Normalising other URL: ${url}" >&2
  151. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  152. if [[ "${canonical}" ]]
  153. then
  154. echo "${prefix}${canonical}${suffix}"
  155. else
  156. echo "Failed to normalise other URL: ${url}" >&2
  157. echo "${line}"
  158. fi
  159. fi
  160. done