The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

153 lines
4.8 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  45. then
  46. url="${url##*\?next=}"
  47. url="${url##*&next=}"
  48. url="${url%%&*}"
  49. url="$(printf '%b' "${url//%/\\x}")"
  50. fi
  51. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  52. then
  53. verbose_echo "Normalising Facebook URL: ${url}" >&2
  54. if [[ "${url}" == *profile.php* ]]
  55. then
  56. url="${url%%&*}"
  57. else
  58. url="${url%%\?*}"
  59. fi
  60. page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  61. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  62. if [[ "${user}" ]]
  63. then
  64. echo "${prefix}https://www.facebook.com/${user}/"
  65. continue
  66. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  67. then
  68. # Profile page which is only visible when logged in
  69. # Extract canonical URL
  70. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  71. if [[ "${user}" ]]
  72. then
  73. echo "${prefix}${user}"
  74. continue
  75. fi
  76. fi
  77. echo "Failed to normalise Facebook URL: ${url}" >&2
  78. echo "${line}"
  79. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  80. then
  81. if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]]
  82. then
  83. verbose_echo "Leaving Twitter URL alone: ${url}" >&2
  84. echo "${line}"
  85. continue
  86. fi
  87. verbose_echo "Normalising Twitter URL: ${url}" >&2
  88. url="${url%%\?*}"
  89. url="${url%/}"
  90. unnormalisedUser="${url##*/}"
  91. user="$(curl -sL --max-time 10 -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  92. if [[ "${user}" ]]
  93. then
  94. echo "${prefix}https://twitter.com/${user}"
  95. else
  96. echo "Failed to normalise Twitter URL: ${url}" >&2
  97. echo "${line}"
  98. fi
  99. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  100. then
  101. if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]]
  102. then
  103. verbose_echo "Leaving Instagram URL alone: ${url}" >&2
  104. echo "${line}"
  105. continue
  106. fi
  107. verbose_echo "Normalising Instagram URL: ${url}" >&2
  108. user="${url%/}"
  109. user="${user##*/}"
  110. echo "${prefix}https://www.instagram.com/${user,,}/"
  111. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  112. then
  113. verbose_echo "Normalising YouTube URL: ${url}" >&2
  114. if [[ "${url}" == *'?'* ]]
  115. then
  116. rurl="${url}&disable_polymer=1"
  117. else
  118. rurl="${url}?disable_polymer=1"
  119. fi
  120. page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  121. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  122. if [[ "${canonical}" ]]
  123. then
  124. echo "${prefix}https://www.youtube.com/${canonical}"
  125. else
  126. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  127. if [[ "${canonical}" ]]
  128. then
  129. echo "${prefix}https://www.youtube.com/${canonical}"
  130. else
  131. echo "Failed to normalise YouTube URL: ${url}" >&2
  132. echo "${line}"
  133. fi
  134. fi
  135. else
  136. verbose_echo "Normalising other URL: ${url}" >&2
  137. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  138. if [[ "${canonical}" ]]
  139. then
  140. echo "${prefix}${canonical}"
  141. else
  142. echo "Failed to normalise other URL: ${url}" >&2
  143. echo "${line}"
  144. fi
  145. fi
  146. done