The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

161 lines
5.0 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" == *' | '* ]]
  45. then
  46. suffix=" | ${url#* | }"
  47. url="${url%% | *}"
  48. else
  49. suffix=""
  50. fi
  51. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  52. then
  53. url="${url##*\?next=}"
  54. url="${url##*&next=}"
  55. url="${url%%&*}"
  56. url="$(printf '%b' "${url//%/\\x}")"
  57. fi
  58. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  59. then
  60. verbose_echo "Normalising Facebook URL: ${url}" >&2
  61. if [[ "${url}" == *profile.php* ]]
  62. then
  63. url="${url%%&*}"
  64. else
  65. url="${url%%\?*}"
  66. fi
  67. page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  68. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  69. if [[ "${user}" ]]
  70. then
  71. echo "${prefix}https://www.facebook.com/${user}/${suffix}"
  72. continue
  73. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  74. then
  75. # Profile page which is only visible when logged in
  76. # Extract canonical URL
  77. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  78. if [[ "${user}" ]]
  79. then
  80. echo "${prefix}${user}${suffix}"
  81. continue
  82. fi
  83. fi
  84. echo "Failed to normalise Facebook URL: ${url}" >&2
  85. echo "${line}"
  86. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  87. then
  88. if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]]
  89. then
  90. verbose_echo "Leaving Twitter URL alone: ${url}" >&2
  91. echo "${line}"
  92. continue
  93. fi
  94. verbose_echo "Normalising Twitter URL: ${url}" >&2
  95. url="${url%%\?*}"
  96. url="${url%/}"
  97. unnormalisedUser="${url##*/}"
  98. user="$(curl -sL --max-time 10 -A "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  99. if [[ "${user}" ]]
  100. then
  101. echo "${prefix}https://twitter.com/${user}${suffix}"
  102. else
  103. echo "Failed to normalise Twitter URL: ${url}" >&2
  104. echo "${line}"
  105. fi
  106. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  107. then
  108. if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]]
  109. then
  110. verbose_echo "Leaving Instagram URL alone: ${url}" >&2
  111. echo "${line}"
  112. continue
  113. fi
  114. verbose_echo "Normalising Instagram URL: ${url}" >&2
  115. user="${url%/}"
  116. user="${user##*/}"
  117. echo "${prefix}https://www.instagram.com/${user,,}/${suffix}"
  118. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  119. then
  120. verbose_echo "Normalising YouTube URL: ${url}" >&2
  121. if [[ "${url}" == *'?'* ]]
  122. then
  123. rurl="${url}&disable_polymer=1"
  124. else
  125. rurl="${url}?disable_polymer=1"
  126. fi
  127. page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  128. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  129. if [[ "${canonical}" ]]
  130. then
  131. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  132. else
  133. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  134. if [[ "${canonical}" ]]
  135. then
  136. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  137. else
  138. echo "Failed to normalise YouTube URL: ${url}" >&2
  139. echo "${line}"
  140. fi
  141. fi
  142. else
  143. verbose_echo "Normalising other URL: ${url}" >&2
  144. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  145. if [[ "${canonical}" ]]
  146. then
  147. echo "${prefix}${canonical}${suffix}"
  148. else
  149. echo "Failed to normalise other URL: ${url}" >&2
  150. echo "${line}"
  151. fi
  152. fi
  153. done