The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

141 lines
4.4 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  45. then
  46. url="${url##*\?next=}"
  47. url="${url##*&next=}"
  48. url="${url%%&*}"
  49. url="$(printf '%b' "${url//%/\\x}")"
  50. fi
  51. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  52. then
  53. verbose_echo "Normalising Facebook URL: ${url}" >&2
  54. if [[ "${url}" == *profile.php* ]]
  55. then
  56. url="${url%%&*}"
  57. else
  58. url="${url%%\?*}"
  59. fi
  60. page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  61. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  62. if [[ "${user}" ]]
  63. then
  64. echo "${prefix}https://www.facebook.com/${user}/"
  65. continue
  66. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  67. then
  68. # Profile page which is only visible when logged in
  69. # Extract canonical URL
  70. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  71. if [[ "${user}" ]]
  72. then
  73. echo "${prefix}${user}"
  74. continue
  75. fi
  76. fi
  77. echo "Failed to normalise Facebook URL: ${url}" >&2
  78. echo "${prefix}${url}"
  79. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  80. then
  81. verbose_echo "Normalising Twitter URL: ${url}" >&2
  82. url="${url%%\?*}"
  83. url="${url%/}"
  84. unnormalisedUser="${url##*/}"
  85. user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  86. if [[ "${user}" ]]
  87. then
  88. echo "${prefix}https://twitter.com/${user}"
  89. else
  90. echo "Failed to normalise Twitter URL: ${url}" >&2
  91. echo "${prefix}${url}"
  92. fi
  93. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  94. then
  95. verbose_echo "Normalising Instagram URL: ${url}" >&2
  96. user="${url%/}"
  97. user="${user##*/}"
  98. echo "${prefix}https://www.instagram.com/${user,,}/"
  99. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  100. then
  101. verbose_echo "Normalising YouTube URL: ${url}" >&2
  102. if [[ "${url}" == *'?'* ]]
  103. then
  104. rurl="${url}&disable_polymer=1"
  105. else
  106. rurl="${url}?disable_polymer=1"
  107. fi
  108. page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  109. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  110. if [[ "${canonical}" ]]
  111. then
  112. echo "${prefix}https://www.youtube.com/${canonical}"
  113. else
  114. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  115. if [[ "${canonical}" ]]
  116. then
  117. echo "${prefix}https://www.youtube.com/${canonical}"
  118. else
  119. echo "Failed to normalise YouTube URL: ${url}" >&2
  120. echo "${prefix}${url}"
  121. fi
  122. fi
  123. else
  124. verbose_echo "Normalising other URL: ${url}" >&2
  125. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  126. if [[ "${canonical}" ]]
  127. then
  128. echo "${prefix}${canonical}"
  129. else
  130. echo "Failed to normalise other URL: ${url}" >&2
  131. echo "${prefix}${url}"
  132. fi
  133. fi
  134. done