The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

141 lignes
4.4 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  45. then
  46. url="${url##*\?next=}"
  47. url="${url##*&next=}"
  48. url="${url%%&*}"
  49. url="$(printf '%b' "${url//%/\\x}")"
  50. fi
  51. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  52. then
  53. verbose_echo "Normalising Facebook URL: ${url}" >&2
  54. if [[ "${url}" == *profile.php* ]]
  55. then
  56. url="${url%%&*}"
  57. else
  58. url="${url%%\?*}"
  59. fi
  60. page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  61. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  62. if [[ "${user}" ]]
  63. then
  64. echo "${prefix}https://www.facebook.com/${user}/"
  65. continue
  66. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  67. then
  68. # Profile page which is only visible when logged in
  69. # Extract canonical URL
  70. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  71. if [[ "${user}" ]]
  72. then
  73. echo "${prefix}${user}"
  74. continue
  75. fi
  76. fi
  77. echo "Failed to normalise Facebook URL: ${url}" >&2
  78. echo "${prefix}${url}"
  79. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  80. then
  81. verbose_echo "Normalising Twitter URL: ${url}" >&2
  82. url="${url%%\?*}"
  83. url="${url%/}"
  84. unnormalisedUser="${url##*/}"
  85. user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  86. if [[ "${user}" ]]
  87. then
  88. echo "${prefix}https://twitter.com/${user}"
  89. else
  90. echo "Failed to normalise Twitter URL: ${url}" >&2
  91. echo "${prefix}${url}"
  92. fi
  93. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  94. then
  95. verbose_echo "Normalising Instagram URL: ${url}" >&2
  96. user="${url%/}"
  97. user="${user##*/}"
  98. echo "${prefix}https://www.instagram.com/${user,,}/"
  99. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  100. then
  101. verbose_echo "Normalising YouTube URL: ${url}" >&2
  102. if [[ "${url}" == *'?'* ]]
  103. then
  104. rurl="${url}&disable_polymer=1"
  105. else
  106. rurl="${url}?disable_polymer=1"
  107. fi
  108. page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  109. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  110. if [[ "${canonical}" ]]
  111. then
  112. echo "${prefix}https://www.youtube.com/${canonical}"
  113. else
  114. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  115. if [[ "${canonical}" ]]
  116. then
  117. echo "${prefix}https://www.youtube.com/${canonical}"
  118. else
  119. echo "Failed to normalise YouTube URL: ${url}" >&2
  120. echo "${prefix}${url}"
  121. fi
  122. fi
  123. else
  124. verbose_echo "Normalising other URL: ${url}" >&2
  125. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  126. if [[ "${canonical}" ]]
  127. then
  128. echo "${prefix}${canonical}"
  129. else
  130. echo "Failed to normalise other URL: ${url}" >&2
  131. echo "${prefix}${url}"
  132. fi
  133. fi
  134. done