The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

153 lignes
4.8 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  45. then
  46. url="${url##*\?next=}"
  47. url="${url##*&next=}"
  48. url="${url%%&*}"
  49. url="$(printf '%b' "${url//%/\\x}")"
  50. fi
  51. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  52. then
  53. verbose_echo "Normalising Facebook URL: ${url}" >&2
  54. if [[ "${url}" == *profile.php* ]]
  55. then
  56. url="${url%%&*}"
  57. else
  58. url="${url%%\?*}"
  59. fi
  60. page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  61. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  62. if [[ "${user}" ]]
  63. then
  64. echo "${prefix}https://www.facebook.com/${user}/"
  65. continue
  66. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  67. then
  68. # Profile page which is only visible when logged in
  69. # Extract canonical URL
  70. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  71. if [[ "${user}" ]]
  72. then
  73. echo "${prefix}${user}"
  74. continue
  75. fi
  76. fi
  77. echo "Failed to normalise Facebook URL: ${url}" >&2
  78. echo "${line}"
  79. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  80. then
  81. if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]]
  82. then
  83. verbose_echo "Leaving Twitter URL alone: ${url}" >&2
  84. echo "${line}"
  85. continue
  86. fi
  87. verbose_echo "Normalising Twitter URL: ${url}" >&2
  88. url="${url%%\?*}"
  89. url="${url%/}"
  90. unnormalisedUser="${url##*/}"
  91. user="$(curl -sL --max-time 10 -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  92. if [[ "${user}" ]]
  93. then
  94. echo "${prefix}https://twitter.com/${user}"
  95. else
  96. echo "Failed to normalise Twitter URL: ${url}" >&2
  97. echo "${line}"
  98. fi
  99. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  100. then
  101. if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]]
  102. then
  103. verbose_echo "Leaving Instagram URL alone: ${url}" >&2
  104. echo "${line}"
  105. continue
  106. fi
  107. verbose_echo "Normalising Instagram URL: ${url}" >&2
  108. user="${url%/}"
  109. user="${user##*/}"
  110. echo "${prefix}https://www.instagram.com/${user,,}/"
  111. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  112. then
  113. verbose_echo "Normalising YouTube URL: ${url}" >&2
  114. if [[ "${url}" == *'?'* ]]
  115. then
  116. rurl="${url}&disable_polymer=1"
  117. else
  118. rurl="${url}?disable_polymer=1"
  119. fi
  120. page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  121. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  122. if [[ "${canonical}" ]]
  123. then
  124. echo "${prefix}https://www.youtube.com/${canonical}"
  125. else
  126. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  127. if [[ "${canonical}" ]]
  128. then
  129. echo "${prefix}https://www.youtube.com/${canonical}"
  130. else
  131. echo "Failed to normalise YouTube URL: ${url}" >&2
  132. echo "${line}"
  133. fi
  134. fi
  135. else
  136. verbose_echo "Normalising other URL: ${url}" >&2
  137. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  138. if [[ "${canonical}" ]]
  139. then
  140. echo "${prefix}${canonical}"
  141. else
  142. echo "Failed to normalise other URL: ${url}" >&2
  143. echo "${line}"
  144. fi
  145. fi
  146. done