The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

169 lignes
5.1 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" == *' | '* ]]
  45. then
  46. suffix=" | ${url#* | }"
  47. url="${url%% | *}"
  48. else
  49. suffix=""
  50. fi
  51. # Normalise domain
  52. if [[ "${url}" =~ ^https?://.*/ ]]
  53. then
  54. domain="${url#*://}"
  55. domain="${domain%%/*}"
  56. url="${url%%://*}://${domain,,}/${url#*://*/}"
  57. fi
  58. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  59. then
  60. url="${url##*\?next=}"
  61. url="${url##*&next=}"
  62. url="${url%%&*}"
  63. url="$(printf '%b' "${url//%/\\x}")"
  64. fi
  65. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  66. then
  67. verbose_echo "Normalising Facebook URL: ${url}" >&2
  68. if [[ "${url}" == *profile.php* ]]
  69. then
  70. url="${url%%&*}"
  71. else
  72. url="${url%%\?*}"
  73. fi
  74. page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  75. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  76. if [[ "${user}" ]]
  77. then
  78. echo "${prefix}https://www.facebook.com/${user}/${suffix}"
  79. continue
  80. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  81. then
  82. # Profile page which is only visible when logged in
  83. # Extract canonical URL
  84. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  85. if [[ "${user}" ]]
  86. then
  87. echo "${prefix}${user}${suffix}"
  88. continue
  89. fi
  90. fi
  91. echo "Failed to normalise Facebook URL: ${url}" >&2
  92. echo "${line}"
  93. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  94. then
  95. if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]]
  96. then
  97. verbose_echo "Leaving Twitter URL alone: ${url}" >&2
  98. echo "${line}"
  99. continue
  100. fi
  101. verbose_echo "Normalising Twitter URL: ${url}" >&2
  102. url="${url%%\?*}"
  103. url="${url%/}"
  104. unnormalisedUser="${url##*/}"
  105. user="$(curl -sL --max-time 10 -A "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  106. if [[ "${user}" ]]
  107. then
  108. echo "${prefix}https://twitter.com/${user}${suffix}"
  109. else
  110. echo "Failed to normalise Twitter URL: ${url}" >&2
  111. echo "${line}"
  112. fi
  113. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  114. then
  115. if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]]
  116. then
  117. verbose_echo "Leaving Instagram URL alone: ${url}" >&2
  118. echo "${line}"
  119. continue
  120. fi
  121. verbose_echo "Normalising Instagram URL: ${url}" >&2
  122. user="${url%/}"
  123. user="${user##*/}"
  124. echo "${prefix}https://www.instagram.com/${user,,}/${suffix}"
  125. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  126. then
  127. verbose_echo "Normalising YouTube URL: ${url}" >&2
  128. if [[ "${url}" == *'?'* ]]
  129. then
  130. rurl="${url}&disable_polymer=1"
  131. else
  132. rurl="${url}?disable_polymer=1"
  133. fi
  134. page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  135. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  136. if [[ "${canonical}" ]]
  137. then
  138. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  139. else
  140. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  141. if [[ "${canonical}" ]]
  142. then
  143. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  144. else
  145. echo "Failed to normalise YouTube URL: ${url}" >&2
  146. echo "${line}"
  147. fi
  148. fi
  149. else
  150. verbose_echo "Normalising other URL: ${url}" >&2
  151. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  152. if [[ "${canonical}" ]]
  153. then
  154. echo "${prefix}${canonical}${suffix}"
  155. else
  156. echo "Failed to normalise other URL: ${url}" >&2
  157. echo "${line}"
  158. fi
  159. fi
  160. done