The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

161 lignes
5.0 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" == *' | '* ]]
  45. then
  46. suffix=" | ${url#* | }"
  47. url="${url%% | *}"
  48. else
  49. suffix=""
  50. fi
  51. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
  52. then
  53. url="${url##*\?next=}"
  54. url="${url##*&next=}"
  55. url="${url%%&*}"
  56. url="$(printf '%b' "${url//%/\\x}")"
  57. fi
  58. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  59. then
  60. verbose_echo "Normalising Facebook URL: ${url}" >&2
  61. if [[ "${url}" == *profile.php* ]]
  62. then
  63. url="${url%%&*}"
  64. else
  65. url="${url%%\?*}"
  66. fi
  67. page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  68. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  69. if [[ "${user}" ]]
  70. then
  71. echo "${prefix}https://www.facebook.com/${user}/${suffix}"
  72. continue
  73. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  74. then
  75. # Profile page which is only visible when logged in
  76. # Extract canonical URL
  77. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  78. if [[ "${user}" ]]
  79. then
  80. echo "${prefix}${user}${suffix}"
  81. continue
  82. fi
  83. fi
  84. echo "Failed to normalise Facebook URL: ${url}" >&2
  85. echo "${line}"
  86. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  87. then
  88. if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]]
  89. then
  90. verbose_echo "Leaving Twitter URL alone: ${url}" >&2
  91. echo "${line}"
  92. continue
  93. fi
  94. verbose_echo "Normalising Twitter URL: ${url}" >&2
  95. url="${url%%\?*}"
  96. url="${url%/}"
  97. unnormalisedUser="${url##*/}"
  98. user="$(curl -sL --max-time 10 -A "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  99. if [[ "${user}" ]]
  100. then
  101. echo "${prefix}https://twitter.com/${user}${suffix}"
  102. else
  103. echo "Failed to normalise Twitter URL: ${url}" >&2
  104. echo "${line}"
  105. fi
  106. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  107. then
  108. if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]]
  109. then
  110. verbose_echo "Leaving Instagram URL alone: ${url}" >&2
  111. echo "${line}"
  112. continue
  113. fi
  114. verbose_echo "Normalising Instagram URL: ${url}" >&2
  115. user="${url%/}"
  116. user="${user##*/}"
  117. echo "${prefix}https://www.instagram.com/${user,,}/${suffix}"
  118. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  119. then
  120. verbose_echo "Normalising YouTube URL: ${url}" >&2
  121. if [[ "${url}" == *'?'* ]]
  122. then
  123. rurl="${url}&disable_polymer=1"
  124. else
  125. rurl="${url}?disable_polymer=1"
  126. fi
  127. page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  128. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  129. if [[ "${canonical}" ]]
  130. then
  131. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  132. else
  133. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  134. if [[ "${canonical}" ]]
  135. then
  136. echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
  137. else
  138. echo "Failed to normalise YouTube URL: ${url}" >&2
  139. echo "${line}"
  140. fi
  141. fi
  142. else
  143. verbose_echo "Normalising other URL: ${url}" >&2
  144. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  145. if [[ "${canonical}" ]]
  146. then
  147. echo "${prefix}${canonical}${suffix}"
  148. else
  149. echo "Failed to normalise other URL: ${url}" >&2
  150. echo "${line}"
  151. fi
  152. fi
  153. done