The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

89 lignes
2.6 KiB

  1. #!/bin/bash
  2. # Takes a wiki page in new-style viewer format on stdin.
  3. # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
  4. # Everything else is run through website-extract-social-media.
  5. # This is done recursively until no new links are discovered anymore.
  6. # The output is further fed through url-normalise before and during processing to avoid equivalent but slightly different duplicates, and the output is deduplicated within each section at the end.
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--verbose' || "$1" == '-v' ]]
  11. then
  12. verbose='--verbose'
  13. else
  14. echo "Unknown option: $1" >&2
  15. exit 1
  16. fi
  17. shift
  18. done
  19. function verbose_echo {
  20. if [[ "${verbose}" ]]
  21. then
  22. echo "$@"
  23. fi
  24. }
  25. function stderr_annotate {
  26. name="$1"
  27. shift
  28. if [[ "${name}" == '' ]]; then name="${1##*/}"; fi
  29. "$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
  30. }
  31. scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
  32. declare -A sectionUrls
  33. stderr_annotate 'url-normalise/before' "${scriptpath}/url-normalise" ${verbose} | while read -r line
  34. do
  35. echo "${line}"
  36. if [[ "${line}" == '=='* ]]
  37. then
  38. verbose_echo "${line}" >&2
  39. unset sectionUrls
  40. declare -A sectionUrls
  41. fi
  42. if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
  43. then
  44. url="${line:2}"
  45. if [[ "${url}" == *' | '* ]]
  46. then
  47. url="${url%% | *}"
  48. fi
  49. if [[ "${sectionUrls[${url}]}" ]]
  50. then
  51. # Processed already, skip
  52. continue
  53. fi
  54. sectionUrls["${url}"]=1
  55. toProcess=("${url}")
  56. while [[ ${#toProcess[@]} -gt 0 ]]
  57. do
  58. curUrl="${toProcess[0]}"
  59. toProcess=("${toProcess[@]:1}")
  60. if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
  61. then
  62. mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-social' "${scriptpath}/url-normalise" ${verbose})
  63. else
  64. mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-web' "${scriptpath}/url-normalise" ${verbose})
  65. fi
  66. for outUrl in "${outUrls[@]}"
  67. do
  68. if [[ "${sectionUrls[${outUrl}]}" ]]
  69. then
  70. # The discovered URL was processed already, skip it entirely
  71. continue
  72. else
  73. # Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
  74. toProcess+=("${outUrl}")
  75. sectionUrls["${outUrl}"]=1
  76. echo "* ${outUrl}"
  77. fi
  78. done
  79. done
  80. fi
  81. done | mawk -W interactive '! /^\*/ { print; } /^\*/ && !seen[$0]++ { print; } /^==/ { delete seen; }'