The little things give you away... A collection of various small helper stuff
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.
 
 
 

84 righe
2.5 KiB

  1. #!/bin/bash
  2. # Takes a wiki page in new-style viewer format on stdin.
  3. # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
  4. # Everything else is run through website-extract-social-media.
  5. # This is done recursively until no new links are discovered anymore.
  6. # The output is further fed through url-normalise before and during processing to avoid equivalent but slightly different duplicates, and the output is deduplicated within each section at the end.
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--verbose' || "$1" == '-v' ]]
  11. then
  12. verbose='--verbose'
  13. else
  14. echo "Unknown option: $1" >&2
  15. exit 1
  16. fi
  17. shift
  18. done
  19. function verbose_echo {
  20. if [[ "${verbose}" ]]
  21. then
  22. echo "$@"
  23. fi
  24. }
  25. function stderr_annotate {
  26. name="$1"
  27. shift
  28. if [[ "${name}" == '' ]]; then name="${1##*/}"; fi
  29. "$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
  30. }
  31. scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
  32. declare -A sectionUrls
  33. stderr_annotate 'url-normalise/before' "${scriptpath}/url-normalise" ${verbose} | while read -r line
  34. do
  35. echo "${line}"
  36. if [[ "${line}" == '=='* ]]
  37. then
  38. verbose_echo "${line}" >&2
  39. unset sectionUrls
  40. declare -A sectionUrls
  41. fi
  42. if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
  43. then
  44. url="${line:2}"
  45. if [[ "${url}" == *' | '* ]]
  46. then
  47. url="${url%% | *}"
  48. fi
  49. sectionUrls["${url}"]=1
  50. toProcess=("${url}")
  51. while [[ ${#toProcess[@]} -gt 0 ]]
  52. do
  53. curUrl="${toProcess[0]}"
  54. toProcess=("${toProcess[@]:1}")
  55. if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
  56. then
  57. mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-social' "${scriptpath}/url-normalise" ${verbose})
  58. else
  59. mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-web' "${scriptpath}/url-normalise" ${verbose})
  60. fi
  61. for outUrl in "${outUrls[@]}"
  62. do
  63. if [[ "${sectionUrls[${outUrl}]}" ]]
  64. then
  65. # The discovered URL was processed already, skip it entirely
  66. continue
  67. else
  68. # Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
  69. toProcess+=("${outUrl}")
  70. sectionUrls["${outUrl}"]=1
  71. echo "* ${outUrl}"
  72. fi
  73. done
  74. done
  75. fi
  76. done | mawk -W interactive '! /^\*/ { print; } /^\*/ && !seen[$0]++ { print; } /^==/ { delete seen; }'