The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

89 lines
2.6 KiB

  1. #!/bin/bash
  2. # Takes a wiki page in new-style viewer format on stdin.
  3. # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
  4. # Everything else is run through website-extract-social-media.
  5. # This is done recursively until no new links are discovered anymore.
  6. # The output is further fed through url-normalise before and during processing to avoid equivalent but slightly different duplicates, and the output is deduplicated within each section at the end.
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--verbose' || "$1" == '-v' ]]
  11. then
  12. verbose='--verbose'
  13. else
  14. echo "Unknown option: $1" >&2
  15. exit 1
  16. fi
  17. shift
  18. done
  19. function verbose_echo {
  20. if [[ "${verbose}" ]]
  21. then
  22. echo "$@"
  23. fi
  24. }
  25. function stderr_annotate {
  26. name="$1"
  27. shift
  28. if [[ "${name}" == '' ]]; then name="${1##*/}"; fi
  29. "$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
  30. }
  31. scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
  32. declare -A sectionUrls
  33. stderr_annotate 'url-normalise/before' "${scriptpath}/url-normalise" ${verbose} | while read -r line
  34. do
  35. echo "${line}"
  36. if [[ "${line}" == '=='* ]]
  37. then
  38. verbose_echo "${line}" >&2
  39. unset sectionUrls
  40. declare -A sectionUrls
  41. fi
  42. if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
  43. then
  44. url="${line:2}"
  45. if [[ "${url}" == *' | '* ]]
  46. then
  47. url="${url%% | *}"
  48. fi
  49. if [[ "${sectionUrls[${url}]}" ]]
  50. then
  51. # Processed already, skip
  52. continue
  53. fi
  54. sectionUrls["${url}"]=1
  55. toProcess=("${url}")
  56. while [[ ${#toProcess[@]} -gt 0 ]]
  57. do
  58. curUrl="${toProcess[0]}"
  59. toProcess=("${toProcess[@]:1}")
  60. if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
  61. then
  62. mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-social' "${scriptpath}/url-normalise" ${verbose})
  63. else
  64. mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-web' "${scriptpath}/url-normalise" ${verbose})
  65. fi
  66. for outUrl in "${outUrls[@]}"
  67. do
  68. if [[ "${sectionUrls[${outUrl}]}" ]]
  69. then
  70. # The discovered URL was processed already, skip it entirely
  71. continue
  72. else
  73. # Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
  74. toProcess+=("${outUrl}")
  75. sectionUrls["${outUrl}"]=1
  76. echo "* ${outUrl}"
  77. fi
  78. done
  79. done
  80. fi
  81. done | mawk -W interactive '! /^\*/ { print; } /^\*/ && !seen[$0]++ { print; } /^==/ { delete seen; }'