Browse Source

Add script for recursive website and social media discovery

master
JustAnotherArchivist 4 years ago
parent
commit
5285c406d9
1 changed files with 56 additions and 0 deletions
  1. +56
    -0
      wiki-recursive-extract

+ 56
- 0
wiki-recursive-extract View File

@@ -0,0 +1,56 @@
#!/bin/bash
# Takes a wiki page in new-style viewer format on stdin.
# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
# Everything else is run through website-extract-social-media.
# This is done recursively until no new links are discovered anymore.

scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
declare -A sectionUrls
while read -r line
do
echo "${line}"
if [[ "${line}" == '=='* ]]
then
unset sectionUrls
declare -A sectionUrls
fi
if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
then
url="${line:2}"
if [[ "${url}" == *' | '* ]]
then
url="${url%% | *}"
fi

sectionUrls["${url}"]=1
toProcess=("${url}")
while [[ ${#toProcess[@]} -gt 0 ]]
do
curUrl="${toProcess[0]}"
toProcess=("${toProcess[@]:1}")

if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
then
echo "Calling social-media-extract-profile-link on ${curUrl}" >&2
mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:))
else
echo "Calling website-extract-social-media on ${curUrl}" >&2
mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }')
fi

for outUrl in "${outUrls[@]}"
do
if [[ "${sectionUrls[${outUrl}]}" ]]
then
# The discovered URL was processed already, skip it entirely
continue
else
# Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
toProcess+=("${outUrl}")
sectionUrls["${outUrl}"]=1
echo "* ${outUrl}"
fi
done
done
fi
done

Loading…
Cancel
Save