Add script for recursive website and social media discovery

4 years ago · 5285c406d9
--- a/+ 56
+++ b/+ 56
@@ -0,0 +1,56 @@
 #!/bin/bash
 # Takes a wiki page in new-style viewer format on stdin.
 # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
 # Everything else is run through website-extract-social-media.
 # This is done recursively until no new links are discovered anymore.

 scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
 declare -A sectionUrls
 while read -r line
 do
 	echo "${line}"
 	if [[ "${line}" == '=='* ]]
 	then
 		unset sectionUrls
 		declare -A sectionUrls
 	fi
 	if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
 	then
 		url="${line:2}"
 		if [[ "${url}" == *' | '* ]]
 		then
 			url="${url%% | *}"
 		fi

 		sectionUrls["${url}"]=1
 		toProcess=("${url}")
 		while [[ ${#toProcess[@]} -gt 0 ]]
 		do
 			curUrl="${toProcess[0]}"
 			toProcess=("${toProcess[@]:1}")

 			if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
 			then
 				echo "Calling social-media-extract-profile-link on ${curUrl}" >&2
 				mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:))
 			else
 				echo "Calling website-extract-social-media on ${curUrl}" >&2
 				mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }')
 			fi

 			for outUrl in "${outUrls[@]}"
 			do
 				if [[ "${sectionUrls[${outUrl}]}" ]]
 				then
 					# The discovered URL was processed already, skip it entirely
 					continue
 				else
 					# Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
 					toProcess+=("${outUrl}")
 					sectionUrls["${outUrl}"]=1
 					echo "* ${outUrl}"
 				fi
 			done
 		done
 	fi
 done