Normalise URLs everywhere to reduce duplicates

4 years ago · 79f0bd4332
--- a/wiki-recursive-extract → wiki-recursive-extract-normalise View File
+++ b/wiki-recursive-extract → wiki-recursive-extract-normalise View File
@@ -3,6 +3,7 @@
 # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
 # Everything else is run through website-extract-social-media.
 # This is done recursively until no new links are discovered anymore.
 # The output is further fed through url-normalise before, during, and after processing to avoid equivalent but slightly different duplicates.

 verbose=
 while [[ $# -gt 0 ]]
@@ -31,7 +32,7 @@ function stderr_annotate {

 scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
 declare -A sectionUrls
 while read -r line
 stderr_annotate "${scriptpath}/url-normalise" ${verbose} | while read -r line
 do
 	echo "${line}"
 	if [[ "${line}" == '=='* ]]
@@ -57,9 +58,9 @@ do

 			if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
 			then
 				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:))
 				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose})
 			else
 				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:))
 				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose})
 			fi

 			for outUrl in "${outUrls[@]}"
@@ -77,4 +78,4 @@ do
 			done
 		done
 	fi
 done
 done | stderr_annotate "${scriptpath}/url-normalise" ${verbose}