diff --git a/wiki-recursive-extract b/wiki-recursive-extract-normalise similarity index 78% rename from wiki-recursive-extract rename to wiki-recursive-extract-normalise index d6e0c9c..e34bba9 100755 --- a/wiki-recursive-extract +++ b/wiki-recursive-extract-normalise @@ -3,6 +3,7 @@ # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link. # Everything else is run through website-extract-social-media. # This is done recursively until no new links are discovered anymore. +# The output is further fed through url-normalise before, during, and after processing to avoid equivalent but slightly different duplicates. verbose= while [[ $# -gt 0 ]] @@ -31,7 +32,7 @@ function stderr_annotate { scriptpath="$(cd "$(dirname "$0")"; pwd -P)" declare -A sectionUrls -while read -r line +stderr_annotate "${scriptpath}/url-normalise" ${verbose} | while read -r line do echo "${line}" if [[ "${line}" == '=='* ]] @@ -57,9 +58,9 @@ do if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" then - mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:)) + mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose}) else - mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:)) + mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose}) fi for outUrl in "${outUrls[@]}" @@ -77,4 +78,4 @@ do done done fi -done +done | stderr_annotate "${scriptpath}/url-normalise" ${verbose}