Browse Source

Normalise URLs everywhere to reduce duplicates

master
JustAnotherArchivist 4 years ago
parent
commit
79f0bd4332
1 changed files with 5 additions and 4 deletions
  1. +5
    -4
      wiki-recursive-extract-normalise

wiki-recursive-extract → wiki-recursive-extract-normalise View File

@@ -3,6 +3,7 @@
# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
# Everything else is run through website-extract-social-media.
# This is done recursively until no new links are discovered anymore.
# The output is further fed through url-normalise before, during, and after processing to avoid equivalent but slightly different duplicates.

verbose=
while [[ $# -gt 0 ]]
@@ -31,7 +32,7 @@ function stderr_annotate {

scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
declare -A sectionUrls
while read -r line
stderr_annotate "${scriptpath}/url-normalise" ${verbose} | while read -r line
do
echo "${line}"
if [[ "${line}" == '=='* ]]
@@ -57,9 +58,9 @@ do

if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
then
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:))
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose})
else
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:))
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose})
fi

for outUrl in "${outUrls[@]}"
@@ -77,4 +78,4 @@ do
done
done
fi
done
done | stderr_annotate "${scriptpath}/url-normalise" ${verbose}

Loading…
Cancel
Save