|
|
@@ -3,6 +3,7 @@ |
|
|
|
# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link. |
|
|
|
# Everything else is run through website-extract-social-media. |
|
|
|
# This is done recursively until no new links are discovered anymore. |
|
|
|
# The output is further fed through url-normalise before, during, and after processing to avoid equivalent but slightly different duplicates. |
|
|
|
|
|
|
|
verbose= |
|
|
|
while [[ $# -gt 0 ]] |
|
|
@@ -31,7 +32,7 @@ function stderr_annotate { |
|
|
|
|
|
|
|
scriptpath="$(cd "$(dirname "$0")"; pwd -P)" |
|
|
|
declare -A sectionUrls |
|
|
|
while read -r line |
|
|
|
stderr_annotate "${scriptpath}/url-normalise" ${verbose} | while read -r line |
|
|
|
do |
|
|
|
echo "${line}" |
|
|
|
if [[ "${line}" == '=='* ]] |
|
|
@@ -57,9 +58,9 @@ do |
|
|
|
|
|
|
|
if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" |
|
|
|
then |
|
|
|
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:)) |
|
|
|
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose}) |
|
|
|
else |
|
|
|
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:)) |
|
|
|
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose}) |
|
|
|
fi |
|
|
|
|
|
|
|
for outUrl in "${outUrls[@]}" |
|
|
@@ -77,4 +78,4 @@ do |
|
|
|
done |
|
|
|
done |
|
|
|
fi |
|
|
|
done |
|
|
|
done | stderr_annotate "${scriptpath}/url-normalise" ${verbose} |