Browse Source

Fix extraction on Wix sites from JSON inside a data attribute

Example: https://www.martinedocourt.ch/
master
JustAnotherArchivist 4 years ago
parent
commit
1fa57d41a3
1 changed files with 7 additions and 7 deletions
  1. +7
    -7
      website-extract-social-media

+ 7
- 7
website-extract-social-media View File

@@ -10,42 +10,42 @@ function fetch_n_extract {
tee \ tee \
>( >(
# Facebook # Facebook
grep -Poi 'facebook\.com/(pages/[^/ <"'"'"']+/|groups/|pg/)?[^/ <"'"'"']+' | \
grep -Poi 'facebook\.com/(pages/((?!&quot;)[^/ <"'"'"'])+/|groups/|pg/)?((?!&quot;)[^/ <"'"'"'])+' | \
sed 's,^,https://www.,' | \ sed 's,^,https://www.,' | \
grep -vi -e '^https://www\.facebook\.com/2008$' -e '^https://www\.facebook\.com/tr\?' -e '^https://www\.facebook\.com/plugins$' -e '^https://www\.facebook\.com/l\.php\?' | \ grep -vi -e '^https://www\.facebook\.com/2008$' -e '^https://www\.facebook\.com/tr\?' -e '^https://www\.facebook\.com/plugins$' -e '^https://www\.facebook\.com/l\.php\?' | \
grep -Pvi '^https://www\.facebook\.com/share(r(\.php)?)?(\?|$)' grep -Pvi '^https://www\.facebook\.com/share(r(\.php)?)?(\?|$)'
) \ ) \
>( >(
# Flickr # Flickr
grep -Poi 'flickr\.com/photos/[^/ <"'"'"']+' | \
grep -Poi 'flickr\.com/photos/((?!&quot;)[^/ <"'"'"'])+' | \
sed 's,^,https://www.,' sed 's,^,https://www.,'
) \ ) \
>( >(
# Instagram # Instagram
grep -Poi 'instagram\.com/(p/)?[^/ <"'"'"']+' | \
grep -Poi 'instagram\.com/(p/)?((?!&quot;)[^/ <"'"'"'])+' | \
sed 's,^,https://www.,' | \ sed 's,^,https://www.,' | \
grep -Pvi -e '^https://www\.instagram\.com/v?p$' grep -Pvi -e '^https://www\.instagram\.com/v?p$'
) \ ) \
>( >(
# Telegram # Telegram
grep -Poi '//(www\.)?t\.me/[^/ <"'"'"']+' | \
grep -Poi '//(www\.)?t\.me/((?!&quot;)[^/ <"'"'"'])+' | \
sed 's,^//,,; s,^www\.,,; s,^,https://,' sed 's,^//,,; s,^www\.,,; s,^,https://,'
) \ ) \
>( >(
# Twitter # Twitter
grep -Poi 'twitter\.com/(#!/)?(hashtag/)?[^/ <"'"'"']+' | \
grep -Poi 'twitter\.com/(#!/)?(hashtag/)?((?!&quot;)[^/ <"'"'"'])+' | \
sed 's,^twitter\.com/#!/,twitter.com/,; s,^,https://,' | \ sed 's,^twitter\.com/#!/,twitter.com/,; s,^,https://,' | \
grep -vi -e '^https://twitter\.com/home\?' -e '^https://twitter\.com/widgets\.js$' -e '^https://twitter\.com/share\?' -e '^https://twitter\.com/intent$' | \ grep -vi -e '^https://twitter\.com/home\?' -e '^https://twitter\.com/widgets\.js$' -e '^https://twitter\.com/share\?' -e '^https://twitter\.com/intent$' | \
sed 's,\([?&]\)ref_src=[^&]\+&\?,\1,; s,?$,,' sed 's,\([?&]\)ref_src=[^&]\+&\?,\1,; s,?$,,'
) \ ) \
>( >(
# VKontakte # VKontakte
grep -Poi 'vk\.com/[^/ <"'"'"']+' | \
grep -Poi 'vk\.com/((?!&quot;)[^/ <"'"'"'])+' | \
sed 's,^,https://,' sed 's,^,https://,'
) \ ) \
>( >(
# YouTube # YouTube
grep -Poi '(youtube\.com/((user|channel|embed)/)?[^/ <"'"'"']+|youtu\.be/[^/ <"'"'"']+)' | \
grep -Poi '(youtube\.com/((user|channel|embed)/)?((?!&quot;)[^/ <"'"'"'])+|youtu\.be/((?!&quot;)[^/ <"'"'"'])+)' | \
awk '/^youtube/ { print "https://www." $0 } /^youtu\.be/ { print "https://" $0 }' | \ awk '/^youtube/ { print "https://www." $0 } /^youtu\.be/ { print "https://" $0 }' | \
grep -vi -e '^https://www\.youtube\.com/vi$' grep -vi -e '^https://www\.youtube\.com/vi$'
) \ ) \


Loading…
Cancel
Save