From 1fa57d41a32c77919252705ecc135ae76db6bbac Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 10 Feb 2020 18:23:36 +0000 Subject: [PATCH] Fix extraction on Wix sites from JSON inside a data attribute Example: https://www.martinedocourt.ch/ --- website-extract-social-media | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website-extract-social-media b/website-extract-social-media index 6d1b877..c566d62 100755 --- a/website-extract-social-media +++ b/website-extract-social-media @@ -10,42 +10,42 @@ function fetch_n_extract { tee \ >( # Facebook - grep -Poi 'facebook\.com/(pages/[^/ <"'"'"']+/|groups/|pg/)?[^/ <"'"'"']+' | \ + grep -Poi 'facebook\.com/(pages/((?!")[^/ <"'"'"'])+/|groups/|pg/)?((?!")[^/ <"'"'"'])+' | \ sed 's,^,https://www.,' | \ grep -vi -e '^https://www\.facebook\.com/2008$' -e '^https://www\.facebook\.com/tr\?' -e '^https://www\.facebook\.com/plugins$' -e '^https://www\.facebook\.com/l\.php\?' | \ grep -Pvi '^https://www\.facebook\.com/share(r(\.php)?)?(\?|$)' ) \ >( # Flickr - grep -Poi 'flickr\.com/photos/[^/ <"'"'"']+' | \ + grep -Poi 'flickr\.com/photos/((?!")[^/ <"'"'"'])+' | \ sed 's,^,https://www.,' ) \ >( # Instagram - grep -Poi 'instagram\.com/(p/)?[^/ <"'"'"']+' | \ + grep -Poi 'instagram\.com/(p/)?((?!")[^/ <"'"'"'])+' | \ sed 's,^,https://www.,' | \ grep -Pvi -e '^https://www\.instagram\.com/v?p$' ) \ >( # Telegram - grep -Poi '//(www\.)?t\.me/[^/ <"'"'"']+' | \ + grep -Poi '//(www\.)?t\.me/((?!")[^/ <"'"'"'])+' | \ sed 's,^//,,; s,^www\.,,; s,^,https://,' ) \ >( # Twitter - grep -Poi 'twitter\.com/(#!/)?(hashtag/)?[^/ <"'"'"']+' | \ + grep -Poi 'twitter\.com/(#!/)?(hashtag/)?((?!")[^/ <"'"'"'])+' | \ sed 's,^twitter\.com/#!/,twitter.com/,; s,^,https://,' | \ grep -vi -e '^https://twitter\.com/home\?' -e '^https://twitter\.com/widgets\.js$' -e '^https://twitter\.com/share\?' -e '^https://twitter\.com/intent$' | \ sed 's,\([?&]\)ref_src=[^&]\+&\?,\1,; s,?$,,' ) \ >( # VKontakte - grep -Poi 'vk\.com/[^/ <"'"'"']+' | \ + grep -Poi 'vk\.com/((?!")[^/ <"'"'"'])+' | \ sed 's,^,https://,' ) \ >( # YouTube - grep -Poi '(youtube\.com/((user|channel|embed)/)?[^/ <"'"'"']+|youtu\.be/[^/ <"'"'"']+)' | \ + grep -Poi '(youtube\.com/((user|channel|embed)/)?((?!")[^/ <"'"'"'])+|youtu\.be/((?!")[^/ <"'"'"'])+)' | \ awk '/^youtube/ { print "https://www." $0 } /^youtu\.be/ { print "https://" $0 }' | \ grep -vi -e '^https://www\.youtube\.com/vi$' ) \