From ede77ad14204ae2d05847dfe7b8c6400a1a10964 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 21 Mar 2019 22:17:41 +0000 Subject: [PATCH] Filter Twitter hashtag scrapes based on account scrapes --- snscrape-twitter-filter | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 snscrape-twitter-filter diff --git a/snscrape-twitter-filter b/snscrape-twitter-filter new file mode 100644 index 0000000..38e1128 --- /dev/null +++ b/snscrape-twitter-filter @@ -0,0 +1,9 @@ +#!/bin/bash +# When scraping accounts and hashtags which have some overlap, this can be used to filter out the accounts' tweets from the hashtag scrapes +# Starting with account and hashtag scrapes in twitter-@* and twitter-#*, respectively: +for f in twitter-#*; do comm -23 <(sort <$f) <(cat twitter-@* | sort) > "${f}-fixed"; done +for f in *-fixed; do { grep -vF '/status/' $f; grep -F '/status/' $f | sort -t'/' -k6,6n | tac; } > "${f}-sorted"; done +for f in *-fixed-sorted; do mv $f ${f/-fixed-sorted/-filtered}; done + +# sort -r should work, but for some reason it doesn't, hence the tac... +# There's certainly a cleaner way which doesn't involve sorting and then restoring the inverse chronological order.