From fd680551dfd1e7397d05bf7fbaba810e98047340 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 23 Mar 2019 17:21:29 +0000 Subject: [PATCH] Add Bing, Reddit/Pushshift, and FoolFuuka scrapers --- bing-scrape | 16 ++++++++++++++++ foolfuuka-search | 31 +++++++++++++++++++++++++++++++ reddit-pushshift-search | 28 ++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 bing-scrape create mode 100644 foolfuuka-search create mode 100644 reddit-pushshift-search diff --git a/bing-scrape b/bing-scrape new file mode 100644 index 0000000..c4719ba --- /dev/null +++ b/bing-scrape @@ -0,0 +1,16 @@ +#!/bin/bash +q="$1" +declare -i max=10000 +if [[ $# -eq 2 ]]; then max=$2; fi +{ + declare -i first=1 + queryStr="q=${q}" + while [[ ${first} -lt ${max} ]] + do + echo "http://www.bing.com/search?${queryStr}" >&2 + curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' "http://www.bing.com/search?${queryStr}" + first+=10 + queryStr="q=${q}&go=Search&qs=ds&first=${first}&FORM=PORE" + sleep 2 + done +} | grep -Po '
  • .*?
  • ' | grep -Po 'href="\Khttps?://(?!www\.microsofttranslator\.com/|view\.officeapps\.live\.com/)[^"]+' | awk '!seen[$0]++' diff --git a/foolfuuka-search b/foolfuuka-search new file mode 100644 index 0000000..8276d4e --- /dev/null +++ b/foolfuuka-search @@ -0,0 +1,31 @@ +#!/bin/bash +# Search 4chan archives based on FoolFuuka +# Searches each board individually to get as much content as possible due to the 5000 results limit +# Output: one post per line in HTML +domain="$1" +q="$2" +curl -s "https://${domain}/" | grep -Po 'href="(https?://'"$(sed 's/[]\.|$(){}?+*^]/\\&/g' <<<"${domain}")"')?/\K[^/]+(?=/")' | awk '!seen[$0]++' | while read -r board +do + content=$(curl -s "https://${domain}/${board}/search/text/${q}/") + if grep -qP '

    .*Returning only' <<<"${content}" + then + echo "Warning: only 5000 results!" >&2 + fi + + declare -i page=1 + while [[ ${page} -lt 201 ]] + do + echo "Grabbing https://${domain}/${board}/search/text/${q}/page/${page}/" >&2 + content=$(curl -s "https://${domain}/${board}/search/text/${q}/page/${page}/") + if grep -qF '
    ' | grep -q 'No results found' + then + echo "Error on https://${domain}/${board}/search/text/${q}/page/${page}/" >&2 + fi + break + fi + tr -d '\n' <<<"${content}" | grep -Po '
    "${pipe}"; rm "${pipe}"; unset pipe +while : +do + { + if [[ "${mode}" == "comment" ]] + then + curl -s "https://api.pushshift.io/reddit/search/comment/?q=${q}&size=500&fields=author,body,created_utc,link_id,parent_id,permalink&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %r" % (d["created_utc"], d["permalink"] if "permalink" in d else d["parent_id"] + "/" + d["link_id"] + "/" + d["author"], d["body"]))' + else + curl -s "https://api.pushshift.io/reddit/search/submission/?q=${q}&size=500&fields=author,created_utc,id,is_self,permalink,selftext,url&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %s %s" % (d["created_utc"], d["permalink"], d["url"] if not d["is_self"] else "None", repr(d["selftext"]) if "selftext" in d else "None"))' + fi + } | awk 'BEGIN { timestamp = 0; } { timestamp=$1; print; } END { print timestamp >"/dev/fd/3" }' + before=$(head -1 <&3) + if [[ ${before} -eq 0 ]] # No data returned by Pushshift + then + break + fi +done