From fd680551dfd1e7397d05bf7fbaba810e98047340 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Sat, 23 Mar 2019 17:21:29 +0000
Subject: [PATCH] Add Bing, Reddit/Pushshift, and FoolFuuka scrapers

---
 bing-scrape             | 16 ++++++++++++++++
 foolfuuka-search        | 31 +++++++++++++++++++++++++++++++
 reddit-pushshift-search | 28 ++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)
 create mode 100644 bing-scrape
 create mode 100644 foolfuuka-search
 create mode 100644 reddit-pushshift-search
diff --git a/bing-scrape b/bing-scrape
new file mode 100644
index 0000000..c4719ba
--- /dev/null
+++ b/bing-scrape
@@ -0,0 +1,16 @@
+#!/bin/bash
+q="$1"
+declare -i max=10000
+if [[ $# -eq 2 ]]; then max=$2; fi
+{
+	declare -i first=1
+	queryStr="q=${q}"
+	while [[ ${first} -lt ${max} ]]
+	do
+		echo "http://www.bing.com/search?${queryStr}" >&2
+		curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' "http://www.bing.com/search?${queryStr}"
+		first+=10
+		queryStr="q=${q}&go=Search&qs=ds&first=${first}&FORM=PORE"
+		sleep 2
+	done
+} | grep -Po '<li class="b_algo">.*?</li>' | grep -Po 'href="\Khttps?://(?!www\.microsofttranslator\.com/|view\.officeapps\.live\.com/)[^"]+' | awk '!seen[$0]++'
diff --git a/foolfuuka-search b/foolfuuka-search
new file mode 100644
index 0000000..8276d4e
--- /dev/null
+++ b/foolfuuka-search
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Search 4chan archives based on FoolFuuka
+# Searches each board individually to get as much content as possible due to the 5000 results limit
+# Output: one post per line in HTML
+domain="$1"
+q="$2"
+curl -s "https://${domain}/" | grep -Po 'href="(https?://'"$(sed 's/[]\.|$(){}?+*^]/\\&/g' <<<"${domain}")"')?/\K[^/]+(?=/")' | awk '!seen[$0]++' | while read -r board
+do
+	content=$(curl -s "https://${domain}/${board}/search/text/${q}/")
+	if grep -qP '<h3 class="section_title">.*Returning only' <<<"${content}"
+	then
+		echo "Warning: only 5000 results!" >&2
+	fi
+
+	declare -i page=1
+	while [[ ${page} -lt 201 ]]
+	do
+		echo "Grabbing https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
+		content=$(curl -s "https://${domain}/${board}/search/text/${q}/page/${page}/")
+		if grep -qF '<div class="alert"' <<<"${content}"
+		then
+			if ! tr -d '\n' <<<"${content}" | grep -Po '<div class="alert".*?</div>' | grep -q 'No results found'
+			then
+				echo "Error on https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
+			fi
+			break
+		fi
+		tr -d '\n' <<<"${content}" | grep -Po '<article class="post.*?</article>'
+		page+=1
+	done
+done
diff --git a/reddit-pushshift-search b/reddit-pushshift-search
new file mode 100644
index 0000000..79ec783
--- /dev/null
+++ b/reddit-pushshift-search
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Search all submissions or comments on Reddit for a search term
+# Usage: $0 (submission|comment) QUERY
+# Output:
+# For submissions: post date timestamp, permalink, url or None if it's a selfpost, body in Python-repr format or None if it's a title-only selfpost
+# For comments: post date timestamp, permalink, content in Python-repr format
+#   For comments before 2017-10-24, the Pushshift API doesn't provide a permalink, so that field is filled with "comment_id/parent_id/username" instead.
+#   Unfortunately, that means that it may be hard to find those comments on Reddit (unless the parent is a thread, i.e. it's a top-level comment).
+mode="$1"
+q="$2"
+before=2147483647
+pipe=$(mktemp -u); mkfifo "${pipe}"; exec 3<>"${pipe}"; rm "${pipe}"; unset pipe
+while :
+do
+	{
+		if [[ "${mode}" == "comment" ]]
+		then
+			curl -s "https://api.pushshift.io/reddit/search/comment/?q=${q}&size=500&fields=author,body,created_utc,link_id,parent_id,permalink&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %r" % (d["created_utc"], d["permalink"] if "permalink" in d else d["parent_id"] + "/" + d["link_id"] + "/" + d["author"], d["body"]))'
+		else
+			curl -s "https://api.pushshift.io/reddit/search/submission/?q=${q}&size=500&fields=author,created_utc,id,is_self,permalink,selftext,url&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %s %s" % (d["created_utc"], d["permalink"], d["url"] if not d["is_self"] else "None", repr(d["selftext"]) if "selftext" in d else "None"))'
+		fi
+	} | awk 'BEGIN { timestamp = 0; }  { timestamp=$1; print; }  END { print timestamp >"/dev/fd/3" }'
+	before=$(head -1 <&3)
+	if [[ ${before} -eq 0 ]] # No data returned by Pushshift
+	then
+		break
+	fi
+done