Add Bing, Reddit/Pushshift, and FoolFuuka scrapers

5 years ago · fd680551df
--- a/+ 16
+++ b/+ 16
@@ -0,0 +1,16 @@
 #!/bin/bash
 q="$1"
 declare -i max=10000
 if [[ $# -eq 2 ]]; then max=$2; fi
 {
 	declare -i first=1
 	queryStr="q=${q}"
 	while [[ ${first} -lt ${max} ]]
 	do
 		echo "http://www.bing.com/search?${queryStr}" >&2
 		curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' "http://www.bing.com/search?${queryStr}"
 		first+=10
 		queryStr="q=${q}&go=Search&qs=ds&first=${first}&FORM=PORE"
 		sleep 2
 	done
 } | grep -Po '<li class="b_algo">.*?</li>' | grep -Po 'href="\Khttps?://(?!www\.microsofttranslator\.com/|view\.officeapps\.live\.com/)[^"]+' | awk '!seen[$0]++'
--- a/+ 31
+++ b/+ 31
@@ -0,0 +1,31 @@
 #!/bin/bash
 # Search 4chan archives based on FoolFuuka
 # Searches each board individually to get as much content as possible due to the 5000 results limit
 # Output: one post per line in HTML
 domain="$1"
 q="$2"
 curl -s "https://${domain}/" | grep -Po 'href="(https?://'"$(sed 's/[]\.|$(){}?+*^]/\\&/g' <<<"${domain}")"')?/\K[^/]+(?=/")' | awk '!seen[$0]++' | while read -r board
 do
 	content=$(curl -s "https://${domain}/${board}/search/text/${q}/")
 	if grep -qP '<h3 class="section_title">.*Returning only' <<<"${content}"
 	then
 		echo "Warning: only 5000 results!" >&2
 	fi

 	declare -i page=1
 	while [[ ${page} -lt 201 ]]
 	do
 		echo "Grabbing https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
 		content=$(curl -s "https://${domain}/${board}/search/text/${q}/page/${page}/")
 		if grep -qF '<div class="alert"' <<<"${content}"
 		then
 			if ! tr -d '\n' <<<"${content}" | grep -Po '<div class="alert".*?</div>' | grep -q 'No results found'
 			then
 				echo "Error on https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
 			fi
 			break
 		fi
 		tr -d '\n' <<<"${content}" | grep -Po '<article class="post.*?</article>'
 		page+=1
 	done
 done
--- a/+ 28
+++ b/+ 28
@@ -0,0 +1,28 @@
 #!/bin/bash
 # Search all submissions or comments on Reddit for a search term
 # Usage: $0 (submission|comment) QUERY
 # Output:
 # For submissions: post date timestamp, permalink, url or None if it's a selfpost, body in Python-repr format or None if it's a title-only selfpost
 # For comments: post date timestamp, permalink, content in Python-repr format
 #   For comments before 2017-10-24, the Pushshift API doesn't provide a permalink, so that field is filled with "comment_id/parent_id/username" instead.
 #   Unfortunately, that means that it may be hard to find those comments on Reddit (unless the parent is a thread, i.e. it's a top-level comment).
 mode="$1"
 q="$2"
 before=2147483647
 pipe=$(mktemp -u); mkfifo "${pipe}"; exec 3<>"${pipe}"; rm "${pipe}"; unset pipe
 while :
 do
 	{
 		if [[ "${mode}" == "comment" ]]
 		then
 			curl -s "https://api.pushshift.io/reddit/search/comment/?q=${q}&size=500&fields=author,body,created_utc,link_id,parent_id,permalink&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %r" % (d["created_utc"], d["permalink"] if "permalink" in d else d["parent_id"] + "/" + d["link_id"] + "/" + d["author"], d["body"]))'
 		else
 			curl -s "https://api.pushshift.io/reddit/search/submission/?q=${q}&size=500&fields=author,created_utc,id,is_self,permalink,selftext,url&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %s %s" % (d["created_utc"], d["permalink"], d["url"] if not d["is_self"] else "None", repr(d["selftext"]) if "selftext" in d else "None"))'
 		fi
 	} | awk 'BEGIN { timestamp = 0; }  { timestamp=$1; print; }  END { print timestamp >"/dev/fd/3" }'
 	before=$(head -1 <&3)
 	if [[ ${before} -eq 0 ]] # No data returned by Pushshift
 	then
 		break
 	fi
 done