Browse Source

Add Bing, Reddit/Pushshift, and FoolFuuka scrapers

master
JustAnotherArchivist 1 year ago
parent
commit
fd680551df
3 changed files with 75 additions and 0 deletions
  1. +16
    -0
      bing-scrape
  2. +31
    -0
      foolfuuka-search
  3. +28
    -0
      reddit-pushshift-search

+ 16
- 0
bing-scrape View File

@@ -0,0 +1,16 @@
#!/bin/bash
q="$1"
declare -i max=10000
if [[ $# -eq 2 ]]; then max=$2; fi
{
declare -i first=1
queryStr="q=${q}"
while [[ ${first} -lt ${max} ]]
do
echo "http://www.bing.com/search?${queryStr}" >&2
curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' "http://www.bing.com/search?${queryStr}"
first+=10
queryStr="q=${q}&go=Search&qs=ds&first=${first}&FORM=PORE"
sleep 2
done
} | grep -Po '<li class="b_algo">.*?</li>' | grep -Po 'href="\Khttps?://(?!www\.microsofttranslator\.com/|view\.officeapps\.live\.com/)[^"]+' | awk '!seen[$0]++'

+ 31
- 0
foolfuuka-search View File

@@ -0,0 +1,31 @@
#!/bin/bash
# Search 4chan archives based on FoolFuuka
# Searches each board individually to get as much content as possible due to the 5000 results limit
# Output: one post per line in HTML
domain="$1"
q="$2"
curl -s "https://${domain}/" | grep -Po 'href="(https?://'"$(sed 's/[]\.|$(){}?+*^]/\\&/g' <<<"${domain}")"')?/\K[^/]+(?=/")' | awk '!seen[$0]++' | while read -r board
do
content=$(curl -s "https://${domain}/${board}/search/text/${q}/")
if grep -qP '<h3 class="section_title">.*Returning only' <<<"${content}"
then
echo "Warning: only 5000 results!" >&2
fi

declare -i page=1
while [[ ${page} -lt 201 ]]
do
echo "Grabbing https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
content=$(curl -s "https://${domain}/${board}/search/text/${q}/page/${page}/")
if grep -qF '<div class="alert"' <<<"${content}"
then
if ! tr -d '\n' <<<"${content}" | grep -Po '<div class="alert".*?</div>' | grep -q 'No results found'
then
echo "Error on https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
fi
break
fi
tr -d '\n' <<<"${content}" | grep -Po '<article class="post.*?</article>'
page+=1
done
done

+ 28
- 0
reddit-pushshift-search View File

@@ -0,0 +1,28 @@
#!/bin/bash
# Search all submissions or comments on Reddit for a search term
# Usage: $0 (submission|comment) QUERY
# Output:
# For submissions: post date timestamp, permalink, url or None if it's a selfpost, body in Python-repr format or None if it's a title-only selfpost
# For comments: post date timestamp, permalink, content in Python-repr format
# For comments before 2017-10-24, the Pushshift API doesn't provide a permalink, so that field is filled with "comment_id/parent_id/username" instead.
# Unfortunately, that means that it may be hard to find those comments on Reddit (unless the parent is a thread, i.e. it's a top-level comment).
mode="$1"
q="$2"
before=2147483647
pipe=$(mktemp -u); mkfifo "${pipe}"; exec 3<>"${pipe}"; rm "${pipe}"; unset pipe
while :
do
{
if [[ "${mode}" == "comment" ]]
then
curl -s "https://api.pushshift.io/reddit/search/comment/?q=${q}&size=500&fields=author,body,created_utc,link_id,parent_id,permalink&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %r" % (d["created_utc"], d["permalink"] if "permalink" in d else d["parent_id"] + "/" + d["link_id"] + "/" + d["author"], d["body"]))'
else
curl -s "https://api.pushshift.io/reddit/search/submission/?q=${q}&size=500&fields=author,created_utc,id,is_self,permalink,selftext,url&before=${before}" | python3 -c 'import json,sys'$'\n''for d in json.loads(sys.stdin.read())["data"]:'$'\n'' print("%d %s %s %s" % (d["created_utc"], d["permalink"], d["url"] if not d["is_self"] else "None", repr(d["selftext"]) if "selftext" in d else "None"))'
fi
} | awk 'BEGIN { timestamp = 0; } { timestamp=$1; print; } END { print timestamp >"/dev/fd/3" }'
before=$(head -1 <&3)
if [[ ${before} -eq 0 ]] # No data returned by Pushshift
then
break
fi
done

Loading…
Cancel
Save