From a812cb5fc291e8e22c41a65a15bc965aa8155577 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Mon, 6 May 2019 22:48:36 +0000
Subject: [PATCH] More snscrape helper tools

---
 archivebot-jobid-calculation | 45 ++++++++++++++++++++++++++++++++++++
 snscrape-extract-usernames   |  2 +-
 snscrape-prepare-commands    | 10 ++++++++
 snscrape-tmux                | 23 ++++++++++++++++++
 snscrape-upload              | 24 +++++++++++++------
 5 files changed, 96 insertions(+), 8 deletions(-)
 create mode 100755 archivebot-jobid-calculation
 create mode 100755 snscrape-prepare-commands
 create mode 100755 snscrape-tmux

diff --git a/archivebot-jobid-calculation b/archivebot-jobid-calculation
new file mode 100755
index 0000000..09e951b
--- /dev/null
+++ b/archivebot-jobid-calculation
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+
+# The SHA1 UUID stuff in Ruby is actually more complicated. Everything's right until the `head -c32`, but then Ruby transforms it into an integer in a quite peculiar way: https://github.com/sporkmonger/uuidtools/blob/a10724236cefd922ee5cd3de7695fb6e5fd703f5/lib/uuidtools.rb#L480-L494
+# Ruby code: ArchiveBot lib/job.rb + https://github.com/sporkmonger/uuidtools/blob/a10724236cefd922ee5cd3de7695fb6e5fd703f5/lib/uuidtools.rb#L688-L691
+# Takes the SHA-1 hash of the namespace (as raw bytes) and the name, truncates it to 32 hex chars, creates a new UUID from it, transforms two fields, converts it to a bigint, and formats it in base-36
+
+# sed/sha1sum/head/bash-based version missing the time_hi_and_version and clock_seq_hi_and_reserved modification
+#{ echo -n '82244de1-c354-4c89-bf2b-f153ce23af43' | sed 's,-,,g' | xxd -r -p; echo -n 'https://transfer.notkiska.pw/sDu6C/marwilliamson-twitter.txt'; } | sha1sum | head -c32 | { read -r hash; BASE36=($(echo {0..9} {a..z})); for i in $(bc <<< "obase=32; ibase=16; ${hash^^}" | tr -d '\\\n'); do echo -n ${BASE36[$((10#$i))]}; done; }; echo
+
+
+import hashlib
+import sys
+import uuid
+
+url = sys.argv[1] # Assume that it's normalised already
+
+# Calculate hash
+h = hashlib.sha1()
+h.update(bytes.fromhex('82244de1-c354-4c89-bf2b-f153ce23af43'.replace('-', '')))
+h.update(url.encode('ascii'))
+h = h.hexdigest()
+
+# Create and transform UUID object
+u = uuid.UUID(h[:32])
+f = list(u.fields)
+f[2] &= 0x0FFF
+f[2] |= (5 << 12)
+f[3] &= 0x3F;
+f[3] |= 0x80;
+
+# Turn it into an int
+#i = (f[0] << 96) + (f[1] << 80) + (f[2] << 64) + (f[3] << 56) + (f[4] << 48) + f[5]
+i = uuid.UUID(fields = f).int
+
+# Convert to base-36
+def int_to_base36(num):
+	# https://stackoverflow.com/a/31746873
+	assert num >= 0
+	digits = '0123456789abcdefghijklmnopqrstuvwxyz'
+	res = ''
+	while not res or num > 0:
+		num, i = divmod(num, 36)
+		res = digits[i] + res
+	return res
+print(int_to_base36(i))
diff --git a/snscrape-extract-usernames b/snscrape-extract-usernames
index a81cb41..da3fe0c 100755
--- a/snscrape-extract-usernames
+++ b/snscrape-extract-usernames
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Extract from stdin social media usernames suitable for snscrape, grouped by service
-grep -Po '(https?://www\.\K(facebook|instagram)\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' |
+grep -Po '(https?://www\.\Kfacebook\.com/(?!pages/)\S+(?=/)|https?://www\.\Kinstagram\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' |
 	sed 's,\.com/, ,' |
 	sort |
 	awk '
diff --git a/snscrape-prepare-commands b/snscrape-prepare-commands
new file mode 100755
index 0000000..640d58a
--- /dev/null
+++ b/snscrape-prepare-commands
@@ -0,0 +1,10 @@
+#!/bin/bash
+scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
+
+"${scriptpath}/snscrape-extract-usernames" | while read -r service line
+do
+	if [[ "${service}" == "facebook:" || "${service}" == "instagram:" || "${service}" == "twitter:" ]]
+	then
+		echo "for user in ${line}; do $(printf "%q" "${scriptpath}")/snscrape-${service:0:-1}-user "'"${user}"; done'
+	fi
+done
diff --git a/snscrape-tmux b/snscrape-tmux
new file mode 100755
index 0000000..904dee8
--- /dev/null
+++ b/snscrape-tmux
@@ -0,0 +1,23 @@
+#!/bin/bash
+mkdir -p /tmp/snscrape
+echo 'snscrape-dev' > /tmp/snscrape/.python-version
+scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
+export PATH="${scriptpath}:${PATH}"
+cd /tmp/snscrape
+tmux new -s snscrape \
+	-n "normalise" 'printf "\033]2;%s\033\\" "normalise"; bash' \; \
+		send-keys -t 'snscrape:normalise' 'pyenv_setup' Enter '# xclip -selection c -o | snscrape-normalise' Enter \; \
+	new-window -n "prepare" 'printf "\033]2;%s\033\\" "prepare"; bash' \; \
+		send-keys -t 'snscrape:prepare' '# xclip -selection c -o | snscrape-prepare-commands' Enter \; \
+	new-window -n "scrape" 'printf "\033]2;%s\033\\" "scrape-facebook"; bash' \; \
+			send-keys -t 'snscrape:scrape.1' 'pyenv_setup' Enter '# facebook' Enter \; \
+		split-window -v 'printf "\033]2;%s\033\\" "scrape-instagram"; bash' \; \
+			send-keys -t 'snscrape:scrape.2' 'pyenv_setup' Enter '# instagram' Enter \; \
+		split-window -v 'printf "\033]2;%s\033\\" "scrape-twitter"; bash' \; \
+			send-keys -t 'snscrape:scrape.3' 'pyenv_setup' Enter '# twitter' Enter \; \
+		select-layout -t 'snscrape:scrape' even-vertical \; \
+	new-window -n "upload" 'printf "\033]2;%s\033\\" "upload"; bash' \; \
+		send-keys -t 'snscrape:upload' 'pyenv_setup' Enter '# snscrape-upload' Enter \; \
+	new-window -n "merge" 'printf "\033]2;%s\033\\" "merge"; bash' \; \
+		send-keys -t 'snscrape:merge' '# snscrape-wiki-transfer-merge' Enter \; \
+	new-window -n "cleanup" 'printf "\033]2;%s\033\\" "cleanup"; bash'
diff --git a/snscrape-upload b/snscrape-upload
index 00fc63a..b05b10e 100755
--- a/snscrape-upload
+++ b/snscrape-upload
@@ -1,7 +1,7 @@
 #!/bin/bash
 scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
 
-insta=
+instagramUrls=()
 
 if [[ -e transfer ]]
 then
@@ -9,6 +9,12 @@ then
 	exit 1
 fi
 
+if ! python3 --version &>/dev/null
+then
+	echo "Error: python3 not found" >&2
+	exit 1
+fi
+
 # ArchiveBot
 for f in "$@"
 do
@@ -23,12 +29,21 @@ do
 	elif [[ "${f}" == instagram-* ]]
 	then
 		echo "!a < ${upurl}"
-		insta=1
+		instagramUrls+=("${upurl}")
 	else
 		echo "!ao < ${upurl}"
 	fi
 done 3>transfer
 
+# Instagram ignores
+if [[ ${#instagramUrls[@]} -gt 0 ]]
+then
+	for url in "${instagramUrls[@]}"
+	do
+		echo "!ig $("${scriptpath}/archivebot-jobid-calculation" "${url}") ^https?://www.instagram.com/.*[?&]hl="
+	done
+fi
+
 # chromebot
 for f in "$@"
 do
@@ -41,9 +56,4 @@ do
 	fi
 done | sed 's,^,chromebot: a ,'
 
-# Instagram ignore warning
-if [[ "${insta}" ]]
-then
-	echo "Don't forget to add the Instagram ignore! ^https?://www.instagram.com/.*[?&]hl=" >&2
-fi
 echo "Wrote ./transfer, you can run snscrape-wiki-transfer-merge now if ./wiki exists." >&2