From a812cb5fc291e8e22c41a65a15bc965aa8155577 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 6 May 2019 22:48:36 +0000 Subject: [PATCH] More snscrape helper tools --- archivebot-jobid-calculation | 45 ++++++++++++++++++++++++++++++++++++ snscrape-extract-usernames | 2 +- snscrape-prepare-commands | 10 ++++++++ snscrape-tmux | 23 ++++++++++++++++++ snscrape-upload | 24 +++++++++++++------ 5 files changed, 96 insertions(+), 8 deletions(-) create mode 100755 archivebot-jobid-calculation create mode 100755 snscrape-prepare-commands create mode 100755 snscrape-tmux diff --git a/archivebot-jobid-calculation b/archivebot-jobid-calculation new file mode 100755 index 0000000..09e951b --- /dev/null +++ b/archivebot-jobid-calculation @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +# The SHA1 UUID stuff in Ruby is actually more complicated. Everything's right until the `head -c32`, but then Ruby transforms it into an integer in a quite peculiar way: https://github.com/sporkmonger/uuidtools/blob/a10724236cefd922ee5cd3de7695fb6e5fd703f5/lib/uuidtools.rb#L480-L494 +# Ruby code: ArchiveBot lib/job.rb + https://github.com/sporkmonger/uuidtools/blob/a10724236cefd922ee5cd3de7695fb6e5fd703f5/lib/uuidtools.rb#L688-L691 +# Takes the SHA-1 hash of the namespace (as raw bytes) and the name, truncates it to 32 hex chars, creates a new UUID from it, transforms two fields, converts it to a bigint, and formats it in base-36 + +# sed/sha1sum/head/bash-based version missing the time_hi_and_version and clock_seq_hi_and_reserved modification +#{ echo -n '82244de1-c354-4c89-bf2b-f153ce23af43' | sed 's,-,,g' | xxd -r -p; echo -n 'https://transfer.notkiska.pw/sDu6C/marwilliamson-twitter.txt'; } | sha1sum | head -c32 | { read -r hash; BASE36=($(echo {0..9} {a..z})); for i in $(bc <<< "obase=32; ibase=16; ${hash^^}" | tr -d '\\\n'); do echo -n ${BASE36[$((10#$i))]}; done; }; echo + + +import hashlib +import sys +import uuid + +url = sys.argv[1] # Assume that it's normalised already + +# Calculate hash +h = hashlib.sha1() +h.update(bytes.fromhex('82244de1-c354-4c89-bf2b-f153ce23af43'.replace('-', ''))) +h.update(url.encode('ascii')) +h = h.hexdigest() + +# Create and transform UUID object +u = uuid.UUID(h[:32]) +f = list(u.fields) +f[2] &= 0x0FFF +f[2] |= (5 << 12) +f[3] &= 0x3F; +f[3] |= 0x80; + +# Turn it into an int +#i = (f[0] << 96) + (f[1] << 80) + (f[2] << 64) + (f[3] << 56) + (f[4] << 48) + f[5] +i = uuid.UUID(fields = f).int + +# Convert to base-36 +def int_to_base36(num): + # https://stackoverflow.com/a/31746873 + assert num >= 0 + digits = '0123456789abcdefghijklmnopqrstuvwxyz' + res = '' + while not res or num > 0: + num, i = divmod(num, 36) + res = digits[i] + res + return res +print(int_to_base36(i)) diff --git a/snscrape-extract-usernames b/snscrape-extract-usernames index a81cb41..da3fe0c 100755 --- a/snscrape-extract-usernames +++ b/snscrape-extract-usernames @@ -1,6 +1,6 @@ #!/bin/bash # Extract from stdin social media usernames suitable for snscrape, grouped by service -grep -Po '(https?://www\.\K(facebook|instagram)\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' | +grep -Po '(https?://www\.\Kfacebook\.com/(?!pages/)\S+(?=/)|https?://www\.\Kinstagram\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' | sed 's,\.com/, ,' | sort | awk ' diff --git a/snscrape-prepare-commands b/snscrape-prepare-commands new file mode 100755 index 0000000..640d58a --- /dev/null +++ b/snscrape-prepare-commands @@ -0,0 +1,10 @@ +#!/bin/bash +scriptpath="$(cd "$(dirname "$0")"; pwd -P)" + +"${scriptpath}/snscrape-extract-usernames" | while read -r service line +do + if [[ "${service}" == "facebook:" || "${service}" == "instagram:" || "${service}" == "twitter:" ]] + then + echo "for user in ${line}; do $(printf "%q" "${scriptpath}")/snscrape-${service:0:-1}-user "'"${user}"; done' + fi +done diff --git a/snscrape-tmux b/snscrape-tmux new file mode 100755 index 0000000..904dee8 --- /dev/null +++ b/snscrape-tmux @@ -0,0 +1,23 @@ +#!/bin/bash +mkdir -p /tmp/snscrape +echo 'snscrape-dev' > /tmp/snscrape/.python-version +scriptpath="$(cd "$(dirname "$0")"; pwd -P)" +export PATH="${scriptpath}:${PATH}" +cd /tmp/snscrape +tmux new -s snscrape \ + -n "normalise" 'printf "\033]2;%s\033\\" "normalise"; bash' \; \ + send-keys -t 'snscrape:normalise' 'pyenv_setup' Enter '# xclip -selection c -o | snscrape-normalise' Enter \; \ + new-window -n "prepare" 'printf "\033]2;%s\033\\" "prepare"; bash' \; \ + send-keys -t 'snscrape:prepare' '# xclip -selection c -o | snscrape-prepare-commands' Enter \; \ + new-window -n "scrape" 'printf "\033]2;%s\033\\" "scrape-facebook"; bash' \; \ + send-keys -t 'snscrape:scrape.1' 'pyenv_setup' Enter '# facebook' Enter \; \ + split-window -v 'printf "\033]2;%s\033\\" "scrape-instagram"; bash' \; \ + send-keys -t 'snscrape:scrape.2' 'pyenv_setup' Enter '# instagram' Enter \; \ + split-window -v 'printf "\033]2;%s\033\\" "scrape-twitter"; bash' \; \ + send-keys -t 'snscrape:scrape.3' 'pyenv_setup' Enter '# twitter' Enter \; \ + select-layout -t 'snscrape:scrape' even-vertical \; \ + new-window -n "upload" 'printf "\033]2;%s\033\\" "upload"; bash' \; \ + send-keys -t 'snscrape:upload' 'pyenv_setup' Enter '# snscrape-upload' Enter \; \ + new-window -n "merge" 'printf "\033]2;%s\033\\" "merge"; bash' \; \ + send-keys -t 'snscrape:merge' '# snscrape-wiki-transfer-merge' Enter \; \ + new-window -n "cleanup" 'printf "\033]2;%s\033\\" "cleanup"; bash' diff --git a/snscrape-upload b/snscrape-upload index 00fc63a..b05b10e 100755 --- a/snscrape-upload +++ b/snscrape-upload @@ -1,7 +1,7 @@ #!/bin/bash scriptpath="$(cd "$(dirname "$0")"; pwd -P)" -insta= +instagramUrls=() if [[ -e transfer ]] then @@ -9,6 +9,12 @@ then exit 1 fi +if ! python3 --version &>/dev/null +then + echo "Error: python3 not found" >&2 + exit 1 +fi + # ArchiveBot for f in "$@" do @@ -23,12 +29,21 @@ do elif [[ "${f}" == instagram-* ]] then echo "!a < ${upurl}" - insta=1 + instagramUrls+=("${upurl}") else echo "!ao < ${upurl}" fi done 3>transfer +# Instagram ignores +if [[ ${#instagramUrls[@]} -gt 0 ]] +then + for url in "${instagramUrls[@]}" + do + echo "!ig $("${scriptpath}/archivebot-jobid-calculation" "${url}") ^https?://www.instagram.com/.*[?&]hl=" + done +fi + # chromebot for f in "$@" do @@ -41,9 +56,4 @@ do fi done | sed 's,^,chromebot: a ,' -# Instagram ignore warning -if [[ "${insta}" ]] -then - echo "Don't forget to add the Instagram ignore! ^https?://www.instagram.com/.*[?&]hl=" >&2 -fi echo "Wrote ./transfer, you can run snscrape-wiki-transfer-merge now if ./wiki exists." >&2