Browse Source

More snscrape helper tools

master
JustAnotherArchivist 1 year ago
parent
commit
a812cb5fc2
5 changed files with 96 additions and 8 deletions
  1. +45
    -0
      archivebot-jobid-calculation
  2. +1
    -1
      snscrape-extract-usernames
  3. +10
    -0
      snscrape-prepare-commands
  4. +23
    -0
      snscrape-tmux
  5. +17
    -7
      snscrape-upload

+ 45
- 0
archivebot-jobid-calculation View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python3

# The SHA1 UUID stuff in Ruby is actually more complicated. Everything's right until the `head -c32`, but then Ruby transforms it into an integer in a quite peculiar way: https://github.com/sporkmonger/uuidtools/blob/a10724236cefd922ee5cd3de7695fb6e5fd703f5/lib/uuidtools.rb#L480-L494
# Ruby code: ArchiveBot lib/job.rb + https://github.com/sporkmonger/uuidtools/blob/a10724236cefd922ee5cd3de7695fb6e5fd703f5/lib/uuidtools.rb#L688-L691
# Takes the SHA-1 hash of the namespace (as raw bytes) and the name, truncates it to 32 hex chars, creates a new UUID from it, transforms two fields, converts it to a bigint, and formats it in base-36

# sed/sha1sum/head/bash-based version missing the time_hi_and_version and clock_seq_hi_and_reserved modification
#{ echo -n '82244de1-c354-4c89-bf2b-f153ce23af43' | sed 's,-,,g' | xxd -r -p; echo -n 'https://transfer.notkiska.pw/sDu6C/marwilliamson-twitter.txt'; } | sha1sum | head -c32 | { read -r hash; BASE36=($(echo {0..9} {a..z})); for i in $(bc <<< "obase=32; ibase=16; ${hash^^}" | tr -d '\\\n'); do echo -n ${BASE36[$((10#$i))]}; done; }; echo


import hashlib
import sys
import uuid

url = sys.argv[1] # Assume that it's normalised already

# Calculate hash
h = hashlib.sha1()
h.update(bytes.fromhex('82244de1-c354-4c89-bf2b-f153ce23af43'.replace('-', '')))
h.update(url.encode('ascii'))
h = h.hexdigest()

# Create and transform UUID object
u = uuid.UUID(h[:32])
f = list(u.fields)
f[2] &= 0x0FFF
f[2] |= (5 << 12)
f[3] &= 0x3F;
f[3] |= 0x80;

# Turn it into an int
#i = (f[0] << 96) + (f[1] << 80) + (f[2] << 64) + (f[3] << 56) + (f[4] << 48) + f[5]
i = uuid.UUID(fields = f).int

# Convert to base-36
def int_to_base36(num):
# https://stackoverflow.com/a/31746873
assert num >= 0
digits = '0123456789abcdefghijklmnopqrstuvwxyz'
res = ''
while not res or num > 0:
num, i = divmod(num, 36)
res = digits[i] + res
return res
print(int_to_base36(i))

+ 1
- 1
snscrape-extract-usernames View File

@@ -1,6 +1,6 @@
#!/bin/bash
# Extract from stdin social media usernames suitable for snscrape, grouped by service
grep -Po '(https?://www\.\K(facebook|instagram)\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' |
grep -Po '(https?://www\.\Kfacebook\.com/(?!pages/)\S+(?=/)|https?://www\.\Kinstagram\.com/\S+(?=/)|https?://\Ktwitter\.com/\S+)' |
sed 's,\.com/, ,' |
sort |
awk '


+ 10
- 0
snscrape-prepare-commands View File

@@ -0,0 +1,10 @@
#!/bin/bash
scriptpath="$(cd "$(dirname "$0")"; pwd -P)"

"${scriptpath}/snscrape-extract-usernames" | while read -r service line
do
if [[ "${service}" == "facebook:" || "${service}" == "instagram:" || "${service}" == "twitter:" ]]
then
echo "for user in ${line}; do $(printf "%q" "${scriptpath}")/snscrape-${service:0:-1}-user "'"${user}"; done'
fi
done

+ 23
- 0
snscrape-tmux View File

@@ -0,0 +1,23 @@
#!/bin/bash
mkdir -p /tmp/snscrape
echo 'snscrape-dev' > /tmp/snscrape/.python-version
scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
export PATH="${scriptpath}:${PATH}"
cd /tmp/snscrape
tmux new -s snscrape \
-n "normalise" 'printf "\033]2;%s\033\\" "normalise"; bash' \; \
send-keys -t 'snscrape:normalise' 'pyenv_setup' Enter '# xclip -selection c -o | snscrape-normalise' Enter \; \
new-window -n "prepare" 'printf "\033]2;%s\033\\" "prepare"; bash' \; \
send-keys -t 'snscrape:prepare' '# xclip -selection c -o | snscrape-prepare-commands' Enter \; \
new-window -n "scrape" 'printf "\033]2;%s\033\\" "scrape-facebook"; bash' \; \
send-keys -t 'snscrape:scrape.1' 'pyenv_setup' Enter '# facebook' Enter \; \
split-window -v 'printf "\033]2;%s\033\\" "scrape-instagram"; bash' \; \
send-keys -t 'snscrape:scrape.2' 'pyenv_setup' Enter '# instagram' Enter \; \
split-window -v 'printf "\033]2;%s\033\\" "scrape-twitter"; bash' \; \
send-keys -t 'snscrape:scrape.3' 'pyenv_setup' Enter '# twitter' Enter \; \
select-layout -t 'snscrape:scrape' even-vertical \; \
new-window -n "upload" 'printf "\033]2;%s\033\\" "upload"; bash' \; \
send-keys -t 'snscrape:upload' 'pyenv_setup' Enter '# snscrape-upload' Enter \; \
new-window -n "merge" 'printf "\033]2;%s\033\\" "merge"; bash' \; \
send-keys -t 'snscrape:merge' '# snscrape-wiki-transfer-merge' Enter \; \
new-window -n "cleanup" 'printf "\033]2;%s\033\\" "cleanup"; bash'

+ 17
- 7
snscrape-upload View File

@@ -1,7 +1,7 @@
#!/bin/bash
scriptpath="$(cd "$(dirname "$0")"; pwd -P)"

insta=
instagramUrls=()

if [[ -e transfer ]]
then
@@ -9,6 +9,12 @@ then
exit 1
fi

if ! python3 --version &>/dev/null
then
echo "Error: python3 not found" >&2
exit 1
fi

# ArchiveBot
for f in "$@"
do
@@ -23,12 +29,21 @@ do
elif [[ "${f}" == instagram-* ]]
then
echo "!a < ${upurl}"
insta=1
instagramUrls+=("${upurl}")
else
echo "!ao < ${upurl}"
fi
done 3>transfer

# Instagram ignores
if [[ ${#instagramUrls[@]} -gt 0 ]]
then
for url in "${instagramUrls[@]}"
do
echo "!ig $("${scriptpath}/archivebot-jobid-calculation" "${url}") ^https?://www.instagram.com/.*[?&]hl="
done
fi

# chromebot
for f in "$@"
do
@@ -41,9 +56,4 @@ do
fi
done | sed 's,^,chromebot: a ,'

# Instagram ignore warning
if [[ "${insta}" ]]
then
echo "Don't forget to add the Instagram ignore! ^https?://www.instagram.com/.*[?&]hl=" >&2
fi
echo "Wrote ./transfer, you can run snscrape-wiki-transfer-merge now if ./wiki exists." >&2

Loading…
Cancel
Save