Browse Source

Add youtube-extract-rapid

master
JustAnotherArchivist 2 years ago
parent
commit
360c4d9371
3 changed files with 288 additions and 0 deletions
  1. +95
    -0
      .youtube-extract-rapid-test
  2. +1
    -0
      youtube-extract-rapid
  3. +192
    -0
      youtube-extract-rapid.c

+ 95
- 0
.youtube-extract-rapid-test View File

@@ -0,0 +1,95 @@
#!/usr/bin/env python3
import itertools
import subprocess


def test(input, lines):
p = subprocess.Popen(['./.make-and-exec-binaries/youtube-extract-rapid'], text = False, stdin = subprocess.PIPE, stdout = subprocess.PIPE)
stdout, stderr = p.communicate(input)
assert not stderr
stdout = stdout.split(b'\n')
assert stdout[-1] == b'' and stdout[:-1] == lines, f'Got {stdout!r} instead of {lines!r} from {input!r}'


def is_id_char(c):
return b'0' <= c <= b'9' or b'a' <= c <= b'z' or b'A' <= c <= b'Z' or c == b'_' or c == b'-'


def bytes_range(a, b):
# Yields every char between a and b (inclusive) as a bytes object
return map(lambda x: bytes([x]), range(ord(a), ord(b) + 1))


test(b'', [])
test(b'short\n', [])
test(b'01234567890', [b'v 01234567890'])
test(b'01234567890\n', [b'v 01234567890'])

# Videos
input = []
for a in map(lambda x: bytes.fromhex(f'{x:02x}'), range(256)):
if is_id_char(a):
continue
for b in map(lambda x: bytes.fromhex(f'{x:02x}'), range(256)):
if is_id_char(b):
continue
input.append(a + b'0aA_-1bB_-2' + b)
test(b''.join(input), [b'v 0aA_-1bB_-2'] * len(input))

# Channels
test(b'0123456789abcdeFGHIJ_-', [b'c 0123456789abcdeFGHIJ_-'])
test(b'UC0123456789abcdeFGHIJ_-', [b'c UC0123456789abcdeFGHIJ_-'])

# Pure playlists
playlists = [
b'0123456789ABCDEF',
b'PL0123456789ABCDEF',
b'0123456789abcdefghijABCDEFGHIJ_-',
b'PL0123456789abcdefghijABCDEFGHIJ_-',
b'RDAMVM0123456789abcdeFGHIJ_-',
b'RDGMEM0123456789abcdeFGHIJ_-',
b'RDAO0123456789abcdeFGHIJ_-',
b'RDEM0123456789abcdeFGHIJ_-',
b'RDKM0123456789abcdeFGHIJ_-',
]
for playlist in playlists:
test(playlist, [b'p ' + playlist])

# Music playlist madness
for prefix in (b'RDCLAK5uy_', b'RDTMAK5uy_', b'OLAK5uy_'):
for c in bytes_range(b'k', b'n'):
test(prefix + c + b'0123456789abcdefghijABCDEFGHIJ_-', [b'p ' + prefix + c + b'0123456789abcdefghijABCDEFGHIJ_-'])

# Playlists with video IDs
for prefix in (b'RD', b'UL', b'EL', b'CL', b'SL', b'LP', b'RDMM', b'RDQM', b'RDEM', b'RDLV', b'RDHC'):
test(prefix + b'0aA_-1bB_-2', [b'p ' + prefix + b'0aA_-1bB_-2', b'v 0aA_-1bB_-2'])
for a, b in itertools.product(bytes_range(b'0', b'4'), bytes_range(b'0', b'9')):
playlist = b'RD' + a + b + b'0aA_-1bB_-2'
test(playlist, [b'p ' + playlist, b'v 0aA_-1bB_-2'])
playlist = b'RDGMEM' + b'0123456789abcdeFGHIJ_-' + b'VM0aA_-1bB_-2'
test(playlist, [b'p ' + playlist, b'v 0aA_-1bB_-2'])

# Playlists with channel IDs
for prefix in (b'UU', b'LL', b'FL', b'PU', b'UUSH'):
test(prefix + b'0123456789abcdeFGHIJ_-', [b'p ' + prefix + b'0123456789abcdeFGHIJ_-', b'c 0123456789abcdeFGHIJ_-'])
test(b'RDCMUC0123456789abcdeFGHIJ_-', [b'p RDCMUC0123456789abcdeFGHIJ_-', b'c UC0123456789abcdeFGHIJ_-'])

# Some particular unrecognised IDs
ids = [
b'0123456789ABCDEG',
b'PL0123456789ABCDEG',
b'RDCLAK5uy_j0123456789abcdefghijABCDEFGHIJ_-',
b'RDCLAK5uy_o0123456789abcdefghijABCDEFGHIJ_-',
]
for id_ in ids:
test(id_, [b'? ' + id_])

# Buffer rollover
BUFFER_SIZE = 1024 * 1024
for offset in range(-11, 1):
test(b'?' * (BUFFER_SIZE + offset) + b'0aA_-1bB_-2', [b'v 0aA_-1bB_-2'])

# Max length exceedance
MAX_RESULT_SIZE = 1024
for length in range(MAX_RESULT_SIZE + 1, MAX_RESULT_SIZE + 15):
test(b'0' * length, [])

+ 1
- 0
youtube-extract-rapid View File

@@ -0,0 +1 @@
.make-and-exec

+ 192
- 0
youtube-extract-rapid.c View File

@@ -0,0 +1,192 @@
// stdin: YouTube URLs or data with little noise besides that
// stdout: lines for videos, channels, playlists, and unknown YouTube IDs found in the input, prefixed with v, c, p, and ?, respectively

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef DEBUG
#define DEBUG 0
#endif
#define debug_print(fmt, ...) do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0)

#define BUFFER_SIZE 1024 * 1024
#define MAX_RESULT_SIZE 1024
// MAX_RESULT_SIZE is the maximum length of an individual match. This must be smaller than BUFFER_SIZE/2.

#define STATE_NONE 0
#define STATE_ID 1
#define STATE_SKIP_UNTIL_NONID 2

#define IS_ID_CHAR(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == '-')

inline bool is_upper_hex(char* c, size_t len) {
for (size_t i = 0; i < len; i++) {
if (!(('0' <= c[i] && c[i] <= '9') || ('A' <= c[i] && c[i] <= 'F')))
return false;
}
return true;
}

int main(int argc, char** argv) {
char* inbuf = malloc(sizeof(char) * BUFFER_SIZE);
int state = 0;
size_t stateStart = 0;
size_t inbufEnd = 0;

while (1) {
debug_print("inbufEnd = %d, state = %d, stateStart = %d\n", inbufEnd, state, stateStart);
debug_print("Reading %d from stdin\n", BUFFER_SIZE - inbufEnd);
size_t readSize = fread(inbuf + inbufEnd, sizeof(char), BUFFER_SIZE - inbufEnd, stdin);
debug_print("Got %d bytes\n", readSize);
if (readSize == 0) {
if (inbufEnd == 0) {
// Nothing read, nothing left from previous iteration. Bye.
break;
} else {
// No more input data but still something left from the previous read.
// Make sure that the next character cannot be considered valid in any state (NUL qualifies), then let the code below handle things.
inbuf[inbufEnd] = '\0';
readSize += 1;
}
}
for (size_t p = inbufEnd; p < inbufEnd + readSize; p++) {
debug_print("p = %d, character = %c, state = %d, stateStart = %d\n", p, inbuf[p], state, stateStart);
if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID && p - stateStart >= MAX_RESULT_SIZE) {
debug_print("%s\n", "max result size exceeded, dropping result and switching to STATE_SKIP_UNTIL_NONID");
state = STATE_SKIP_UNTIL_NONID;
stateStart = 0;
}
switch (state) {
case STATE_NONE:
if (IS_ID_CHAR(inbuf[p])) {
debug_print("%c is an ID char, switching to STATE_ID\n", inbuf[p]);
state = STATE_ID;
stateStart = p;
}
break;

case STATE_ID:
if (!IS_ID_CHAR(inbuf[p])) {
debug_print("%c is not an ID char\n", inbuf[p]);
if (p - stateStart >= 10) {
debug_print("p = %d, stateStart = %d, got %d ID chars: %.*s\n", p, stateStart, p - stateStart, p - stateStart, inbuf + stateStart);
if (p - stateStart == 11) {
printf("v %.*s\n", p - stateStart, inbuf + stateStart);
} else if (p - stateStart == 22) {
printf("c %.*s\n", p - stateStart, inbuf + stateStart);
} else if (p - stateStart == 24 && inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'C') {
printf("c %.*s\n", p - stateStart, inbuf + stateStart);
} else {
// Playlist candidates, some of which contain IDs for channels or videos
if (p - stateStart >= 19 && memcmp(inbuf + stateStart, "RDAMPL", 6) == 0) {
// Playlist ID starts with RDAMPL, which is followed by a normal playlist ID, so skip that.
stateStart += 6;
}
if (p - stateStart == 32 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 34)) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
} else if ( (p - stateStart == 16 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 18))
&& is_upper_hex(inbuf + p - 16, 16)
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
} else if (p - stateStart == 24 && ( (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'U')
|| (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'L')
|| (inbuf[stateStart] == 'F' && inbuf[stateStart + 1] == 'L')
|| (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'U')
)
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
printf("c %.*s\n", 22, inbuf + stateStart + 2);
} else if (p - stateStart == 26 && memcmp(inbuf + stateStart, "UUSH", 4) == 0) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
printf("c %.*s\n", 22, inbuf + stateStart + 4);
} else if (p - stateStart == 13 && ( (inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D')
|| (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'L')
|| (inbuf[stateStart] == 'E' && inbuf[stateStart + 1] == 'L')
|| (inbuf[stateStart] == 'C' && inbuf[stateStart + 1] == 'L')
|| (inbuf[stateStart] == 'S' && inbuf[stateStart + 1] == 'L')
|| (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'P')
)
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
printf("v %.*s\n", 11, inbuf + stateStart + 2);
} else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
&& ( (inbuf[stateStart + 2] == 'M' && inbuf[stateStart + 3] == 'M')
|| (inbuf[stateStart + 2] == 'Q' && inbuf[stateStart + 3] == 'M')
|| (inbuf[stateStart + 2] == 'E' && inbuf[stateStart + 3] == 'M')
|| (inbuf[stateStart + 2] == 'L' && inbuf[stateStart + 3] == 'V')
|| (inbuf[stateStart + 2] == 'H' && inbuf[stateStart + 3] == 'C')
)
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
printf("v %.*s\n", 11, inbuf + stateStart + 4);
} else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
&& '0' <= inbuf[stateStart + 2] && inbuf[stateStart + 2] <= '4'
&& '0' <= inbuf[stateStart + 3] && inbuf[stateStart + 3] <= '9'
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
printf("v %.*s\n", 11, inbuf + stateStart + 4);
} else if ( p - stateStart == 28 && ( memcmp(inbuf + stateStart, "RDAMVM", 6) == 0
|| memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
)
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
} else if (p - stateStart == 28 && memcmp(inbuf + stateStart, "RDCMUC", 6) == 0) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
printf("c %.*s\n", 24, inbuf + stateStart + 4);
} else if ( p - stateStart == 41 && memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
&& inbuf[stateStart + 28] == 'V' && inbuf[stateStart + 29] == 'M'
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
printf("v %.*s\n", 11, inbuf + stateStart + 30);
} else if (p - stateStart == 26 && ( memcmp(inbuf + stateStart, "RDAO", 4) == 0
|| memcmp(inbuf + stateStart, "RDEM", 4) == 0
|| memcmp(inbuf + stateStart, "RDKM", 4) == 0
)
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
} else if ( p - stateStart == 43 && 'k' <= inbuf[stateStart + 10] && inbuf[stateStart + 10] <= 'n'
&& ( memcmp(inbuf + stateStart, "RDCLAK5uy_", 10) == 0
|| memcmp(inbuf + stateStart, "RDTMAK5uy_", 10) == 0
)
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
} else if ( p - stateStart == 41 && 'k' <= inbuf[stateStart + 8] && inbuf[stateStart + 8] <= 'n'
&& memcmp(inbuf + stateStart, "OLAK5uy_", 8) == 0
) {
printf("p %.*s\n", p - stateStart, inbuf + stateStart);
} else {
printf("? %.*s\n", p - stateStart, inbuf + stateStart);
}
}
}
debug_print("%s\n", "Switching to STATE_NONE");
state = STATE_NONE;
stateStart = 0;
}
break;

case STATE_SKIP_UNTIL_NONID:
if (!IS_ID_CHAR(inbuf[p])) {
debug_print("%s\n", "Switching to STATE_NONE");
state = STATE_NONE;
}
break;
}
}
if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID) {
// Need to keep the trailing part of the buffer for the next iteration.
// Because stateStart gets reset to zero when it exceeds MAX_RESULT_SIZE, inbufEnd + readSize - stateStart is guaranteed to be smaller than MAX_RESULT_SIZE.
// Because MAX_RESULT_SIZE < BUFFER_SIZE/2, we can simply copy the last bytes to the beginning of inbuf directly.
debug_print("Copying %d bytes starting from %d to the beginning of the buffer: %.*s\n",
inbufEnd + readSize - stateStart, stateStart, inbufEnd + readSize - stateStart, inbuf + stateStart);
memcpy(inbuf, inbuf + stateStart, inbufEnd + readSize - stateStart);
inbufEnd += readSize - stateStart;
stateStart = 0;
} else {
debug_print("%s\n", "No buffer copying necessary");
inbufEnd = 0;
}
}
}

Loading…
Cancel
Save