Add youtube-extract-rapid

2 years ago · 360c4d9371
--- a/.youtube-extract-rapid-test
+++ b/.youtube-extract-rapid-test
@@ -0,0 +1,95 @@
 #!/usr/bin/env python3
 import itertools
 import subprocess


 def test(input, lines):
 	p = subprocess.Popen(['./.make-and-exec-binaries/youtube-extract-rapid'], text = False, stdin = subprocess.PIPE, stdout = subprocess.PIPE)
 	stdout, stderr = p.communicate(input)
 	assert not stderr
 	stdout = stdout.split(b'\n')
 	assert stdout[-1] == b'' and stdout[:-1] == lines, f'Got {stdout!r} instead of {lines!r} from {input!r}'


 def is_id_char(c):
 	return b'0' <= c <= b'9' or b'a' <= c <= b'z' or b'A' <= c <= b'Z' or c == b'_' or c == b'-'


 def bytes_range(a, b):
 	# Yields every char between a and b (inclusive) as a bytes object
 	return map(lambda x: bytes([x]), range(ord(a), ord(b) + 1))


 test(b'', [])
 test(b'short\n', [])
 test(b'01234567890', [b'v 01234567890'])
 test(b'01234567890\n', [b'v 01234567890'])

 # Videos
 input = []
 for a in map(lambda x: bytes.fromhex(f'{x:02x}'), range(256)):
 	if is_id_char(a):
 		continue
 	for b in map(lambda x: bytes.fromhex(f'{x:02x}'), range(256)):
 		if is_id_char(b):
 			continue
 		input.append(a + b'0aA_-1bB_-2' + b)
 test(b''.join(input), [b'v 0aA_-1bB_-2'] * len(input))

 # Channels
 test(b'0123456789abcdeFGHIJ_-', [b'c 0123456789abcdeFGHIJ_-'])
 test(b'UC0123456789abcdeFGHIJ_-', [b'c UC0123456789abcdeFGHIJ_-'])

 # Pure playlists
 playlists = [
 	b'0123456789ABCDEF',
 	b'PL0123456789ABCDEF',
 	b'0123456789abcdefghijABCDEFGHIJ_-',
 	b'PL0123456789abcdefghijABCDEFGHIJ_-',
 	b'RDAMVM0123456789abcdeFGHIJ_-',
 	b'RDGMEM0123456789abcdeFGHIJ_-',
 	b'RDAO0123456789abcdeFGHIJ_-',
 	b'RDEM0123456789abcdeFGHIJ_-',
 	b'RDKM0123456789abcdeFGHIJ_-',
 ]
 for playlist in playlists:
 	test(playlist, [b'p ' + playlist])

 # Music playlist madness
 for prefix in (b'RDCLAK5uy_', b'RDTMAK5uy_', b'OLAK5uy_'):
 	for c in bytes_range(b'k', b'n'):
 		test(prefix + c + b'0123456789abcdefghijABCDEFGHIJ_-', [b'p ' + prefix + c + b'0123456789abcdefghijABCDEFGHIJ_-'])

 # Playlists with video IDs
 for prefix in (b'RD', b'UL', b'EL', b'CL', b'SL', b'LP', b'RDMM', b'RDQM', b'RDEM', b'RDLV', b'RDHC'):
 	test(prefix + b'0aA_-1bB_-2', [b'p ' + prefix + b'0aA_-1bB_-2', b'v 0aA_-1bB_-2'])
 for a, b in itertools.product(bytes_range(b'0', b'4'), bytes_range(b'0', b'9')):
 	playlist = b'RD' + a + b + b'0aA_-1bB_-2'
 	test(playlist, [b'p ' + playlist, b'v 0aA_-1bB_-2'])
 playlist = b'RDGMEM' + b'0123456789abcdeFGHIJ_-' + b'VM0aA_-1bB_-2'
 test(playlist, [b'p ' + playlist, b'v 0aA_-1bB_-2'])

 # Playlists with channel IDs
 for prefix in (b'UU', b'LL', b'FL', b'PU', b'UUSH'):
 	test(prefix + b'0123456789abcdeFGHIJ_-', [b'p ' + prefix + b'0123456789abcdeFGHIJ_-', b'c 0123456789abcdeFGHIJ_-'])
 test(b'RDCMUC0123456789abcdeFGHIJ_-', [b'p RDCMUC0123456789abcdeFGHIJ_-', b'c UC0123456789abcdeFGHIJ_-'])

 # Some particular unrecognised IDs
 ids = [
 	b'0123456789ABCDEG',
 	b'PL0123456789ABCDEG',
 	b'RDCLAK5uy_j0123456789abcdefghijABCDEFGHIJ_-',
 	b'RDCLAK5uy_o0123456789abcdefghijABCDEFGHIJ_-',
 ]
 for id_ in ids:
 	test(id_, [b'? ' + id_])

 # Buffer rollover
 BUFFER_SIZE = 1024 * 1024
 for offset in range(-11, 1):
 	test(b'?' * (BUFFER_SIZE + offset) + b'0aA_-1bB_-2', [b'v 0aA_-1bB_-2'])

 # Max length exceedance
 MAX_RESULT_SIZE = 1024
 for length in range(MAX_RESULT_SIZE + 1, MAX_RESULT_SIZE + 15):
 	test(b'0' * length, [])
--- a/+ 1
+++ b/+ 1
@@ -0,0 +1 @@
 .make-and-exec
--- a/youtube-extract-rapid.c
+++ b/youtube-extract-rapid.c
@@ -0,0 +1,192 @@
 // stdin: YouTube URLs or data with little noise besides that
 // stdout: lines for videos, channels, playlists, and unknown YouTube IDs found in the input, prefixed with v, c, p, and ?, respectively

 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #ifndef DEBUG
 #define DEBUG 0
 #endif
 #define debug_print(fmt, ...) do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0)

 #define BUFFER_SIZE 1024 * 1024
 #define MAX_RESULT_SIZE 1024
 // MAX_RESULT_SIZE is the maximum length of an individual match. This must be smaller than BUFFER_SIZE/2.

 #define STATE_NONE 0
 #define STATE_ID 1
 #define STATE_SKIP_UNTIL_NONID 2

 #define IS_ID_CHAR(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == '-')

 inline bool is_upper_hex(char* c, size_t len) {
 	for (size_t i = 0; i < len; i++) {
 		if (!(('0' <= c[i] && c[i] <= '9') || ('A' <= c[i] && c[i] <= 'F')))
 			return false;
 	}
 	return true;
 }

 int main(int argc, char** argv) {
 	char* inbuf = malloc(sizeof(char) * BUFFER_SIZE);
 	int state = 0;
 	size_t stateStart = 0;
 	size_t inbufEnd = 0;

 	while (1) {
 		debug_print("inbufEnd = %d, state = %d, stateStart = %d\n", inbufEnd, state, stateStart);
 		debug_print("Reading %d from stdin\n", BUFFER_SIZE - inbufEnd);
 		size_t readSize = fread(inbuf + inbufEnd, sizeof(char), BUFFER_SIZE - inbufEnd, stdin);
 		debug_print("Got %d bytes\n", readSize);
 		if (readSize == 0) {
 			if (inbufEnd == 0) {
 				// Nothing read, nothing left from previous iteration. Bye.
 				break;
 			} else {
 				// No more input data but still something left from the previous read.
 				// Make sure that the next character cannot be considered valid in any state (NUL qualifies), then let the code below handle things.
 				inbuf[inbufEnd] = '\0';
 				readSize += 1;
 			}
 		}
 		for (size_t p = inbufEnd; p < inbufEnd + readSize; p++) {
 			debug_print("p = %d, character = %c, state = %d, stateStart = %d\n", p, inbuf[p], state, stateStart);
 			if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID && p - stateStart >= MAX_RESULT_SIZE) {
 				debug_print("%s\n", "max result size exceeded, dropping result and switching to STATE_SKIP_UNTIL_NONID");
 				state = STATE_SKIP_UNTIL_NONID;
 				stateStart = 0;
 			}
 			switch (state) {
 				case STATE_NONE:
 					if (IS_ID_CHAR(inbuf[p])) {
 						debug_print("%c is an ID char, switching to STATE_ID\n", inbuf[p]);
 						state = STATE_ID;
 						stateStart = p;
 					}
 					break;

 				case STATE_ID:
 					if (!IS_ID_CHAR(inbuf[p])) {
 						debug_print("%c is not an ID char\n", inbuf[p]);
 						if (p - stateStart >= 10) {
 							debug_print("p = %d, stateStart = %d, got %d ID chars: %.*s\n", p, stateStart, p - stateStart, p - stateStart, inbuf + stateStart);
 							if (p - stateStart == 11) {
 								printf("v %.*s\n", p - stateStart, inbuf + stateStart);
 							} else if (p - stateStart == 22) {
 								printf("c %.*s\n", p - stateStart, inbuf + stateStart);
 							} else if (p - stateStart == 24 && inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'C') {
 								printf("c %.*s\n", p - stateStart, inbuf + stateStart);
 							} else {
 								// Playlist candidates, some of which contain IDs for channels or videos
 								if (p - stateStart >= 19 && memcmp(inbuf + stateStart, "RDAMPL", 6) == 0) {
 									// Playlist ID starts with RDAMPL, which is followed by a normal playlist ID, so skip that.
 									stateStart += 6;
 								}
 								if (p - stateStart == 32 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 34)) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 								} else if (   (p - stateStart == 16 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 18))
 								           && is_upper_hex(inbuf + p - 16, 16)
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 								} else if (p - stateStart == 24 && (   (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'U')
 								                                    || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'L')
 								                                    || (inbuf[stateStart] == 'F' && inbuf[stateStart + 1] == 'L')
 								                                    || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'U')
 								                                   )
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 									printf("c %.*s\n", 22, inbuf + stateStart + 2);
 								} else if (p - stateStart == 26 && memcmp(inbuf + stateStart, "UUSH", 4) == 0) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 									printf("c %.*s\n", 22, inbuf + stateStart + 4);
 								} else if (p - stateStart == 13 && (   (inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D')
 								                                    || (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'L')
 								                                    || (inbuf[stateStart] == 'E' && inbuf[stateStart + 1] == 'L')
 								                                    || (inbuf[stateStart] == 'C' && inbuf[stateStart + 1] == 'L')
 								                                    || (inbuf[stateStart] == 'S' && inbuf[stateStart + 1] == 'L')
 								                                    || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'P')
 								                                   )
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 									printf("v %.*s\n", 11, inbuf + stateStart + 2);
 								} else if (   p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
 								           && (   (inbuf[stateStart + 2] == 'M' && inbuf[stateStart + 3] == 'M')
 								               || (inbuf[stateStart + 2] == 'Q' && inbuf[stateStart + 3] == 'M')
 								               || (inbuf[stateStart + 2] == 'E' && inbuf[stateStart + 3] == 'M')
 								               || (inbuf[stateStart + 2] == 'L' && inbuf[stateStart + 3] == 'V')
 								               || (inbuf[stateStart + 2] == 'H' && inbuf[stateStart + 3] == 'C')
 								              )
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 									printf("v %.*s\n", 11, inbuf + stateStart + 4);
 								} else if (   p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
 								           && '0' <= inbuf[stateStart + 2] && inbuf[stateStart + 2] <= '4'
 								           && '0' <= inbuf[stateStart + 3] && inbuf[stateStart + 3] <= '9'
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 									printf("v %.*s\n", 11, inbuf + stateStart + 4);
 								} else if (   p - stateStart == 28 && (   memcmp(inbuf + stateStart, "RDAMVM", 6) == 0
 								                                       || memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
 								                                      )
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 								} else if (p - stateStart == 28 && memcmp(inbuf + stateStart, "RDCMUC", 6) == 0) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 									printf("c %.*s\n", 24, inbuf + stateStart + 4);
 								} else if (   p - stateStart == 41 && memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
 								           && inbuf[stateStart + 28] == 'V' && inbuf[stateStart + 29] == 'M'
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 									printf("v %.*s\n", 11, inbuf + stateStart + 30);
 								} else if (p - stateStart == 26 && (   memcmp(inbuf + stateStart, "RDAO", 4) == 0
 								                                    || memcmp(inbuf + stateStart, "RDEM", 4) == 0
 								                                    || memcmp(inbuf + stateStart, "RDKM", 4) == 0
 								                                   )
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 								} else if (   p - stateStart == 43 && 'k' <= inbuf[stateStart + 10] && inbuf[stateStart + 10] <= 'n'
 								           && (   memcmp(inbuf + stateStart, "RDCLAK5uy_", 10) == 0
 								               || memcmp(inbuf + stateStart, "RDTMAK5uy_", 10) == 0
 								              )
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 								} else if (   p - stateStart == 41 && 'k' <= inbuf[stateStart + 8] && inbuf[stateStart + 8] <= 'n'
 								           && memcmp(inbuf + stateStart, "OLAK5uy_", 8) == 0
 								          ) {
 									printf("p %.*s\n", p - stateStart, inbuf + stateStart);
 								} else {
 									printf("? %.*s\n", p - stateStart, inbuf + stateStart);
 								}
 							}
 						}
 						debug_print("%s\n", "Switching to STATE_NONE");
 						state = STATE_NONE;
 						stateStart = 0;
 					}
 					break;

 				case STATE_SKIP_UNTIL_NONID:
 					if (!IS_ID_CHAR(inbuf[p])) {
 						debug_print("%s\n", "Switching to STATE_NONE");
 						state = STATE_NONE;
 					}
 					break;
 			}
 		}
 		if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID) {
 			// Need to keep the trailing part of the buffer for the next iteration.
 			// Because stateStart gets reset to zero when it exceeds MAX_RESULT_SIZE, inbufEnd + readSize - stateStart is guaranteed to be smaller than MAX_RESULT_SIZE.
 			// Because MAX_RESULT_SIZE < BUFFER_SIZE/2, we can simply copy the last bytes to the beginning of inbuf directly.
 			debug_print("Copying %d bytes starting from %d to the beginning of the buffer: %.*s\n",
 			            inbufEnd + readSize - stateStart, stateStart, inbufEnd + readSize - stateStart, inbuf + stateStart);
 			memcpy(inbuf, inbuf + stateStart, inbufEnd + readSize - stateStart);
 			inbufEnd += readSize - stateStart;
 			stateStart = 0;
 		} else {
 			debug_print("%s\n", "No buffer copying necessary");
 			inbufEnd = 0;
 		}
 	}
 }