From b66260ca940a7cb82b308b70232909df4e5b9bee Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 25 Nov 2020 22:07:35 +0000 Subject: [PATCH] Add youtube-extract --- youtube-extract | 124 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100755 youtube-extract diff --git a/youtube-extract b/youtube-extract new file mode 100755 index 0000000..01028a2 --- /dev/null +++ b/youtube-extract @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +import re +import sys + + +if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']): + print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr) + print(file = sys.stderr) + print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the URLs on stdin and prints them on stdout.", file = sys.stderr) + print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr) + print(file = sys.stderr) + print("In 'removenonyt' mode, prints all URLs that look like they are or contain YouTube URLs on stdout.", file = sys.stderr) + sys.exit(1) + + +# For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? instead. + +# Channel/user URLs; the protocol and domain are stripped and replaced below. +channelPattern = re.compile('|'.join([ + r'//www\.youtube\.com/c/[^/?]+', + r'//www\.youtube\.com/user/[^/?]+', + r'//www\.youtube\.com/channel/UC[^/?]+', + r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)$)[^/?]+(?=/?$)', + ])) + +# Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K). +# If necessary, use lookahead assertions to match further stuff after the video ID. +videoPattern = re.compile('|'.join([ + # Normal watch URL + r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}', + r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}', + # Embeds + r'//www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}', + r'//www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}', + # Shortener + r'//youtu\.be/[0-9A-Za-z_-]{11}', + # Old (Flash) embeds + r'//www\.youtube\.com/v/[0-9A-Za-z_-]{11}', + # Redirects from links in video descriptions + r'//www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)', + # Tracking and other crap + r'//www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}', + r'//www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}', + r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}', + # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed + r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}', + ])) + +noisePattern = re.compile('|'.join([ + r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/', + r'^//www\.youtube\.com/s/gaming/emoji/', + r'^//www\.youtube\.com/redirect\?event=channel_banner&', + r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)', + r'^//www\.youtube\.com/yts/', + r'^//www\.youtube\.com/img/', + r'^//www\.youtube\.com/youtubei/', + r'^//www\.youtube\.com/ads/', + r'^//www\.youtube\.com/(player|iframe)_api\?', + r'^//www\.youtube\.com/error(_204)?/?\?', + r'^//www\.youtube\.com/(about|t|howyoutubeworks)[/?]', + r'^//www\.youtube\.com/results/?\?', + r'^//www\.youtube\.com/premium/?\?', + r'^//www\.youtube\.com/?(\?|$)', + r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff + r'^//consent\.(youtube|google)\.com/', + ])) + + +matchers = [ + # (pattern, paramSearch, function(match) -> output str or None); returning None stops further processing + [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]], + [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'], + [re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=UU[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/UC{m[0].rsplit("=", 1)[1][2:]}'], + [re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=(PL|FL|RD|OL)[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/playlist?list={m[0].rsplit("=", 1)[1]}'], + [re.compile(r'//www\.youtube\.com/embed/?\?(?=(.*&)?listType=user_uploads(&|$))(.*&)?list=[^&]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("=", 1)[1]}'], + [re.compile(r'//www\.youtube\.com/rss/user/[^/]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("/", 1)[1]}'], + [re.compile(r'//www\.youtube\.com/feeds/videos\.xml\?(.*&)?channel_id=UC[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/{m[0].rsplit("=", 1)[1]}'], + [re.compile(r'//www\.youtube\.com(/view_play_list\?p=|/playlist\?(.*&)?list=)[0-9A-F]{16}(?=(&|$))'), True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[0].rsplit("=", 1)[1]}'], + [re.compile(r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/'), False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'], + [noisePattern, False, lambda m: None], +] + +# Compile second pattern for parameters if needed +for e in matchers: + pattern, paramSearch, f = e + if paramSearch: + p2 = pattern.pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(':', '(:|%3A)').replace(r'\?', r'(\?|%3F)') + p2 = re.sub(r'(?