diff --git a/youtube-extract b/youtube-extract index ed39ce0..d4acfe8 100755 --- a/youtube-extract +++ b/youtube-extract @@ -25,13 +25,35 @@ if mode == 'removenonyt': assert mode == 'massage' -# For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? or (\\)? instead. But really, why would you have backslashes in URLs? +noisePattern = '|'.join([ + r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/', + r'^//www\.youtube\.com/s/gaming/emoji/', + r'^//www\.youtube\.com/redirect\?event=channel_banner&', + r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)', + r'^//www\.youtube\.com/yts/', + r'^//www\.youtube\.com/img/', + r'^//www\.youtube\.com/youtubei/', + r'^//www\.youtube\.com/ads(/|$)', + r'^//www\.youtube\.com/creators(/|$)', + r'^//www\.youtube\.com/(player|iframe)_api(\?|$)', + r'^//www\.youtube\.com/error(_204)?/?\?', + r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)', + r'^//www\.youtube\.com/results/?(\?|$)', + r'^//www\.youtube\.com/premium/?\?', + r'^//www\.youtube\.com/new([/?]|$)', + r'^//www\.youtube\.com/?(\?|$)', + r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff + r'^//www\.youtube\.com/service_ajax$', + r'^//www\.youtube\.com/watch(\?v=)?$', + r'^//consent\.(youtube|google)\.com/', + r'^//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap + ]) channelPattern = '|'.join([ - r'//www\.youtube\.com/c/[^/?&]+', - r'//www\.youtube\.com/user/[^/?&]+', - r'//www\.youtube\.com/channel/UC[^/?&]+', - r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)(%[23]F|$))[^/?&]+(?=/?$)', + r'//www\.youtube\.com/c/[^/?&=.]+', + r'//www\.youtube\.com/user/[^/?&=.]+', + r'//www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}', + r'//www\.youtube\.com/[^/?&=.]+(?=/?$)', ]) # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K). @@ -57,29 +79,6 @@ videoPattern = '|'.join([ r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}', ]) -noisePattern = '|'.join([ - r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/', - r'^//www\.youtube\.com/s/gaming/emoji/', - r'^//www\.youtube\.com/redirect\?event=channel_banner&', - r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)', - r'^//www\.youtube\.com/yts/', - r'^//www\.youtube\.com/img/', - r'^//www\.youtube\.com/youtubei/', - r'^//www\.youtube\.com/ads(/|$)', - r'^//www\.youtube\.com/creators(/|$)', - r'^//www\.youtube\.com/(player|iframe)_api\?', - r'^//www\.youtube\.com/error(_204)?/?\?', - r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)', - r'^//www\.youtube\.com/results/?\?', - r'^//www\.youtube\.com/premium/?\?', - r'^//www\.youtube\.com/new([/?]|$)', - r'^//www\.youtube\.com/?(\?|$)', - r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff - r'^//www\.youtube\.com/service_ajax$', - r'^//www\.youtube\.com/watch(\?v=)?$', - r'^//consent\.(youtube|google)\.com/', - ]) - def percentdecode(s): return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&') @@ -89,7 +88,7 @@ matchers = [ # (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line # If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function. [noisePattern, False, lambda m: None], - [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]], + [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1].rstrip('/')], [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'], [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'], [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],