Browse Source

Refine patterns

master
JustAnotherArchivist 3 years ago
parent
commit
81e2b4b999
1 changed files with 28 additions and 29 deletions
  1. +28
    -29
      youtube-extract

+ 28
- 29
youtube-extract View File

@@ -25,13 +25,35 @@ if mode == 'removenonyt':
assert mode == 'massage'


# For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? or (\\)? instead. But really, why would you have backslashes in URLs?
noisePattern = '|'.join([
r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
r'^//www\.youtube\.com/s/gaming/emoji/',
r'^//www\.youtube\.com/redirect\?event=channel_banner&',
r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
r'^//www\.youtube\.com/yts/',
r'^//www\.youtube\.com/img/',
r'^//www\.youtube\.com/youtubei/',
r'^//www\.youtube\.com/ads(/|$)',
r'^//www\.youtube\.com/creators(/|$)',
r'^//www\.youtube\.com/(player|iframe)_api(\?|$)',
r'^//www\.youtube\.com/error(_204)?/?\?',
r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
r'^//www\.youtube\.com/results/?(\?|$)',
r'^//www\.youtube\.com/premium/?\?',
r'^//www\.youtube\.com/new([/?]|$)',
r'^//www\.youtube\.com/?(\?|$)',
r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
r'^//www\.youtube\.com/service_ajax$',
r'^//www\.youtube\.com/watch(\?v=)?$',
r'^//consent\.(youtube|google)\.com/',
r'^//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap
])

channelPattern = '|'.join([
r'//www\.youtube\.com/c/[^/?&]+',
r'//www\.youtube\.com/user/[^/?&]+',
r'//www\.youtube\.com/channel/UC[^/?&]+',
r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)(%[23]F|$))[^/?&]+(?=/?$)',
r'//www\.youtube\.com/c/[^/?&=.]+',
r'//www\.youtube\.com/user/[^/?&=.]+',
r'//www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
r'//www\.youtube\.com/[^/?&=.]+(?=/?$)',
])

# Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
@@ -57,29 +79,6 @@ videoPattern = '|'.join([
r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
])

noisePattern = '|'.join([
r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
r'^//www\.youtube\.com/s/gaming/emoji/',
r'^//www\.youtube\.com/redirect\?event=channel_banner&',
r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
r'^//www\.youtube\.com/yts/',
r'^//www\.youtube\.com/img/',
r'^//www\.youtube\.com/youtubei/',
r'^//www\.youtube\.com/ads(/|$)',
r'^//www\.youtube\.com/creators(/|$)',
r'^//www\.youtube\.com/(player|iframe)_api\?',
r'^//www\.youtube\.com/error(_204)?/?\?',
r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
r'^//www\.youtube\.com/results/?\?',
r'^//www\.youtube\.com/premium/?\?',
r'^//www\.youtube\.com/new([/?]|$)',
r'^//www\.youtube\.com/?(\?|$)',
r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
r'^//www\.youtube\.com/service_ajax$',
r'^//www\.youtube\.com/watch(\?v=)?$',
r'^//consent\.(youtube|google)\.com/',
])


def percentdecode(s):
return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&')
@@ -89,7 +88,7 @@ matchers = [
# (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line
# If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function.
[noisePattern, False, lambda m: None],
[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]],
[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1].rstrip('/')],
[videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
[r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
[r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],


Loading…
Cancel
Save