The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

131 lines
6.3 KiB

  1. #!/usr/bin/env python3
  2. import itertools
  3. import os
  4. import re
  5. import sys
  6. if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
  7. print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr)
  8. print(file = sys.stderr)
  9. print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the URLs on stdin and prints them on stdout.", file = sys.stderr)
  10. print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr)
  11. print(file = sys.stderr)
  12. print("In 'removenonyt' mode, prints all URLs that look like they are or contain YouTube URLs on stdout.", file = sys.stderr)
  13. sys.exit(1)
  14. mode = sys.argv[1] if len(sys.argv) >= 2 else 'massage'
  15. if mode == 'removenonyt':
  16. # Anything in here could never be as fast as grep, so just delegate to that...
  17. os.execlp('grep', 'grep', '-F', '-e', '/www.youtube.com/', '-e', '/youtu.be/', '-e', '%2Fwww.youtube.com%2F', '-e', '%2Fyoutu.be%2F')
  18. sys.exit(0)
  19. assert mode == 'massage'
  20. # For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? or (\\)? instead. But really, why would you have backslashes in URLs?
  21. channelPattern = '|'.join([
  22. r'//www\.youtube\.com/c/[^/?&]+',
  23. r'//www\.youtube\.com/user/[^/?&]+',
  24. r'//www\.youtube\.com/channel/UC[^/?&]+',
  25. r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)(%[23]F|$))[^/?&]+(?=/?$)',
  26. ])
  27. # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
  28. # If necessary, use lookahead assertions to match further stuff after the video ID.
  29. videoPattern = '|'.join([
  30. # Normal watch URL
  31. r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
  32. r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
  33. # Embeds
  34. r'//www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
  35. r'//www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
  36. # Shortener
  37. r'//youtu\.be/[0-9A-Za-z_-]{11}',
  38. # Old (Flash) embeds
  39. r'//www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
  40. # Redirects from links in video descriptions
  41. r'//www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
  42. # Tracking and other crap
  43. r'//www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
  44. r'//www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
  45. r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
  46. # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
  47. r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
  48. ])
  49. noisePattern = '|'.join([
  50. r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
  51. r'^//www\.youtube\.com/s/gaming/emoji/',
  52. r'^//www\.youtube\.com/redirect\?event=channel_banner&',
  53. r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
  54. r'^//www\.youtube\.com/yts/',
  55. r'^//www\.youtube\.com/img/',
  56. r'^//www\.youtube\.com/youtubei/',
  57. r'^//www\.youtube\.com/ads(/|$)',
  58. r'^//www\.youtube\.com/creators(/|$)',
  59. r'^//www\.youtube\.com/(player|iframe)_api\?',
  60. r'^//www\.youtube\.com/error(_204)?/?\?',
  61. r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
  62. r'^//www\.youtube\.com/results/?\?',
  63. r'^//www\.youtube\.com/premium/?\?',
  64. r'^//www\.youtube\.com/new([/?]|$)',
  65. r'^//www\.youtube\.com/?(\?|$)',
  66. r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
  67. r'^//www\.youtube\.com/service_ajax$',
  68. r'^//www\.youtube\.com/watch(\?v=)?$',
  69. r'^//consent\.(youtube|google)\.com/',
  70. ])
  71. def percentdecode(s):
  72. return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&')
  73. matchers = [
  74. # (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line
  75. # If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function.
  76. [noisePattern, False, lambda m: None],
  77. [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]],
  78. [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
  79. [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
  80. [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
  81. [r'//www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  82. [r'//www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  83. [r'//www\.youtube\.com/subscription_center\?(?:.*&)?add_user=([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  84. [r'//www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
  85. [r'//www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
  86. [r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
  87. ]
  88. # Compile pattern and generate one for parameters if desired
  89. for e in matchers:
  90. e[0] = re.compile(e[0])
  91. # Only one slash before so it still matches inside URLs when slashes were collapsed.
  92. domainPattern = re.compile(r'/(www\.)?youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)/')
  93. for origLine in sys.stdin:
  94. origLine = origLine.strip()
  95. line = re.sub(r'^https?://', '//', origLine)
  96. line = domainPattern.sub('/www.youtube.com/', line)
  97. hadMatches = False
  98. for pattern, paramSearch, f in matchers:
  99. results = set()
  100. for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(percentdecode(line))) if paramSearch else ()):
  101. hadMatches = True
  102. r = f(m)
  103. if r in results:
  104. continue
  105. results.add(r)
  106. if r is None:
  107. break
  108. print(r)
  109. if None in results:
  110. break
  111. if not hadMatches:
  112. print(origLine, file = sys.stderr)