The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

136 lines
6.8 KiB

  1. #!/usr/bin/env python3
  2. import itertools
  3. import os
  4. import re
  5. import sys
  6. if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
  7. print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr)
  8. print(file = sys.stderr)
  9. print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the lines on stdin and prints them on stdout.", file = sys.stderr)
  10. print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr)
  11. print(file = sys.stderr)
  12. print("In 'removenonyt' mode, prints all lines that look like they are or contain YouTube URLs on stdout.", file = sys.stderr)
  13. sys.exit(1)
  14. mode = sys.argv[1] if len(sys.argv) >= 2 else 'massage'
  15. # Only one slash before so it still matches inside URLs when slashes were collapsed.
  16. domainPattern = re.compile(r'/(www\.|m\.)?(youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)|(music|gaming)\.youtube\.com|(es|uk|pl|ru|it|jp|br)\.youtube\.com|youtube-nocookie\.com)(:\d+)?/', re.IGNORECASE)
  17. if mode == 'removenonyt':
  18. # Anything in here could never be as fast as grep, so just delegate to that...
  19. os.execlp('grep', 'grep', '-Fai', '-e', 'youtube', '-e', 'youtu.be', '-e', 'ytimg.com', '-e', '?v=', '-e', '%3Fv%3D', '-e', '&v=', '-e', '%26v%3D')
  20. sys.exit(0)
  21. assert mode == 'massage'
  22. noisePattern = '|'.join([
  23. r'//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
  24. r'//www\.youtube\.com/s/gaming/emoji/',
  25. r'//www\.youtube\.com/redirect\?event=channel_banner&',
  26. r'//www\.youtube\.com/redirect\?(?=(\S*&)?event=video_description(&|$))(?!(\S*&)?v=)',
  27. r'//www\.youtube\.com/yts/',
  28. r'//www\.youtube\.com/img/',
  29. r'//www\.youtube\.com/youtubei/',
  30. r'//www\.youtube\.com/ads(/|$)',
  31. r'//www\.youtube\.com/creators(/|$)',
  32. r'//www\.youtube\.com/(player|iframe)_api(\?|$)',
  33. r'//www\.youtube\.com/error(_204)?/?\?',
  34. r'//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
  35. r'//www\.youtube\.com/results/?(\?|$)',
  36. r'//www\.youtube\.com/premium/?\?',
  37. r'//www\.youtube\.com/new([/?]|$)',
  38. r'//www\.youtube\.com/?(\?(?!(\S*&)?v=)|$)',
  39. r'//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
  40. r'//www\.youtube\.com/service_ajax$',
  41. r'//www\.youtube\.com/watch(\?v=)?$',
  42. r'//consent\.(youtube|google)\.com/',
  43. r'//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap
  44. ])
  45. channelPattern = '|'.join([
  46. r'''/www\.youtube\.com/c/[^/?&=."'>\\\s]+''',
  47. r'/www\.youtube\.com/user/[A-Za-z0-9]{1,20}',
  48. r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
  49. r'''/www\.youtube\.com/[^/?&=."'>\\\s]+(?=/?(\s|\\?["'>]|$))''',
  50. ])
  51. # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
  52. # If necessary, use lookahead assertions to match further stuff after the video ID.
  53. videoPattern = '|'.join([
  54. # Normal watch URL
  55. r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  56. r'/www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
  57. # Embeds
  58. r'/www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
  59. r'/www\.youtube\.com/embed/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  60. # Shortener
  61. r'/(?i:youtu\.be)(:\d+)?/[0-9A-Za-z_-]{11}',
  62. # Shorts
  63. r'/www\.youtube\.com/shorts/[0-9A-Za-z_-]{11}',
  64. # Old (Flash) embeds
  65. r'/www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
  66. # Redirects from links in video descriptions
  67. r'/www\.youtube\.com/redirect\?(\S*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
  68. # Tracking and other crap
  69. r'/www\.youtube\.com/(ptracking|set_awesome)\?(\S*&)?video_id=[0-9A-Za-z_-]{11}',
  70. r'/www\.youtube\.com/api/timedtext\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  71. r'/www\.youtube\.com/(my_videos_)?edit\?(\S*&)?video_id=[0-9A-Za-z_-]{11}',
  72. r'/www\.youtube\.com/(all_comments|attribution|cthru|get_endscreen|livestreaming/dashboard)\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  73. # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
  74. r'/watch/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  75. # Generic v parameter on anything
  76. r'[?&]v=[0-9A-Za-z_-]{11}(?=&|\s|$)',
  77. ])
  78. def percentdecode(s):
  79. return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&')
  80. matchers = [
  81. # (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line
  82. # If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function.
  83. [noisePattern, False, lambda m: None],
  84. [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 2)[-1].rstrip('/')],
  85. [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
  86. [r'/www\.youtube\.com/(?:playlist|watch|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:\S*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
  87. [r'/www\.youtube\.com/(?:playlist|watch|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:\S*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
  88. [r'/www\.youtube\.com/embed/?\?(?=(?:\S*&)?listType=user_uploads(?:&|$))(?:\S*&)?list=([A-Za-z0-9]{1,20})', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  89. [r'/www\.youtube\.com/rss/user/([A-Za-z0-9]{1,20})', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  90. [r'/www\.youtube\.com/(?:subscription_center\?(?:\S*&)?add_user=|subscribe_widget\?(?:\S*&)?p=|profile\?(?:\S*&)?user=)([A-Za-z0-9]{1,20})', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  91. [r'/www\.youtube\.com/feeds/videos\.xml\?(?:\S*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
  92. [r'/www\.youtube\.com(?:/view_play_list\?(?:\S*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|\s|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
  93. [r'/(?i:i\.ytimg\.com|img\.youtube\.com)(?::\d+)?/vi/([0-9A-Za-z_-]{11})/', True, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
  94. ]
  95. # Compile pattern and generate one for parameters if desired
  96. for e in matchers:
  97. e[0] = re.compile(e[0])
  98. for origLine in sys.stdin.buffer:
  99. line = re.sub(r'https?://', '//', origLine.strip().decode('utf-8', 'surrogateescape'))
  100. line = domainPattern.sub('/www.youtube.com/', line)
  101. decodedLine = percentdecode(line)
  102. hadMatches = False
  103. for pattern, paramSearch, f in matchers:
  104. results = set()
  105. for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(decodedLine)) if paramSearch else ()):
  106. hadMatches = True
  107. r = f(m)
  108. if r in results:
  109. continue
  110. results.add(r)
  111. if r is None:
  112. break
  113. sys.stdout.buffer.write(r.encode('utf-8', 'surrogateescape'))
  114. sys.stdout.buffer.write(b'\n')
  115. if None in results:
  116. break
  117. if not hadMatches:
  118. sys.stderr.buffer.write(origLine)