From b66260ca940a7cb82b308b70232909df4e5b9bee Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Wed, 25 Nov 2020 22:07:35 +0000
Subject: [PATCH] Add youtube-extract

---
 youtube-extract | 124 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100755 youtube-extract

diff --git a/youtube-extract b/youtube-extract
new file mode 100755
index 0000000..01028a2
--- /dev/null
+++ b/youtube-extract
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+import re
+import sys
+
+
+if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
+	print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr)
+	print(file = sys.stderr)
+	print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the URLs on stdin and prints them on stdout.", file = sys.stderr)
+	print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr)
+	print(file = sys.stderr)
+	print("In 'removenonyt' mode, prints all URLs that look like they are or contain YouTube URLs on stdout.", file = sys.stderr)
+	sys.exit(1)
+
+
+# For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? instead.
+
+# Channel/user URLs; the protocol and domain are stripped and replaced below.
+channelPattern = re.compile('|'.join([
+	r'//www\.youtube\.com/c/[^/?]+',
+	r'//www\.youtube\.com/user/[^/?]+',
+	r'//www\.youtube\.com/channel/UC[^/?]+',
+	r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)$)[^/?]+(?=/?$)',
+  ]))
+
+# Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
+# If necessary, use lookahead assertions to match further stuff after the video ID.
+videoPattern = re.compile('|'.join([
+	# Normal watch URL
+	r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
+	r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
+	# Embeds
+	r'//www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
+	r'//www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
+	# Shortener
+	r'//youtu\.be/[0-9A-Za-z_-]{11}',
+	# Old (Flash) embeds
+	r'//www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
+	# Redirects from links in video descriptions
+	r'//www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
+	# Tracking and other crap
+	r'//www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
+	r'//www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
+	r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
+	# Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
+	r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
+  ]))
+
+noisePattern = re.compile('|'.join([
+	r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
+	r'^//www\.youtube\.com/s/gaming/emoji/',
+	r'^//www\.youtube\.com/redirect\?event=channel_banner&',
+	r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
+	r'^//www\.youtube\.com/yts/',
+	r'^//www\.youtube\.com/img/',
+	r'^//www\.youtube\.com/youtubei/',
+	r'^//www\.youtube\.com/ads/',
+	r'^//www\.youtube\.com/(player|iframe)_api\?',
+	r'^//www\.youtube\.com/error(_204)?/?\?',
+	r'^//www\.youtube\.com/(about|t|howyoutubeworks)[/?]',
+	r'^//www\.youtube\.com/results/?\?',
+	r'^//www\.youtube\.com/premium/?\?',
+	r'^//www\.youtube\.com/?(\?|$)',
+	r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
+	r'^//consent\.(youtube|google)\.com/',
+  ]))
+
+
+matchers = [
+	# (pattern, paramSearch, function(match) -> output str or None); returning None stops further processing
+	[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]],
+	[videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
+	[re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=UU[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/UC{m[0].rsplit("=", 1)[1][2:]}'],
+	[re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=(PL|FL|RD|OL)[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/playlist?list={m[0].rsplit("=", 1)[1]}'],
+	[re.compile(r'//www\.youtube\.com/embed/?\?(?=(.*&)?listType=user_uploads(&|$))(.*&)?list=[^&]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("=", 1)[1]}'],
+	[re.compile(r'//www\.youtube\.com/rss/user/[^/]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("/", 1)[1]}'],
+	[re.compile(r'//www\.youtube\.com/feeds/videos\.xml\?(.*&)?channel_id=UC[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/{m[0].rsplit("=", 1)[1]}'],
+	[re.compile(r'//www\.youtube\.com(/view_play_list\?p=|/playlist\?(.*&)?list=)[0-9A-F]{16}(?=(&|$))'), True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[0].rsplit("=", 1)[1]}'],
+	[re.compile(r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/'), False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
+	[noisePattern, False, lambda m: None],
+]
+
+# Compile second pattern for parameters if needed
+for e in matchers:
+	pattern, paramSearch, f = e
+	if paramSearch:
+		p2 = pattern.pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(':', '(:|%3A)').replace(r'\?', r'(\?|%3F)')
+		p2 = re.sub(r'(?<!\(\?)=', '(=|%3D)', p2)
+		e[1] = re.compile(p2.replace('&', '(&|%26)'), pattern.flags)
+	else:
+		e[1] = None
+
+# Only one slash before so it still matches inside URLs when slashes were collapsed.
+domainPattern = re.compile(r'/(www\.)?youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)/')
+
+for origLine in sys.stdin:
+	origLine = origLine.strip()
+	line = re.sub(r'^https?://', '//', origLine)
+	line = domainPattern.sub('/www.youtube.com/', line)
+	if sys.argv[1] == 'massage':
+		hadMatches = False
+		for pattern1, pattern2, f in matchers:
+			patterns = [pattern1]
+			if pattern2:
+				patterns.append(pattern2)
+			results = set()
+			for pattern in patterns:
+				m = pattern.search(line)
+				if m:
+					hadMatches = True
+					r = f(m)
+					if r in results:
+						continue
+					results.add(r)
+					if r is None:
+						break
+					print(r)
+			if None in results:
+				break
+		if not hadMatches:
+			print(origLine, file = sys.stderr)
+	elif sys.argv[1] == 'removenonyt':
+		if any(x in line for x in ('/www.youtube.com/', '/youtu.be/', '%2Fwww.youtube.com%2F', '%2Fyoutu.be%2F')):
+			print(origLine)