From a07c2b2374de24be8a994f751269d93967ec5794 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Thu, 15 Sep 2022 05:18:21 +0000
Subject: [PATCH] Fix handling of invalid UTF-8 input

---
 youtube-extract | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/youtube-extract b/youtube-extract
index 887e527..7e20075 100755
--- a/youtube-extract
+++ b/youtube-extract
@@ -113,9 +113,7 @@ for e in matchers:
 	e[0] = re.compile(e[0])
 
 for origLine in sys.stdin.buffer:
-	origLine = origLine.decode('utf-8', 'surrogateescape')
-	origLine = origLine.strip()
-	line = re.sub(r'https?://', '//', origLine)
+	line = re.sub(r'https?://', '//', origLine.strip().decode('utf-8', 'surrogateescape'))
 	line = domainPattern.sub('/www.youtube.com/', line)
 	decodedLine = percentdecode(line)
 	hadMatches = False
@@ -129,8 +127,9 @@ for origLine in sys.stdin.buffer:
 			results.add(r)
 			if r is None:
 				break
-			print(r)
+			sys.stdout.buffer.write(r.encode('utf-8', 'surrogateescape'))
+			sys.stdout.buffer.write(b'\n')
 		if None in results:
 			break
 	if not hadMatches:
-		print(origLine, file = sys.stderr)
+		sys.stderr.buffer.write(origLine)