From a07c2b2374de24be8a994f751269d93967ec5794 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 15 Sep 2022 05:18:21 +0000 Subject: [PATCH] Fix handling of invalid UTF-8 input --- youtube-extract | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube-extract b/youtube-extract index 887e527..7e20075 100755 --- a/youtube-extract +++ b/youtube-extract @@ -113,9 +113,7 @@ for e in matchers: e[0] = re.compile(e[0]) for origLine in sys.stdin.buffer: - origLine = origLine.decode('utf-8', 'surrogateescape') - origLine = origLine.strip() - line = re.sub(r'https?://', '//', origLine) + line = re.sub(r'https?://', '//', origLine.strip().decode('utf-8', 'surrogateescape')) line = domainPattern.sub('/www.youtube.com/', line) decodedLine = percentdecode(line) hadMatches = False @@ -129,8 +127,9 @@ for origLine in sys.stdin.buffer: results.add(r) if r is None: break - print(r) + sys.stdout.buffer.write(r.encode('utf-8', 'surrogateescape')) + sys.stdout.buffer.write(b'\n') if None in results: break if not hadMatches: - print(origLine, file = sys.stderr) + sys.stderr.buffer.write(origLine)