Browse Source

Fix handling of invalid UTF-8 input

master
JustAnotherArchivist 1 year ago
parent
commit
a07c2b2374
1 changed files with 4 additions and 5 deletions
  1. +4
    -5
      youtube-extract

+ 4
- 5
youtube-extract View File

@@ -113,9 +113,7 @@ for e in matchers:
e[0] = re.compile(e[0])

for origLine in sys.stdin.buffer:
origLine = origLine.decode('utf-8', 'surrogateescape')
origLine = origLine.strip()
line = re.sub(r'https?://', '//', origLine)
line = re.sub(r'https?://', '//', origLine.strip().decode('utf-8', 'surrogateescape'))
line = domainPattern.sub('/www.youtube.com/', line)
decodedLine = percentdecode(line)
hadMatches = False
@@ -129,8 +127,9 @@ for origLine in sys.stdin.buffer:
results.add(r)
if r is None:
break
print(r)
sys.stdout.buffer.write(r.encode('utf-8', 'surrogateescape'))
sys.stdout.buffer.write(b'\n')
if None in results:
break
if not hadMatches:
print(origLine, file = sys.stderr)
sys.stderr.buffer.write(origLine)

Loading…
Cancel
Save