Browse Source

Stop trying to rewrite patterns for percent encoding

master
JustAnotherArchivist 3 years ago
parent
commit
9974d4613c
1 changed files with 4 additions and 12 deletions
  1. +4
    -12
      youtube-extract

+ 4
- 12
youtube-extract View File

@@ -103,15 +103,7 @@ matchers = [

# Compile pattern and generate one for parameters if desired
for e in matchers:
pattern, paramSearch, f = e
e[0] = re.compile(pattern)
if paramSearch:
p2 = pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(r'\?', r'(\?|%3F)')
p2 = re.sub(r'(?<!\(\?):', '(:|%3A)', p2)
p2 = re.sub(r'(?<!\(\?)=', '(=|%3D)', p2)
e[1] = re.compile(p2.replace('&', '(&|%26)'))
else:
e[1] = None
e[0] = re.compile(e[0])

# Only one slash before so it still matches inside URLs when slashes were collapsed.
domainPattern = re.compile(r'/(www\.)?youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)/')
@@ -121,11 +113,11 @@ for origLine in sys.stdin:
line = re.sub(r'^https?://', '//', origLine)
line = domainPattern.sub('/www.youtube.com/', line)
hadMatches = False
for pattern1, pattern2, f in matchers:
for pattern, paramSearch, f in matchers:
results = set()
for m, encoded in itertools.chain(((x, False) for x in pattern1.finditer(line)), ((x, True) for x in pattern2.finditer(line)) if pattern2 else ()):
for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(percentdecode(line))) if paramSearch else ()):
hadMatches = True
r = f(m if not encoded else [percentdecode(x) if x else x for x in itertools.chain((m[0],), m.groups())])
r = f(m)
if r in results:
continue
results.add(r)


Loading…
Cancel
Save