From e3380e6e2a6e6927243c8e5184e4978434b0b67a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 7 Feb 2023 20:48:44 +0000 Subject: [PATCH] Fix 'binary' lines --- html-extract-stupid | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html-extract-stupid b/html-extract-stupid index a78559f..9127322 100755 --- a/html-extract-stupid +++ b/html-extract-stupid @@ -8,7 +8,7 @@ tr '\r\n' ' ' | tr '<' '\n' | # Extract tags of interest - grep -i '^\(a\|base\|img\|link\|script\)\s' | + grep -ai '^\(a\|base\|img\|link\|script\)\s' | # Fix scripty backslash nonsense perl -pe 's,\\,,g' | @@ -26,13 +26,13 @@ " | # Filter out unprocessed lines - grep '^+' | sed 's,^+,,' | + grep -a '^+' | sed 's,^+,,' | # Remove quotes from attribute values perl -pe "s,^([a-zA-Z]+) (['\"])(.*)\2$,\1 \3," | # Filter out lines without an attribute value - grep -Pv '^[a-zA-Z]+ $' | + grep -Pva '^[a-zA-Z]+ $' | # img srcset splitting python3 -c 'import re, sys'$'\n''for l in map(str.strip, sys.stdin):'$'\n'' try:'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if tag != "imgsrcset":'$'\n'' print(l); continue'$'\n'' for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n'' if url: print(f"img {url}")'$'\n'' except BrokenPipeError: break' |