From a85ffe791b5e405552b4c8dc84b0a147904f7d7f Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 6 Jun 2023 21:08:40 +0000 Subject: [PATCH] Filter out lines with invalid UTF-8 --- html-extract-stupid | 3 +++ 1 file changed, 3 insertions(+) diff --git a/html-extract-stupid b/html-extract-stupid index e435b34..9f2137b 100755 --- a/html-extract-stupid +++ b/html-extract-stupid @@ -34,6 +34,9 @@ # Filter out lines without an attribute value grep -Pva '^[a-zA-Z]+ $' | + # Remove lines with invalid UTF-8 + LANG=C.UTF-8 grep -a '^.*$' | + # img srcset splitting python3 -c 'import os, re, sys'$'\n''try:'$'\n'' for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if tag != "imgsrcset":'$'\n'' print(l); continue'$'\n'' for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n'' if url: print(f"img {url}")'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)' |