From e3380e6e2a6e6927243c8e5184e4978434b0b67a Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Tue, 7 Feb 2023 20:48:44 +0000
Subject: [PATCH] Fix 'binary' lines

---
 html-extract-stupid | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/html-extract-stupid b/html-extract-stupid
index a78559f..9127322 100755
--- a/html-extract-stupid
+++ b/html-extract-stupid
@@ -8,7 +8,7 @@
 	tr '\r\n' '  ' | tr '<' '\n' |
 
 	# Extract tags of interest
-	grep -i '^\(a\|base\|img\|link\|script\)\s' |
+	grep -ai '^\(a\|base\|img\|link\|script\)\s' |
 
 	# Fix scripty backslash nonsense
 	perl -pe 's,\\,,g' |
@@ -26,13 +26,13 @@
 	         " |
 
 	# Filter out unprocessed lines
-	grep '^+' | sed 's,^+,,' |
+	grep -a '^+' | sed 's,^+,,' |
 
 	# Remove quotes from attribute values
 	perl -pe "s,^([a-zA-Z]+) (['\"])(.*)\2$,\1 \3," |
 
 	# Filter out lines without an attribute value
-	grep -Pv '^[a-zA-Z]+ $' |
+	grep -Pva '^[a-zA-Z]+ $' |
 
 	# img srcset splitting
 	python3 -c 'import re, sys'$'\n''for l in map(str.strip, sys.stdin):'$'\n'' try:'$'\n''  tag, value = l.split(" ", 1)'$'\n''  tag = tag.lower()'$'\n''  if tag != "imgsrcset":'$'\n''   print(l); continue'$'\n''  for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n''   if url: print(f"img {url}")'$'\n'' except BrokenPipeError: break' |