From 6acea5d6ebf5920a892cfecb1819d2fca6958873 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 24 Jan 2023 07:29:45 +0000 Subject: [PATCH] Add html-extract-stupid --- html-extract-stupid | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100755 html-extract-stupid diff --git a/html-extract-stupid b/html-extract-stupid new file mode 100755 index 0000000..deb288a --- /dev/null +++ b/html-extract-stupid @@ -0,0 +1,43 @@ +#!/bin/bash +# Reads HTML from stdin, extracts links and page requisites with simple string splitting and regexes. +# Usage cannot be recommended against enough. +# Produces lines of 'TAG URL', e.g. 'a https://example.org/'. +{ + # Reformat so each line is one tag + # Yes, this may break attribute values if they contain CR, LF, or <, but that's rare enough. + tr '\r\n' ' ' | tr '<' '\n' | + + # Extract tags of interest + grep -i '^\(a\|base\|img\|link\|script\)\s' | + + # Fix scripty backslash nonsense + perl -pe 's,\\,,g' | + + # Split img tags with src and srcset + perl -pe "s,^img(?=\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(?:\s|>))\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?srcset\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\s|>).*,img src=\1\nimg srcset=\2," | + + # Extract interesting tags/attributes + perl -pe "s,^(a|base)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i; + s,^(img|script)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i; + s,^(link)\s(?=(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?rel\s*=\s*(?:stylesheet|\"(?:[^\"]*\s)?stylesheet(?:\s[^\"]*)?\"|'(?:[^']*\s)?stylesheet(?:\s[^']*)?')(?:\$|\s|>))(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i; + s,^(img)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?(srcset)\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1\2 \3,i; + # Ensure that there's a LF at the end of each line since \s might match it. + s,\s*$,\n,; + " | + + # Filter out unprocessed lines + grep '^+' | sed 's,^+,,' | + + # Remove quotes from attribute values + perl -pe "s,^([a-zA-Z]+) (['\"])(.*)\2$,\1 \3," | + + # img srcset splitting + python3 -c 'import re, sys'$'\n''for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' if tag != "imgsrcset":'$'\n'' print(l); continue'$'\n'' for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n'' if url: print(f"img {url}")' | + + # Decode HTML references + python3 -c 'import html, sys'$'\n''for l in sys.stdin:'$'\n'' try: print(html.unescape(l.strip()))'$'\n'' except BrokenPipeError: break' | + + # Combine base and values to get absolute URLs + # If multiple base tags are present, they all get respected. This violates the HTML specs. + python3 -c 'import sys, urllib.parse; base = None'$'\n''for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if base:'$'\n'' value = urllib.parse.urljoin(base, value)'$'\n'' if tag == "base":'$'\n'' base = value'$'\n'' continue'$'\n'' try: print(f"{tag} {value}")'$'\n'' except BrokenPipeError: break' +}