瀏覽代碼

Fix 'binary' lines

master
JustAnotherArchivist 1 年之前
父節點
當前提交
e3380e6e2a
共有 1 個檔案被更改,包括 3 行新增3 行删除
  1. +3
    -3
      html-extract-stupid

+ 3
- 3
html-extract-stupid 查看文件

@@ -8,7 +8,7 @@
tr '\r\n' ' ' | tr '<' '\n' |

# Extract tags of interest
grep -i '^\(a\|base\|img\|link\|script\)\s' |
grep -ai '^\(a\|base\|img\|link\|script\)\s' |

# Fix scripty backslash nonsense
perl -pe 's,\\,,g' |
@@ -26,13 +26,13 @@
" |

# Filter out unprocessed lines
grep '^+' | sed 's,^+,,' |
grep -a '^+' | sed 's,^+,,' |

# Remove quotes from attribute values
perl -pe "s,^([a-zA-Z]+) (['\"])(.*)\2$,\1 \3," |

# Filter out lines without an attribute value
grep -Pv '^[a-zA-Z]+ $' |
grep -Pva '^[a-zA-Z]+ $' |

# img srcset splitting
python3 -c 'import re, sys'$'\n''for l in map(str.strip, sys.stdin):'$'\n'' try:'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if tag != "imgsrcset":'$'\n'' print(l); continue'$'\n'' for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n'' if url: print(f"img {url}")'$'\n'' except BrokenPipeError: break' |


Loading…
取消
儲存