From 36aa2e8259aa7009898c90b198abb5b740414471 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 21 Feb 2021 04:40:41 +0000 Subject: [PATCH] Add archivebot-log-extract-ignores --- archivebot-log-extract-ignores | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 archivebot-log-extract-ignores diff --git a/archivebot-log-extract-ignores b/archivebot-log-extract-ignores new file mode 100755 index 0000000..56a7cba --- /dev/null +++ b/archivebot-log-extract-ignores @@ -0,0 +1,41 @@ +#!/bin/bash + +if [[ "$1" == '--test' ]] +then + ## Self-test + + diff -q <("$0" '/bar' <<-'EOF' + 2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/foo using pattern /foo$ + 2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/bar using pattern /bar + 2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/baz/bar using pattern /baz/bar + 2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/barnope using pattern /barnope + EOF + ) <(cat <<-EOF + https://example.org/bar + EOF + ) >/dev/null + if [[ $? -eq 0 ]] + then + echo 'Success!' + exit 0 + else + echo 'Fail!' + exit 1 + fi +fi + +if [[ -t 0 || $# -ne 1 || "$1" == '--help' ]] +then + echo "Usage: $0 PATTERN" >&2 + echo 'Reads an ArchiveBot job log (or decompressed meta WARC) from stdin, prints all URLs that were ignored using PATTERN to stdout.' >&2 + exit 1 +fi + +pattern="$1" +{ + printf "%s\n" "${pattern}" + grep -F ' - archivebot.pipeline.wpull_plugin - INFO - Ignore ' | \ + grep -F " using pattern ${pattern}" +} | \ + awk 'NR==1 { pattern = $0; next; } $NF == pattern' | \ + grep -Po ' Ignore \K.*?(?= using pattern )'