Browse Source

Add archivebot-log-extract-ignores

master
JustAnotherArchivist 3 years ago
parent
commit
36aa2e8259
1 changed files with 41 additions and 0 deletions
  1. +41
    -0
      archivebot-log-extract-ignores

+ 41
- 0
archivebot-log-extract-ignores View File

@@ -0,0 +1,41 @@
#!/bin/bash

if [[ "$1" == '--test' ]]
then
## Self-test

diff -q <("$0" '/bar' <<-'EOF'
2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/foo using pattern /foo$
2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/bar using pattern /bar
2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/baz/bar using pattern /baz/bar
2021-02-21 04:18:06,000 - archivebot.pipeline.wpull_plugin - INFO - Ignore https://example.org/barnope using pattern /barnope
EOF
) <(cat <<-EOF
https://example.org/bar
EOF
) >/dev/null
if [[ $? -eq 0 ]]
then
echo 'Success!'
exit 0
else
echo 'Fail!'
exit 1
fi
fi

if [[ -t 0 || $# -ne 1 || "$1" == '--help' ]]
then
echo "Usage: $0 PATTERN" >&2
echo 'Reads an ArchiveBot job log (or decompressed meta WARC) from stdin, prints all URLs that were ignored using PATTERN to stdout.' >&2
exit 1
fi

pattern="$1"
{
printf "%s\n" "${pattern}"
grep -F ' - archivebot.pipeline.wpull_plugin - INFO - Ignore ' | \
grep -F " using pattern ${pattern}"
} | \
awk 'NR==1 { pattern = $0; next; } $NF == pattern' | \
grep -Po ' Ignore \K.*?(?= using pattern )'

Loading…
Cancel
Save