|
|
@@ -10,8 +10,20 @@ function usage_exit { |
|
|
|
if [[ "$1" == '-h' || "$1" == '--help' ]]; then usage_exit 0; fi |
|
|
|
if [[ $# -ne 2 ]]; then usage_exit 1; fi |
|
|
|
|
|
|
|
# Perl seems to be ~30 % faster than AWK for this, but grep is ~2-3 times faster than Perl. |
|
|
|
# AWK uses the least memory, Perl about 1.5 times as much, grep twice as much (as AWK). |
|
|
|
# Performance and memory comparison using <(seq 1000000 2048575) <(seq 1000000 2048575) (i.e. 8 MiB of input data, all lines in both files), median of 9 runs: |
|
|
|
# |
|
|
|
# Implementation | User time | Sys time | Peak RSS |
|
|
|
# | [s] | [s] | [MiB] |
|
|
|
# ---------------|-----------|----------|--------- |
|
|
|
# AWK | 1.16 | 0.03 | 86.8 |
|
|
|
# Perl | 0.90 | 0.06 | 149.6 |
|
|
|
# Python | 0.58 | 0.06 | 112.6 |
|
|
|
# grep | 0.36 | 0.07 | 216.9 |
|
|
|
# |
|
|
|
# Exact command executed for these tests, with warmup: |
|
|
|
# { for i in {0..3}; do ./dedupe <(seq 1000000 2048575) <(seq 1000000 2048575) >/dev/null; done; for i in {0..8}; do /usr/bin/time -v ./dedupe <(seq 1000000 2048575) <(seq 1000000 2048575) 2> >(grep -F -e ' time ' -e 'Maximum resident' >&2) | cat >/dev/null; done; } |& sort |
|
|
|
|
|
|
|
#awk 'NR==FNR { s[$0]=1; next; } !($0 in s)' "$1" "$2" |
|
|
|
#perl -ne 'if (@ARGV == 1) { $seen{$_}=1; } else { print $_ if !(exists $seen{$_}); }' "$1" "$2" |
|
|
|
#python3 -c 'import sys'$'\n''s={}'$'\n''with open(sys.argv[1], "r") as fp:'$'\n'' for line in fp:'$'\n'' s[line]=True'$'\n''with open(sys.argv[2], "r") as fp:'$'\n'' for line in fp:'$'\n'' if line not in s:'$'\n'' print(line, end="")' "$1" "$2" |
|
|
|
grep -F -x -v -f "$1" "$2" |