The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

30 lines
1.7 KiB

  1. #!/bin/bash
  2. function usage_exit {
  3. echo 'Usage: dedupe FILE1 FILE2' >&2
  4. echo >&2
  5. echo 'Prints all lines from FILE2 that do not appear in FILE1, in the order of FILE2.' >&2
  6. echo "WARNING: FILE1 has to be read into memory fully, and memory use scales with about a factor 40 of FILE1's size. If your files are sorted, use comm instead." >&2
  7. exit $1
  8. }
  9. if [[ "$1" == '-h' || "$1" == '--help' ]]; then usage_exit 0; fi
  10. if [[ $# -ne 2 ]]; then usage_exit 1; fi
  11. # Performance and memory comparison using <(seq 1000000 2048575) <(seq 1000000 2048575) (i.e. 8 MiB of input data, all lines in both files), median of 9 runs:
  12. #
  13. # Implementation | User time | Sys time | Peak RSS
  14. # | [s] | [s] | [MiB]
  15. # ---------------|-----------|----------|---------
  16. # AWK | 1.16 | 0.03 | 86.8
  17. # Perl | 0.90 | 0.06 | 149.6
  18. # Python | 0.58 | 0.06 | 112.6
  19. # grep | 0.36 | 0.07 | 216.9
  20. #
  21. # Exact command executed for these tests, with warmup:
  22. # { for i in {0..3}; do ./dedupe <(seq 1000000 2048575) <(seq 1000000 2048575) >/dev/null; done; for i in {0..8}; do /usr/bin/time -v ./dedupe <(seq 1000000 2048575) <(seq 1000000 2048575) 2> >(grep -F -e ' time ' -e 'Maximum resident' >&2) | cat >/dev/null; done; } |& sort
  23. #awk 'NR==FNR { s[$0]=1; next; } !($0 in s)' "$1" "$2"
  24. #perl -ne 'if (@ARGV == 1) { $seen{$_}=1; } else { print $_ if !(exists $seen{$_}); }' "$1" "$2"
  25. #python3 -c 'import sys'$'\n''s={}'$'\n''with open(sys.argv[1], "r") as fp:'$'\n'' for line in fp:'$'\n'' s[line]=True'$'\n''with open(sys.argv[2], "r") as fp:'$'\n'' for line in fp:'$'\n'' if line not in s:'$'\n'' print(line, end="")' "$1" "$2"
  26. grep -F -x -v -f "$1" "$2"