#!/bin/bash
function usage_exit {
	echo 'Usage: dedupe FILE1 FILE2' >&2
	echo >&2
	echo 'Prints all lines from FILE2 that do not appear in FILE1, in the order of FILE2.' >&2
	echo "WARNING: FILE1 has to be read into memory fully, and memory use scales with about a factor 40 of FILE1's size. If your files are sorted, use comm instead." >&2
	exit $1
}

if [[ "$1" == '-h' || "$1" == '--help' ]]; then usage_exit 0; fi
if [[ $# -ne 2 ]]; then usage_exit 1; fi

# Performance and memory comparison using <(seq 1000000 2048575) <(seq 1000000 2048575)  (i.e. 8 MiB of input data, all lines in both files), median of 9 runs:
#
# Implementation | User time | Sys time | Peak RSS
#                |       [s] |      [s] |    [MiB]
# ---------------|-----------|----------|---------
# AWK            |      1.16 |     0.03 |     86.8
# Perl           |      0.90 |     0.06 |    149.6
# Python         |      0.58 |     0.06 |    112.6
# grep           |      0.36 |     0.07 |    216.9
#
# Exact command executed for these tests, with warmup:
#  { for i in {0..3}; do ./dedupe <(seq 1000000 2048575) <(seq 1000000 2048575) >/dev/null; done; for i in {0..8}; do /usr/bin/time -v ./dedupe <(seq 1000000 2048575) <(seq 1000000 2048575) 2> >(grep -F -e ' time ' -e 'Maximum resident' >&2) | cat >/dev/null; done; } |& sort

#awk 'NR==FNR { s[$0]=1; next; }  !($0 in s)' "$1" "$2"
#perl -ne 'if (@ARGV == 1) { $seen{$_}=1; } else { print $_ if !(exists $seen{$_}); }' "$1" "$2"
#python3 -c 'import sys'$'\n''s={}'$'\n''with open(sys.argv[1], "r") as fp:'$'\n'' for line in fp:'$'\n''  s[line]=True'$'\n''with open(sys.argv[2], "r") as fp:'$'\n'' for line in fp:'$'\n''  if line not in s:'$'\n''   print(line, end="")' "$1" "$2"
grep -F -x -v -f "$1" "$2"