From 4d274e64e00b29b7b582edf0c8721fa253378dc2 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Fri, 25 Sep 2020 17:30:43 +0000
Subject: [PATCH] Add dedupe

---
 dedupe | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100755 dedupe

diff --git a/dedupe b/dedupe
new file mode 100755
index 0000000..855ae5e
--- /dev/null
+++ b/dedupe
@@ -0,0 +1,15 @@
+#!/bin/bash
+function usage_exit {
+	echo 'Usage: dedupe FILE1 FILE2' >&2
+	echo >&2
+	echo 'Prints all lines from FILE2 that do not appear in FILE1, in the order of FILE2.' >&2
+	echo 'WARNING: FILE1 has to be read into memory fully. If your files are sorted, use comm instead.' >&2
+	exit $1
+}
+
+if [[ "$1" == '-h' || "$1" == '--help' ]]; then usage_exit 0; fi
+if [[ $# -ne 2 ]]; then usage_exit 1; fi
+
+# Perl seems to be ~30 % faster for this.
+#awk 'NR==FNR { s[$0]=1; next; }  !($0 in s)' "$1" "$2"
+perl -ne 'if (@ARGV == 1) { $seen{$_}=1; } else { print $_ if !(exists $seen{$_}); }' "$1" "$2"