aboutsummaryrefslogtreecommitdiff
path: root/dups
diff options
context:
space:
mode:
authorGuillermo Ramos2021-04-13 18:17:14 +0200
committerGuillermo Ramos2021-04-13 18:17:14 +0200
commit04446011f364909096cb49f2b66e4df96d713209 (patch)
tree6c2bbb722d63487a766d91e5ecc9560f01442f44 /dups
downloadcli-04446011f364909096cb49f2b66e4df96d713209.tar.gz
Initial commit
Diffstat (limited to 'dups')
-rwxr-xr-xdups56
1 files changed, 56 insertions, 0 deletions
diff --git a/dups b/dups
new file mode 100755
index 0000000..bbf357e
--- /dev/null
+++ b/dups
@@ -0,0 +1,56 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use File::Find qw<find>;
+use IPC::System::Simple qw<capture>;
+
+@ARGV ge 1 || die "Usage: $0 <dir1> [<dir2> ...]\n";
+
+my %sizes;
+my %md5s;
+
+sub fill_sizes {
+ return if -d "$_"; # Skip directories
+
+ my $size = capture("stat", "--printf=%s", $_);
+ $sizes{$size} ||= [];
+ push @{$sizes{$size}}, $File::Find::name;
+}
+
+print STDERR "Collecting file sizes...\n";
+find(\&fill_sizes, $_) foreach (@ARGV);
+
+print STDERR "Computing md5s of files with same size...\n";
+my @progress = (0, 0, scalar(keys(%sizes)));
+foreach my $size (keys(%sizes)) {
+ $progress[0]++;
+ $progress[1] = 0;
+ my @same_size_files = @{$sizes{$size}};
+ next unless @same_size_files gt 1; # Discard unique sizes
+
+
+ foreach my $file (@same_size_files) {
+ $progress[1]++;
+ print STDERR "$progress[0].$progress[1] / $progress[2]\r";
+ STDERR->flush();
+
+ my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/;
+ $md5s{$md5} ||= [];
+ push @{$md5s{$md5}}, $file;
+ }
+}
+
+foreach my $md5 (keys(%md5s)) {
+ my @same_md5_files = @{$md5s{$md5}};
+ next unless @same_md5_files gt 1; # Discard unique hashes
+
+ print "Found duplicate files:\n";
+ foreach my $file (@same_md5_files) {
+ print "\t$file\n";
+ }
+ printf "\n";
+}
+
+printf STDERR "Done!\n";