#!/usr/bin/env perl use strict; use warnings; use v5.36; use List::Util qw; use File::Find qw; use IPC::System::Simple qw; @ARGV ge 1 || die "Usage: $0 [opts] [ ...] Opts: --dislike|-d Dislike directory (reduce priority) --yes |-y Don't ask for confirmation when there's a single choice --debug Print extra debug info "; # CLI options my $yes = 0; my $debug = 0; my @preferences; # Map from size to list of filenames my %sizes; # Map from md5 to list of filenames my %md5s; sub register_file_size { return if -d "$_"; # Skip directories my $filename = $File::Find::name; my $size = capture("stat", "--printf=%s", $_); $sizes{$size} ||= []; push @{$sizes{$size}}, $filename; } sub indir($file, $dir) { my ($diffpath) = Cwd::abs_path($file) =~ /^$dir(.*)$/; return !!$diffpath; } sub guess_best_choices(@files) { my @best_guesses = (); # Remove some choice based on the 'dislike' preference foreach my $pref (@preferences) { if ($pref->{"type"} eq "dislike") { print STDERR "PREF: DISLIKE $pref->{dir}\n" if $debug; foreach my $file (@files) { if (indir($file, $pref->{"dir"})) { print STDERR "Discarding $file...\n" if $debug; } else { push @best_guesses, $file; } print STDERR "BEST: @best_guesses\n" if $debug; } } } return @best_guesses; } sub keep($choice, @rest) { my @delete = grep { $_ ne $choice } @rest; print STDERR "\tKEEPING $choice, DELETING @delete\n" if $debug; unlink foreach @delete; } print STDERR "Collecting file sizes...\n"; while (my $arg = shift) { if ($arg eq "-y" || $arg eq "--yes") { $yes = 1; } elsif ($arg eq "--debug") { $debug = 1; } elsif ($arg eq "-d" || $arg eq "--dislike") { push @preferences, {"type" => "dislike", "dir" => Cwd::abs_path(shift) . "/"}; } else { find(\®ister_file_size, $arg); } } print STDERR "Computing md5s of files with same size...\n"; my @progress = (0, 0, scalar(keys(%sizes))); foreach my $size (keys(%sizes)) { $progress[0]++; $progress[1] = 0; my @same_size_files = @{$sizes{$size}}; next unless @same_size_files gt 1; # Discard unique sizes foreach my $file (@same_size_files) { $progress[1]++; print STDERR "$progress[0].$progress[1] / $progress[2]\r"; STDERR->flush(); my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/; $md5s{$md5} ||= []; push @{$md5s{$md5}}, $file; } } print "\n"; foreach my $md5 (keys(%md5s)) { my @same_md5_files = @{$md5s{$md5}}; next unless @same_md5_files gt 1; # Discard unique hashes print "\nFound duplicate files:\n"; foreach my $file (@same_md5_files) { print "\t$file\n"; } my @best_choices = guess_best_choices(@same_md5_files); @best_choices = @same_md5_files unless @best_choices; if (@best_choices == 1) { my $best_choice = $best_choices[0]; my $gogo = $yes; unless ($gogo) { print "\n\tBest choice is '$best_choice', do you want to delete the others? (yes/y) "; my $resp = ; chomp $resp; $gogo = $resp eq "yes" || $resp eq "y"; } if ($gogo) { keep($best_choice, @same_md5_files); } } else { print "\n\tPlease choose one to keep (or press 'enter' to skip):\n"; for (my $i = 0; $i < @best_choices; $i++) { my $choice = $best_choices[$i]; print "\t [$i] $choice\n"; } print "\t> "; my $index = ; chomp $index; if ($index ne "") { $index = int($index); if ($index >= 0 && $index < @best_choices) { keep($best_choices[$index], @same_md5_files); } else { print "\n!!\t Index outside of range, ignoring\n"; } } } } printf STDERR "Done!\n";