#!/usr/bin/env perl use strict; use warnings; use v5.36; use List::Util qw; use File::Find qw; use IPC::System::Simple qw; @ARGV ge 1 || die "Usage: $0 [opts] [ ...] Opts: --dislike|-d Dislike directory (reduce priority) --yes |-y Don't ask for confirmation when there's a single choice --debug Print extra debug info "; # CLI options my $yes = 0; my $debug = 0; my @preferences; # Map from size to list of filenames my %sizes; # 'wanted' subroutine of File::Find::find. Add the file name to its size key in the %sizes hash sub register_file_size { return if -d "$_"; # Skip directories my $filename = $File::Find::name; my $size = capture("stat", "--printf=%s", $_); $sizes{$size} ||= []; push @{$sizes{$size}}, $filename; } # Return true iff '$file' is anywhere inside '$dir' sub indir($file, $dir) { my ($diffpath) = Cwd::abs_path($file) =~ /^$dir(.*)$/; return !!$diffpath; } # Take @files and filter it with the best results, discarding the ones from the disliked directories sub guess_best_choices(@files) { my @best_guesses = (); # Remove some choice based on the 'dislike' preference foreach my $pref (@preferences) { if ($pref->{"type"} eq "dislike") { print STDERR "PREF: DISLIKE $pref->{dir}\n" if $debug; foreach my $file (@files) { if (indir($file, $pref->{"dir"})) { print STDERR "Discarding $file...\n" if $debug; } else { push @best_guesses, $file; } print STDERR "BEST: @best_guesses\n" if $debug; } } } return @best_guesses; } sub keep($choice, @rest) { my @delete = grep { $_ ne $choice } @rest; print STDERR "\tKEEPING $choice, DELETING @delete\n" if $debug; unlink foreach @delete; } # Parse CLI args while (my $arg = shift) { if ($arg eq "-y" || $arg eq "--yes") { $yes = 1; } elsif ($arg eq "--debug") { $debug = 1; } elsif ($arg eq "-d" || $arg eq "--dislike") { push @preferences, {"type" => "dislike", "dir" => Cwd::abs_path(shift) . "/"}; } else { print STDERR "Collecting file sizes...\n"; find(\®ister_file_size, $arg); } } my $progressidx = 0; my $progressbar = '/-\|'; sub progressbar($curr, $total) { return sprintf "\r[%s] %s/%s", substr($progressbar, $progressidx++ % length($progressbar), 1), $curr, $total; } # Convert a hash of sizes to a hash of md5s sub sizes_to_md5s($sizes) { my %md5s; # progress: (curr_size, total_sizes) my @progress = (1, scalar(keys(%$sizes))); foreach my $size (keys(%$sizes)) { my @same_size_files = @{$sizes->{$size}}; printf STDERR progressbar($progress[0], $progress[1]); # Compute only md5 of non-unique sizes if (@same_size_files gt 1) { foreach my $file (@same_size_files) { printf STDERR progressbar($progress[0], $progress[1]) unless $progress[1] == 0; STDERR->flush(); my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/; $md5s{$md5} ||= []; push @{$md5s{$md5}}, $file; } } $progress[0]++; } return \%md5s; } print STDERR "Computing md5s of files with same size...\n"; # Map from md5 to list of filenames my %md5s = %{sizes_to_md5s(\%sizes)}; print "\n"; foreach my $md5 (keys(%md5s)) { my @same_md5_files = @{$md5s{$md5}}; next unless @same_md5_files gt 1; # Discard unique hashes print "\nFound duplicate files:\n"; foreach my $file (@same_md5_files) { print "\t$file\n"; } my @best_choices = guess_best_choices(@same_md5_files); @best_choices = @same_md5_files unless @best_choices; if (@best_choices == 1) { my $best_choice = $best_choices[0]; my $gogo = $yes; unless ($gogo) { print "\n\tBest choice is '$best_choice', do you want to delete the others? (yes/y) "; my $resp = ; chomp $resp; $gogo = $resp eq "yes" || $resp eq "y"; } if ($gogo) { keep($best_choice, @same_md5_files); } } else { print "\n\tPlease choose one to keep (or press 'enter' to skip):\n"; for (my $i = 0; $i < @best_choices; $i++) { my $choice = $best_choices[$i]; print "\t [$i] $choice\n"; } print "\t> "; my $index = ; chomp $index; if ($index ne "") { $index = int($index); if ($index >= 0 && $index < @best_choices) { keep($best_choices[$index], @same_md5_files); } else { print "\n!!\t Index outside of range, ignoring\n"; } } } } printf STDERR "Done!\n";