#!/usr/bin/env perl use strict; use warnings; use List::Util qw; use File::Find qw; use IPC::System::Simple qw; @ARGV ge 1 || die "Usage: $0 [opts] [ ...] Opts: --dislike|-d Dislike directory (reduce priority) --yes |-y Don't ask for confirmation when there's a single choice --debug Print extra debug info "; # CLI options my $yes = 0; my $debug = 0; my @preferences; my %sizes; my %md5s; sub fill_sizes { return if -d "$_"; # Skip directories my $size = capture("stat", "--printf=%s", $_); $sizes{$size} ||= []; push @{$sizes{$size}}, $File::Find::name; } sub closeness { my ($file, $dir) = @_; my ($diffpath) = Cwd::abs_path($file) =~ /^$dir(.*)$/; if ($diffpath) { return () = $diffpath =~ /\//g; } else { return -1; } } sub guess_best_choices { my @best_guesses = @_; foreach my $pref (@preferences) { if ($pref->{"type"} eq "dislike") { print STDERR "PREF: DISLIKE $pref->{dir}\n" if $debug; my $best_index = 0; my @files = @best_guesses; foreach my $file (@files) { my $dislikability = closeness($file, $pref->{"dir"}); print STDERR "CLOSENESS($file, $pref->{dir}) = $dislikability\n" if $debug; if ($best_index != -1 && ($dislikability == -1 || $dislikability > $best_index)) { @best_guesses = ($file); $best_index = $dislikability; } elsif ($dislikability == $best_index) { push @best_guesses, $file; } print STDERR "BEST: [$best_index] @best_guesses\n" if $debug; } } } return @best_guesses; } sub keep { my ($choice, @rest) = @_; my @delete = grep { $_ ne $choice } @rest; print STDERR "\tKEEPING $choice, DELETING @delete\n" if $debug; unlink foreach @delete; } print STDERR "Collecting file sizes...\n"; while (my $arg = shift) { if ($arg eq "-y" || $arg eq "--yes") { $yes = 1; } elsif ($arg eq "--debug") { $debug = 1; } elsif ($arg eq "-d" || $arg eq "--dislike") { push @preferences, {"type" => "dislike", "dir" => Cwd::abs_path(shift) . "/"}; } else { find(\&fill_sizes, $arg); } } print STDERR "Computing md5s of files with same size...\n"; my @progress = (0, 0, scalar(keys(%sizes))); foreach my $size (keys(%sizes)) { $progress[0]++; $progress[1] = 0; my @same_size_files = @{$sizes{$size}}; next unless @same_size_files gt 1; # Discard unique sizes foreach my $file (@same_size_files) { $progress[1]++; print STDERR "$progress[0].$progress[1] / $progress[2]\r"; STDERR->flush(); my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/; $md5s{$md5} ||= []; push @{$md5s{$md5}}, $file; } } print "\n"; foreach my $md5 (keys(%md5s)) { my @same_md5_files = @{$md5s{$md5}}; next unless @same_md5_files gt 1; # Discard unique hashes print "\nFound duplicate files:\n"; foreach my $file (@same_md5_files) { print "\t$file\n"; } if (my @best_choices = guess_best_choices(@same_md5_files)) { if (@best_choices == 1) { my $best_choice = $best_choices[0]; my $gogo = $yes; unless ($gogo) { print "\n\tBest choice is '$best_choice', do you want to delete the others? (yes/y) "; my $resp = ; chomp $resp; $gogo = $resp eq "yes" || $resp eq "y"; } if ($gogo) { keep($best_choice, @same_md5_files); } } else { print "\n\tPlease choose one to keep (or press 'enter' to skip):\n"; for (my $i = 0; $i < @best_choices; $i++) { my $choice = $best_choices[$i]; print "\t [$i] $choice\n"; } print "\t> "; my $index = ; chomp $index; if ($index ne "") { $index = int($index); if ($index >= 0 && $index < @best_choices) { keep($best_choices[$index], @same_md5_files); } else { print "\n!!\t Index outside of range, ignoring\n"; } } } } } printf STDERR "Done!\n";