1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
#!/usr/bin/env perl
use strict;
use warnings;
use v5.36;
use List::Util qw<min>;
use File::Find qw<find>;
use IPC::System::Simple qw<capture>;
@ARGV ge 1 || die "Usage: $0 [opts] <dir1> [<dir2> ...]
Opts: --dislike|-d <path> Dislike directory (reduce priority)
--yes |-y Don't ask for confirmation when there's a single choice
--debug Print extra debug info
";
# CLI options
my $yes = 0;
my $debug = 0;
my @preferences;
# Map from size to list of filenames
my %sizes;
# Map from md5 to list of filenames
my %md5s;
sub register_file_size {
return if -d "$_"; # Skip directories
my $filename = $File::Find::name;
my $size = capture("stat", "--printf=%s", $_);
$sizes{$size} ||= [];
push @{$sizes{$size}}, $filename;
}
sub indir($file, $dir) {
my ($diffpath) = Cwd::abs_path($file) =~ /^$dir(.*)$/;
return !!$diffpath;
}
sub guess_best_choices(@files) {
my @best_guesses = ();
# Remove some choice based on the 'dislike' preference
foreach my $pref (@preferences) {
if ($pref->{"type"} eq "dislike") {
print STDERR "PREF: DISLIKE $pref->{dir}\n" if $debug;
foreach my $file (@files) {
if (indir($file, $pref->{"dir"})) {
print STDERR "Discarding $file...\n" if $debug;
} else {
push @best_guesses, $file;
}
print STDERR "BEST: @best_guesses\n" if $debug;
}
}
}
return @best_guesses;
}
sub keep($choice, @rest) {
my @delete = grep { $_ ne $choice } @rest;
print STDERR "\tKEEPING $choice, DELETING @delete\n" if $debug;
unlink foreach @delete;
}
print STDERR "Collecting file sizes...\n";
while (my $arg = shift) {
if ($arg eq "-y" || $arg eq "--yes") {
$yes = 1;
} elsif ($arg eq "--debug") {
$debug = 1;
} elsif ($arg eq "-d" || $arg eq "--dislike") {
push @preferences, {"type" => "dislike", "dir" => Cwd::abs_path(shift) . "/"};
} else {
find(\®ister_file_size, $arg);
}
}
print STDERR "Computing md5s of files with same size...\n";
my @progress = (0, 0, scalar(keys(%sizes)));
foreach my $size (keys(%sizes)) {
$progress[0]++;
$progress[1] = 0;
my @same_size_files = @{$sizes{$size}};
next unless @same_size_files gt 1; # Discard unique sizes
foreach my $file (@same_size_files) {
$progress[1]++;
print STDERR "$progress[0].$progress[1] / $progress[2]\r";
STDERR->flush();
my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/;
$md5s{$md5} ||= [];
push @{$md5s{$md5}}, $file;
}
}
print "\n";
foreach my $md5 (keys(%md5s)) {
my @same_md5_files = @{$md5s{$md5}};
next unless @same_md5_files gt 1; # Discard unique hashes
print "\nFound duplicate files:\n";
foreach my $file (@same_md5_files) {
print "\t$file\n";
}
my @best_choices = guess_best_choices(@same_md5_files);
@best_choices = @same_md5_files unless @best_choices;
if (@best_choices == 1) {
my $best_choice = $best_choices[0];
my $gogo = $yes;
unless ($gogo) {
print "\n\tBest choice is '$best_choice', do you want to delete the others? (yes/y) ";
my $resp = <STDIN>;
chomp $resp;
$gogo = $resp eq "yes" || $resp eq "y";
}
if ($gogo) {
keep($best_choice, @same_md5_files);
}
} else {
print "\n\tPlease choose one to keep (or press 'enter' to skip):\n";
for (my $i = 0; $i < @best_choices; $i++) {
my $choice = $best_choices[$i];
print "\t [$i] $choice\n";
}
print "\t> ";
my $index = <STDIN>;
chomp $index;
if ($index ne "") {
$index = int($index);
if ($index >= 0 && $index < @best_choices) {
keep($best_choices[$index], @same_md5_files);
} else {
print "\n!!\t Index outside of range, ignoring\n";
}
}
}
}
printf STDERR "Done!\n";
|