1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
#!/usr/bin/env perl
use strict;
use warnings;
use v5.36;
use List::Util qw<min>;
use File::Find qw<find>;
use IPC::System::Simple qw<capture>;
@ARGV ge 1 || die "Usage: $0 [opts] <dir1> [<dir2> ...]
Opts: --dislike|-d <path> Dislike directory (reduce priority)
--yes |-y Don't ask for confirmation when there's a single choice
--debug Print extra debug info
";
# CLI options
my $yes = 0;
my $debug = 0;
my @preferences;
# Map from size to list of filenames
my %sizes;
# 'wanted' subroutine of File::Find::find. Add the file name to its size key in the %sizes hash
sub register_file_size {
return if -d "$_"; # Skip directories
my $filename = $File::Find::name;
my $size = capture("stat", "--printf=%s", $_);
$sizes{$size} ||= [];
push @{$sizes{$size}}, $filename;
}
# Return true iff '$file' is anywhere inside '$dir'
sub indir($file, $dir) {
my ($diffpath) = Cwd::abs_path($file) =~ /^$dir(.*)$/;
return !!$diffpath;
}
# Take @files and filter it with the best results, discarding the ones from the disliked directories
sub guess_best_choices(@files) {
my @best_guesses = ();
# Remove some choice based on the 'dislike' preference
foreach my $pref (@preferences) {
if ($pref->{"type"} eq "dislike") {
print STDERR "PREF: DISLIKE $pref->{dir}\n" if $debug;
foreach my $file (@files) {
if (indir($file, $pref->{"dir"})) {
print STDERR "Discarding $file...\n" if $debug;
} else {
push @best_guesses, $file;
}
print STDERR "BEST: @best_guesses\n" if $debug;
}
}
}
return @best_guesses;
}
sub keep($choice, @rest) {
my @delete = grep { $_ ne $choice } @rest;
print STDERR "\tKEEPING $choice, DELETING @delete\n" if $debug;
unlink foreach @delete;
}
# Parse CLI args
while (my $arg = shift) {
if ($arg eq "-y" || $arg eq "--yes") {
$yes = 1;
} elsif ($arg eq "--debug") {
$debug = 1;
} elsif ($arg eq "-d" || $arg eq "--dislike") {
push @preferences, {"type" => "dislike", "dir" => Cwd::abs_path(shift) . "/"};
} else {
print STDERR "Collecting file sizes...\n";
find(\®ister_file_size, $arg);
}
}
my $progressidx = 0;
my $progressbar = '/-\|';
sub progressbar($curr, $total) {
return sprintf "\r[%s] %s/%s", substr($progressbar, $progressidx++ % length($progressbar), 1), $curr, $total;
}
# Convert a hash of sizes to a hash of md5s
sub sizes_to_md5s($sizes) {
my %md5s;
# progress: (curr_size, total_sizes)
my @progress = (1, scalar(keys(%$sizes)));
foreach my $size (keys(%$sizes)) {
my @same_size_files = @{$sizes->{$size}};
printf STDERR progressbar($progress[0], $progress[1]);
# Compute only md5 of non-unique sizes
if (@same_size_files gt 1) {
foreach my $file (@same_size_files) {
printf STDERR progressbar($progress[0], $progress[1]) unless $progress[1] == 0;
STDERR->flush();
my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/;
$md5s{$md5} ||= [];
push @{$md5s{$md5}}, $file;
}
}
$progress[0]++;
}
return \%md5s;
}
print STDERR "Computing md5s of files with same size...\n";
# Map from md5 to list of filenames
my %md5s = %{sizes_to_md5s(\%sizes)};
print "\n";
foreach my $md5 (keys(%md5s)) {
my @same_md5_files = @{$md5s{$md5}};
next unless @same_md5_files gt 1; # Discard unique hashes
print "\nFound duplicate files:\n";
foreach my $file (@same_md5_files) {
print "\t$file\n";
}
my @best_choices = guess_best_choices(@same_md5_files);
@best_choices = @same_md5_files unless @best_choices;
if (@best_choices == 1) {
my $best_choice = $best_choices[0];
my $gogo = $yes;
unless ($gogo) {
print "\n\tBest choice is '$best_choice', do you want to delete the others? (yes/y) ";
my $resp = <STDIN>;
chomp $resp;
$gogo = $resp eq "yes" || $resp eq "y";
}
if ($gogo) {
keep($best_choice, @same_md5_files);
}
} else {
print "\n\tPlease choose one to keep (or press 'enter' to skip):\n";
for (my $i = 0; $i < @best_choices; $i++) {
my $choice = $best_choices[$i];
print "\t [$i] $choice\n";
}
print "\t> ";
my $index = <STDIN>;
chomp $index;
if ($index ne "") {
$index = int($index);
if ($index >= 0 && $index < @best_choices) {
keep($best_choices[$index], @same_md5_files);
} else {
print "\n!!\t Index outside of range, ignoring\n";
}
}
}
}
printf STDERR "Done!\n";
|