aboutsummaryrefslogtreecommitdiff
path: root/dups
blob: bf59ee942ecc530000a6d64d000d979a5e5d8801 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env perl

use strict;
use warnings;

use List::Util qw<min>;
use File::Find qw<find>;
use IPC::System::Simple qw<capture>;

@ARGV ge 1 || die "Usage: $0 [opts] <dir1> [<dir2> ...]

Opts:  --dislike|-d <path>    Dislike directory (reduce priority)
       --yes    |-y           Don't ask for confirmation when there's a single choice
       --debug                Print extra debug info
";

# CLI options
my $yes = 0;
my $debug = 0;
my @preferences;

my %sizes;
my %md5s;

sub fill_sizes {
    return if -d "$_"; # Skip directories

    my $size = capture("stat", "--printf=%s", $_);
    $sizes{$size} ||= [];
    push @{$sizes{$size}}, $File::Find::name;
}

sub closeness {
    my ($file, $dir) = @_;
    my ($diffpath) = Cwd::abs_path($file) =~ /^$dir(.*)$/;
    if ($diffpath) {
        return () = $diffpath =~ /\//g;
    } else {
        return -1;
    }
}

sub guess_best_choices {
    my @best_guesses = @_;

    foreach my $pref (@preferences) {
        if ($pref->{"type"} eq "dislike") {
            print STDERR "PREF: DISLIKE $pref->{dir}\n" if $debug;
            my $best_index = 0;
            my @files = @best_guesses;
            foreach my $file (@files) {
                my $dislikability = closeness($file, $pref->{"dir"});
                print STDERR "CLOSENESS($file, $pref->{dir}) = $dislikability\n" if $debug;
                if ($best_index != -1 && ($dislikability == -1 || $dislikability > $best_index)) {
                    @best_guesses = ($file);
                    $best_index = $dislikability;
                } elsif ($dislikability == $best_index) {
                    push @best_guesses, $file;
                }
                print STDERR "BEST: [$best_index] @best_guesses\n" if $debug;
            }
        }
    }

    return @best_guesses;
}

sub keep {
    my ($choice, @rest) = @_;
    my @delete = grep { $_ ne $choice } @rest;
    print STDERR "\tKEEPING $choice, DELETING @delete\n" if $debug;
    unlink foreach @delete;
}

print STDERR "Collecting file sizes...\n";
while (my $arg = shift) {
    if ($arg eq "-y" || $arg eq "--yes") {
        $yes = 1;
    } elsif ($arg eq "--debug") {
        $debug = 1;
    } elsif ($arg eq "-d" || $arg eq "--dislike") {
        push @preferences, {"type" => "dislike", "dir" => Cwd::abs_path(shift) . "/"};
    } else {
        find(\&fill_sizes, $arg);
    }
}

print STDERR "Computing md5s of files with same size...\n";
my @progress = (0, 0, scalar(keys(%sizes)));
foreach my $size (keys(%sizes)) {
    $progress[0]++;
    $progress[1] = 0;
    my @same_size_files = @{$sizes{$size}};
    next unless @same_size_files gt 1; # Discard unique sizes

    foreach my $file (@same_size_files) {
        $progress[1]++;
        print STDERR "$progress[0].$progress[1] / $progress[2]\r";
        STDERR->flush();

        my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/;
        $md5s{$md5} ||= [];
        push @{$md5s{$md5}}, $file;
    }
}

print "\n";

foreach my $md5 (keys(%md5s)) {
    my @same_md5_files = @{$md5s{$md5}};
    next unless @same_md5_files gt 1; # Discard unique hashes

    print "\nFound duplicate files:\n";
    foreach my $file (@same_md5_files) {
        print "\t$file\n";
    }

    if (my @best_choices = guess_best_choices(@same_md5_files)) {
        if (@best_choices == 1) {
            my $best_choice = $best_choices[0];
            my $gogo = $yes;
            unless ($gogo) {
                print "\n\tBest choice is '$best_choice', do you want to delete the others? (yes/y) ";
                my $resp = <STDIN>;
                chomp $resp;
                $gogo = $resp eq "yes" || $resp eq "y";
            }
            if ($gogo) {
                keep($best_choice, @same_md5_files);
            }
        } else {
            print "\n\tPlease choose one to keep (or press 'enter' to skip):\n";
            for (my $i = 0; $i < @best_choices; $i++) {
                my $choice = $best_choices[$i];
                print "\t  [$i] $choice\n";
            }
            print "\t> ";
            my $index = <STDIN>;
            chomp $index;
            if ($index ne "") {
                $index = int($index);
                if ($index >= 0 && $index < @best_choices) {
                    keep($best_choices[$index], @same_md5_files);
                } else {
                    print "\n!!\t Index outside of range, ignoring\n";
                }
            }
        }
    }
}

printf STDERR "Done!\n";