aboutsummaryrefslogtreecommitdiff
path: root/dups
blob: bbf357e4219f11a667e3e2820de30f2102854528 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env perl

use strict;
use warnings;

use File::Find qw<find>;
use IPC::System::Simple qw<capture>;

@ARGV ge 1 || die "Usage: $0 <dir1> [<dir2> ...]\n";

my %sizes;
my %md5s;

sub fill_sizes {
    return if -d "$_"; # Skip directories

    my $size = capture("stat", "--printf=%s", $_);
    $sizes{$size} ||= [];
    push @{$sizes{$size}}, $File::Find::name;
}

print STDERR "Collecting file sizes...\n";
find(\&fill_sizes, $_) foreach (@ARGV);

print STDERR "Computing md5s of files with same size...\n";
my @progress = (0, 0, scalar(keys(%sizes)));
foreach my $size (keys(%sizes)) {
    $progress[0]++;
    $progress[1] = 0;
    my @same_size_files = @{$sizes{$size}};
    next unless @same_size_files gt 1; # Discard unique sizes


    foreach my $file (@same_size_files) {
        $progress[1]++;
        print STDERR "$progress[0].$progress[1] / $progress[2]\r";
        STDERR->flush();

        my ($md5) = capture("md5sum", "-z", $file) =~ m/^([^ ]+)/;
        $md5s{$md5} ||= [];
        push @{$md5s{$md5}}, $file;
    }
}

foreach my $md5 (keys(%md5s)) {
    my @same_md5_files = @{$md5s{$md5}};
    next unless @same_md5_files gt 1; # Discard unique hashes

    print "Found duplicate files:\n";
    foreach my $file (@same_md5_files) {
        print "\t$file\n";
    }
    printf "\n";
}

printf STDERR "Done!\n";