#!/usr/bin/perl # Filename: find_dup # Author: David Ljung Madison # See License: http://MarginalHacks.com/License # Description: Find duplicates amongst a list of files # Version: 2.00 use strict; ################################################## # Setup the variables ################################################## my $PROGNAME = $0; $PROGNAME =~ s|.*/||; my $SUM = "sum"; # Checksum program ################################################## # Usage ################################################## sub usage { my $msg; foreach $msg (@_) { print "ERROR: $msg\n"; } print "\n"; print "Usage:\t$PROGNAME [-d] \n"; print "\tList all duplicates in a set of files\n"; print "\n"; print "\t-f\tDon't list the first duplicate on each line\n"; print "\t-rm\tRemove all duplicate files\n"; print "\t-quick\tJust use checksums, don't run 'diff'\n"; print "\t-d\tSet debug mode\n"; print "\n"; exit -1; } sub parse_args { my @files; my %opt; $opt{first} = 1; while ($#ARGV>=0) { my $arg=shift(@ARGV); if ($arg =~ /^-h$/) { usage(); } if ($arg =~ /^-d$/) { $opt{debug}=1; next; } if ($arg =~ /^-f$/) { $opt{first}=0; next; } if ($arg =~ /^-rm$/) { $opt{rm}=1; next; } if ($arg =~ /^-quick$/) { $opt{quick}=1; next; } if ($arg =~ /^-/) { usage("Unknown option: $arg"); } if (-d $arg) { my @got = `find $arg -type f`; chomp(@got); push(@files,@got); } else { push(@files,$arg); } } usage("Need at least two files") unless ($#files>0); (\%opt,@files); } sub diff { my ($opt,$a,$b) = @_; return 0 if $opt->{quick}; system("diff \Q$a\E \Q$b\E"); $? ? 1 : 0; } # natural sort by Dominus of perlmonks.com sub naturally { my @a = split /(\d+)/, $a; my @b = split /(\d+)/, $b; my $M = @a > @b ? @a : @b; my $res = 0; for (my $i = 0; $i < $M; $i++) { return -1 if ! defined $a[$i]; return 1 if ! defined $b[$i]; if ($a[$i] =~ /\d/) { $res = $a[$i] <=> $b[$i]; } else { $res = $a[$i] cmp $b[$i]; } last if $res; } $res; } ################################################## # Main code ################################################## sub sum($) { my ($f) = @_; my $sum = `$SUM \Q$f\E`; chomp($sum); $sum; } sub first($) { my ($f) = @_; my $data; open(F,"<$f") || return print STDERR "ERROR: Can't read $f\n"; read(F,$data,256); close F; $data; } sub main { my ($opt,@files) = parse_args(); # Get file sizes ######################### my %sizes; #print "Getting filesizes\n" if $opt->{debug}; print "$#files files\n" if $opt->{debug}; foreach my $f ( @files ) { my $sz = -s $f; push(@{$sizes{$sz}}, $f); } # For any matching filesizes, compare first chunk ######################### #print(($#files-(scalar keys %sizes))." still to match\n") if $opt->{debug}; foreach my $sz ( keys %sizes ) { next unless ($#{$sizes{$sz}} > 0); # First chunk my %first; #print "Getting first chunks\n" if $opt->{debug}; foreach my $f ( @{$sizes{$sz}} ) { my $chunk = first $f; #print "[$sz] $f: $chunk\n" if $opt->{debug}; push(@{$first{$chunk}}, $f); } # For any matching first chunks, compare sums ######################### foreach my $chunk ( keys %first ) { next unless ($#{$first{$chunk}} > 0); my %sums; foreach my $f ( @{$first{$chunk}} ) { my $sum = sum $f; #print "[$sz,$chunk] $f: $sum\n" if ($opt->{debug}); push(@{$sums{$sum}}, $f); } # Find matching sums my @match; foreach my $s ( keys %sums ) { next unless ($#{$sums{$s}} > 0); push(@match, $sums{$s}); } # Double-check matches with diff # It is unlikely, though possible, that they have the same sum but are different foreach my $m ( @match ) { my @check = sort naturally @$m; while (@check) { my $first = shift(@check); my (@same,@diff); foreach (@check) { system("diff \Q$first\E \Q$_\E"); diff($opt,$first,$_) ? push(@diff,$_) : push(@same,$_); } print "Same sum but different! [$first @same] vs [@diff]\n" if ($opt->{debug} && @diff); @check = @diff; # Go through any remainders that didn't match next unless @same; print $first," " if $opt->{first}; print join(" ",@same), "\n"; if ($opt->{rm}) { print STDERR "[$PROGNAME] ERROR: Couldn't remove all files [@same]\n" unless (unlink(@same) == $#same+1); } } } } } } main();