#!/usr/bin/perl
# Filename:	find_dup
# Author:	David Ljung Madison <DaveSource.com>
# See License:	http://MarginalHacks.com/License
# Description:	Find duplicates amongst a list of files
# Version:	2.00
use strict;

##################################################
# Setup the variables
##################################################
my $PROGNAME = $0;
$PROGNAME =~ s|.*/||;

my $SUM = "sum";	# Checksum program

##################################################
# Usage
##################################################
sub usage {
  my $msg;
  foreach $msg (@_) { print "ERROR:  $msg\n"; }
  print "\n";
  print "Usage:\t$PROGNAME [-d] <files|dirs..>\n";
  print "\tList all duplicates in a set of files\n";
  print "\n";
  print "\t-f\tDon't list the first duplicate on each line\n";
  print "\t-rm\tRemove all duplicate files\n";
  print "\t-quick\tJust use checksums, don't run 'diff'\n";
  print "\t-d\tSet debug mode\n";
  print "\n";
  exit -1;
}

sub parse_args {
  my @files;
  my %opt;
  $opt{first} = 1;

  while ($#ARGV>=0) {
    my $arg=shift(@ARGV);
    if ($arg =~ /^-h$/) { usage(); }
    if ($arg =~ /^-d$/) { $opt{debug}=1; next; }
    if ($arg =~ /^-f$/) { $opt{first}=0; next; }
    if ($arg =~ /^-rm$/) { $opt{rm}=1; next; }
    if ($arg =~ /^-quick$/) { $opt{quick}=1; next; }
    if ($arg =~ /^-/) { usage("Unknown option: $arg"); }
    if (-d $arg) {
      my @got = `find $arg -type f`;
      chomp(@got);
      push(@files,@got);
    } else {
      push(@files,$arg);
    }
  }
  usage("Need at least two files") unless ($#files>0);

  (\%opt,@files);
}

sub diff {
  my ($opt,$a,$b) = @_;

  return 0 if $opt->{quick};
  system("diff -q \Q$a\E \Q$b\E > /dev/null");
  $? ? 1 : 0;
}

# natural sort by Dominus of perlmonks.com
sub naturally {
  my @a = split /(\d+)/, $a;
  my @b = split /(\d+)/, $b;
  my $M = @a > @b ? @a : @b;
  my $res = 0;
  for (my $i = 0; $i < $M; $i++) {
    return -1 if ! defined $a[$i];
    return 1 if  ! defined $b[$i];
    if ($a[$i] =~ /\d/) {
      $res = $a[$i] <=> $b[$i];
    } else {
      $res = $a[$i] cmp $b[$i];
    }
    last if $res;
  }
  $res;
}

##################################################
# Main code
##################################################
sub sum($) {
	my ($f) = @_;
	my $sum = `$SUM \Q$f\E`;
	chomp($sum);
	$sum;
}

sub first($) {
	my ($f) = @_;
	my $data;
	open(F,"<$f") || return print STDERR "ERROR: Can't read $f\n";
	read(F,$data,256);
	close F;
	$data;
}

sub main {
  my ($opt,@files) = parse_args();

	# Get file sizes
	#########################
	my %sizes;
	#print "Getting filesizes\n" if $opt->{debug};
	print "$#files files\n" if $opt->{debug};
	foreach my $f ( @files ) {
		my $sz = -s $f;
		push(@{$sizes{$sz}}, $f);
	}

	# For any matching filesizes, compare first chunk
	#########################
	#print(($#files-(scalar keys %sizes))." still to match\n") if $opt->{debug};
	foreach my $sz ( keys %sizes ) {
    next unless ($#{$sizes{$sz}} > 0);

		# First chunk
		my %first;
		#print "Getting first chunks\n" if $opt->{debug};
  	foreach my $f ( @{$sizes{$sz}} ) {
    	my $chunk = first $f;
			#print "[$sz] $f: $chunk\n" if $opt->{debug};
    	push(@{$first{$chunk}}, $f);
  	}

		# For any matching first chunks, compare sums
		#########################
		foreach my $chunk ( keys %first ) {
   	 next unless ($#{$first{$chunk}} > 0);

  		my %sums;
  		foreach my $f ( @{$first{$chunk}} ) {
    		my $sum = sum $f;
				#print "[$sz,$chunk] $f: $sum\n" if ($opt->{debug});
    		push(@{$sums{$sum}}, $f);
  		}

  		# Find matching sums
  		my @match;
  		foreach my $s ( keys %sums ) {
    		next unless ($#{$sums{$s}} > 0);
    		push(@match, $sums{$s});
  		}

  		# Double-check matches with diff
  		# It is unlikely, though possible, that they have the same sum but are different
  		foreach my $m ( @match ) {
    		my @check = sort naturally @$m;
    		while (@check) {
      		my $first = shift(@check);
      		my (@same,@diff);
      		foreach (@check) {
        		##system("diff \Q$first\E \Q$_\E");
        		diff($opt,$first,$_) ? push(@diff,$_) : push(@same,$_);
      		}
      		print "Same sum but different! [$first @same] vs [@diff]\n"
        		if ($opt->{debug} && @diff);
      		@check = @diff;	# Go through any remainders that didn't match

      		next unless @same;
      		print $first," " if $opt->{first};
      		print join(" ",@same), "\n";
      		if ($opt->{rm}) {
        		print STDERR "[$PROGNAME] ERROR: Couldn't remove all files [@same]\n"
          		unless (unlink(@same) == $#same+1);
      		}
    		}
  		}
		}
	}
}
main();
