#!/usr/bin/perl
# Filename:	findex
# Author:	David Ljung Madison <DaveSource.com>
# See License:	http://MarginalHacks.com/License
# Description:	Fetches all contents of an index
use strict;
use Cwd 'abs_path';

##################################################
# Setup the variables
##################################################
my $PROGNAME = $0;
$PROGNAME =~ s|.*/||;

# Pick one

# GET can specify referer
my $GET	=	"GET";
# lynx does auth
# lynx doesn't need to read file completely into memory
my $LYNX =	"lynx -source";

# Pick one
my $FETCH = $LYNX;

sub interrupt { die("[$PROGNAME] Interrupted\n"); }
$SIG{INT} = \&interrupt; $SIG{TERM} = \&interrupt;
$SIG{HUP} = \&interrupt; $SIG{QUIT} = \&interrupt;
$SIG{EXIT} = \&interrupt; $SIG{__DIE__} = \&interrupt;

my $DEFAULT_SCHEMA = "http";

##################################################
# Usage
##################################################
sub usage {
  foreach my $msg (@_) { print STDERR "ERROR:  $msg\n"; }
  print STDERR <<USAGE;

Usage:\t$PROGNAME [-d] <file>
  Fetches an index of files
  -only <re>    Only get links that match this regexp
  -from <url>   Set referer
  -d            Set debug mode
  -list_hrefs   List all hrefs at a location (instead of downloading)
	-dir <dir>    Save files in specified directory
  -no_replace   Don't fetch files that already exist
  -count        Convert the last number to a %d and count up
  --count       Convert the last number to a %d and count down AND up
  -miss <num>   Can miss this many URLs before giving up.
	-miss_sum <num> Filesum of a file/image to be considered a miss
  -retry <num>  Retry individual fetches if failed.
  -auth <user:pass>   Specify a user/pass for a website
  -external     Follow links on external domains
  
  Example:  findex http://GetDave.com
            Fetches all the local links on GetDave.com
  
  Example:  findex -only '(\.(gif|jpe?g)\$)' http://Yahoo.com/
            Fetches only gifs and jpegs (as links, not <img> tags)
  
  Example:  findex http://offthemark.com/rotate/0201%0.2d.gif 9-11
            Fetches a few off the mark cartoons
  
  Example:  findex http://somewhere.com/pics/%0.4d.gif 0-
            Fetch some images until no more are found
  
  Example:  findex -count http://somewhere.com/pics/03.gif
            Same as http://somewhere.com/pics/%0.2d.gif 3-

  You can use two -count args to loop through two numbers in the URL

USAGE
  exit -1;
}

my $STDIN = 0;
sub next_arg {
  if ($STDIN) {
    # Skip blank lines
    while (<>) { last if /\S/; }
    chomp;
    return $_ if $_;
    $STDIN = 0;
  }
  shift(@ARGV);
}

sub handleArg {
  my ($opt,$arg,$down) = @_;

  my (%p,$did);

  ($p{schema},$p{domain},$p{page}) = url_split($arg);
  $p{url} = "$p{schema}://$p{domain}/$p{page}";

  print "$p{domain}\n" unless $opt->{list_hrefs} || $down;


  if ($opt->{count}) {
    my $url = $p{url};
    $url =~ s/%/%%/g;
    usage("URL didn't have any numbers but -count specified")
      unless $url =~ /^(.*\D)(\d+)(\D*)$/;
    my ($pre,$num,$post) = ($1,$2,$3);
    $p{count} = $num+0;
    $p{count}-- if $down;
    $p{count} = $down ? "-$p{count}" : "$p{count}-";
    my $format = "%0.".(length($num))."d";
    $p{url} = $pre.$format.$post;
  } else {
    $p{count} = shift(@ARGV) if ($arg =~ /%[\d\.]*d/);
  }

  my $missed = 0;
  while (1) {
    my $got = findex($opt,\%p);
    $did++;
    last unless $opt->{count}>1;
    last unless $got || ++$missed<$opt->{can_miss};
    $missed = 0 if $got;
    usage("URL didn't have two numbers, but -count specified twice")
      unless $p{url} =~ /^(.*\D)(\d+)(\D+%0\.\d+\D+)$/;
    my ($pre,$num,$post) = ($1,$2,$3);
    my $format = "%0.".(length($num))."d";
    $num = sprintf($format,$num+1);
    $p{url} = $pre.$num.$post;
  }

  $opt->{only}=undef unless $STDIN;
  $opt->{exclude}=undef unless $STDIN;
  $opt->{list_hrefs}=0 unless $STDIN;

  $did;
}

sub parse_args {
  my $opt = {};
  $opt->{can_miss} = 10;
  #$opt->{home} = abs_path();

  my $did = 0;

  while (my $arg=next_arg()) {
    if ($arg eq "-h") { usage(); }
    if ($arg eq "-d") { $MAIN::DEBUG=1; next; }
    if ($arg eq "-dir") { $opt->{local}=shift(@ARGV); next; }
    if ($arg eq "-from") { $opt->{from}=shift(@ARGV); next; }
    if ($arg eq "-only") { push(@{$opt->{only}},shift(@ARGV)); next; }
    if ($arg eq "-exclude") { push(@{$opt->{exclude}},shift(@ARGV)); next; }
    if ($arg eq "-retry") { $opt->{retry} = shift(@ARGV); next; }
    if ($arg eq "-count") { $opt->{count}++; next; }
    if ($arg eq "--count") { $opt->{count}++; $opt->{upDown}++; next; }
    if ($arg eq "-miss") { $opt->{can_miss} = shift(@ARGV); next; }
    if ($arg eq "-miss_sum") { $opt->{miss_sum} = shift(@ARGV); next; }
    if ($arg eq "-auth") { $opt->{auth} = shift(@ARGV); next; }
    if ($arg eq "-no_replace") { $opt->{no_replace} = 1; next; }
    if ($arg eq "-external") { $opt->{external} = 1; next; }
    if ($arg eq "-list_hrefs") { $opt->{list_hrefs}=1; next; }
    if ($arg eq "-") { $STDIN = 1; next; }
    if ($arg =~ /^-/) { usage("Unknown option: $arg"); }

    usage("Can't do more than two -count args for a URL (or \"-\")")
      if $opt->{count}>2;

    usage("Can't use -auth with -from, unfortunately") if $opt->{from} && $opt->{auth};
    $FETCH = "$GET -H 'user-agent: Mozilla/5.0' -H 'Referer: $opt->{from}'" if $opt->{from};
    $FETCH = $LYNX if $opt->{auth};
    
    # Do it.
    $did += handleArg($opt,$arg);
    $did += handleArg($opt,$arg,1) if $opt->{upDown};

    $opt->{count}=0 unless $STDIN;
  }
  usage("No paths defined") unless $did;
}

sub debug {
  return unless $MAIN::DEBUG;
  foreach my $msg (@_) { print STDERR "[$PROGNAME] $msg\n"; }
}

##################################################
# URL handling
##################################################
sub url_split {
  my ($url) = @_;

  # Break it down!  <ba ba bum ba bum...>
  $url =~ s{^(ftp|https?)://}{};
  my $schema = $1 || $DEFAULT_SCHEMA;
  return ($schema,$1,$2) if ($url =~ m|^([^/]+)/(.*)$|);
  return ($schema,$url,"");
}

sub clean_url {
  my ($schema,$domain,$path) = @_;

  # Domain cleaning
  $domain=lc($domain);

  # Path cleaning

  # Fixup %codes in URL
  $path =~  s/%([0-9a-fA-F]{2})/chr(hex($1))/eg;

#  # Default page?
#  $path.=$DEFAULT_PAGE if ($path =~ m|/$| || $path eq "");

  # Remove any inter-page indexing (page.html#chapter1)
  #$path =~ s|#[^/]*$||;
  #Interpage indexing can include "/" - at least with Apache
  $path =~ s|#.*$||;

  # Remove all // and /./
  $path="/$path"; # Temporarily make matches easier
  while ($path =~ s|/\.?/|/|) {}
  $path =~ s|/\.$||;

  # Remove all */dir/../*
  while ($path =~ s|/[^/]+/\.\./|/|) {}
  # SECURITY:  Make sure we didn't have too many /../
  # Example:   /bob/../../../../../../etc/passwd
  $path =~ s|^/\.\./|/|;    # /../*     -> /blah
  $path = "/" if $path eq "/..";  # /..       -> /
  $path =~ s|[^/]+/\.\.$||;   # */some/.. -> *

  # SECURITY:  Remove any leading /
  # (I'm not sure this can actually happen here, but...)
  $path =~ s|^/||g;

  ($schema,$domain,$path);
}

sub resolve_url {
  my ($schema,$domain,$page,$new) = @_;

  # Absolute URL:  (ftp/http/https only)
  # http://bob.com/here.html -> bob.com/here.html
  return (clean_url($1,$2,$3)) if ($new =~ m{^(ftp|https?)://([^/]+)(.*)});

  # Can't handle other URL schemes (ftp, gopher, mailto, news, telnet)
  # (Should we do ftp someday?  Where would we save the files?)
  return undef if ($new =~ m|^[^/]+:|);

  # Absolute path, same domain:
  # bob.com/some/page1.html & /new/page2.html -> bob.com/new/page2.html
  return clean_url($schema,$domain,$new) if ($new =~ m|^/|);

  # Relative path but already at root
  # bob.com/here.html & some/page.html -> bob.com/some/page.html
  return clean_url($schema,$domain,"$new") unless ($page =~ m|/|);

  # Relative path
  # bob.com/some/page1.html & ../page2.html -> bob.com/page2.html
  # First remove the page from the end
  my $dir=$page;  $dir =~ s|/[^/]+$||;
  return clean_url($schema,$domain,"$dir/$new");
}

##################################################
# Code
##################################################
sub read_index {
  my ($opt,$p) = @_;

  debug("Reading index [$p->{url}]");

  open(I,"$FETCH \Q$p->{url}\E 2>/dev/null |") || die("Couldn't get [$p->{url}]\n");

	my %already_hrefs;

  my $line = "";
  while (<I>) {
		chomp;
    $line = $line ? $line.' '.$_ : $_;
		# Find <a href=> and <enclosure url=> (for podcasts)
    while ($line =~ /<a\s+href=([^>]+)>(.*?)<\/a(.*)/i || $line =~ /<enclosure\s+url=([^>]+)>(.*?)(.*)/i) {
      my ($f,$n);
      ($f,$n,$line) = ($1,$2,$3);
      $f = $1 if $f =~ /^"([^"]+)"/ || $f =~ /^'([^']+)'/;
      # This line is actually unnecessary except for if we setup
      # debugging output to show us $n
      $n =~ s/<(?!img)[^>]+>//g;

			#debug("LINK: $n -> $f");

      # Ignore parent directory links
      next if $n =~ /parent directory/i;
      next if $f eq "..";

      # Ignore the sortby ?links in indexes
      next if $f =~ /(^|\/)\?(C|N|sortby)=/;
      # Also: ?M=A, ...
      next if $f =~ /\?[MSD]=A$/;

      # Resolve URL
      my ($schema,$domain,$page)
        = resolve_url($p->{schema},$p->{domain},$p->{page},$f);

			debug("LINK: $n -> $schema $domain $page");

      # Only fetch files from same schema/domain
      # (otherwise we might make a load of domain directories - maybe
      #  this should be an option?)
      next unless $schema eq $p->{schema};
      # But allow www.bob.com to match bob.com..
      my $d = $domain; $d =~ s/^www\.//;
      my $pd = $p->{domain}; $pd =~ s/^www\.//;

      if ($d ne $pd) {
        next unless $opt->{external};
        $opt->{found_external}++;
        $page = "${schema}://$domain/$page";
      }

			#debug("Same schema/domain");

      next if $opt->{only} && grep($page !~ /$_/, @{$opt->{only}});
			next if $opt->{exclude} && grep($page =~ /$_/, @{$opt->{exclude}});

			$page='/' unless $page;

      next if $already_hrefs{$page}++;

      if ($opt->{list_hrefs}) {
        print "$page\n";
        next;
      }

      push(@{$p->{list}},$page);
    }

    # Allow continuation of lines
    next if ($line =~ /<a($|\s)/i);
    # Otherwise get rid of the rest of the line
    $line="";

  }
  close I;
}

sub expand_list {
  my ($opt,$p) = @_;

  # List comes from an index page
  return read_index($opt,$p) unless ($p->{count});

  my (undef,undef,$file) = url_split($p->{url});

  # Open-ended list, don't expand now
  return ($p->{curr_file},$p->{curr_num}) = ($file,$1)
    if ($p->{count} =~ /^(\d+)-$/);
  return ($p->{curr_file},$p->{curr_num}) = ($file,$1)
    if ($p->{count} =~ /^-(\d+)$/);

  debug("Expanding $p->{url} [$p->{count}]");

  my ($a,$b) = (0,$p->{count});
  ($a,$b) = ($1,$2) if ($p->{count} =~ /^(\d+)-(\d+)$/);
  for ($a..$b) {
    push(@{$p->{list}},sprintf($file,$_));
  }
}

sub next_file {
  my ($p) = @_;
  return shift(@{$p->{list}}) unless defined $p->{curr_num};
  my $ret = sprintf($p->{curr_file},$p->{curr_num});
  $p->{curr_num} += ($p->{count} =~ /^-/) ? -1 : 1;
	$ret;
}

sub not_found {
  my ($opt,$file) = @_;
  return 1 if -z $file;
  $file =~ s#^(\s)#./$1#;
  open(FILE,'<',$file) || return 1;
  my $notfound = 0;
  my $media = ($file =~ /\.(jpg|jpeg|gif|mpg|mov)$/) ? 1 : 0;
  while(<FILE>) {
    last if $.>30;	# Only check first thirty lines or so
		#for binaries?#last if $.>2 && length($_)>200;
    $notfound=1 if /not[ ]?found/i;
    $notfound=1 if /page cannot be found/i;
    $notfound=1 if /404 not found/i;
    $notfound=1 if /all 404 errors/i;
    $notfound=1 if /500 (can.t connect|read timeout)/i;
    $notfound=1 if /^\s*<(html>|head>|meta)/i && $.<5 && $media;	# Heuristic
    last if $notfound;
  }
  close FILE;

	return $notfound if $notfound;

	if ($opt->{miss_sum}) {
		if (open(SUM,"sum \Q$file\E|")) {
			my ($sum) = scalar <SUM>;
			close SUM;
			$notfound=1 if $sum =~ /^\s*$opt->{miss_sum}(\s+|$)/;
		} else {
    		print STDERR "[$PROGNAME] WARNING: Couldn't run 'sum':\n$!\n"
				unless $MAIN::WARNSUM++;
		}
	}

  $notfound;
}

sub mkdirR {
  my ($dir) = @_;
  my $now;
  foreach ( split('/',$dir) ) {
    $now .= "$_/";
    next if -d $now;
    mkdir($now,0755) || die("Couldn't mkdir [$now]\n");
  }
}

sub clean_dirs {
  my ($dir) = @_;
  while ($dir) {
    # Should fail if not empty..
    last unless rmdir($dir);
    $dir =~ s|/[^/]+$||;
  }
}

sub get {
  my ($opt,$url,$file) = @_;

	debug("Get $url -> $file");
	debug("[already have]") if -f $file && !not_found($opt,$file);

  return 1 if $opt->{no_replace} && -f $file && !not_found($opt,$file);
  return print STDERR "    Ignoring directory:\n"
    if -d $file;

  my $got = 0;
  do {
    my $cmd = $FETCH;
    $cmd .= " -accept_all_cookies -auth $opt->{auth}" if $opt->{auth};
    system("$cmd \Q$url\E > \Q$file\E 2> /dev/null");
    my ($exit,$signal,$dump) = ($? >> 8, $? & 127, $? & 128);
    print STDERR "[$PROGNAME] WARNING: Error from $FETCH:\n$!\n" if $exit;
    print STDERR "[$PROGNAME] WARNING: Core dump for:\n$!\n" if $dump;
    interrupt() if $signal;

    $got = not_found($opt,$file) ? 0 : 1;
  } until (--$opt->{retry}<0) || $got;
  $got;
}

sub findex {
  my ($opt,$path) = @_;

  my $got = 0;

  expand_list($opt,$path);

  return if $opt->{list_hrefs};

  my $missed = 0;
  my $base = "$path->{schema}://$path->{domain}";
  while (my $page = next_file($path)) {
    # Make sure we have the subdirectory
    $page =~ m|(.+)/| ? mkdirR("$path->{domain}/$1") : mkdirR($path->{domain})
      unless $opt->{local};

    # Get the file
    my ($url,$get) = ($page =~ m|://(.+)|)
      ? ($page,$1) : ("$base/$page","$path->{domain}/$page");
    $get =~ s|.*/|$opt->{local}/| if $opt->{local};
    my $found = get($opt,$url,$get);

    # Delete not found files.
    debug("Not found - deleting [$get]") if !$found;
    unlink $get if !$found;

    # Check if we're finished with an open-ended count
    if ($path->{curr_num} && !$found) {
      last if ++$missed > $opt->{can_miss};
      next;
    }
    $missed=0;

    $got++;
    print "  $page";
    print " [not found]" unless $found;
    print "\n";
  }

  #$opt->{local} || chdir $opt->{home} || die("Couldn't go back home! [$opt->{home}]\n");

  clean_dirs($path->{dir});

  print "  Not found\n" unless $got;
  print "  [Some external links found, consider -external]\n"
    if !$got && $opt->{found_external};

  $got;
}

parse_args();
