#!/usr/bin/perl # Filename: findex # Author: David Ljung Madison # See License: http://MarginalHacks.com/License # Description: Fetches all contents of an index use strict; use Cwd 'abs_path'; ################################################## # Setup the variables ################################################## my $PROGNAME = $0; $PROGNAME =~ s|.*/||; # Pick one # GET can specify referer my $GET = "GET"; # lynx does auth # lynx doesn't need to read file completely into memory my $LYNX = "lynx -source"; # Pick one my $FETCH = $LYNX; sub interrupt { die("[$PROGNAME] Interrupted\n"); } $SIG{INT} = \&interrupt; $SIG{TERM} = \&interrupt; $SIG{HUP} = \&interrupt; $SIG{QUIT} = \&interrupt; $SIG{EXIT} = \&interrupt; $SIG{__DIE__} = \&interrupt; my $DEFAULT_SCHEMA = "http"; ################################################## # Usage ################################################## sub usage { foreach my $msg (@_) { print STDERR "ERROR: $msg\n"; } print STDERR < Fetches an index of files -only Only get links that match this regexp -from Set referer -d Set debug mode -list_hrefs List all hrefs at a location (instead of downloading) -dir Save files in specified directory -no_replace Don't fetch files that already exist -count Convert the last number to a %d and count up --count Convert the last number to a %d and count down AND up -miss Can miss this many URLs before giving up. -miss_sum Filesum of a file/image to be considered a miss -retry Retry individual fetches if failed. -auth Specify a user/pass for a website -external Follow links on external domains Example: findex http://GetDave.com Fetches all the local links on GetDave.com Example: findex -only '(\.(gif|jpe?g)\$)' http://Yahoo.com/ Fetches only gifs and jpegs (as links, not tags) Example: findex http://offthemark.com/rotate/0201%0.2d.gif 9-11 Fetches a few off the mark cartoons Example: findex http://somewhere.com/pics/%0.4d.gif 0- Fetch some images until no more are found Example: findex -count http://somewhere.com/pics/03.gif Same as http://somewhere.com/pics/%0.2d.gif 3- You can use two -count args to loop through two numbers in the URL USAGE exit -1; } my $STDIN = 0; sub next_arg { if ($STDIN) { # Skip blank lines while (<>) { last if /\S/; } chomp; return $_ if $_; $STDIN = 0; } shift(@ARGV); } sub handleArg { my ($opt,$arg,$down) = @_; my (%p,$did); ($p{schema},$p{domain},$p{page}) = url_split($arg); $p{url} = "$p{schema}://$p{domain}/$p{page}"; print "$p{domain}\n" unless $opt->{list_hrefs} || $down; if ($opt->{count}) { my $url = $p{url}; $url =~ s/%/%%/g; usage("URL didn't have any numbers but -count specified") unless $url =~ /^(.*\D)(\d+)(\D*)$/; my ($pre,$num,$post) = ($1,$2,$3); $p{count} = $num+0; $p{count}-- if $down; $p{count} = $down ? "-$p{count}" : "$p{count}-"; my $format = "%0.".(length($num))."d"; $p{url} = $pre.$format.$post; } else { $p{count} = shift(@ARGV) if ($arg =~ /%[\d\.]*d/); } my $missed = 0; while (1) { my $got = findex($opt,\%p); $did++; last unless $opt->{count}>1; last unless $got || ++$missed<$opt->{can_miss}; $missed = 0 if $got; usage("URL didn't have two numbers, but -count specified twice") unless $p{url} =~ /^(.*\D)(\d+)(\D+%0\.\d+\D+)$/; my ($pre,$num,$post) = ($1,$2,$3); my $format = "%0.".(length($num))."d"; $num = sprintf($format,$num+1); $p{url} = $pre.$num.$post; } $opt->{only}=undef unless $STDIN; $opt->{exclude}=undef unless $STDIN; $opt->{list_hrefs}=0 unless $STDIN; $did; } sub parse_args { my $opt = {}; $opt->{can_miss} = 10; #$opt->{home} = abs_path(); my $did = 0; while (my $arg=next_arg()) { if ($arg eq "-h") { usage(); } if ($arg eq "-d") { $MAIN::DEBUG=1; next; } if ($arg eq "-dir") { $opt->{local}=shift(@ARGV); next; } if ($arg eq "-from") { $opt->{from}=shift(@ARGV); next; } if ($arg eq "-only") { push(@{$opt->{only}},shift(@ARGV)); next; } if ($arg eq "-exclude") { push(@{$opt->{exclude}},shift(@ARGV)); next; } if ($arg eq "-retry") { $opt->{retry} = shift(@ARGV); next; } if ($arg eq "-count") { $opt->{count}++; next; } if ($arg eq "--count") { $opt->{count}++; $opt->{upDown}++; next; } if ($arg eq "-miss") { $opt->{can_miss} = shift(@ARGV); next; } if ($arg eq "-miss_sum") { $opt->{miss_sum} = shift(@ARGV); next; } if ($arg eq "-auth") { $opt->{auth} = shift(@ARGV); next; } if ($arg eq "-no_replace") { $opt->{no_replace} = 1; next; } if ($arg eq "-external") { $opt->{external} = 1; next; } if ($arg eq "-list_hrefs") { $opt->{list_hrefs}=1; next; } if ($arg eq "-") { $STDIN = 1; next; } if ($arg =~ /^-/) { usage("Unknown option: $arg"); } usage("Can't do more than two -count args for a URL (or \"-\")") if $opt->{count}>2; usage("Can't use -auth with -from, unfortunately") if $opt->{from} && $opt->{auth}; $FETCH = "$GET -H 'user-agent: Mozilla/5.0' -H 'Referer: $opt->{from}'" if $opt->{from}; $FETCH = $LYNX if $opt->{auth}; # Do it. $did += handleArg($opt,$arg); $did += handleArg($opt,$arg,1) if $opt->{upDown}; $opt->{count}=0 unless $STDIN; } usage("No paths defined") unless $did; } sub debug { return unless $MAIN::DEBUG; foreach my $msg (@_) { print STDERR "[$PROGNAME] $msg\n"; } } ################################################## # URL handling ################################################## sub url_split { my ($url) = @_; # Break it down! $url =~ s{^(ftp|https?)://}{}; my $schema = $1 || $DEFAULT_SCHEMA; return ($schema,$1,$2) if ($url =~ m|^([^/]+)/(.*)$|); return ($schema,$url,""); } sub clean_url { my ($schema,$domain,$path) = @_; # Domain cleaning $domain=lc($domain); # Path cleaning # Fixup %codes in URL $path =~ s/%([0-9a-fA-F]{2})/chr(hex($1))/eg; # # Default page? # $path.=$DEFAULT_PAGE if ($path =~ m|/$| || $path eq ""); # Remove any inter-page indexing (page.html#chapter1) #$path =~ s|#[^/]*$||; #Interpage indexing can include "/" - at least with Apache $path =~ s|#.*$||; # Remove all // and /./ $path="/$path"; # Temporarily make matches easier while ($path =~ s|/\.?/|/|) {} $path =~ s|/\.$||; # Remove all */dir/../* while ($path =~ s|/[^/]+/\.\./|/|) {} # SECURITY: Make sure we didn't have too many /../ # Example: /bob/../../../../../../etc/passwd $path =~ s|^/\.\./|/|; # /../* -> /blah $path = "/" if $path eq "/.."; # /.. -> / $path =~ s|[^/]+/\.\.$||; # */some/.. -> * # SECURITY: Remove any leading / # (I'm not sure this can actually happen here, but...) $path =~ s|^/||g; ($schema,$domain,$path); } sub resolve_url { my ($schema,$domain,$page,$new) = @_; # Absolute URL: (ftp/http/https only) # http://bob.com/here.html -> bob.com/here.html return (clean_url($1,$2,$3)) if ($new =~ m{^(ftp|https?)://([^/]+)(.*)}); # Can't handle other URL schemes (ftp, gopher, mailto, news, telnet) # (Should we do ftp someday? Where would we save the files?) return undef if ($new =~ m|^[^/]+:|); # Absolute path, same domain: # bob.com/some/page1.html & /new/page2.html -> bob.com/new/page2.html return clean_url($schema,$domain,$new) if ($new =~ m|^/|); # Relative path but already at root # bob.com/here.html & some/page.html -> bob.com/some/page.html return clean_url($schema,$domain,"$new") unless ($page =~ m|/|); # Relative path # bob.com/some/page1.html & ../page2.html -> bob.com/page2.html # First remove the page from the end my $dir=$page; $dir =~ s|/[^/]+$||; return clean_url($schema,$domain,"$dir/$new"); } ################################################## # Code ################################################## sub read_index { my ($opt,$p) = @_; debug("Reading index [$p->{url}]"); open(I,"$FETCH \Q$p->{url}\E 2>/dev/null |") || die("Couldn't get [$p->{url}]\n"); my %already_hrefs; my $line = ""; while () { chomp; $line = $line ? $line.' '.$_ : $_; # Find and (for podcasts) while ($line =~ /]+)>(.*?)<\/a(.*)/i || $line =~ /]+)>(.*?)(.*)/i) { my ($f,$n); ($f,$n,$line) = ($1,$2,$3); $f = $1 if $f =~ /^"([^"]+)"/ || $f =~ /^'([^']+)'/; # This line is actually unnecessary except for if we setup # debugging output to show us $n $n =~ s/<(?!img)[^>]+>//g; #debug("LINK: $n -> $f"); # Ignore parent directory links next if $n =~ /parent directory/i; next if $f eq ".."; # Ignore the sortby ?links in indexes next if $f =~ /(^|\/)\?(C|N|sortby)=/; # Also: ?M=A, ... next if $f =~ /\?[MSD]=A$/; # Resolve URL my ($schema,$domain,$page) = resolve_url($p->{schema},$p->{domain},$p->{page},$f); debug("LINK: $n -> $schema $domain $page"); # Only fetch files from same schema/domain # (otherwise we might make a load of domain directories - maybe # this should be an option?) next unless $schema eq $p->{schema}; # But allow www.bob.com to match bob.com.. my $d = $domain; $d =~ s/^www\.//; my $pd = $p->{domain}; $pd =~ s/^www\.//; if ($d ne $pd) { next unless $opt->{external}; $opt->{found_external}++; $page = "${schema}://$domain/$page"; } #debug("Same schema/domain"); next if $opt->{only} && grep($page !~ /$_/, @{$opt->{only}}); next if $opt->{exclude} && grep($page =~ /$_/, @{$opt->{exclude}}); $page='/' unless $page; next if $already_hrefs{$page}++; if ($opt->{list_hrefs}) { print "$page\n"; next; } push(@{$p->{list}},$page); } # Allow continuation of lines next if ($line =~ /{count}); my (undef,undef,$file) = url_split($p->{url}); # Open-ended list, don't expand now return ($p->{curr_file},$p->{curr_num}) = ($file,$1) if ($p->{count} =~ /^(\d+)-$/); return ($p->{curr_file},$p->{curr_num}) = ($file,$1) if ($p->{count} =~ /^-(\d+)$/); debug("Expanding $p->{url} [$p->{count}]"); my ($a,$b) = (0,$p->{count}); ($a,$b) = ($1,$2) if ($p->{count} =~ /^(\d+)-(\d+)$/); for ($a..$b) { push(@{$p->{list}},sprintf($file,$_)); } } sub next_file { my ($p) = @_; return shift(@{$p->{list}}) unless defined $p->{curr_num}; my $ret = sprintf($p->{curr_file},$p->{curr_num}); $p->{curr_num} += ($p->{count} =~ /^-/) ? -1 : 1; $ret; } sub not_found { my ($opt,$file) = @_; return 1 if -z $file; $file =~ s#^(\s)#./$1#; open(FILE,'<',$file) || return 1; my $notfound = 0; my $media = ($file =~ /\.(jpg|jpeg|gif|mpg|mov)$/) ? 1 : 0; while() { last if $.>30; # Only check first thirty lines or so #for binaries?#last if $.>2 && length($_)>200; $notfound=1 if /not[ ]?found/i; $notfound=1 if /page cannot be found/i; $notfound=1 if /404 not found/i; $notfound=1 if /all 404 errors/i; $notfound=1 if /500 (can.t connect|read timeout)/i; $notfound=1 if /^\s*<(html>|head>|meta)/i && $.<5 && $media; # Heuristic last if $notfound; } close FILE; return $notfound if $notfound; if ($opt->{miss_sum}) { if (open(SUM,"sum \Q$file\E|")) { my ($sum) = scalar ; close SUM; $notfound=1 if $sum =~ /^\s*$opt->{miss_sum}(\s+|$)/; } else { print STDERR "[$PROGNAME] WARNING: Couldn't run 'sum':\n$!\n" unless $MAIN::WARNSUM++; } } $notfound; } sub mkdirR { my ($dir) = @_; my $now; foreach ( split('/',$dir) ) { $now .= "$_/"; next if -d $now; mkdir($now,0755) || die("Couldn't mkdir [$now]\n"); } } sub clean_dirs { my ($dir) = @_; while ($dir) { # Should fail if not empty.. last unless rmdir($dir); $dir =~ s|/[^/]+$||; } } sub get { my ($opt,$url,$file) = @_; debug("Get $url -> $file"); debug("[already have]") if -f $file && !not_found($opt,$file); return 1 if $opt->{no_replace} && -f $file && !not_found($opt,$file); return print STDERR " Ignoring directory:\n" if -d $file; my $got = 0; do { my $cmd = $FETCH; $cmd .= " -accept_all_cookies -auth $opt->{auth}" if $opt->{auth}; system("$cmd \Q$url\E > \Q$file\E 2> /dev/null"); my ($exit,$signal,$dump) = ($? >> 8, $? & 127, $? & 128); print STDERR "[$PROGNAME] WARNING: Error from $FETCH:\n$!\n" if $exit; print STDERR "[$PROGNAME] WARNING: Core dump for:\n$!\n" if $dump; interrupt() if $signal; $got = not_found($opt,$file) ? 0 : 1; } until (--$opt->{retry}<0) || $got; $got; } sub findex { my ($opt,$path) = @_; my $got = 0; expand_list($opt,$path); return if $opt->{list_hrefs}; my $missed = 0; my $base = "$path->{schema}://$path->{domain}"; while (my $page = next_file($path)) { # Make sure we have the subdirectory $page =~ m|(.+)/| ? mkdirR("$path->{domain}/$1") : mkdirR($path->{domain}) unless $opt->{local}; # Get the file my ($url,$get) = ($page =~ m|://(.+)|) ? ($page,$1) : ("$base/$page","$path->{domain}/$page"); $get =~ s|.*/|$opt->{local}/| if $opt->{local}; my $found = get($opt,$url,$get); # Delete not found files. debug("Not found - deleting [$get]") if !$found; unlink $get if !$found; # Check if we're finished with an open-ended count if ($path->{curr_num} && !$found) { last if ++$missed > $opt->{can_miss}; next; } $missed=0; $got++; print " $page"; print " [not found]" unless $found; print "\n"; } #$opt->{local} || chdir $opt->{home} || die("Couldn't go back home! [$opt->{home}]\n"); clean_dirs($path->{dir}); print " Not found\n" unless $got; print " [Some external links found, consider -external]\n" if !$got && $opt->{found_external}; $got; } parse_args();