#!/usr/bin/perl # Filename: bew # Author: David Ljung Madison # See License: http://MarginalHacks.com/License # Version: 0.90 # Description: This is a web-mirroring program ('bew,' get it?) # Recursively gets an entire web. # Also conveniently tells you about broken links. # # Forget HEAD, use If-Modified-Since: # Send something like: # GET / HTTP/1.0 # If-Modified-Since: Wed, 28 Jun 2000 00:09:48 GMT; length=759 # Connection: Keep-Alive # User-Agent: Mozilla/4.7 [en] (X11; U; Linux 2.2.18 i686) # Pragma: no-cache # Host: localhost:5000 # Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */* # Accept-Encoding: gzip # Accept-Language: en # Accept-Charset: iso-8859-1,*,utf-8 # Receive something like: # HTTP/1.1 304 Not Modified # Date: Sat, 30 Dec 2000 16:38:23 GMT # Server: Apache/1.3.6 (Unix) (Red Hat/Linux) # Connection: Keep-Alive # Keep-Alive: timeout=15, max=100 # ETag: "9065-2f7-3959424c" # # # TODO: FRAME SUPPORT?? # NOT FOUND: isn't cached??? # # # Remove files in the hierarchy that don't exist anymore? # Maybe -rm option that scans whole directory for add_url()s # # Doesn't handle subdomains very well. bob.steve.com goes into steve.com # # Bugs - the following files won't be fetched: # 1) Any files that aren't linked to # 2) CGI scripts (unless the source is linked somewhere, see -cgi) # 3) Any files only accessed by CGI scripts # 4) Any files only loaded by java/javascript (such as by mouseover, etc..) # 5) Schemes other than http:// (such as ftp:/ gopher:/) # 6) Weird stuff: , style sheets, ... # 7) Misses 'README' and 'HEADER' files in directory indexes, and anything # else the server doesn't want us to see (IndexIgnore in Apache:srm.conf) # 8) bob.com:80/page and bob.com/page will be treated as different pages # 9) Requires that the server implements the HEAD mechanism # 10) Trimming /www./ isn't good for -www because it means we falsely # see external links as NOT FOUND (i.e.: www.yahoo.com) # # Apache doesn't give 'Last-Modified' information for .shtml files, so # we have to load them every time. # # Ignores robots.txt, while it *is* a web crawler, the first step must be # manual and is assumed to be intended. # # Also: # Assumes that http://domain.com/dir/ == http://domain.com/dir/index.html # Doesn't ignore (probably for the best?) # Ignores setting # Doesn't know about symbolic links on the server, may fetch multiple # copies of files, or even get stuck in infinite loops # # Weird problem - say you want everything inside of bob.com/a/b but the # start page is bob.com/a/b/b.html and bob.com/a/b is unreadable: # 1) bew bob.com/a/b <- won't work, can't read start page # 2) bew bob.com/a/b/b.html <- won't work, only gets b.html page # 3) bew bob.com/a/b bob.com/a/b/b.html <- works # Web Robots home page: # http://info.webcrawler.com/mak/projects/robots/robots.html use strict; use HTTP::Date; use FileHandle; use IPC::Open2; ################################################## # Setup the variables ################################################## my $PROGNAME = $0; $PROGNAME =~ s|.*/||; my $UMASK = 0755; my @GET_URL = qw(lynx -source -accept_all_cookies); my $MV = "/bin/mv"; my $DEFAULT_PAGE = "index.html"; # Kludge: Assumption my $DEFAULT_PORT = 80; ################################################## # Globals ################################################## # Bad to use, but better than passing these around to every damn routine my @SEARCH; # What parts of the web do we keep? [,] my %ADD_CACHE; # Cache of urls we've already chosen to fetch my %HEAD_CACHE; # Cache of head() results my @FETCH_LIST; # URLs to fetch ################################################## # Usage ################################################## sub usage { my $msg; foreach $msg (@_) { print "ERROR: $msg\n"; } print "\n"; print "Usage:\t$PROGNAME ..\n"; print "\tFetch the web!\n"; print "\n"; print "URLs can either be a full URL or just a domain name ('http://' not required)\n"; print " Files under that URL will be saved in a directory of the same name\n"; print " A different directory can be specified with: =\n"; print "\n"; print "Examples:\n"; print " davesource.com/Marginal => saves files in 'davesource.com/Marginal'\n"; print " fringe.davesource.com/Fringe=db => saves files in 'db/Fringe'\n"; print "\n"; print "Options:\n"; #print " -v \tVerbose mode\n"; print " -log \tWhere to log output (default STDOUT)\n"; print " -sleep \tNumber of seconds to sleep between fetches (default 0)\n"; print " -www \tTreat www.domain.com the same as domain.com\n"; print " -cgi \tSave \"some.cgi?key=blah\" files\n"; print " -aggressive \tTest all subdirectories in the tree for contents\n"; print " -check_links\tTest for broken links in external URLs as well\n"; print "\n"; close LOG; exit -1; } sub parse_args { # Defaults: my ($log,$sleep) = ("-",0); $MAIN::WWW=""; $MAIN::VERBOSE=1; #print "VERBOSE ON\n" if ($MAIN::VERBOSE); my $arg; while ($#ARGV>=0) { $arg=shift(@ARGV); if ($arg =~ /^-h$/) { usage(); } if ($arg =~ /^-l(og)?(=(.+))?$/) { $log=$3 ? $3 : shift(@ARGV); next; } if ($arg =~ /^-sleep(=(.+))?$/) { $sleep=$2 ? $2 : shift(@ARGV); next; } if ($arg =~ /^-v$/) { $MAIN::VERBOSE=1; next; } if ($arg =~ /^-www$/) { $MAIN::WWW="(?:www\.|)"; next; } if ($arg =~ /^-cgi$/) { push(@MAIN::CGI, shift @ARGV); next; } # My own kludge - only save one CGI key (by order specified in ARGV) if ($arg =~ /^-cgi1$/) { $MAIN::CGI_ONE = 1; next; } if ($arg =~ /^-ag(g(r(essive)?)?)?$/) { $MAIN::AGGRESSIVE=1; next; } if ($arg =~ /^-check_?links$/) { $MAIN::CHECK_LINKS=1; next; } if ($arg =~ /^-/) { usage("Unknown option: $arg"); } my ($url,$dir) = ($arg =~ /^([^=]+)=(.+)$/) ? ($1, $2) : ($arg, undef); my ($domain,$page) = clean_url( url_split($url) ); $dir=$dir || $domain; # Add it to the search push(@SEARCH, [ "$domain/$page", $dir ] ); # Add it to the list of pages to grab add_url($domain,$page,$dir,"command option"); } open(LOG,">$log") || usage("Couldn't open logfile [",($log eq "-" ? "STDOUT" : $log),"]"); select((select(LOG), $| = 1) [0]); # Unbuffer log output usage("No starting point defined") unless urls_left(); $sleep; } ################################################## # Interrupts ################################################## $SIG{'INT'}='interrupt'; $SIG{'TERM'}='interrupt'; $SIG{'HUP'}='interrupt'; $SIG{'SUSP'}='interrupt'; $SIG{'QUIT'}='interrupt'; sub interrupt { my ($msg) = @_; alarm(0); # Turn off alarms $msg="" unless ($msg); die("[$PROGNAME] Interrupt: $msg\n"); } ######################### # Run a system, catch interrupts ######################### sub my_system { my ($save,@cmd) = @_; # Do it and get the exit value my $pid; unless ($pid=fork) { close(STDOUT); open(STDOUT,">$save") || die("[$PROGNAME] ERROR: Couldn't write [$save]\n"); #open(STDERR,">&LOG); # Dump problems to log exec @cmd; die("[$PROGNAME] ERROR: Can't exec [@cmd]\n"); } my $wait=waitpid($pid,0); my ($exit,$signal,$dump) = ($? >> 8, $? & 127, $? & 128); print STDERR "\n[$PROGNAME] Interrupt [@cmd] $signal\n" if ($signal); die("\n[$PROGNAME] Core dump: @cmd\n") if ($dump); kill $signal, $$ if ($signal); # Propagate the signal to this script $exit; # Return exit value } ######################### # Setup a directory for a path (mkdir -p ) ######################### sub setup_dir { my ($path) = @_; $path =~ s|[^/]+$||; # Remove file component my $so_far="."; my $dir; foreach $dir ( split(/\//, $path )) { next unless ($dir); $so_far.="/$dir"; if (! -d $so_far) { # Danger, danger! die("[$PROGNAME] Couldn't remove file (to mkdir) [$so_far]\n") if (-f $so_far && !unlink($so_far)); mkdir($so_far,$UMASK) || die("[$PROGNAME] Couldn't create directory [$so_far]\n"); } } } ######################### # Clean out //, /./ and /../ from the path ######################### sub url_split { my ($url) = @_; # Break it down! $url =~ s|^https?://||; return ($1,$2) if ($url =~ m|^([^/]+)/(.*)$|); return ($url,""); } sub clean_url { my ($domain,$path) = @_; # Domain cleaning $domain=lc($domain); # $domain =~ s/^www\.// if ($MAIN::WWW); # Path cleaning # # Default page? # $path.=$DEFAULT_PAGE if ($path =~ m|/$| || $path eq ""); # Remove any inter-page indexing (page.html#chapter1) #$path =~ s|#[^/]*$||; #Interpage indexing can include "/" - at least with Apache $path =~ s|#.*$||; # Remove all // and /./ $path="/$path"; # Temporarily make matches easier while ($path =~ s|/\.?/|/|) {} $path =~ s|/\.$||; # Remove all */dir/../* while ($path =~ s|/[^/]+/\.\./|/|) {} # SECURITY: Make sure we didn't have too many /../ # Example: /bob/../../../../../../etc/passwd $path =~ s|^/\.\./|/|; # /../* -> /blah $path = "/" if $path eq "/.."; # /.. -> / $path =~ s|[^/]+/\.\.$||; # */some/.. -> * # SECURITY: Remove any leading / # Only for call from parse_args, really - Example: http://bob.com//etc/passwd $path =~ s|^/||g; ($domain,$path); } ######################### # Figure out what a new URL resolves to (minus the http://) ######################### sub resolve_url { my ($domain,$page,$new) = @_; # Absolute URL: (http/https only) # http://bob.com/here.html -> bob.com/here.html return (clean_url($1,$2)) if ($new =~ m|^https?://([^/]+)(.*)|); # Can't handle other URL schemes (ftp, gopher, mailto, news, telnet) # (Should we do ftp someday? Where would we save the files?) return undef if ($new =~ m|^[^/]+:|); # Absolute path, same domain: # bob.com/some/page1.html & /new/page2.html -> bob.com/new/page2.html return clean_url($domain,$new) if ($new =~ m|^/|); # Relative path but already at root # bob.com/here.html & some/page.html -> bob.com/some/page.html return clean_url($domain,"$new") unless ($page =~ m|/|); # Relative path # bob.com/some/page1.html & ../page2.html -> bob.com/page2.html # First remove the page from the end my $dir=$page; $dir =~ s|/[^/]+$||; return clean_url($domain,"$dir/$new"); } ################################################## # Get the head information on a URL # found: Does the page exist? # location: Has it been moved? # last_mod: What's the last mod time? # type: Content type ################################################## # Cache the answers, loop on location sub head { my ($url) = @_; my ($found,$dir,$loc,$last_mod,$type); $loc=$url; while ($loc) { $url=$loc; ($found,$loc,$last_mod,$type) = real_head($url); } # Default page? (Directory indexes don't have Last-Modified) $url.=$DEFAULT_PAGE if ($url =~ m|/$| && $last_mod); return ($found,$url,$last_mod,$type); } my %NO_HEAD; # Domains that won't give head. sub real_head { my ($url) = @_; my $cache=$HEAD_CACHE{$url}; return @$cache if ($cache); my ($domain,$page) = url_split($url); my ($found,$type,$moved,$loc,$last_mod)=(0,0,0,0,undef,undef); my $port = $DEFAULT_PORT; ($domain,$port)=($`,$') if ($domain =~ /:/); if ($NO_HEAD{$domain}) { $found=1; } else { my $pid = open2( \*READER, \*WRITER, "telnet $domain $port 2>/dev/null" ); WRITER->autoflush(); # default here, actually # KLUDGE: ASSUME http: method (https?) print WRITER "HEAD http://$domain/$page HTTP/1.0\n\n"; while () { $found=1 if /HTTP.*[23]\d\d/; if (/HEAD.*Not Implemented/i || /Internal Server Error/i) { print STDERR "[$PROGNAME] Warning: [$domain] doesn't implement HEAD\n" unless $NO_HEAD{$domain}++; $found=1; } $moved=1 if /Moved\s+Permanently/i; $loc=$1 if /Location:\s+(\S+)/; $last_mod=str2time($1) if /Last-Modified: (.*)/i; $type=$1 if /Content-Type:\s+(\S+)/i; #print "YO: $_"; } close(READER); close(WRITER); my $wait=waitpid($pid,0); # So we don't collect defunct processes } # How do we know it's a directory listing? # Directory listings don't have a last mod. Neither do .shtml? And NO_HEAD? # $type = "dir_index/html" # unless ($last_mod || $NO_HEAD{$domain} || $page =~ /\.shtml$/); # And neither does index.php, but we don't know that it's PHP. $loc=undef unless $moved; $type = "text/html" if !$type && $page =~ /\.s?html?$/; $HEAD_CACHE{$url} = [$found,$loc,$last_mod,$type]; ($found,$loc,$last_mod,$type); } ################################################## # Add a url to our list to fetch ################################################## sub add_url { my ($domain,$page,$dir,$from) = @_; my $save = saveCGI("$dir/$page"); return unless $save; # Does it exist? Is it in the right place? Info... my ($found,$loc,$last_mod,$type) = head("$domain/$page"); if (!$found && $from) { if ($from eq "command option") { # Dumb kludge, LOG isn't open yet print "NOT FOUND: $domain/$page [$from]\n\n"; } elsif ($MAIN::VERBOSE || $MAIN::CHECK_LINKS) { print LOG "NOT FOUND: $domain/$page [$from]\n"; } return; } # Use relocation information if given ($domain,$page) = clean_url( url_split($loc) ) if $loc; # directory listings need a filename. Also, NO_HEAD can screw things up: $save.=$DEFAULT_PAGE if ($type =~ /dir_index/ || $save =~ m|/$|); return if ($ADD_CACHE{$save}++); return push(@FETCH_LIST,[$domain,$page,$dir,$save,$last_mod,$type]); } # We save CGI scripts if they have keys we care about sub saveCGI { my ($page) = @_; # KLUDGE: Ignore (most) CGIs for now #return ($page =~ /cgi/ || $page =~ m|\?[^/]+$|) ? undef : $page; # Is it even a .cgi?request return $page unless $page =~ m|(.+)\?([^/]+)$|; # Use -cgi to allow some CGI saving return unless @MAIN::CGI; my ($query,@new); ($page, $query) = ($1,$2); foreach my $q ( split(/&/, $query) ) { next unless grep($q =~ /^$_(=|$)/, @MAIN::CGI); push(@new,$q); last if $MAIN::CGI_ONE; } return undef unless @new; # Didn't find anything to save, don't keep it. $page.'?'.join('&', @new); } sub check_url { my ($domain,$page,$from) = @_; return unless defined $domain; my $search; foreach $search ( @SEARCH ) { my ($regexp,$dir) = @$search; return add_url($domain,$page,$dir,$from) if ("$domain/$page" =~ /^$MAIN::WWW$regexp/); } # It's 'external' - check if it's a broken link? if ($MAIN::CHECK_LINKS) { my ($found,$loc,$last_mod,$type) = head("$domain/$page"); # # Gross kludge - we probably only meant to strip 'www.' off internal sites # ($found,$loc,$last_mod,$type) = head("www.$domain/$page") # if (!$found && $domain !~ /^www\./ && $MAIN::WWW); print LOG "NOT FOUND: $domain/$page [$from]\n" unless ($found); } return; } sub urls_left { @FETCH_LIST ? 1 : 0; } sub next_url { # Pop would probably be faster, # but it might be more confusing in a recursive search my $tmp = shift(@FETCH_LIST); return undef unless $tmp; @$tmp; } ################################################## # Scan a page for urls to fetch ################################################## sub scan_urls { my ($domain,$page,$file) = @_; # Collect new urls open(PAGE,$file) || return print STDERR "[$PROGNAME] Couldn't read $file\n"; my $line=""; while() { # Heuristic to avoid garbage URLs in shell scripts. # If we fetch a script, it may have a bunch of ']*href="([^"]+)"/i || $line =~ /<(?:a|link) [^>]*href='([^']+)'/i || $line =~ /<(?:a|link) [^>]*href=([^\s>]+)/i || $line =~ /]*background="([^"]+)"/i || $line =~ /]*background='([^']+)'/i || $line =~ /]*background=([^\s>]+)/i || $line =~ /<(?:img|embed) [^>]*src="([^"]+)"/i || $line =~ /<(?:img|embed) [^>]*src='([^']+)'/i || $line =~ /<(?:img|embed) [^>]*src=([^\s>]+)/i) { my $link=$1; # Add the url after we resolve its path based on the current url # Ignore 'href=#name' right here, otherwise we get the parent dir # Bug: Will ignore . Big deal. # Ignore 'href=""' right here, otherwise we get this file again check_url( resolve_url($domain,$page,$link), "$domain/$page" ) if ($link !~ /^#/ && $link ne "''" && $link ne '""'); # Next portion of the line $line=$'; } # Allow continuation of lines next if ($line =~ /]+$/i || $line =~ /]+$/i || $line =~ /]+$/i); # Otherwise get rid of the rest of the line $line=""; } close(PAGE); } ################################################## # Get a web page, recursively get its links #