#!/usr/bin/perl
# Filename:	bew
# Author:	David Ljung Madison <DaveSource.com>
# See License:	http://MarginalHacks.com/License
# Version:	0.90
# Description:	This is a web-mirroring program ('bew,' get it?)
#		Recursively gets an entire web.
#		Also conveniently tells you about broken links.
#
# Forget HEAD, use If-Modified-Since:
# Send something like:
#   GET / HTTP/1.0
#   If-Modified-Since: Wed, 28 Jun 2000 00:09:48 GMT; length=759
#   Connection: Keep-Alive
#   User-Agent: Mozilla/4.7 [en] (X11; U; Linux 2.2.18 i686)
#   Pragma: no-cache
#   Host: localhost:5000
#   Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*
#   Accept-Encoding: gzip
#   Accept-Language: en
#   Accept-Charset: iso-8859-1,*,utf-8
# Receive something like:
#   HTTP/1.1 304 Not Modified
#   Date: Sat, 30 Dec 2000 16:38:23 GMT
#   Server: Apache/1.3.6 (Unix)  (Red Hat/Linux)
#   Connection: Keep-Alive
#   Keep-Alive: timeout=15, max=100
#   ETag: "9065-2f7-3959424c"
#
#
# TODO:  FRAME SUPPORT??
# NOT FOUND:  isn't cached???
#
#
# Remove files in the hierarchy that don't exist anymore?
#   Maybe -rm option that scans whole directory for add_url()s
#
# Doesn't handle subdomains very well.  bob.steve.com goes into steve.com
#
# Bugs - the following files won't be fetched:
# 1) Any files that aren't linked to
# 2) CGI scripts (unless the source is linked somewhere, see -cgi)
# 3) Any files only accessed by CGI scripts
# 4) Any files only loaded by java/javascript (such as by mouseover, etc..)
# 5) Schemes other than http:// (such as ftp:/  gopher:/)
# 6) Weird stuff:  <img dynsrc=..>, style sheets, ...
# 7) Misses 'README' and 'HEADER' files in directory indexes, and anything
#    else the server doesn't want us to see (IndexIgnore in Apache:srm.conf)
# 8) bob.com:80/page and bob.com/page will be treated as different pages
# 9) Requires that the server implements the HEAD mechanism
# 10) Trimming /www./ isn't good for -www because it means we falsely
#     see external links as NOT FOUND (i.e.:  www.yahoo.com)
#
# Apache doesn't give 'Last-Modified' information for .shtml files, so
# we have to load them every time.
#
# Ignores robots.txt, while it *is* a web crawler, the first step must be
# manual and is assumed to be intended.
#
# Also:
#   Assumes that http://domain.com/dir/ == http://domain.com/dir/index.html
#   Doesn't ignore <!-- html comments -->    (probably for the best?)
#   Ignores <base=..> setting
#   Doesn't know about symbolic links on the server, may fetch multiple
#     copies of files, or even get stuck in infinite loops
#
# Weird problem - say you want everything inside of bob.com/a/b but the
# start page is bob.com/a/b/b.html and bob.com/a/b is unreadable:
# 1)  bew bob.com/a/b		<- won't work, can't read start page
# 2)  bew bob.com/a/b/b.html	<- won't work, only gets b.html page
# 3)  bew bob.com/a/b bob.com/a/b/b.html	<- works

# Web Robots home page:
#   http://info.webcrawler.com/mak/projects/robots/robots.html

use strict;
use HTTP::Date;
use FileHandle;
use IPC::Open2;

##################################################
# Setup the variables
##################################################
my $PROGNAME = $0;
$PROGNAME =~ s|.*/||;

my $UMASK = 0755;

my @GET_URL = qw(lynx -source -accept_all_cookies);
my $MV = "/bin/mv";

my $DEFAULT_PAGE = "index.html";	# Kludge:  Assumption
my $DEFAULT_PORT = 80;

##################################################
# Globals
##################################################
# Bad to use, but better than passing these around to every damn routine
my @SEARCH;	# What parts of the web do we keep?  [<regexp>,<dir>]
my %ADD_CACHE;	# Cache of urls we've already chosen to fetch
my %HEAD_CACHE;	# Cache of head() results
my @FETCH_LIST;	# URLs to fetch

##################################################
# Usage
##################################################
sub usage {
  my $msg;
  foreach $msg (@_) { print "ERROR:  $msg\n"; }
  print "\n";
  print "Usage:\t$PROGNAME <URL> ..\n";
  print "\tFetch the web!\n";
  print "\n";
  print "URLs can either be a full URL or just a domain name ('http://' not required)\n";
  print "  Files under that URL will be saved in a directory of the same name\n";
  print "  A different directory can be specified with: <URL>=<dir>\n";
  print "\n";
  print "Examples:\n";
  print "  davesource.com/Marginal         => saves files in 'davesource.com/Marginal'\n";
  print "  fringe.davesource.com/Fringe=db => saves files in 'db/Fringe'\n";
  print "\n";
  print "Options:\n";
  #print "  -v          \tVerbose mode\n";
  print "  -log        \tWhere to log output (default STDOUT)\n";
  print "  -sleep      \tNumber of seconds to sleep between fetches (default 0)\n";
  print "  -www        \tTreat www.domain.com the same as domain.com\n";
  print "  -cgi <key>  \tSave \"some.cgi?key=blah\" files\n";
  print "  -aggressive \tTest all subdirectories in the tree for contents\n";
  print "  -check_links\tTest for broken links in external URLs as well\n";
  print "\n";
  close LOG;
  exit -1;
}

sub parse_args {
  # Defaults:
  my ($log,$sleep) = ("-",0);
  $MAIN::WWW="";
  $MAIN::VERBOSE=1;

#print "VERBOSE ON\n" if ($MAIN::VERBOSE);

  my $arg;
  while ($#ARGV>=0) {
    $arg=shift(@ARGV);
    if ($arg =~ /^-h$/) { usage(); }
    if ($arg =~ /^-l(og)?(=(.+))?$/) { $log=$3 ? $3 : shift(@ARGV); next; }
    if ($arg =~ /^-sleep(=(.+))?$/) { $sleep=$2 ? $2 : shift(@ARGV); next; }
    if ($arg =~ /^-v$/) { $MAIN::VERBOSE=1; next; }
    if ($arg =~ /^-www$/) { $MAIN::WWW="(?:www\.|)"; next; }
    if ($arg =~ /^-cgi$/) { push(@MAIN::CGI, shift @ARGV); next; }
    # My own kludge - only save one CGI key (by order specified in ARGV)
    if ($arg =~ /^-cgi1$/) { $MAIN::CGI_ONE = 1; next; }
    if ($arg =~ /^-ag(g(r(essive)?)?)?$/) { $MAIN::AGGRESSIVE=1; next; }
    if ($arg =~ /^-check_?links$/) { $MAIN::CHECK_LINKS=1; next; }
    if ($arg =~ /^-/) { usage("Unknown option: $arg"); }

    my ($url,$dir) = ($arg =~ /^([^=]+)=(.+)$/) ? ($1, $2) : ($arg, undef);
    my ($domain,$page) = clean_url( url_split($url) );
    $dir=$dir || $domain;

    # Add it to the search
    push(@SEARCH, [ "$domain/$page", $dir ] );

    # Add it to the list of pages to grab
    add_url($domain,$page,$dir,"command option");
  }

  open(LOG,">$log") ||
    usage("Couldn't open logfile [",($log eq "-" ? "STDOUT" : $log),"]");
  select((select(LOG), $| = 1) [0]);	# Unbuffer log output

  usage("No starting point defined") unless urls_left();

  $sleep;
}

##################################################
# Interrupts
##################################################
$SIG{'INT'}='interrupt';
$SIG{'TERM'}='interrupt';
$SIG{'HUP'}='interrupt';
$SIG{'SUSP'}='interrupt';
$SIG{'QUIT'}='interrupt';
sub interrupt {
  my ($msg) = @_;
  alarm(0);	# Turn off alarms
  $msg="<unknown interrupt>" unless ($msg);
  die("[$PROGNAME] Interrupt: $msg\n");
}

#########################
# Run a system, catch interrupts
#########################
sub my_system {
  my ($save,@cmd) = @_;

  # Do it and get the exit value
  my $pid;
  unless ($pid=fork) {
    close(STDOUT);
    open(STDOUT,">$save") || die("[$PROGNAME] ERROR: Couldn't write [$save]\n");
    #open(STDERR,">&LOG);	# Dump problems to log
    exec @cmd;
    die("[$PROGNAME] ERROR:  Can't exec [@cmd]\n");
  }
  my $wait=waitpid($pid,0);
  my ($exit,$signal,$dump) = ($? >> 8, $? & 127, $? & 128);

  print STDERR "\n[$PROGNAME]  Interrupt [@cmd] $signal\n" if ($signal);
  die("\n[$PROGNAME]  Core dump: @cmd\n") if ($dump);
  kill $signal, $$ if ($signal);        # Propagate the signal to this script

  $exit;	# Return exit value
}

#########################
# Setup a directory for a path (mkdir -p <path>)
#########################
sub setup_dir {
  my ($path) = @_;

  $path =~ s|[^/]+$||;		# Remove file component

  my $so_far=".";
  my $dir;
  foreach $dir ( split(/\//, $path )) {
    next unless ($dir);
    $so_far.="/$dir";
    if (! -d $so_far) {
      # Danger, danger!
      die("[$PROGNAME] Couldn't remove file (to mkdir) [$so_far]\n")
        if (-f $so_far && !unlink($so_far));
      mkdir($so_far,$UMASK) || die("[$PROGNAME] Couldn't create directory [$so_far]\n");
    }
  }
}

#########################
# Clean out //, /./ and /../ from the path
#########################
sub url_split {
  my ($url) = @_;

  # Break it down!  <ba ba bum ba bum...>
  $url =~ s|^https?://||;
  return ($1,$2) if ($url =~ m|^([^/]+)/(.*)$|);
  return ($url,"");
}

sub clean_url {
  my ($domain,$path) = @_;

  # Domain cleaning
  $domain=lc($domain);
#  $domain =~ s/^www\.// if ($MAIN::WWW);

  # Path cleaning

#  # Default page?
#  $path.=$DEFAULT_PAGE if ($path =~ m|/$| || $path eq "");

  # Remove any inter-page indexing (page.html#chapter1)
  #$path =~ s|#[^/]*$||;
  #Interpage indexing can include "/" - at least with Apache
  $path =~ s|#.*$||;

  # Remove all // and /./
  $path="/$path";	# Temporarily make matches easier
  while ($path =~ s|/\.?/|/|) {}
  $path =~ s|/\.$||;

  # Remove all */dir/../*
  while ($path =~ s|/[^/]+/\.\./|/|) {}
  # SECURITY:  Make sure we didn't have too many /../
  # Example:   /bob/../../../../../../etc/passwd
  $path =~ s|^/\.\./|/|;		# /../*     -> /blah
  $path = "/" if $path eq "/..";	# /..       -> /
  $path =~ s|[^/]+/\.\.$||;		# */some/.. -> *

  # SECURITY:  Remove any leading /
  # Only for call from parse_args, really - Example:  http://bob.com//etc/passwd
  $path =~ s|^/||g;

  ($domain,$path);
}

#########################
# Figure out what a new URL resolves to (minus the http://)
#########################
sub resolve_url {
  my ($domain,$page,$new) = @_;

  # Absolute URL:  (http/https only)
  # http://bob.com/here.html -> bob.com/here.html
  return (clean_url($1,$2)) if ($new =~ m|^https?://([^/]+)(.*)|);

  # Can't handle other URL schemes (ftp, gopher, mailto, news, telnet)
  # (Should we do ftp someday?  Where would we save the files?)
  return undef if ($new =~ m|^[^/]+:|);

  # Absolute path, same domain:
  # bob.com/some/page1.html & /new/page2.html -> bob.com/new/page2.html
  return clean_url($domain,$new) if ($new =~ m|^/|);

  # Relative path but already at root
  # bob.com/here.html & some/page.html -> bob.com/some/page.html
  return clean_url($domain,"$new") unless ($page =~ m|/|);

  # Relative path
  # bob.com/some/page1.html & ../page2.html -> bob.com/page2.html
  # First remove the page from the end
  my $dir=$page;  $dir =~ s|/[^/]+$||;
  return clean_url($domain,"$dir/$new");
}

##################################################
# Get the head information on a URL
# found:	Does the page exist?
# location:	Has it been moved?
# last_mod:	What's the last mod time?
# type:		Content type
##################################################
# Cache the answers, loop on location
sub head {
  my ($url) = @_;
  my ($found,$dir,$loc,$last_mod,$type);

  $loc=$url;
  while ($loc) {
    $url=$loc;
    ($found,$loc,$last_mod,$type) = real_head($url);
  }

  # Default page?  (Directory indexes don't have Last-Modified)
  $url.=$DEFAULT_PAGE if ($url =~ m|/$| && $last_mod);

  return ($found,$url,$last_mod,$type);
}

my %NO_HEAD;	# Domains that won't give head.
sub real_head {
  my ($url) = @_;

  my $cache=$HEAD_CACHE{$url};
  return @$cache if ($cache);

  my ($domain,$page) = url_split($url);

  my ($found,$type,$moved,$loc,$last_mod)=(0,0,0,0,undef,undef);

  my $port = $DEFAULT_PORT;
  ($domain,$port)=($`,$') if ($domain =~ /:/);

  if ($NO_HEAD{$domain}) {
    $found=1;
  } else {
    my $pid = open2( \*READER, \*WRITER, "telnet $domain $port 2>/dev/null" );
    WRITER->autoflush(); # default here, actually
#   KLUDGE: ASSUME http: method (https?)
    print WRITER "HEAD http://$domain/$page HTTP/1.0\n\n";
    while (<READER>) {
      $found=1 if /HTTP.*[23]\d\d/;
      if (/HEAD.*Not Implemented/i || /Internal Server Error/i) {
        print STDERR "[$PROGNAME] Warning:  [$domain] doesn't implement HEAD\n"
          unless $NO_HEAD{$domain}++;
        $found=1;
      }
      $moved=1 if /Moved\s+Permanently/i;
      $loc=$1 if /Location:\s+(\S+)/;
      $last_mod=str2time($1) if /Last-Modified: (.*)/i;
      $type=$1 if /Content-Type:\s+(\S+)/i;
      #print "YO: $_";
    }
    close(READER); close(WRITER);
    my $wait=waitpid($pid,0);	# So we don't collect defunct processes
  }

  # How do we know it's a directory listing?
  # Directory listings don't have a last mod.  Neither do .shtml?  And NO_HEAD?
#  $type = "dir_index/html"
#    unless ($last_mod || $NO_HEAD{$domain} || $page =~ /\.shtml$/);
# And neither does index.php, but we don't know that it's PHP.

  $loc=undef unless $moved;

  $type = "text/html" if !$type && $page =~ /\.s?html?$/;

  $HEAD_CACHE{$url} = [$found,$loc,$last_mod,$type];

  ($found,$loc,$last_mod,$type);
}

##################################################
# Add a url to our list to fetch
##################################################
sub add_url {
  my ($domain,$page,$dir,$from) = @_;

  my $save = saveCGI("$dir/$page");
  return unless $save;

  # Does it exist?  Is it in the right place?  Info...
  my ($found,$loc,$last_mod,$type) = head("$domain/$page");
  if (!$found && $from) {
    if ($from eq "command option") {	# Dumb kludge, LOG isn't open yet
      print "NOT FOUND: $domain/$page [$from]\n\n";
    } elsif ($MAIN::VERBOSE || $MAIN::CHECK_LINKS) {
      print LOG "NOT FOUND: $domain/$page [$from]\n";
    }
    return;
  }
  # Use relocation information if given
  ($domain,$page) = clean_url( url_split($loc) ) if $loc;

  # directory listings need a filename.  Also, NO_HEAD can screw things up:
  $save.=$DEFAULT_PAGE if ($type =~ /dir_index/ || $save =~ m|/$|);

  return if ($ADD_CACHE{$save}++);
  return push(@FETCH_LIST,[$domain,$page,$dir,$save,$last_mod,$type]);
}

# We save CGI scripts if they have keys we care about
sub saveCGI {
  my ($page) = @_;

  # KLUDGE:  Ignore (most) CGIs for now
  #return ($page =~ /cgi/ || $page =~ m|\?[^/]+$|) ? undef : $page;

  # Is it even a .cgi?request
  return $page unless $page =~ m|(.+)\?([^/]+)$|;

  # Use -cgi to allow some CGI saving
  return unless @MAIN::CGI;
  my ($query,@new);
  ($page, $query) = ($1,$2);
  foreach my $q ( split(/&/, $query) ) {
    next unless grep($q =~ /^$_(=|$)/, @MAIN::CGI);
    push(@new,$q);
		last if $MAIN::CGI_ONE;
  }
  return undef unless @new;	# Didn't find anything to save, don't keep it.
  $page.'?'.join('&', @new);
}

sub check_url {
  my ($domain,$page,$from) = @_;

  return unless defined $domain;

  my $search;
  foreach $search ( @SEARCH ) {
    my ($regexp,$dir) = @$search;
    return add_url($domain,$page,$dir,$from)
      if ("$domain/$page" =~ /^$MAIN::WWW$regexp/);
  }

  # It's 'external' - check if it's a broken link?
  if ($MAIN::CHECK_LINKS) {
    my ($found,$loc,$last_mod,$type) = head("$domain/$page");
#    # Gross kludge - we probably only meant to strip 'www.' off internal sites
#    ($found,$loc,$last_mod,$type) = head("www.$domain/$page")
#      if (!$found && $domain !~ /^www\./ && $MAIN::WWW);
    print LOG "NOT FOUND: $domain/$page [$from]\n" unless ($found);
  }
  return;
}

sub urls_left {
  @FETCH_LIST ? 1 : 0;
}

sub next_url {
  # Pop would probably be faster,
  # but it might be more confusing in a recursive search
  my $tmp = shift(@FETCH_LIST);
  return undef unless $tmp;
  @$tmp;
}

##################################################
# Scan a page for urls to fetch
##################################################
sub scan_urls {
  my ($domain,$page,$file) = @_;

  # Collect new urls
  open(PAGE,$file) || return print STDERR "[$PROGNAME]  Couldn't read $file\n";
  my $line="";
  while(<PAGE>) {
    # Heuristic to avoid garbage URLs in shell scripts.
    # If we fetch a script, it may have a bunch of '<a href..' in it.
    #last if ($.==1 && /^#!/);	# Ignore #!bang scripts

    $line.=$_;
    while ($line =~ /<(?:a|link) [^>]*href="([^"]+)"/i ||
           $line =~ /<(?:a|link) [^>]*href='([^']+)'/i ||
           $line =~ /<(?:a|link) [^>]*href=([^\s>]+)/i ||
           $line =~ /<body [^>]*background="([^"]+)"/i ||
           $line =~ /<body [^>]*background='([^']+)'/i ||
           $line =~ /<body [^>]*background=([^\s>]+)/i ||
           $line =~ /<(?:img|embed) [^>]*src="([^"]+)"/i ||
           $line =~ /<(?:img|embed) [^>]*src='([^']+)'/i ||
           $line =~ /<(?:img|embed) [^>]*src=([^\s>]+)/i) {
      my $link=$1;

      # Add the url after we resolve its path based on the current url
      #   Ignore 'href=#name' right here, otherwise we get the parent dir
      #   Bug:  Will ignore <img src=#dumb_name.jpg>.  Big deal.
      #   Ignore 'href=""' right here, otherwise we get this file again
      check_url( resolve_url($domain,$page,$link), "$domain/$page" )
        if ($link !~ /^#/ && $link ne "''" && $link ne '""');

      # Next portion of the line
      $line=$';
    }

    # Allow continuation of lines
    next if ($line =~ /<a [^>]+$/i ||
             $line =~ /<img [^>]+$/i ||
             $line =~ /<body [^>]+$/i);

    # Otherwise get rid of the rest of the line
    $line="";
  }
  close(PAGE);
}

##################################################
# Get a web page, recursively get its links
#   <a .* href=...
#   <img .* src=....
#   <embed .* src=....
#   <body .* background=...
##################################################
sub get {
  my ($domain,$page,$dir,$save,$last_mod,$type) = next_url();

  # Get the file (if it's changed)

  if (! -f $save || (stat($save))[9] < $last_mod || !$last_mod) {
    print LOG "",(-f $save ? "update:    " : "load:      "),"$domain/$page\n" if ($MAIN::VERBOSE);
    setup_dir($save);
    return print STDERR "[$PROGNAME]  Couldn't fetch http://$domain/$page\n"
      if (my_system($save,@GET_URL,"http://$domain/$page"));
    utime $last_mod, $last_mod, $save if ($last_mod);
  } else {
    print LOG "unchanged: $domain/$page\n" if ($MAIN::VERBOSE);
  }

  scan_urls($domain,$page,$save) if ($type =~ /html/);

  # Directory listings don't have a Last-Modification time, erase them
  unlink($save) if ($type =~ /dir_index/);

  # Be agressive - look for indexes of subdirectories
  if ($MAIN::AGGRESSIVE) {
    my $aggro_page = $page;
    while ($aggro_page =~ s|/?[^/]+$||) {
      check_url($domain,$aggro_page,0);
    }
  }
}

##################################################
# Main code
##################################################
sub main {
  my ($sleep) = parse_args();
  while (urls_left()) {
    get();
    sleep $sleep if ($sleep);
  }
  close LOG;
}
main();
