#!/usr/bin/perl
# Filename:	debed
# Author:	David Ljung Madison <DaveSource.com>
# See License:	http://MarginalHacks.com/License/
# Description:	Unembeds quicktime movies
#
# Thanks to a slashdot posting by Pathwalker:
#   http://slashdot.org/comments.pl?sid=61803&cid=5798166
#
# Sorenson now available on linux!
#   http://slashdot.org/articles/02/06/20/2350241.shtml?tid=106
#
######################################################################
# When you look at the page, you want to find the EMBED tag that holds 
# the movie, so try the following command:
# curl http://www.apple.com/trailers/columbia/charliesangelsfullthrottle/large_trailer.html | grep EMBED
# Looking at the embed tag, you will see that it references two quick 
# time movies, one as a SRC argument, and one as a HREF argument.  If 
# both are given, you want the HREF, otherwise, you want the SRC.  here :
# http://a772.g.akamai.net/5/772/51/52d5e54c5d8bec/1a1a1aaa2198c627970773d80669d84574a8d80d3cb12453c02589f25382f668c9329e0375e8177ae955ca3799026392f55dbe309c056add3eac1378e457bd329e0a7ee658c73baf1b80/ca2_fullthrottle-tlr_481.mov
# is the link from the HREF option.  It is the reference file for the movie.
# 
# Now, download it and run strings on it. You will see several 
# instances of "url" on one line, and a url on the next line.  These 
# are the movies that the reference movie points to.  Almost always, 
# the first several will be to inform users that they have too old of a 
# version of quick time (they usually have a quicktime version embedded 
# in the name.) skip over them.
# 
# at the end of the file, you will see this: 
# ca2_fullthrottle-tlr_m480.mov
# that is the relative path to the real  movie file. 
# 
# Take the URL for the reference file, and replace 
# ca2_fullthrottle-tlr_481.mov with ca2_fullthrottle-tlr_m480.mov
# and you will have
# 'http://a772.g.akamai.net/5/772/51/52d5e54c5d8bec/1a1a1aaa2198c627970773d80669d84574a8d80d3cb12453c02589f25382f668c9329e0375e8177ae955ca3799026392f55dbe309c056add3eac1378e457bd329e0a7ee658c73baf1b80/ca2_fullthrottle-tlr_m480.mov'
# the url for the actual movie data file.
# 
# This technique should work, even if Apple changes their convention 
# of just adding a m before the size of the movie, to indicate the real file.
# 
# This also only works if the movie is being served over http.  If it 
# is over rtsp, you will need some extra tools: http://www.live.com/
######################################################################
# [Dave] Stuff I've learned:
# .url or +url or )url is a relative path
######################################################################
 


use strict;

##################################################
# Setup the variables
##################################################
my $PROGNAME = $0;
$PROGNAME =~ s|.*/||;

# How to get URLs - pick one
my $GET = 'GET';
#my $GET = 'curl';
#my $GET = 'lynx -source';

# How to yank strings from a binary
my $STRINGS = 'strings';

##################################################
# Usage
##################################################
sub fatal {
  foreach my $msg (@_) { print STDERR "[$PROGNAME] ERROR:  $msg\n"; }
  exit(-1);
}

sub usage {
  foreach my $msg (@_) { print STDERR "ERROR:  $msg\n"; }
  print STDERR <<USAGE;
Usage:\t$PROGNAME <url>

Finds streaming movie files in <embed> tags for a given URL.
Extracts all movies listed in the stream and prints/downloads them.

	-qt     Show quicktime version URLs as well
	-fetch  Download the movies instead of printing

(Currently has no way to distinguish between streaming .movs and
 normal .movs - if anyone knows how to do this, please let me know)

Examples:
% $PROGNAME -fetch http://some.trailer.site/embedded_trailer_page.html
% xine `$PROGNAME http://some.trailer.site/embedded_trailer_page.html`

Author:  David Ljung Madison
Site:    http://MarginalHacks.com/
USAGE
  exit -1;
}

sub parse_args {
  my $url;
  while (my $arg=shift(@ARGV)) {
    if ($arg =~ /^-h$/) { usage(); }
    if ($arg =~ /^-d$/) { $MAIN::DEBUG=1; next; }
    if ($arg =~ /^-qt$/) { $MAIN::KEEP_QTS=1; next; }
    if ($arg =~ /^-f(etch)?$/) { $MAIN::FETCH=1; next; }
    if ($arg =~ /^-/) { usage("Unknown option: $arg"); }
    usage("Too many urls specified [$arg and $url]") if $url;
    $url=$arg;
  }
  usage("No url defined") unless $url;

  $url;
}

sub debug {
  return unless $MAIN::DEBUG;
  foreach my $msg (@_) { print STDERR "[$PROGNAME] $msg\n"; }
}

sub fatal {
  foreach my $msg (@_) { print STDERR "[$PROGNAME] ERROR:  $msg\n"; }
  exit(-1);
}

#print "CAT HACK!\n";
#$GET = 'cat';	# for testing
##################################################
# Code
##################################################
sub get_tag {
  my ($tag,$str) = @_;
  return $1 if $str =~ /\W$tag='([^']+)'/i;
  return $1 if $str =~ /\W$tag=\"([^\"]+)\"/i;
  return $1 if $str =~ /\W$tag=(\w+)/i;
  undef;
}

sub find_embed {
  my ($url) = @_;
  open(GET,"$GET \Q$url\E |") || fatal("Couldn't get url: $url");
  my @found;
  while (<GET>) {
    while (/<embed(.*)/i) {
      $_ = $1;
      my $embed;
      do {
        chomp;
        $embed .= " $_";
      } while (!/>/ && ($_=<GET>));
      s/[^>]*>//;
      $embed =~ s/>.*//;
      my $src=get_tag('src',$embed);
      my $href=get_tag('href',$embed);
      undef $src unless $src =~ /\.moo?v$/;
      undef $href unless $href =~ /\.moo?v$/;
      # Href is preferred
      push(@found,$href) if $href;
      push(@found,$src) if $src && !$href;
    }
  }
  close GET;
  die("No <embed> tags found in url:\n  $url\n\nMake sure you're using the URL for the page that shows the actual movie.\n")
    unless @found;
  @found;
}

sub is_qtversion { m|/qt\dgateQT\d[^/]*moo?v[^/]*|i ? 1 : 0; }

sub parse_embed {
  my (@embed) = @_;

  my %urls;
  foreach my $embed ( @embed ) {
    my $path = $embed;
    $path =~ s|/+[^/]+$||;
    debug("Parsing embed file: $embed");
    open(GET,"$GET \Q$embed\E | $STRINGS |") || fatal("Couldn't pipe embedded url through strings: $embed");
    my $saw_url = 0;
    while (<GET>) {
      chomp;
      if ($saw_url && ($MAIN::KEEP_QTS || !is_qtversion())) {
        my $url = $_;
        # .url or +url or )url is a relative path
        $url = "$path/$2" if $url =~ /^(\.|\+|\))(.+)$/;
        debug("Saw url: $url\n") unless $urls{$url};
        $urls{$url}++;
      }
      $saw_url= /^\s*url\s*$/i ? 1 : 0;
    }
  }

  keys %urls;
}

sub handle {
  my ($mov) = @_;
  return print "$mov\n" unless $MAIN::FETCH;
  my $f = $mov;
  $f =~ s|.*/||;
  $f = $1 if $f =~ /(.+)[\&\?].+/;
  $f = $f || "index.html";
  debug("GET: $mov > $f");
  system("$GET \Q$mov\E > \Q$f\E");
  return print "Error [$?]: $!\n" if $?;
  print "$f\n";
}

##################################################
# Main code
##################################################
sub main {
  my $url = parse_args();

  my (@embed) = find_embed($url);

  my @urls = parse_embed(@embed);

  # For now, just print or download them.
  # I suppose I could launch a movie player, but I'll let them do that...
  map handle($_), @urls;
}
main();
