\s*(\S.*)\s*(

#!/usr/bin/perl # Filename: site_index # Author: David Ljung Madison # See License: http://MarginalHacks.com/License/ my $VERSION= 2.02; # Description: Multi-site HTML site index generator # Input: Reads a file (or STDIN) of "domain path" use strict; use POSIX; use IO::File; umask 022; ################################################## # Setup the variables ################################################## my $PROGNAME = $0; $PROGNAME =~ s|.*/||; my $DEFAULT_INDEX = "\.s?html?\$"; my $DEFAULT_DEFAULT = "index.html"; my $DEFAULT_OUT = "Site_Index"; # Directory. Will overwrite! my $DEFAULT_DEPTH = 0; # Max depth my $DEFAULT_LINKS_PER_PAGE = 95; # Break up pages my $NO_INDEX = ".no_index"; # Don't index these directories my $NO_CONTENTS = ".no_contents"; # Don't index contents my $NO_SHOW = ".no_show"; # Don't index contents ######################### # Usage ######################### sub fatal { foreach my $msg (@_) { print STDERR "[$PROGNAME] ERROR: $msg\n"; } exit(-1); } sub debug { return unless $MAIN::DEBUG; foreach my $msg (@_) { print STDERR "[$PROGNAME] $msg\n"; } } # Read in the domain info into @DOMAINS sub get_sites { my ($data,$file) = @_; open(FILE,"<$file") || usage("Can't read input [$file]"); while () { chomp; s/^\s+//; s/\s+$//; my ($dom,$path,$importance) = split(/\t/,$_,3); # If it's not a domain, then it's just here for importance if ($dom =~ m|/|) { fatal("Non-domain entries [$dom] are useless without importance '1'") unless $importance==1; } else { # Domain push(@{$data->{domains}},$dom) unless $data->{domain}{$dom}; $data->{domain}{$dom}{path} = $path unless $data->{domain}{$dom}{path}; $data->{domain}{$dom}{importance} = $importance unless $data->{domain}{$dom}{importance} && $data->{domain}{$dom}{importance} < $importance; } push(@{$data->{top}}, out_top_links($data,$dom,$path)) if $importance==1; } close FILE; $data; } sub usage { foreach my $msg (@_) { print STDERR "ERROR: $msg\n"; } my $def_index = $DEFAULT_INDEX; $def_index =~ s/\./\\./g; print STDERR < \tBuilds site indexes for multiple domains Reads file(s) (or '-' for STDIN) for domain info: domain path [importance] Options: -ignore Ignore these paths/files -index Index these types of files [default '$def_index'] -default Default directory page [default '$DEFAULT_DEFAULT'] -out Where to put the site index [default '$DEFAULT_OUT'] -lpp Number of links per site index page. -depth Maximum depth (0 ignored) [default '$DEFAULT_DEPTH'] -noclean Don't remove old index files Examples: -ignore '/images\$' Ignore any directories named: "images" -ignore '/(images|thumbnails)\$' Multiple ignores -ignore '/\\.' Ignore dot directories -index '\\.(s?html?|txt)\$' Index .shtm, .shtml, .htm, .html, .txt Domains can have an optional "importance" value from 1-4: 1) List root link at the top of all site indexes (and treat as 2) 2) List in every site index first. 3) List in every site index. 4) Only a link to the top page appears in other indexes. 5) Doesn't appear in other indexes at all. END_USAGE exit -1; } sub parse_args { my (%data,@files); # Defaults $data{opt}{index} = $DEFAULT_INDEX; $data{opt}{default} = $DEFAULT_DEFAULT; $data{opt}{out} = $DEFAULT_OUT; $data{opt}{links_per_page} = $DEFAULT_LINKS_PER_PAGE; $data{opt}{depth} = $DEFAULT_DEPTH; while (my $arg=shift(@ARGV)) { if ($arg =~ /^-h$/) { usage(); } if ($arg =~ /^-d$/) { $MAIN::DEBUG=1; next; } if ($arg =~ /^-ignore(=(.+))?$/) { $data{opt}{ignore}= $2 ? $3 : shift @ARGV; next; } if ($arg =~ /^-index(=(.+))?$/) { $data{opt}{index}= $2 ? $3 : shift @ARGV; next; } if ($arg =~ /^-default(=(.+))?$/) { $data{opt}{default}= $2 ? $3 : shift @ARGV; next; } if ($arg =~ /^-out(=(.+))?$/) { $data{opt}{out}= $2 ? $3 : shift @ARGV; next; } if ($arg =~ /^-lpp(=(.+))?$/) { $data{opt}{links_per_page}= $2 ? $3 : shift @ARGV; next; } if ($arg =~ /^-depth(=(.+))?$/) { $data{opt}{depth}= $2 ? $3 : shift @ARGV; next; } if ($arg =~ /^-no_?clean$/) { $data{opt}{noclean}=1; next; } if ($arg =~ /^-./) { usage("Unknown option: $arg"); } push(@files,$arg); } #usage("No input defined") unless $file; push(@files,"-") unless @files; map get_sites(\%data,$_), @files; unshift(@{$data{top}}, "

Main links:

\n") if $data{top}; \%data; } ######################### # Build a site index ######################### sub index_last { # Contents sorter, site_index goes last my ($data,$a,$b) = @_; return 1 if $a eq $data->{opt}{out}; return -1 if $b eq $data->{opt}{out}; return $a cmp $b; } sub get_title { my ($data,$path,$file) = @_; my $title = undef; my $file = "$path/$file" if $path; $file .= "/$data->{opt}{default}" if -d $file; return $data->{titles}{$file} if $data->{titles}{$file}; return $title unless open(F,"<$file"); my ($in_title,$done_title) = (0,0); while () { $in_title = 1 if s/.*]*>\s*//i; $done_title = 1 if s/<\/title.*//i; $title .= $_ if $in_title; last if $done_title; #if (m|\s*(\S.*)\s*()?|) { close F; return $1; } last if $.>100; # Only read first 100 lines last if m|/>/g; return $data->{titles}{$file}=$title; } sub get_name { my ($data,$file) = @_; my $name = $file; $name =~ s/_/ /g; # Underbar = space $name =~ s|$data->{opt}{index}||g; # .html $name; } sub out_top_links { my ($data,$dom,$fullpath) = @_; $fullpath = $fullpath || $dom; my ($path,$file) = ($fullpath =~ m|(.*)/([^/]+)$|) ? ($1, $2) : ("",$fullpath); my $name = get_name($data,$file); $dom = "http://$dom" unless $dom =~ m|://|; out_link($dom, 1, $name, get_title($data,$path,$file)); } sub out_link { my ($url, $lvl, $name, $title) = @_; my $out = " "x(4*$lvl); $url =~ s/'/%27/g; $out .= "$name"; $out .= " $title" if $name ne $title; $out .= "
\n"; $out; } sub domain_header { my ($data, $domain) = @_; # Find any domain aliases my $path = $data->{domain}{$domain}{path}; my @domains = grep($path && $data->{domain}{$_}{path} eq $path, @{$data->{domains}}); push(@domains,$domain) unless $path; my $str = join(" / ", map("$_", @domains)); "

$str

\n"; } sub no_index { my ($data,$dir,$path) = @_; # No index? return 1 if -f "$path/$NO_INDEX"; # Skip it if it's the path of one of our subdomains, let that index it foreach my $dom ( @{$data->{domains}} ) { my $dom_path = $data->{domain}{$dom}{path}; next unless $dom_path; # $dom is a subdomain of $dir (and not, for example, the other way around) next unless $dom_path =~ /^$dir./; # And this is in the path of the subdomain next unless $path =~ /^$dom_path/; return 1; } return 0; } sub get_directory { my ($data, $domain, $dir, $path, $lvl) = @_; return unless $dir; return if $data->{opt}{ignore} && $path =~ /$data->{opt}{ignore}/; return if $data->{opt}{depth} && $data->{opt}{depth} < $lvl; my @links; # If we're level 0, then put in the domain header unless ($lvl) { push(@links, domain_header($data,$domain)); $lvl++; } # Read the directory opendir(DIR, $path) || fatal("Couldn't read directory [$path]\n"); my @dir = grep(-d "$path/$_" || /$data->{opt}{index}/ && !/$data->{opt}{default}/, grep(!/^\.{1,2}$/, readdir(DIR)) ); @dir = grep( "$path/$_" !~ /$data->{opt}{ignore}/, @dir) if $data->{opt}{ignore}; closedir(DIR); # Handle directories and html return unless (@dir); my $url_path = $path; $url_path =~ s|^$dir|http://$domain|g; foreach my $file ( sort { index_last($data,$a,$b) } @dir ) { # Heuristic: Ignore symbolic links that point to other files in this dir next if -l "$path/$file" && readlink("$path/$file") !~ m|/|; my $name = get_name($data,$file); my $title = get_title($data,$path,$file); my $url = "$url_path/$file"; unless (-d "$path/$file") { push(@links, out_link($url, $lvl, $name, $title)); } elsif (!no_index($data,$dir,"$path/$file")) { my @dir_links = get_directory($data, $domain, $dir, "$path/$file", $lvl+1) unless (-f "$path/$file/$NO_CONTENTS" || $file eq $data->{opt}{out}); next unless @dir_links || -f "$path/$file/$data->{opt}{default}"; push(@links, out_link("$url/", $lvl, $name, $title)) unless -f "$path/$file/$NO_SHOW"; push(@links, @dir_links); } } @links; } ######################### # Output ######################### sub page_index { my ($page) = @_; ($page==1) ? "index.html" : "index.$page.html"; } sub start_index { my ($data, $out, $domain, $page, $pages) = @_; my $path = $data->{domain}{$domain}{path}; my $file = "$path/$data->{opt}{out}/"; mkdir($file, 0755) unless -d $file; $file .= page_index($page); open($out,">$file") || fatal("Can't write site index [$file]\n"); print $out < Hierarchy: $domain Hierarchy for local domains, generated by the MarginalHacks tool $PROGNAME

END_OF_HEADER return $file unless $pages>1; print $out " \n"; print $out " \n"; print $out " \n"; print $out "
\n"; foreach my $p ( 1..$pages ) { my $url = page_index($p); print $out ($page==$p) ? " Page $p
\n" : " Page $p
\n"; } print $out " \n"; $file; } sub end_index { my ($data, $out, $file, $pages) = @_; print $out <1;
END_PAGES_TABLE my $date = localtime; print $out <

Generated on $date; END_OF_FOOTER close($out); print "Wrote $file\n"; } sub clean_index { my ($data,$domain,$pages) = @_; my $path = $data->{domain}{$domain}{path}; my $f; while (++$pages && ($f="$path/$data->{opt}{out}/".page_index($pages)) && -f $f) { unlink $f; print "Removed: $f\n"; } } sub output { my ($data, $domain, $show) = @_; # How many pages? my $lpp = $data->{opt}{links_per_page}; my $num = $#$show+1; my $pages = POSIX::ceil($num/$lpp); # The page links themselves count.. (and so do the two credit links) while ($pages*$lpp < $num+($pages*($pages-1))+2) { if ($lpp<$pages) { # Trouble print STDERR "Not enough links-per-page, increasing\n"; $lpp*=1.25; $pages=int($pages/1.25); } else { $pages++; } } my $out = new IO::File; my $link = 0; for(my $p=1; $p<=$pages; $p++) { my $cnt = $pages-1; my $file = start_index($data, $out, $domain, $p, $pages); for( ; $link<=$#$show && $cnt<$lpp; $link++) { print $out $show->[$link]; $cnt++; } end_index($data, $out, $file, $pages); } clean_index($data, $domain, $pages) unless $data->{opt}{noclean}; } ################################################## # Main code ################################################## # Domain sorter srand(time^$$); sub domains { my ($data,$a,$b) = @_; my $a_num = split(/\./,$a); my $b_num = split(/\./,$b); # No - subdomains will be how we compare same-importance domains # # Subdomains go last # return $a_num <=> $b_num unless $a_num==$b_num; # Rate by importance for 2-5 my $a_imp = $data->{domain}{$a}{importance}; my $b_imp = $data->{domain}{$b}{importance}; $a_imp = 5 unless $a_imp; $b_imp = 5 unless $b_imp; $a_imp = 2 if $a_imp<2; $b_imp = 2 if $b_imp<2; return $a_imp <=> $b_imp unless $a_imp==$b_imp; # Number of subdomains return $a_num <=> $b_num unless $a_num==$b_num; #$a cmp $b; # Random otherwise int(rand(2))*2-1; } sub main { my $data = parse_args(); # Get the site index for each site my %did; foreach my $domain ( @{$data->{domains}} ) { my $path = $data->{domain}{$domain}{path}; next unless $path; next if $did{$path}++; # Skip if it's an alias print "Fetching index info: $domain\n"; @{$data->{links}{$domain}} = get_directory($data, $domain, $path, $path, 0); delete $data->{links}{$domain} unless @{$data->{links}{$domain}} } # Write the site index for each site foreach my $domain ( @{$data->{domains}} ) { my @show = $data->{top} ? @{$data->{top}} : (); next unless $data->{domain}{$domain}{path}; next unless $data->{links}{$domain}; push(@show, @{$data->{links}{$domain}}); foreach my $show_domain ( sort { domains($data,$a,$b); } @{$data->{domains}} ) { my $show_importance = $data->{domain}{$show_domain}{importance}; next if $show_domain eq $domain; next if $show_importance>=5; # Only on their own index push(@show, domain_header($data,$show_domain)) if $show_importance==4 || !$data->{links}{$show_domain}; next if $show_importance==4; push(@show, @{$data->{links}{$show_domain}}) if $data->{links}{$show_domain}; } output($data, $domain, \@show); } } main();