#!/usr/bin/perl # Filename: xml_parser # Author: David Ljung Madison # See License: http://MarginalHacks.com/License # Description: parses xml use strict; ################################################## # Setup the variables ################################################## my $PROGNAME = $0; $PROGNAME =~ s|.*/||; ################################################## # Usage ################################################## sub usage { my $msg; foreach $msg (@_) { print "ERROR: $msg\n"; } print "\n"; print "Usage:\t$PROGNAME [-d] \n"; print "\tDoes something to the given file\n"; print "\t-d\tSet debug mode\n"; print "\n"; exit -1; } sub parse_args { my $file; while ($#ARGV>=0) { my $arg=shift(@ARGV); if ($arg =~ /^-h$/) { usage(); } if ($arg =~ /^-d$/) { $MAIN::DEBUG=1; next; } if ($arg =~ /^-/) { usage("Unknown option: $arg"); } usage("Too many files specified [$arg and $file]") if (defined($file)); $file=$arg; } usage("No file defined") if (!defined($file)); $file; } ################################################## # Utilities ################################################## sub count_nl { my ($str) = @_; $str =~ tr/\n/\n/; } sub parse_tag_attr { my ($line,$str) = @_; my %attr; my $tag; $str =~ s/^\s*//; $str =~ s/\s*$//; my ($name,@tag_attr) = split(/\s+/,$str); $tag = $name; foreach my $ta ( @tag_attr ) { if ($ta =~ /^([^=]+)=([^=]+)$/) { my ($k,$v) = ($1,$2); $v = $1 if $v =~ /^'(.*)'$/; $v = $1 if $v =~ /^"(.*)"$/; $attr{$k} = $v; } else { die("[line $line] Tag argument [$ta] should be of the form [key=value]\n"); } } ($tag,\%attr); } ################################################## # Parse xml where data and tags are mixed (data can # occur in non-leaf tags ################################################## sub add_data { my ($arr,$data) = @_; return if $data =~ /^\s*$/; $data =~ s/^\s*//; $data =~ s/\s*$//; push(@$arr,$data); } my $line = 1; sub parse_xml{ my ($str) = @_; my @xml; # Parse xml into data structure. # Each ... gets pushed onto an array as a hash. # Example: ..... # $tag{tag} = "image"; # $tag{attr} = HASH: ( name => bob, size => big ); # $tag{data} = array of all tags/data inside the tags my ($data,$tag_str); while ($str) { if ($str =~ /^([^<]*))//s); $line += count_nl($1); } elsif ($str =~ /^([^<]*)<([^>]+)>(.*)$/s) { # Deal with tags ($data,$tag_str,$str) = ($1,$2,$3); # Push any data on from before the tag add_data(\@xml,$data); # Line number $line += count_nl($data)+count_nl($tag_str); # Handle tag attr my %tag; ($tag{tag},$tag{attr}) = parse_tag_attr($line,$tag_str); # Find the end tag and parse everything in between die("[line $line] Couldn't find end of tag <$tag{tag}>\n") unless ($str =~ /^(.*)<\/$tag{tag}>(.*)$/s); ($data,$str) = ($1,$2); $tag{data} = parse_xml($data,$line); push(@xml,\%tag); } else { # No tags left, just push data add_data(\@xml,$str); $line += count_nl($str); undef $str; } } \@xml; } ################################################## # Display (for debug) ################################################## sub display_xml{ my ($xml,$lvl) = @_; # Debug - print it out foreach my $data ( @$xml ) { if (ref $data eq "HASH") { print " "x$lvl,"<$data->{tag}>\n"; print " "x$lvl." ATTR: ".join(",",map("${_}->$data->{attr}{$_}", keys %{$data->{attr}}))."\n" if ($data->{attr} && %{$data->{attr}}); display_xml($data->{data},$lvl+1); print " "x$lvl,"{tag}>\n"; } else { print " "x$lvl,"[$data]\n"; } } } ################################################## # Main code ################################################## sub main { my $xml = parse_args(); open(XML,$xml) || usage("Couldn't open xml: $xml"); my @xml = ; close(XML); my $xml = join('',@xml); display_xml( parse_xml($xml) ); } main();