# package HTML::WebMake::HTMLCleaner; use Carp; use strict; use HTML::Parser; use HTML::WebMake::Main; use vars qw{ @ISA $VERSION @ALLFEATURES $INLINE_TAGS $KEEP_FORMAT_TAGS $EMPTY_ELEMENT_TAGS $BOOL_ATTR_VALUE }; @ISA = qw(HTML::Parser); $VERSION = 'bogus'; # avoid CPAN picking up html::parser ref @ALLFEATURES = qw{ pack nocomments addimgsizes addxmlslashes fixcolors cleanattrs indent fixhrefs }; $KEEP_FORMAT_TAGS = qr{(?:xmp|listing|pre|plaintext)}; $INLINE_TAGS = qr{(?:a|b|i|em|q|strong|h\d|code|abbr|acronym|address|big|cite|del|ins|s|small|strike|sub|sup|u|samp|kbd|var|img|span|textarea)}; $EMPTY_ELEMENT_TAGS = qr{(?:area|base|basefont|bgsound|br|col|embed|frame|hr|img|input|isindex|keygen|link|meta|param|spacer|wbr)}; ########################################################################### sub new { my $class = shift; $class = ref($class) || $class; my ($main) = @_; my $self = $class->SUPER::new ( api_version => 2 ); $self->{main} = $main; $self->clear_features(); $BOOL_ATTR_VALUE = undef; # this parameter is not supported in earlier versions if ($HTML::Parser::VERSION >= 3.00) { my $val = '==BOOL_TRUE=='; eval { $self->boolean_attribute_value ($val); $BOOL_ATTR_VALUE = $val; }; # make sure we get the tokenpos: this is a hack so we can tell # if we're parsing real XML instead of HTML $self->handler ( end => "end", "self, tagname, text, tokenpos"); } bless ($self, $class); $self; } ########################################################################### sub select_features { my ($self, $feats) = @_; $self->clear_features(); if ($feats =~ /\ball\b/i) { foreach my $feat (@ALLFEATURES) { $self->{$feat} = 1; } } foreach my $feat (split (' ', $feats)) { if ($feat =~ s/^\-//) { $self->{$feat} = 0; # turned off } else { $self->{$feat} = 1; # turned on } } } sub clear_features { my ($self) = @_; foreach my $feat (@ALLFEATURES) { $self->{$feat} = 0; } } ########################################################################### sub clean { my ($self, $txt, $fname) = @_; $self->{out} = [ ]; $self->{in_pre} = 0; $self->{indent_level} = 0; $self->{indent_str} = ''; $self->{indent_depth} = 2; $self->{last_was_noninline_close_tag} = 0; $self->{last_text_was_whitespace} = 0; if ($fname !~ /\..?htm?/i) { # use XML mode if possible if ($self->can ("xml_mode")) { $self->xml_mode (1); } # preserve case: important for RDF if ($self->can ("case_sensitive")) { $self->case_sensitive (1); } } $self->parse ($$txt); $self->eof(); if ($self->{indent_level} > 0) { warn "HTML cleaner: unbalanced tags found in $fname\n"; } join ('', @{$self->{out}}); } ########################################################################### sub start { my($self, $tagname, $attr, $attrseq, $origtext) = @_; if ($tagname =~ /^${KEEP_FORMAT_TAGS}$/) { $self->{in_pre}++; } my $is_inline_tag; if ($tagname =~ /^${INLINE_TAGS}$/) { $is_inline_tag = 1; if ($self->{last_text_was_whitespace}) { $self->add_text (" "); } } else { $is_inline_tag = 0; } if (!$self->{cleanattrs}) { $self->add_text ($origtext); } else { $self->clean_attrs_at_start ($tagname, $attr, $attrseq, $origtext); } if (!$is_inline_tag && !$self->{in_pre}) { $self->add_text ("\n"); if (!$self->{in_pre}) { if ($tagname !~ /^${EMPTY_ELEMENT_TAGS}$/ && $origtext !~ /\/>$/) { $self->open_indent(); } else { $self->add_current_indent(); } } } $self->{last_was_noninline_close_tag} = 0; $self->{last_text_was_whitespace} = 0; } sub clean_attrs_at_start { my ($self, $tagname, $attr, $attrseq, $origtext) = @_; my $attrs = ''; my $imgsrc = ''; foreach my $name (@{$attrseq}) { my $val = $attr->{$name}; if ($self->{fixcolors} && $name =~ /colou?r$/) { if ($val =~ /^[\da-f]{6}$/i) { $val = "#".$val; # color=004000 -> color="#004000" } } if ($tagname eq 'img' && $name eq 'src') { $imgsrc = $val; } if ($self->{fixhrefs} && ($name eq 'src' || $name eq 'href')) { if ($val !~ /^[a-z0-9A-Z]+:/) { $val = HTML::WebMake::Main::canon_path ($val); $val =~ s,\\,/,g; } } if (defined $BOOL_ATTR_VALUE && $val eq $BOOL_ATTR_VALUE) { $attrs .= " ".$name; } elsif (!defined $BOOL_ATTR_VALUE && $val eq $name) { $attrs .= " ".$name; } elsif ($val =~ /\"/) { $val =~ s/\'/'/g; $attrs .= " ".$name ."=\'".$val."\'"; } else { $attrs .= " ".$name ."=\"".$val."\""; } } my $tagend = ''; if ($attrs !~ /\/\s*$/ && (($self->{addxmlslashes} && $tagname =~ /^${EMPTY_ELEMENT_TAGS}$/) || $origtext =~ /\s\/>$/)) { $tagend = " />"; # HTML-4, XHTML, XML style } else { $tagend = ">"; } if ($self->{last_was_noninline_close_tag}) { $self->add_current_indent(); } if ($tagname eq 'img' && $self->{addimgsizes} && $attrs !~ /(height|width)/i && $imgsrc !~ /^(?:[a-z0-9A-Z]+:|\/)/) { $self->add_text ($self->{main}->fileless_subst ("(html-cleaner)", "<$tagname".$attrs.' ${IMGSIZE}>')); } else { $self->add_text ("<".$tagname, $attrs, $tagend); } } # -------------------------------------------------------------------------- sub end { my($self, $tagname, $origtext, $tokenpos) = @_; # XML tags if (!defined ($tokenpos)) { return; } my $exiting_pre = ($tagname =~ /^${KEEP_FORMAT_TAGS}$/); if ($exiting_pre) { $self->{in_pre}--; } $self->{last_text_was_whitespace} = 0; if ($self->{in_pre}) { $self->add_text ($origtext); } elsif ($tagname !~ /^${INLINE_TAGS}$/ && !$exiting_pre) { if (!$self->{last_was_noninline_close_tag}) { $self->add_text ("\n"); } $self->close_indent(); $self->add_text ("\n"); $self->{last_was_noninline_close_tag} = 1; } else { $self->add_text (""); $self->{last_was_noninline_close_tag} = 0; } } # -------------------------------------------------------------------------- sub text { my($self, $origtext, $is_cdata) = @_; if ($self->{in_pre} > 0) { $self->{last_was_noninline_close_tag} = 0; $self->{last_text_was_whitespace} = 0; $self->add_text ($origtext); return; } elsif ($origtext =~ /^\s*$/s) { $self->{last_text_was_whitespace} = 1; return; } $self->{last_was_noninline_close_tag} = 0; $self->{last_text_was_whitespace} = 0; $self->pack_text (\$origtext); # or, to tidy up whitespace: $self->add_text ($origtext); } # -------------------------------------------------------------------------- sub process { my ($self, $origtext) = @_; $self->add_text ("\n"); $self->add_current_indent(); } # -------------------------------------------------------------------------- sub comment { my ($self, $origtext) = @_; if (!$self->{nocomments}) { $self->pack_text (\$origtext); $self->add_text ("\n"); $self->add_current_indent(); } } # -------------------------------------------------------------------------- sub declaration { my ($self, $origtext) = @_; $self->add_text ("\n"); $self->add_current_indent(); } ########################################################################### sub pack_text { my($self, $txt) = @_; if ($self->{pack} && !($self->{in_pre} > 0)) { $$txt =~ s/\n\n+/\n/gm; $$txt =~ s/[ \t]+/ /gm; $$txt =~ s/^ / /gm; $$txt =~ s/ $/ /gm; my $indent = $self->get_current_indent(); $$txt =~ s/\n/\n${indent}/gs; } } ########################################################################### sub add_text { my $self = shift; push (@{$self->{out}}, @_); # $self->{last_was_indent} = 0; } sub open_indent { my ($self) = @_; if (!$self->{indent}) { return; } $self->{indent_level} += $self->{indent_depth}; $self->{indent_str} = (' ' x $self->{indent_level}); # return if ($self->{last_was_indent}); push (@{$self->{out}}, $self->{indent_str}); # $self->{last_was_indent} = 1; } sub close_indent { my ($self) = @_; if (!$self->{indent}) { return; } $self->{indent_level} -= $self->{indent_depth}; if ($self->{indent_level} < 0) { $self->{indent_level} = 0; } $self->{indent_str} = (' ' x $self->{indent_level}); # return if ($self->{last_was_indent}); push (@{$self->{out}}, $self->{indent_str}); # $self->{last_was_indent} = 1; } sub add_current_indent { my ($self) = @_; if (!$self->{indent}) { return; } # return if ($self->{last_was_indent}); push (@{$self->{out}}, $self->{indent_str}); # $self->{last_was_indent} = 1; } sub get_current_indent { my ($self) = @_; $self->{indent_str}; } ########################################################################### 1;