package HTML::WebMake::MSHTMLtoHTML; # ------------------------------------------------------------------------- sub convert { my ($self, $contobj, $txt) = @_; # use John Walker's demoroniser first off $txt = demoronise($txt); # first, trim off and tails. Set a metadatum from the title # if possible. if ($txt =~ s/^(.*?)(]*>)//is) { my $head = $1; if ($head =~ /(.*?)<\/title>/i) { $txt = "<wmmeta name='Title'>". $1 . "</wmmeta>" . $txt; } } $txt =~ s{</body>.*?$}{}gis; # <![if ...]>...<![endif]> $txt =~ s{<!\[if\s.+?\]>}{}gs; $txt =~ s{<!\[endif\]>}{}gs; # remove Mso classes on HTML tags $txt =~ s{<(\S+)\s+class=\"Mso\S+\"}{<$1}gis; # Office-namespace tags $txt =~ s{</?o:\S+(?:\s[^>]+|)>}{}gis; # Word-namespace tags $txt =~ s{</?w:\S+(?:\s[^>]+|)>}{}gis; $txt =~ s{<span\s+style="([^"]+)">(.*?)</span>}{ _mshtml_fix_styles('span', $1, $2); }gies; $txt =~ s{<span\s+style='([^']+)'>(.*?)</span>}{ _mshtml_fix_styles('span', $1, $2); }gies; # nbsp's at the end of a paragraph $txt =~ s{(?:\ )+(<\/p>)}{$1}gis; # empty tags foreach my $tag (qw(b i u em font small big strong code div ul ol blockquote h1 h2 h3 h4 h5 h6 pre table)) { $txt =~ s{ <${tag}(?:\s+[^>]*|\s*)> (?:\s+|<\s*br\s*/?>|\ )* <\/${tag}> }{}gisx; } $txt; } sub _mshtml_fix_styles { my ($tag, $style, $text) = @_; $style =~ s/\s+/ /gs; my @styles = split (/\s*;\s*/s, $style); my @newstyles = (); foreach my $st (@styles) { if ($st !~ /^mso-/) { push (@newstyles, $st); } } if (scalar @newstyles == 0) { return $text; } else { return "<$tag style=\"". join (';', @newstyles). "\">$text</$tag>"; } } # from John Walker's demoroniser: # # De-moron-ise Text from Microsoft Applications # # by John Walker -- January 1998 # http://www.fourmilab.ch/ # # This program is in the public domain. # sub demoronise { local($s) = @_; local($i, $c); # Eliminate idiot MS-DOS carriage returns from line terminator $s =~ s/\s+$//; $s .= "\n"; # Map strategically incompatible non-ISO characters in the # range 0x82 -- 0x9F into plausible substitutes where # possible. $s =~ s/\x82/,/g; $s =~ s-\x83-<em>f</em>-g; $s =~ s/\x84/,,/g; $s =~ s/\x85/.../g; $s =~ s/\x88/^/g; $s =~ s-\x89- <B0>/<B0><B0>-g; $s =~ s/\x8B/</g; $s =~ s/\x8C/Oe/g; $s =~ s/\x91/`/g; $s =~ s/\x92/'/g; $s =~ s/\x93/"/g; $s =~ s/\x94/"/g; $s =~ s/\x95/*/g; $s =~ s/\x96/-/g; $s =~ s/\x97/--/g; $s =~ s-\x98-<sup>~</sup>-g; $s =~ s-\x99-<sup>TM</sup>-g; $s =~ s/\x9B/>/g; $s =~ s/\x9C/oe/g; # Now check for any remaining untranslated characters. if ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) { for ($i = 0; $i < length($s); $i++) { $c = substr($s, $i, 1); if ($c =~ m/[\x00-\x09\x10-\x1F\x80-\x9F]/) { my $err = sprintf "untranslated character 0x%02X in MS-HTML input\n", unpack('C', $c); warn $err; } } } # Supply missing semicolon at end of numeric entity if # Billy's bozos left it out. $s =~ s/(&#[0-2]\d\d)\s/$1; /g; # Fix dimbulb obscure numeric rendering of < > & $s =~ s/&/&/g; $s =~ s/</</g; $s =~ s/>/>/g; # Fix unquoted non-alphanumeric characters in table tags $s =~ s/(<TABLE\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi; $s =~ s/(<TD\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi; $s =~ s/(<TH\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi; # Correct PowerPoint mis-nesting of tags $s =~ s-(<Font .*>\s*<STRONG>.*)(</FONT>\s*</STRONG>)-$1</STRONG></Font>-gi; # Translate bonehead PowerPoint misuse of <UL> to achieve # paragraph breaks. $s =~ s-<P>\s*<UL>-<p>-gi; $s =~ s-</UL><UL>-<p>-gi; $s =~ s-</UL>\s*</P>--gi; # Repair PowerPoint depredations in "text-only slides" $s =~ s-<P></P>--gi; $s =~ s- <TD HEIGHT=100- <tr><TD HEIGHT=100-ig; $s =~ s-<LI><H2>-<H2>-ig; $s; } # ------------------------------------------------------------------------- 1;