package HTML::WebMake::MSHTMLtoHTML;
# -------------------------------------------------------------------------
sub convert {
my ($self, $contobj, $txt) = @_;
# use John Walker's demoroniser first off
$txt = demoronise($txt);
# first, trim off
and tails. Set a metadatum from the title
# if possible.
if ($txt =~ s/^(.*?)(]*>)//is) {
my $head = $1;
if ($head =~ /(.*?)<\/title>/i) {
$txt = "". $1 . "" . $txt;
}
}
$txt =~ s{.*?$}{}gis;
# ...
$txt =~ s{}{}gs;
$txt =~ s{}{}gs;
# remove Mso classes on HTML tags
$txt =~ s{<(\S+)\s+class=\"Mso\S+\"}{<$1}gis;
# Office-namespace tags
$txt =~ s{?o:\S+(?:\s[^>]+|)>}{}gis;
# Word-namespace tags
$txt =~ s{?w:\S+(?:\s[^>]+|)>}{}gis;
$txt =~ s{(.*?)}{
_mshtml_fix_styles('span', $1, $2);
}gies;
$txt =~ s{(.*?)}{
_mshtml_fix_styles('span', $1, $2);
}gies;
# nbsp's at the end of a paragraph
$txt =~ s{(?:\ )+(<\/p>)}{$1}gis;
# empty tags
foreach my $tag (qw(b i u em font small big strong code div ul ol
blockquote h1 h2 h3 h4 h5 h6 pre table))
{
$txt =~ s{
<${tag}(?:\s+[^>]*|\s*)>
(?:\s+|<\s*br\s*/?>|\ )*
<\/${tag}>
}{}gisx;
}
$txt;
}
sub _mshtml_fix_styles {
my ($tag, $style, $text) = @_;
$style =~ s/\s+/ /gs;
my @styles = split (/\s*;\s*/s, $style);
my @newstyles = ();
foreach my $st (@styles) {
if ($st !~ /^mso-/) { push (@newstyles, $st); }
}
if (scalar @newstyles == 0) {
return $text;
} else {
return "<$tag style=\"". join (';', @newstyles). "\">$text$tag>";
}
}
# from John Walker's demoroniser:
#
# De-moron-ise Text from Microsoft Applications
#
# by John Walker -- January 1998
# http://www.fourmilab.ch/
#
# This program is in the public domain.
#
sub demoronise {
local($s) = @_;
local($i, $c);
# Eliminate idiot MS-DOS carriage returns from line terminator
$s =~ s/\s+$//;
$s .= "\n";
# Map strategically incompatible non-ISO characters in the
# range 0x82 -- 0x9F into plausible substitutes where
# possible.
$s =~ s/\x82/,/g;
$s =~ s-\x83-f-g;
$s =~ s/\x84/,,/g;
$s =~ s/\x85/.../g;
$s =~ s/\x88/^/g;
$s =~ s-\x89- /-g;
$s =~ s/\x8B/~-g;
$s =~ s-\x99-TM-g;
$s =~ s/\x9B/>/g;
$s =~ s/\x9C/oe/g;
# Now check for any remaining untranslated characters.
if ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) {
for ($i = 0; $i < length($s); $i++) {
$c = substr($s, $i, 1);
if ($c =~ m/[\x00-\x09\x10-\x1F\x80-\x9F]/) {
my $err = sprintf "untranslated character 0x%02X in MS-HTML input\n", unpack('C', $c);
warn $err;
}
}
}
# Supply missing semicolon at end of numeric entity if
# Billy's bozos left it out.
$s =~ s/([0-2]\d\d)\s/$1; /g;
# Fix dimbulb obscure numeric rendering of < > &
$s =~ s/&/&/g;
$s =~ s/</</g;
$s =~ s/>/>/g;
# Fix unquoted non-alphanumeric characters in table tags
$s =~ s/(\s*.*)(\s*)-$1-gi;
# Translate bonehead PowerPoint misuse of