src/PMG/HTMLMail.pm

   1 package PMG::HTMLMail;
   2
   3 use strict;
   4 use warnings;
   5 use Encode;
   6 use Data::Dumper;
   7 use MIME::Head;
   8 use File::Path;
   9 use HTML::Entities;
  10 use MIME::Parser;
  11 use MIME::Base64;
  12 use HTML::TreeBuilder;
  13 use HTML::Scrubber;
  14
  15 use PMG::Utils;
  16 use PMG::MIMEUtils;
  17
  18 sub dump_html {
  19     my ($tree, $cid_hash) = @_;
  20
  21     my @html = ();
  22
  23     my($tag, $node, $start, $depth);
  24
  25     $tree->traverse(
  26         sub {
  27             ($node, $start) = @_;
  28             if(ref $node) {
  29                 $tag = $node->{'_tag'};
  30
  31                 # try to open a new window when user activates a anchor
  32                 $node->{target} = '_blank' if $tag eq 'a';
  33
  34                 if ($tag eq 'img') {
  35                     if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) {
  36                         if (my $datauri = $cid_hash->{$1}) {
  37                             $node->{src} = $datauri;
  38                         }
  39                     }
  40                 }
  41
  42                 if($start) { # on the way in
  43                     push(@html, $node->starttag);
  44                 } else {
  45                     # on the way out
  46                     push(@html, $node->endtag);
  47                 }
  48             } else {
  49                 # simple text content
  50                 $node = encode_entities($node)
  51                     # That does magic things if $entities is undef.
  52                     unless $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} };
  53                 # To keep from amp-escaping children of script et al.
  54                 # That doesn't deal with descendants; but then, CDATA
  55                 #  parents shouldn't /have/ descendants other than a
  56                 #  text children (or comments?)
  57                 push(@html, $node);
  58             }
  59             1; # keep traversing
  60         }
  61     );
  62
  63     return join('', @html, "\n");
  64 }
  65
  66 sub getscrubber {
  67     my ($viewimages, $allowhref) = @_;
  68
  69     # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
  70
  71     my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
  72
  73     my @rules = ( script => 0 );
  74
  75     my @default = (
  76         0 =>  # default rule, deny all tags
  77         {
  78             '*' => 0, # default rule, deny all attributes
  79             abbr => 1,
  80             accept => 1,
  81             'accept-charset' => 1,
  82             accesskey => 1,
  83             align => 1,
  84             alt => 1,
  85             axis => 1,
  86             border => 1,
  87             bgcolor => 1,
  88             cellpadding => 1,
  89             cellspacing => 1,
  90             char => 1,
  91             charoff => 1,
  92             charset => 1,
  93             checked => 1,
  94             cite => 1,
  95             class => 1,
  96             clear => 1,
  97             cols => 1,
  98             colspan => 1,
  99             color => 1,
 100             compact => 1,
 101             coords => 1,
 102             datetime => 1,
 103             dir => 1,
 104             disabled => 1,
 105             enctype => 1,
 106             frame => 1,
 107             headers => 1,
 108             height => 1,
 109             # only allow http:// and https:// hrefs
 110             'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
 111             hreflang => 1,
 112             hspace => 1,
 113             id => 1,
 114             ismap => 1,
 115             label => 1,
 116             lang => 1,
 117             longdesc => 1,
 118             maxlength => 1,
 119             media => 1,
 120             method => 1,
 121             multiple => 1,
 122             name => 1,
 123             nohref => 1,
 124             noshade => 1,
 125             nowrap => 1,
 126             prompt => 1,
 127             readonly => 1,
 128             rel => 1,
 129             rev => 1,
 130             rows => 1,
 131             rowspan => 1,
 132             rules => 1,
 133             scope => 1,
 134             selected => 1,
 135             shape => 1,
 136             size => 1,
 137             span => 1,
 138             src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
 139             start => 1,
 140             style => 1,
 141             summary => 1,
 142             tabindex => 1,
 143             target => 1,
 144             title => 1,
 145             type => 1,
 146             usemap => 1,
 147             valign => 1,
 148             value => 1,
 149             vspace => 1,
 150             width => 1,
 151         }
 152     );
 153
 154     my $scrubber = HTML::Scrubber->new(
 155         allow   => \@allow,
 156         rules   => \@rules,
 157         default => \@default,
 158         comment => 0,
 159         process => 0,
 160     );
 161
 162     $scrubber->style(1);
 163
 164     return $scrubber;
 165 }
 166
 167 sub read_raw_email {
 168     my ($path, $maxbytes) = @_;
 169
 170     open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
 171
 172     my $data = '';
 173     my $raw_header = '';
 174
 175     # read header
 176     my $header;
 177     while (defined(my $line = <$fh>)) {
 178         $raw_header .= $line;
 179         chomp $line;
 180         push @$header, $line;
 181         last if $line =~ m/^\s*$/;
 182     }
 183
 184     my $head = MIME::Head->new($header);
 185
 186     my $cs = $head->mime_attr("content-type.charset");
 187
 188     my $bytes = 0;
 189
 190     while (defined(my $line = <$fh>)) {
 191         $bytes += length ($line);
 192         if ($cs) {
 193             $data .= decode($cs, $line);
 194         } else {
 195             $data .= $line;
 196         }
 197         if (defined($maxbytes) && ($bytes >= $maxbytes)) {
 198             $data .= "\n... mail truncated (> $maxbytes bytes)\n";
 199             last;
 200         }
 201     }
 202
 203     close($fh);
 204
 205     return ($raw_header, $data);
 206 }
 207
 208 my $read_part = sub {
 209     my ($part) = @_;
 210
 211     my $io = $part->open("r");
 212     return undef if !$io;
 213
 214     my $raw = '';
 215     while (defined(my $line = $io->getline)) { $raw .= $line; }
 216     $io->close;
 217
 218     return $raw;
 219 };
 220
 221 my $find_images = sub {
 222     my ($cid_hash, $entity) = @_;
 223
 224     foreach my $part ($entity->parts)  {
 225         if (my $rawcid = $part->head->get('Content-Id')) {
 226             if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
 227                 my $cid = $1;
 228                 my $ctype = $part->head->mime_attr('Content-type') // '';
 229                 if ($ctype =~ m!^image/!) {
 230                     if (defined(my $raw = $read_part->($part))) {
 231                         $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
 232                     }
 233                 }
 234             }
 235         }
 236     }
 237 };
 238
 239 sub entity_to_html {
 240     my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
 241
 242     my $mime_type = lc($entity->mime_type);;
 243
 244     if ($mime_type eq 'text/plain') {
 245         my $raw = $read_part->($entity) // '';
 246         my $html = "<pre>\n";
 247
 248         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 249             $html .= PMG::Utils::decode_to_html($cs, $raw);
 250         } else {
 251             $html .= encode_entities($raw);
 252         }
 253
 254         $html .= "</pre>\n";
 255
 256         return $html;
 257
 258     } elsif ($mime_type eq 'text/html') {
 259         my $raw = $read_part->($entity) // '';
 260
 261         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 262             eval { $raw = decode($cs, $raw); }; # ignore errors here
 263         }
 264
 265         # create a well formed tree
 266         my $tree = HTML::TreeBuilder->new();
 267         $tree->parse($raw);
 268         $tree->eof();
 269
 270         my $whtml = dump_html($tree, $viewimages ? $cid_hash : {});
 271         $tree->delete;
 272
 273         # remove dangerous/unneeded elements
 274         my $scrubber = getscrubber($viewimages, $allowhref);
 275         return $scrubber->scrub($whtml);
 276
 277     } elsif ($mime_type =~ m|^multipart/|i) {
 278         my $multi_part;
 279         my $html_part;
 280         my $text_part;
 281
 282         foreach my $part ($entity->parts)  {
 283             my $subtype = lc($part->mime_type);
 284             $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
 285             $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
 286             $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
 287         }
 288
 289         # get related/embedded images as data uris
 290         $find_images->($cid_hash, $entity);
 291
 292         my $alt = $multi_part || $html_part || $text_part;
 293
 294         return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
 295     }
 296
 297     return undef;
 298 }
 299
 300 sub email_to_html {
 301     my ($path, $raw, $viewimages, $allowhref) = @_;
 302
 303     my $dumpdir = "/tmp/.proxdumpview_$$";
 304
 305     my $html = '';
 306
 307     eval {
 308         if ($raw) {
 309
 310             my ($header, $content) = read_raw_email($path);
 311
 312             $html .= "<pre>\n" .
 313                 encode_entities($header) .
 314                 "\n" .
 315                 encode_entities($content) .
 316                 "</pre>\n";
 317
 318         } else {
 319
 320             my $parser = PMG::MIMEUtils::new_mime_parser({
 321                 dumpdir => $dumpdir,
 322             });
 323
 324             my $entity = $parser->parse_open($path);
 325
 326             PMG::MIMEUtils::fixup_multipart($entity);
 327
 328             $html = entity_to_html($entity, {}, $viewimages, $allowhref);
 329         }
 330     };
 331     my $err = $@;
 332
 333     rmtree $dumpdir;
 334
 335     die "unable to parse mail: $err" if $err;
 336
 337     return $html;
 338 }
 339
 340 1;