src/PMG/HTMLMail.pm

   1 package PMG::HTMLMail;
   2
   3 use strict;
   4 use warnings;
   5 use Encode;
   6 use Data::Dumper;
   7 use MIME::Head;
   8 use File::Path;
   9 use HTML::Entities;
  10 use MIME::Parser;
  11 use MIME::Base64;
  12 use HTML::TreeBuilder;
  13 use HTML::Scrubber;
  14
  15 use PMG::Utils;
  16 use PMG::MIMEUtils;
  17
  18 # $value is a ref to a string scalar
  19 my sub remove_urls {
  20     my ($value) = @_;
  21     return if !defined $$value;
  22
  23     # convert 'url([..])' to '___([..])' so the browser does not load it
  24     $$value =~ s|url\(|___(|gi;
  25
  26     # similar for all protocols
  27     $$value =~ s|[a-z0-9]+://|_|gi;
  28 }
  29
  30 my sub remove_urls_from_attr {
  31     my ($obj, $tag_name, $attr_name, $value) = @_;
  32
  33     remove_urls(\$value);
  34
  35     return $value;
  36 }
  37
  38 sub dump_html {
  39     my ($tree, $cid_hash, $view_images) = @_;
  40
  41     my @html = ();
  42
  43     $tree->traverse(sub {
  44         my ($node, $start, $depth) = @_;
  45         if (ref $node) {
  46             my $tag = $node->{'_tag'};
  47
  48             # try to open a new window when user activates a anchor
  49             $node->{target} = '_blank' if $tag eq 'a';
  50
  51             if ($tag eq 'img' && $view_images) {
  52                 if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) {
  53                     if (my $datauri = $cid_hash->{$1}) {
  54                         $node->{src} = $datauri;
  55                     }
  56                 }
  57             }
  58
  59             if ($tag eq 'style' && !$view_images) {
  60                 remove_urls($_) for grep { !ref $$_ } $node->content_refs_list();
  61             }
  62
  63             if ($start) { # on the way in
  64                 push(@html, $node->starttag);
  65             } else { # on the way out
  66                 push(@html, $node->endtag);
  67             }
  68         } else { # simple text content
  69             # To keep from amp-escaping children of script et al. That doesn't deal with descendants;
  70             # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
  71             if (!$HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }) {
  72                 # That does magic things if $entities is undef.
  73                 $node = encode_entities($node);
  74             }
  75             push(@html, $node);
  76         }
  77         return 1; # keep traversing
  78     });
  79
  80     return join('', @html, "\n");
  81 }
  82
  83 sub getscrubber {
  84     my ($viewimages, $allowhref) = @_;
  85
  86     # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
  87
  88     my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
  89
  90     my @rules = ( script => 0 );
  91
  92     my @default = (
  93         0 =>  # default rule, deny all tags
  94         {
  95             '*' => 0, # default rule, deny all attributes
  96             abbr => 1,
  97             accept => 1,
  98             'accept-charset' => 1,
  99             accesskey => 1,
 100             align => 1,
 101             alt => 1,
 102             axis => 1,
 103             border => 1,
 104             bgcolor => 1,
 105             cellpadding => 1,
 106             cellspacing => 1,
 107             char => 1,
 108             charoff => 1,
 109             charset => 1,
 110             checked => 1,
 111             cite => 1,
 112             class => 1,
 113             clear => 1,
 114             cols => 1,
 115             colspan => 1,
 116             color => 1,
 117             compact => 1,
 118             coords => 1,
 119             datetime => 1,
 120             dir => 1,
 121             disabled => 1,
 122             enctype => 1,
 123             frame => 1,
 124             headers => 1,
 125             height => 1,
 126             # only allow http:// and https:// hrefs
 127             'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
 128             hreflang => 1,
 129             hspace => 1,
 130             id => 1,
 131             ismap => 1,
 132             label => 1,
 133             lang => 1,
 134             longdesc => 1,
 135             maxlength => 1,
 136             media => 1,
 137             method => 1,
 138             multiple => 1,
 139             name => 1,
 140             nohref => 1,
 141             noshade => 1,
 142             nowrap => 1,
 143             prompt => 1,
 144             readonly => 1,
 145             rel => 1,
 146             rev => 1,
 147             rows => 1,
 148             rowspan => 1,
 149             rules => 1,
 150             scope => 1,
 151             selected => 1,
 152             shape => 1,
 153             size => 1,
 154             span => 1,
 155             src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
 156             start => 1,
 157             style => $viewimages ? 1 : \&remove_urls_from_attr,
 158             summary => 1,
 159             tabindex => 1,
 160             target => 1,
 161             title => 1,
 162             type => 1,
 163             usemap => 1,
 164             valign => 1,
 165             value => 1,
 166             vspace => 1,
 167             width => 1,
 168         }
 169     );
 170
 171     my $scrubber = HTML::Scrubber->new(
 172         allow   => \@allow,
 173         rules   => \@rules,
 174         default => \@default,
 175         comment => 0,
 176         process => 0,
 177     );
 178
 179     $scrubber->style(1);
 180
 181     return $scrubber;
 182 }
 183
 184 sub read_raw_email {
 185     my ($path, $maxbytes) = @_;
 186
 187     open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
 188
 189     my $data = '';
 190     my $raw_header = '';
 191
 192     # read header
 193     my $header;
 194     while (defined(my $line = <$fh>)) {
 195         $raw_header .= $line;
 196         chomp $line;
 197         push @$header, $line;
 198         last if $line =~ m/^\s*$/;
 199     }
 200
 201     my $head = MIME::Head->new($header);
 202
 203     my $cs = $head->mime_attr("content-type.charset");
 204
 205     my $bytes = 0;
 206
 207     while (defined(my $line = <$fh>)) {
 208         $bytes += length ($line);
 209         if ($cs) {
 210             $data .= decode($cs, $line);
 211         } else {
 212             $data .= $line;
 213         }
 214         if (defined($maxbytes) && ($bytes >= $maxbytes)) {
 215             $data .= "\n... mail truncated (> $maxbytes bytes)\n";
 216             last;
 217         }
 218     }
 219
 220     close($fh);
 221
 222     return ($raw_header, $data);
 223 }
 224
 225 my $read_part = sub {
 226     my ($part) = @_;
 227
 228     my $io = $part->open("r");
 229     return undef if !$io;
 230
 231     my $raw = '';
 232     while (defined(my $line = $io->getline)) { $raw .= $line; }
 233     $io->close;
 234
 235     return $raw;
 236 };
 237
 238 my $find_images = sub {
 239     my ($cid_hash, $entity) = @_;
 240
 241     foreach my $part ($entity->parts)  {
 242         if (my $rawcid = $part->head->get('Content-Id')) {
 243             if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
 244                 my $cid = $1;
 245                 my $ctype = $part->head->mime_attr('Content-type') // '';
 246                 if ($ctype =~ m!^image/!) {
 247                     if (defined(my $raw = $read_part->($part))) {
 248                         $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
 249                     }
 250                 }
 251             }
 252         }
 253     }
 254 };
 255
 256 sub entity_to_html {
 257     my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
 258
 259     my $mime_type = lc($entity->mime_type);;
 260
 261     if ($mime_type eq 'text/plain') {
 262         my $raw = $read_part->($entity) // '';
 263         my $html = "<pre>\n";
 264
 265         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 266             $html .= PMG::Utils::decode_to_html($cs, $raw);
 267         } else {
 268             $html .= encode_entities($raw);
 269         }
 270
 271         $html .= "</pre>\n";
 272
 273         return $html;
 274
 275     } elsif ($mime_type eq 'text/html') {
 276         my $raw = $read_part->($entity) // '';
 277
 278         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 279             eval { $raw = decode($cs, $raw); }; # ignore errors here
 280         }
 281
 282         # create a well formed tree
 283         my $tree = HTML::TreeBuilder->new();
 284         $tree->parse($raw);
 285         $tree->eof();
 286
 287         # normalizes html, replaces CID references with data uris and scrubs style tags
 288         my $whtml = dump_html($tree, $cid_hash, $viewimages);
 289         $tree->delete;
 290
 291         # remove dangerous/unneeded elements
 292         my $scrubber = getscrubber($viewimages, $allowhref);
 293         return $scrubber->scrub($whtml);
 294
 295     } elsif ($mime_type =~ m|^multipart/|i) {
 296         my $multi_part;
 297         my $html_part;
 298         my $text_part;
 299
 300         foreach my $part ($entity->parts)  {
 301             my $subtype = lc($part->mime_type);
 302             $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
 303             $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
 304             $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
 305         }
 306
 307         # get related/embedded images as data uris
 308         $find_images->($cid_hash, $entity);
 309
 310         my $alt = $multi_part || $html_part || $text_part;
 311
 312         return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
 313     }
 314
 315     return undef;
 316 }
 317
 318 sub email_to_html {
 319     my ($path, $raw, $viewimages, $allowhref) = @_;
 320
 321     my $dumpdir = "/tmp/.proxdumpview_$$";
 322
 323     my $html = '';
 324
 325     eval {
 326         if ($raw) {
 327
 328             my ($header, $content) = read_raw_email($path);
 329
 330             $html .= "<pre>\n" .
 331                 encode_entities($header) .
 332                 "\n" .
 333                 encode_entities($content) .
 334                 "</pre>\n";
 335
 336         } else {
 337
 338             my $parser = PMG::MIMEUtils::new_mime_parser({
 339                 dumpdir => $dumpdir,
 340             });
 341
 342             my $entity = $parser->parse_open($path);
 343
 344             PMG::MIMEUtils::fixup_multipart($entity);
 345
 346             $html = entity_to_html($entity, {}, $viewimages, $allowhref);
 347         }
 348     };
 349     my $err = $@;
 350
 351     rmtree $dumpdir;
 352
 353     die "unable to parse mail: $err" if $err;
 354
 355     return $html;
 356 }
 357
 358 1;