src/PMG/HTMLMail.pm

   1 package PMG::HTMLMail;
   2
   3 use strict;
   4 use warnings;
   5 use Encode;
   6 use Data::Dumper;
   7 use MIME::Head;
   8 use File::Path;
   9 use HTML::Entities;
  10 use MIME::Parser;
  11 use MIME::Base64;
  12 use HTML::TreeBuilder;
  13 use HTML::Scrubber;
  14
  15 use PMG::Utils;
  16 use PMG::MIMEUtils;
  17
  18 # $value is a ref to a string scalar
  19 my sub remove_urls {
  20     my ($value) = @_;
  21     return if !defined $$value;
  22
  23     # convert 'url([..])' to '___([..])' so the browser does not load it
  24     $$value =~ s|url\(|___(|gi;
  25
  26     # similar for all protocols
  27     $$value =~ s|[a-z0-9]+://|_|gi;
  28 }
  29
  30 my sub remove_urls_from_attr {
  31     my ($obj, $tag_name, $attr_name, $value) = @_;
  32
  33     remove_urls(\$value);
  34
  35     return $value;
  36 }
  37
  38 sub dump_html {
  39     my ($tree, $cid_hash, $view_images) = @_;
  40
  41     my @html = ();
  42
  43     $tree->traverse(sub {
  44         my ($node, $start, $depth) = @_;
  45         if (ref $node) {
  46             my $tag = $node->{'_tag'};
  47
  48             # try to open a new window when user activates a anchor
  49             $node->{target} = '_blank' if $tag eq 'a';
  50
  51             if ($tag eq 'img' && $view_images) {
  52                 if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) {
  53                     if (my $datauri = $cid_hash->{$1}) {
  54                         $node->{src} = $datauri;
  55                     }
  56                 }
  57             }
  58
  59             if ($tag eq 'style' && !$view_images) {
  60                 remove_urls($_) for grep { !ref $$_ } $node->content_refs_list();
  61             }
  62
  63             if ($start) { # on the way in
  64                 push(@html, $node->starttag);
  65             } else { # on the way out
  66                 push(@html, $node->endtag);
  67             }
  68         } else { # simple text content
  69             # To keep from amp-escaping children of script et al. That doesn't deal with descendants;
  70             # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
  71             if (!$HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }) {
  72                 # That does magic things if $entities is undef.
  73                 $node = encode_entities($node);
  74             }
  75             push(@html, $node);
  76         }
  77         return 1; # keep traversing
  78     });
  79
  80     return join('', @html, "\n");
  81 }
  82
  83 sub getscrubber {
  84     my ($viewimages, $allowhref) = @_;
  85
  86     # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
  87
  88     my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
  89
  90     my @rules = ( script => 0 );
  91
  92     my @default = (
  93         0 =>  # default rule, deny all tags
  94         {
  95             '*' => 0, # default rule, deny all attributes
  96             abbr => 1,
  97             accept => 1,
  98             'accept-charset' => 1,
  99             accesskey => 1,
 100             align => 1,
 101             alt => 1,
 102             axis => 1,
 103             border => 1,
 104             bgcolor => 1,
 105             cellpadding => 1,
 106             cellspacing => 1,
 107             char => 1,
 108             charoff => 1,
 109             charset => 1,
 110             checked => 1,
 111             cite => 1,
 112             class => 1,
 113             clear => 1,
 114             cols => 1,
 115             colspan => 1,
 116             color => 1,
 117             compact => 1,
 118             coords => 1,
 119             datetime => 1,
 120             dir => 1,
 121             disabled => 1,
 122             enctype => 1,
 123             frame => 1,
 124             headers => 1,
 125             height => 1,
 126             # only allow http:// and https:// hrefs
 127             'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
 128             hreflang => 1,
 129             hspace => 1,
 130             id => 1,
 131             ismap => 1,
 132             label => 1,
 133             lang => 1,
 134             longdesc => 1,
 135             maxlength => 1,
 136             media => 1,
 137             method => 1,
 138             multiple => 1,
 139             name => 1,
 140             nohref => 1,
 141             noshade => 1,
 142             nowrap => 1,
 143             prompt => 1,
 144             readonly => 1,
 145             rel => 1,
 146             rev => 1,
 147             rows => 1,
 148             rowspan => 1,
 149             rules => 1,
 150             scope => 1,
 151             selected => 1,
 152             shape => 1,
 153             size => 1,
 154             span => 1,
 155             src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
 156             start => 1,
 157             style => $viewimages ? 1 : \&remove_urls_from_attr,
 158             summary => 1,
 159             tabindex => 1,
 160             target => 1,
 161             title => 1,
 162             type => 1,
 163             usemap => 1,
 164             valign => 1,
 165             value => 1,
 166             vspace => 1,
 167             width => 1,
 168         }
 169     );
 170
 171     my $scrubber = HTML::Scrubber->new(
 172         allow   => \@allow,
 173         rules   => \@rules,
 174         default => \@default,
 175         comment => 0,
 176         process => 0,
 177     );
 178
 179     $scrubber->style(1);
 180
 181     return $scrubber;
 182 }
 183
 184 sub read_raw_email {
 185     my ($path, $maxbytes) = @_;
 186
 187     open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
 188
 189     my $data = '';
 190     my $raw_header = '';
 191
 192     # read header
 193     my $header;
 194     while (defined(my $line = <$fh>)) {
 195         my $decoded_line = PMG::Utils::try_decode_utf8($line);
 196         $raw_header .= $decoded_line;
 197         chomp $decoded_line;
 198         push @$header, $decoded_line;
 199         last if $line =~ m/^\s*$/;
 200     }
 201
 202     my $head = MIME::Head->new($header);
 203
 204     my $cs = $head->mime_attr("content-type.charset");
 205
 206     my $bytes = 0;
 207
 208     while (defined(my $line = <$fh>)) {
 209         $bytes += length ($line);
 210         if ($cs) {
 211             $data .= decode($cs, $line);
 212         } else {
 213             $data .= $line;
 214         }
 215         if (defined($maxbytes) && ($bytes >= $maxbytes)) {
 216             $data .= "\n... mail truncated (> $maxbytes bytes)\n";
 217             last;
 218         }
 219     }
 220
 221     close($fh);
 222
 223     return ($raw_header, $data);
 224 }
 225
 226 my $read_part = sub {
 227     my ($part) = @_;
 228
 229     my $io = $part->open("r");
 230     return undef if !$io;
 231
 232     my $raw = '';
 233     while (defined(my $line = $io->getline)) { $raw .= $line; }
 234     $io->close;
 235
 236     return $raw;
 237 };
 238
 239 my $find_images = sub {
 240     my ($cid_hash, $entity) = @_;
 241
 242     foreach my $part ($entity->parts)  {
 243         if (my $rawcid = $part->head->get('Content-Id')) {
 244             if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
 245                 my $cid = $1;
 246                 my $ctype = $part->head->mime_attr('Content-type') // '';
 247                 if ($ctype =~ m!^image/!) {
 248                     if (defined(my $raw = $read_part->($part))) {
 249                         $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
 250                     }
 251                 }
 252             }
 253         }
 254     }
 255 };
 256
 257 sub entity_to_html {
 258     my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
 259
 260     my $mime_type = lc($entity->mime_type);;
 261
 262     if ($mime_type eq 'text/plain') {
 263         my $raw = $read_part->($entity) // '';
 264         my $html = "<pre>\n";
 265
 266         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 267             $html .= PMG::Utils::decode_to_html($cs, $raw);
 268         } else {
 269             $html .= encode_entities($raw);
 270         }
 271
 272         $html .= "</pre>\n";
 273
 274         return $html;
 275
 276     } elsif ($mime_type eq 'text/html') {
 277         my $raw = $read_part->($entity) // '';
 278
 279         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 280             eval { $raw = decode($cs, $raw); }; # ignore errors here
 281         }
 282
 283         # create a well formed tree
 284         my $tree = HTML::TreeBuilder->new();
 285         $tree->parse($raw);
 286         $tree->eof();
 287
 288         # normalizes html, replaces CID references with data uris and scrubs style tags
 289         my $whtml = dump_html($tree, $cid_hash, $viewimages);
 290         $tree->delete;
 291
 292         # remove dangerous/unneeded elements
 293         my $scrubber = getscrubber($viewimages, $allowhref);
 294         return $scrubber->scrub($whtml);
 295
 296     } elsif ($mime_type =~ m|^multipart/|i) {
 297         my $multi_part;
 298         my $html_part;
 299         my $text_part;
 300
 301         foreach my $part ($entity->parts)  {
 302             my $subtype = lc($part->mime_type);
 303             $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
 304             $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
 305             $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
 306         }
 307
 308         # get related/embedded images as data uris
 309         $find_images->($cid_hash, $entity);
 310
 311         my $alt = $multi_part || $html_part || $text_part;
 312
 313         return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
 314     }
 315
 316     return undef;
 317 }
 318
 319 sub email_to_html {
 320     my ($path, $raw, $viewimages, $allowhref) = @_;
 321
 322     my $dumpdir = "/tmp/.proxdumpview_$$";
 323
 324     my $html = '';
 325
 326     eval {
 327         if ($raw) {
 328
 329             my ($header, $content) = read_raw_email($path);
 330
 331             $html .= "<pre>\n" .
 332                 encode_entities($header) .
 333                 "\n" .
 334                 encode_entities($content) .
 335                 "</pre>\n";
 336
 337         } else {
 338
 339             my $parser = PMG::MIMEUtils::new_mime_parser({
 340                 dumpdir => $dumpdir,
 341             });
 342
 343             my $entity = $parser->parse_open($path);
 344
 345             PMG::MIMEUtils::fixup_multipart($entity);
 346
 347             $html = entity_to_html($entity, {}, $viewimages, $allowhref);
 348         }
 349     };
 350     my $err = $@;
 351
 352     rmtree $dumpdir;
 353
 354     die "unable to parse mail: $err" if $err;
 355
 356     return $html;
 357 }
 358
 359 1;