PMG/HTMLMail.pm

   1 package PMG::HTMLMail;
   2
   3 use strict;
   4 use warnings;
   5 use Encode;
   6 use Data::Dumper;
   7 use MIME::Head;
   8 use File::Path;
   9 use HTML::Entities;
  10 use MIME::Parser;
  11 use MIME::Base64;
  12 use HTML::TreeBuilder;
  13 use HTML::Scrubber;
  14
  15 sub dump_html {
  16     my ($tree, $cid_hash) = @_;
  17
  18     my @html = ();
  19
  20     my($tag, $node, $start, $depth);
  21
  22     $tree->traverse(
  23         sub {
  24             ($node, $start) = @_;
  25             if(ref $node) {
  26                 $tag = $node->{'_tag'};
  27
  28                 # try to open a new window when user activates a anchor
  29                 $node->{target} = '_blank' if $tag eq 'a';
  30
  31                 if ($tag eq 'img') {
  32                     if ($node->{src} =~ m/^cid:(\S+)$/) {
  33                         if (my $datauri = $cid_hash->{$1}) {
  34                             $node->{src} = $datauri;
  35                         }
  36                     }
  37                 }
  38
  39                 if($start) { # on the way in
  40                     push(@html, $node->starttag);
  41                 } else {
  42                     # on the way out
  43                     push(@html, $node->endtag);
  44                 }
  45             } else {
  46                 # simple text content
  47                 $node = encode_entities($node)
  48                     # That does magic things if $entities is undef.
  49                     unless $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} };
  50                 # To keep from amp-escaping children of script et al.
  51                 # That doesn't deal with descendants; but then, CDATA
  52                 #  parents shouldn't /have/ descendants other than a
  53                 #  text children (or comments?)
  54                 push(@html, $node);
  55             }
  56             1; # keep traversing
  57         }
  58     );
  59
  60     return join('', @html, "\n");
  61 }
  62
  63 sub getscrubber {
  64     my ($viewimages, $allowhref) = @_;
  65
  66     # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
  67
  68     my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
  69
  70     my @rules = ( script => 0 );
  71
  72     my @default = (
  73         0 =>  # default rule, deny all tags
  74         {
  75             '*' => 0, # default rule, deny all attributes
  76             abbr => 1,
  77             accept => 1,
  78             'accept-charset' => 1,
  79             accesskey => 1,
  80             align => 1,
  81             alt => 1,
  82             axis => 1,
  83             border => 1,
  84             bgcolor => 1,
  85             cellpadding => 1,
  86             cellspacing => 1,
  87             char => 1,
  88             charoff => 1,
  89             charset => 1,
  90             checked => 1,
  91             cite => 1,
  92             class => 1,
  93             clear => 1,
  94             cols => 1,
  95             colspan => 1,
  96             color => 1,
  97             compact => 1,
  98             coords => 1,
  99             datetime => 1,
 100             dir => 1,
 101             disabled => 1,
 102             enctype => 1,
 103             frame => 1,
 104             headers => 1,
 105             height => 1,
 106             # only allow http:// and https:// hrefs
 107             'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
 108             hreflang => 1,
 109             hspace => 1,
 110             id => 1,
 111             ismap => 1,
 112             label => 1,
 113             lang => 1,
 114             longdesc => 1,
 115             maxlength => 1,
 116             media => 1,
 117             method => 1,
 118             multiple => 1,
 119             name => 1,
 120             nohref => 1,
 121             noshade => 1,
 122             nowrap => 1,
 123             prompt => 1,
 124             readonly => 1,
 125             rel => 1,
 126             rev => 1,
 127             rows => 1,
 128             rowspan => 1,
 129             rules => 1,
 130             scope => 1,
 131             selected => 1,
 132             shape => 1,
 133             size => 1,
 134             span => 1,
 135             src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
 136             start => 1,
 137             style => 1,
 138             summary => 1,
 139             tabindex => 1,
 140             target => 1,
 141             title => 1,
 142             type => 1,
 143             usemap => 1,
 144             valign => 1,
 145             value => 1,
 146             vspace => 1,
 147             width => 1,
 148         }
 149     );
 150
 151     my $scrubber = HTML::Scrubber->new(
 152         allow   => \@allow,
 153         rules   => \@rules,
 154         default => \@default,
 155         comment => 0,
 156         process => 0,
 157     );
 158
 159     $scrubber->style(1);
 160
 161     return $scrubber;
 162 }
 163
 164 sub read_raw_email {
 165     my ($path, $maxbytes) = @_;
 166
 167     open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
 168
 169     my $data = '';
 170     my $raw_header = '';
 171
 172     # read header
 173     my $header;
 174     while (defined(my $line = <$fh>)) {
 175         $raw_header .= $line;
 176         chomp $line;
 177         push @$header, $line;
 178         last if $line =~ m/^\s*$/;
 179     }
 180
 181     my $head = MIME::Head->new($header);
 182
 183     my $cs = $head->mime_attr("content-type.charset");
 184
 185     my $bytes = 0;
 186
 187     while (defined(my $line = <$fh>)) {
 188         $bytes += length ($line);
 189         if ($cs) {
 190             $data .= decode($cs, $line);
 191         } else {
 192             $data .= $line;
 193         }
 194         if (defined($maxbytes) && ($bytes >= $maxbytes)) {
 195             $data .= "\n... mail truncated (> $maxbytes bytes)\n";
 196             last;
 197         }
 198     }
 199
 200     close($fh);
 201
 202     return ($raw_header, $data);
 203 }
 204
 205 my $read_part = sub {
 206     my ($part) = @_;
 207
 208     my $io = $part->open("r");
 209     return undef if !$io;
 210
 211     my $raw = '';
 212     while (defined(my $line = $io->getline)) { $raw .= $line; }
 213     $io->close;
 214
 215     return $raw;
 216 };
 217
 218 my $find_images = sub {
 219     my ($cid_hash, $entity) = @_;
 220
 221     foreach my $part ($entity->parts)  {
 222         if (my $rawcid = $part->head->get('Content-Id')) {
 223             if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
 224                 my $cid = $1;
 225                 my $ctype = $part->head->mime_attr('Content-type') // '';
 226                 if ($ctype =~ m!^image/!) {
 227                     if (defined(my $raw = $read_part->($part))) {
 228                         $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
 229                     }
 230                 }
 231             }
 232         }
 233     }
 234 };
 235
 236 sub entity_to_html {
 237     my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
 238
 239     my $mime_type = lc($entity->mime_type);;
 240
 241     if ($mime_type eq 'text/plain') {
 242         my $raw = $read_part->($entity) // '';
 243         my $html = "<pre>\n";
 244
 245         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 246             $html .= PMG::Utils::decode_to_html($cs, $raw);
 247         } else {
 248             $html .= encode_entities($raw);
 249         }
 250
 251         $html .= "</pre>\n";
 252
 253         return $html;
 254
 255     } elsif ($mime_type eq 'text/html') {
 256         my $raw = $read_part->($entity) // '';
 257
 258         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 259             eval { $raw = decode($cs, $raw); }; # ignore errors here
 260         }
 261
 262         # create a well formed tree
 263         my $tree = HTML::TreeBuilder->new();
 264         $tree->parse($raw);
 265         $tree->eof();
 266
 267         my $whtml = dump_html($tree, $viewimages ? $cid_hash : {});
 268         $tree->delete;
 269
 270         # remove dangerous/unneeded elements
 271         my $scrubber = getscrubber($viewimages, $allowhref);
 272         return $scrubber->scrub($whtml);
 273
 274     } elsif ($mime_type =~ m|^multipart/|i) {
 275         my $multi_part;
 276         my $html_part;
 277         my $text_part;
 278
 279         foreach my $part ($entity->parts)  {
 280             my $subtype = lc($part->mime_type);
 281             $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
 282             $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
 283             $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
 284         }
 285
 286         # get related/embedded images as data uris
 287         $find_images->($cid_hash, $entity);
 288
 289         my $alt = $multi_part || $html_part || $text_part;
 290
 291         return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
 292     }
 293
 294     return undef;
 295 }
 296
 297 sub email_to_html {
 298     my ($path, $raw, $viewimages, $allowhref) = @_;
 299
 300     my $dumpdir = "/tmp/.proxdumpview_$$";
 301
 302     my $html = '';
 303
 304     eval {
 305         if ($raw) {
 306
 307             my ($header, $content) = read_raw_email($path);
 308
 309             $html .= "<pre>\n" .
 310                 encode_entities($header) .
 311                 "\n" .
 312                 encode_entities($content) .
 313                 "</pre>\n";
 314
 315         } else {
 316
 317             my $parser = new MIME::Parser;
 318             $parser->extract_nested_messages(0);
 319
 320             rmtree $dumpdir;
 321
 322             # Create and set the output directory:
 323             (-d $dumpdir || mkdir($dumpdir ,0755)) ||
 324                 die "can't create $dumpdir: $! : ERROR";
 325             (-w $dumpdir) ||
 326                 die "can't write to directory $dumpdir: $! : ERROR";
 327
 328             $parser->output_dir($dumpdir);
 329
 330             my $entity = $parser->parse_open($path);
 331
 332             # bug fix for bin/tests/content/mimeparser.txt
 333             if ($entity->mime_type =~ m|multipart/|i && !$entity->head->multipart_boundary) {
 334                 $entity->head->mime_attr('Content-type' => "application/x-unparseable-multipart");
 335             }
 336
 337             $html = entity_to_html($entity, {}, $viewimages, $allowhref);
 338         }
 339     };
 340     my $err = $@;
 341
 342     rmtree $dumpdir;
 343
 344     die "unable to parse mail: $err" if $err;
 345
 346     return $html;
 347 }
 348
 349 1;