PMG/HTMLMail.pm

   1 package PMG::HTMLMail;
   2
   3 use strict;
   4 use warnings;
   5 use Encode;
   6 use Data::Dumper;
   7 use MIME::Head;
   8 use File::Path;
   9 use HTML::Entities;
  10 use MIME::Parser;
  11 use MIME::Base64;
  12 use HTML::TreeBuilder;
  13 use HTML::Scrubber;
  14
  15 sub dump_html {
  16     my ($tree, $cid_hash) = @_;
  17
  18     my @html = ();
  19
  20     my($tag, $node, $start, $depth);
  21
  22     $tree->traverse(
  23         sub {
  24             ($node, $start) = @_;
  25             if(ref $node) {
  26                 $tag = $node->{'_tag'};
  27
  28                 # try to open a new window when user activates a anchor
  29                 $node->{target} = '_blank' if $tag eq 'a';
  30
  31                 if ($tag eq 'img') {
  32                     if ($node->{src} =~ m/^cid:(\S+)$/) {
  33                         if (my $datauri = $cid_hash->{$1}) {
  34                             $node->{src} = $datauri;
  35                         }
  36                     }
  37                 }
  38
  39                 if($start) { # on the way in
  40                     push(@html, $node->starttag);
  41                 } else {
  42                     # on the way out
  43                     push(@html, $node->endtag);
  44                 }
  45             } else {
  46                 # simple text content
  47                 $node = encode_entities($node)
  48                     # That does magic things if $entities is undef.
  49                     unless $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} };
  50                 # To keep from amp-escaping children of script et al.
  51                 # That doesn't deal with descendants; but then, CDATA
  52                 #  parents shouldn't /have/ descendants other than a
  53                 #  text children (or comments?)
  54                 push(@html, $node);
  55             }
  56             1; # keep traversing
  57         }
  58     );
  59
  60     return join('', @html, "\n");
  61 }
  62
  63 sub getscrubber {
  64     my ($viewimages, $allowhref) = @_;
  65
  66     # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
  67
  68     my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
  69
  70     my @rules = ( script => 0 );
  71
  72     my @default = (
  73         0 =>  # default rule, deny all tags
  74         {
  75             '*' => 0, # default rule, deny all attributes
  76             abbr => 1,
  77             accept => 1,
  78             'accept-charset' => 1,
  79             accesskey => 1,
  80             align => 1,
  81             alt => 1,
  82             axis => 1,
  83             border => 1,
  84             bgcolor => 1,
  85             cellpadding => 1,
  86             cellspacing => 1,
  87             char => 1,
  88             charoff => 1,
  89             charset => 1,
  90             checked => 1,
  91             cite => 1,
  92             class => 1,
  93             clear => 1,
  94             cols => 1,
  95             colspan => 1,
  96             color => 1,
  97             compact => 1,
  98             coords => 1,
  99             datetime => 1,
 100             dir => 1,
 101             disabled => 1,
 102             enctype => 1,
 103             frame => 1,
 104             headers => 1,
 105             height => 1,
 106             # only allow http:// and https:// hrefs
 107             'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
 108             hreflang => 1,
 109             hspace => 1,
 110             id => 1,
 111             ismap => 1,
 112             label => 1,
 113             lang => 1,
 114             longdesc => 1,
 115             maxlength => 1,
 116             media => 1,
 117             method => 1,
 118             multiple => 1,
 119             name => 1,
 120             nohref => 1,
 121             noshade => 1,
 122             nowrap => 1,
 123             prompt => 1,
 124             readonly => 1,
 125             rel => 1,
 126             rev => 1,
 127             rows => 1,
 128             rowspan => 1,
 129             rules => 1,
 130             scope => 1,
 131             selected => 1,
 132             shape => 1,
 133             size => 1,
 134             span => 1,
 135             src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
 136             start => 1,
 137             style => 1,
 138             summary => 1,
 139             tabindex => 1,
 140             target => 1,
 141             title => 1,
 142             type => 1,
 143             usemap => 1,
 144             valign => 1,
 145             value => 1,
 146             vspace => 1,
 147             width => 1,
 148         }
 149     );
 150
 151     my $scrubber = HTML::Scrubber->new(
 152         allow   => \@allow,
 153         rules   => \@rules,
 154         default => \@default,
 155         comment => 0,
 156         process => 0,
 157     );
 158
 159     $scrubber->style(1);
 160
 161     return $scrubber;
 162 }
 163
 164 sub read_raw_email {
 165     my ($path, $maxbytes) = @_;
 166
 167     open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
 168
 169     my $data = '';
 170     my $raw_header = '';
 171
 172     # read header
 173     my $header;
 174     while (defined(my $line = <$fh>)) {
 175         $raw_header .= $line;
 176         chomp $line;
 177         push @$header, $line;
 178         last if $line =~ m/^\s*$/;
 179     }
 180
 181     my $head = MIME::Head->new($header);
 182
 183     my $cs = $head->mime_attr("content-type.charset");
 184
 185     my $bytes = 0;
 186
 187     while (defined(my $line = <$fh>)) {
 188         $bytes += length ($line);
 189         if ($cs) {
 190             $data .= decode($cs, $line);
 191         } else {
 192             $data .= $line;
 193         }
 194         if (defined($maxbytes) && ($bytes >= $maxbytes)) {
 195             $data .= "\n... mail truncated (> $maxbytes bytes)\n";
 196             last;
 197         }
 198     }
 199
 200     close($fh);
 201
 202     return ($raw_header, $data);
 203 }
 204
 205 my $read_part = sub {
 206     my ($part) = @_;
 207
 208     my $io = $part->open("r");
 209     return undef if !$io;
 210
 211     my $raw = '';
 212     while (defined(my $line = $io->getline)) { $raw .= $line; }
 213     $io->close;
 214
 215     return $raw;
 216 };
 217
 218 my $find_images = sub {
 219     my ($entity) = @_;
 220
 221     my $res = {};
 222
 223     foreach my $part ($entity->parts)  {
 224         if (my $rawcid = $part->head->get('Content-Id')) {
 225             if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
 226                 my $cid = $1;
 227                 my $ctype = $part->head->mime_attr('Content-type') // '';
 228                 if ($ctype =~ m!^image/!) {
 229                     if (defined(my $raw = $read_part->($part))) {
 230                         $res->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
 231                     }
 232                 }
 233             }
 234         }
 235     }
 236
 237     return $res;
 238 };
 239
 240 sub entity_to_html {
 241     my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
 242
 243     my $mime_type = lc($entity->mime_type);;
 244
 245     if ($mime_type eq 'text/plain') {
 246         my $raw = $read_part->($entity) // '';
 247         my $html = "<pre>\n";
 248
 249         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 250             $html .= PMG::Utils::decode_to_html($cs, $raw);
 251         } else {
 252             $html .= encode_entities($raw);
 253         }
 254
 255         $html .= "</pre>\n";
 256
 257         return $html;
 258
 259     } elsif ($mime_type eq 'text/html') {
 260         my $raw = $read_part->($entity) // '';
 261
 262         if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
 263             eval { $raw = decode($cs, $raw); }; # ignore errors here
 264         }
 265
 266         # create a well formed tree
 267         my $tree = HTML::TreeBuilder->new();
 268         $tree->parse($raw);
 269         $tree->eof();
 270
 271         my $whtml = dump_html($tree, $viewimages ? $cid_hash : {});
 272         $tree->delete;
 273
 274         # remove dangerous/unneeded elements
 275         my $scrubber = getscrubber($viewimages, $allowhref);
 276         return $scrubber->scrub($whtml);
 277
 278     } elsif ($mime_type =~ m|^multipart/|i) {
 279         my $multi_part;
 280         my $html_part;
 281         my $text_part;
 282
 283         foreach my $part ($entity->parts)  {
 284             my $subtype = lc($part->mime_type);
 285             $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
 286             $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
 287             $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
 288         }
 289
 290         # get related/embedded images as data uris
 291         my $cid_hash = $find_images->($entity);
 292
 293         my $alt = $multi_part || $html_part || $text_part;
 294
 295         return entity_to_html ($alt, $cid_hash, $viewimages, $allowhref) if $alt;
 296     }
 297
 298     return undef;
 299 }
 300
 301 sub email_to_html {
 302     my ($path, $raw, $viewimages, $allowhref) = @_;
 303
 304     my $dumpdir = "/tmp/.proxdumpview_$$";
 305
 306     my $html = '';
 307
 308     eval {
 309         if ($raw) {
 310
 311             my ($header, $content) = read_raw_email($path);
 312
 313             $html .= "<pre>\n" .
 314                 encode_entities($header) .
 315                 "\n" .
 316                 encode_entities($content) .
 317                 "</pre>\n";
 318
 319         } else {
 320
 321             my $parser = new MIME::Parser;
 322             $parser->extract_nested_messages(0);
 323
 324             rmtree $dumpdir;
 325
 326             # Create and set the output directory:
 327             (-d $dumpdir || mkdir($dumpdir ,0755)) ||
 328                 die "can't create $dumpdir: $! : ERROR";
 329             (-w $dumpdir) ||
 330                 die "can't write to directory $dumpdir: $! : ERROR";
 331
 332             $parser->output_dir($dumpdir);
 333
 334             my $entity = $parser->parse_open($path);
 335
 336             # bug fix for bin/tests/content/mimeparser.txt
 337             if ($entity->mime_type =~ m|multipart/|i && !$entity->head->multipart_boundary) {
 338                 $entity->head->mime_attr('Content-type' => "application/x-unparseable-multipart");
 339             }
 340
 341             $html = entity_to_html($entity, {}, $viewimages, $allowhref);
 342         }
 343     };
 344     my $err = $@;
 345
 346     rmtree $dumpdir;
 347
 348     die "unable to parse mail: $err" if $err;
 349
 350     return $html;
 351 }
 352
 353 1;