]> git.proxmox.com Git - pmg-api.git/blame - src/PMG/HTMLMail.pm
dkim: add QID in warnings
[pmg-api.git] / src / PMG / HTMLMail.pm
CommitLineData
08d6921a
DM
1package PMG::HTMLMail;
2
3use strict;
4use warnings;
5use Encode;
6use Data::Dumper;
7use MIME::Head;
8use File::Path;
9use HTML::Entities;
10use MIME::Parser;
11use MIME::Base64;
12use HTML::TreeBuilder;
13use HTML::Scrubber;
14
18598b2c
DC
15use PMG::Utils;
16use PMG::MIMEUtils;
17
d970f290
DC
18# $value is a ref to a string scalar
19my sub remove_urls {
20 my ($value) = @_;
d6bf3645
TL
21 return if !defined $$value;
22
d970f290
DC
23 # convert 'url([..])' to '___([..])' so the browser does not load it
24 $$value =~ s|url\(|___(|gi;
25
26 # similar for all protocols
27 $$value =~ s|[a-z0-9]+://|_|gi;
28}
29
30my sub remove_urls_from_attr {
31 my ($obj, $tag_name, $attr_name, $value) = @_;
32
33 remove_urls(\$value);
34
35 return $value;
36}
37
08d6921a 38sub dump_html {
d970f290 39 my ($tree, $cid_hash, $view_images) = @_;
08d6921a
DM
40
41 my @html = ();
42
66a0e574
TL
43 $tree->traverse(sub {
44 my ($node, $start, $depth) = @_;
45 if (ref $node) {
46 my $tag = $node->{'_tag'};
47
48 # try to open a new window when user activates a anchor
49 $node->{target} = '_blank' if $tag eq 'a';
50
51 if ($tag eq 'img' && $view_images) {
52 if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) {
53 if (my $datauri = $cid_hash->{$1}) {
54 $node->{src} = $datauri;
08d6921a
DM
55 }
56 }
66a0e574 57 }
08d6921a 58
66a0e574
TL
59 if ($tag eq 'style' && !$view_images) {
60 remove_urls($_) for grep { !ref $$_ } $node->content_refs_list();
61 }
d970f290 62
66a0e574
TL
63 if ($start) { # on the way in
64 push(@html, $node->starttag);
65 } else { # on the way out
66 push(@html, $node->endtag);
67 }
68 } else { # simple text content
69 # To keep from amp-escaping children of script et al. That doesn't deal with descendants;
70 # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
71 if (!$HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }) {
72 # That does magic things if $entities is undef.
73 $node = encode_entities($node);
08d6921a 74 }
66a0e574 75 push(@html, $node);
08d6921a 76 }
66a0e574
TL
77 return 1; # keep traversing
78 });
08d6921a
DM
79
80 return join('', @html, "\n");
81}
82
83sub getscrubber {
84 my ($viewimages, $allowhref) = @_;
85
86 # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
87
98dac1e6 88 my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
08d6921a 89
3f819452
DM
90 my @rules = ( script => 0 );
91
08d6921a
DM
92 my @default = (
93 0 => # default rule, deny all tags
94 {
95 '*' => 0, # default rule, deny all attributes
96 abbr => 1,
97 accept => 1,
98 'accept-charset' => 1,
99 accesskey => 1,
100 align => 1,
101 alt => 1,
102 axis => 1,
103 border => 1,
7cfe043a 104 bgcolor => 1,
08d6921a
DM
105 cellpadding => 1,
106 cellspacing => 1,
107 char => 1,
108 charoff => 1,
109 charset => 1,
110 checked => 1,
111 cite => 1,
112 class => 1,
113 clear => 1,
114 cols => 1,
115 colspan => 1,
116 color => 1,
117 compact => 1,
118 coords => 1,
119 datetime => 1,
120 dir => 1,
121 disabled => 1,
122 enctype => 1,
123 frame => 1,
124 headers => 1,
125 height => 1,
126 # only allow http:// and https:// hrefs
127 'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
128 hreflang => 1,
129 hspace => 1,
130 id => 1,
131 ismap => 1,
132 label => 1,
133 lang => 1,
134 longdesc => 1,
135 maxlength => 1,
136 media => 1,
137 method => 1,
138 multiple => 1,
139 name => 1,
140 nohref => 1,
141 noshade => 1,
142 nowrap => 1,
143 prompt => 1,
144 readonly => 1,
145 rel => 1,
146 rev => 1,
147 rows => 1,
148 rowspan => 1,
149 rules => 1,
150 scope => 1,
151 selected => 1,
152 shape => 1,
153 size => 1,
154 span => 1,
3f819452 155 src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
08d6921a 156 start => 1,
8a1a3461 157 style => $viewimages ? 1 : \&remove_urls_from_attr,
08d6921a
DM
158 summary => 1,
159 tabindex => 1,
160 target => 1,
161 title => 1,
162 type => 1,
163 usemap => 1,
164 valign => 1,
165 value => 1,
166 vspace => 1,
167 width => 1,
168 }
169 );
170
171 my $scrubber = HTML::Scrubber->new(
172 allow => \@allow,
173 rules => \@rules,
174 default => \@default,
175 comment => 0,
08d6921a
DM
176 process => 0,
177 );
178
98dac1e6
DM
179 $scrubber->style(1);
180
08d6921a
DM
181 return $scrubber;
182}
183
184sub read_raw_email {
b75acee7 185 my ($path, $maxbytes) = @_;
08d6921a
DM
186
187 open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
188
189 my $data = '';
190 my $raw_header = '';
191
192 # read header
193 my $header;
194 while (defined(my $line = <$fh>)) {
07f7275e
SI
195 my $decoded_line = PMG::Utils::try_decode_utf8($line);
196 $raw_header .= $decoded_line;
197 chomp $decoded_line;
198 push @$header, $decoded_line;
08d6921a
DM
199 last if $line =~ m/^\s*$/;
200 }
201
202 my $head = MIME::Head->new($header);
203
204 my $cs = $head->mime_attr("content-type.charset");
205
b75acee7
DM
206 my $bytes = 0;
207
08d6921a 208 while (defined(my $line = <$fh>)) {
b75acee7 209 $bytes += length ($line);
08d6921a
DM
210 if ($cs) {
211 $data .= decode($cs, $line);
212 } else {
213 $data .= $line;
214 }
b75acee7
DM
215 if (defined($maxbytes) && ($bytes >= $maxbytes)) {
216 $data .= "\n... mail truncated (> $maxbytes bytes)\n";
217 last;
218 }
08d6921a
DM
219 }
220
221 close($fh);
222
223 return ($raw_header, $data);
224}
225
226my $read_part = sub {
227 my ($part) = @_;
228
229 my $io = $part->open("r");
230 return undef if !$io;
231
232 my $raw = '';
233 while (defined(my $line = $io->getline)) { $raw .= $line; }
234 $io->close;
235
236 return $raw;
237};
238
239my $find_images = sub {
59424fd5 240 my ($cid_hash, $entity) = @_;
08d6921a
DM
241
242 foreach my $part ($entity->parts) {
243 if (my $rawcid = $part->head->get('Content-Id')) {
244 if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
245 my $cid = $1;
246 my $ctype = $part->head->mime_attr('Content-type') // '';
247 if ($ctype =~ m!^image/!) {
248 if (defined(my $raw = $read_part->($part))) {
59424fd5 249 $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
08d6921a
DM
250 }
251 }
252 }
253 }
254 }
08d6921a
DM
255};
256
257sub entity_to_html {
258 my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
259
260 my $mime_type = lc($entity->mime_type);;
261
262 if ($mime_type eq 'text/plain') {
263 my $raw = $read_part->($entity) // '';
264 my $html = "<pre>\n";
265
266 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
267 $html .= PMG::Utils::decode_to_html($cs, $raw);
268 } else {
269 $html .= encode_entities($raw);
270 }
271
272 $html .= "</pre>\n";
273
274 return $html;
275
276 } elsif ($mime_type eq 'text/html') {
277 my $raw = $read_part->($entity) // '';
278
279 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
280 eval { $raw = decode($cs, $raw); }; # ignore errors here
281 }
282
283 # create a well formed tree
284 my $tree = HTML::TreeBuilder->new();
285 $tree->parse($raw);
286 $tree->eof();
287
d970f290
DC
288 # normalizes html, replaces CID references with data uris and scrubs style tags
289 my $whtml = dump_html($tree, $cid_hash, $viewimages);
08d6921a
DM
290 $tree->delete;
291
292 # remove dangerous/unneeded elements
293 my $scrubber = getscrubber($viewimages, $allowhref);
294 return $scrubber->scrub($whtml);
295
296 } elsif ($mime_type =~ m|^multipart/|i) {
297 my $multi_part;
298 my $html_part;
299 my $text_part;
300
301 foreach my $part ($entity->parts) {
302 my $subtype = lc($part->mime_type);
303 $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
304 $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
305 $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
306 }
307
308 # get related/embedded images as data uris
59424fd5 309 $find_images->($cid_hash, $entity);
08d6921a
DM
310
311 my $alt = $multi_part || $html_part || $text_part;
312
59424fd5 313 return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
08d6921a
DM
314 }
315
316 return undef;
317}
318
319sub email_to_html {
320 my ($path, $raw, $viewimages, $allowhref) = @_;
321
322 my $dumpdir = "/tmp/.proxdumpview_$$";
323
324 my $html = '';
325
326 eval {
327 if ($raw) {
328
329 my ($header, $content) = read_raw_email($path);
330
331 $html .= "<pre>\n" .
332 encode_entities($header) .
333 "\n" .
334 encode_entities($content) .
335 "</pre>\n";
336
337 } else {
338
18598b2c
DC
339 my $parser = PMG::MIMEUtils::new_mime_parser({
340 dumpdir => $dumpdir,
341 });
08d6921a
DM
342
343 my $entity = $parser->parse_open($path);
344
18598b2c 345 PMG::MIMEUtils::fixup_multipart($entity);
08d6921a
DM
346
347 $html = entity_to_html($entity, {}, $viewimages, $allowhref);
348 }
349 };
350 my $err = $@;
351
352 rmtree $dumpdir;
353
354 die "unable to parse mail: $err" if $err;
355
356 return $html;
357}
358
3591;