]> git.proxmox.com Git - pmg-api.git/blame - src/PMG/HTMLMail.pm
subscription: handle missing subscription info
[pmg-api.git] / src / PMG / HTMLMail.pm
CommitLineData
08d6921a
DM
1package PMG::HTMLMail;
2
3use strict;
4use warnings;
5use Encode;
6use Data::Dumper;
7use MIME::Head;
8use File::Path;
9use HTML::Entities;
10use MIME::Parser;
11use MIME::Base64;
12use HTML::TreeBuilder;
13use HTML::Scrubber;
14
18598b2c
DC
15use PMG::Utils;
16use PMG::MIMEUtils;
17
d970f290
DC
18# $value is a ref to a string scalar
19my sub remove_urls {
20 my ($value) = @_;
d6bf3645
TL
21 return if !defined $$value;
22
d970f290
DC
23 # convert 'url([..])' to '___([..])' so the browser does not load it
24 $$value =~ s|url\(|___(|gi;
25
26 # similar for all protocols
27 $$value =~ s|[a-z0-9]+://|_|gi;
28}
29
30my sub remove_urls_from_attr {
31 my ($obj, $tag_name, $attr_name, $value) = @_;
32
33 remove_urls(\$value);
34
35 return $value;
36}
37
08d6921a 38sub dump_html {
d970f290 39 my ($tree, $cid_hash, $view_images) = @_;
08d6921a
DM
40
41 my @html = ();
42
66a0e574
TL
43 $tree->traverse(sub {
44 my ($node, $start, $depth) = @_;
45 if (ref $node) {
46 my $tag = $node->{'_tag'};
47
48 # try to open a new window when user activates a anchor
49 $node->{target} = '_blank' if $tag eq 'a';
50
51 if ($tag eq 'img' && $view_images) {
52 if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) {
53 if (my $datauri = $cid_hash->{$1}) {
54 $node->{src} = $datauri;
08d6921a
DM
55 }
56 }
66a0e574 57 }
08d6921a 58
66a0e574
TL
59 if ($tag eq 'style' && !$view_images) {
60 remove_urls($_) for grep { !ref $$_ } $node->content_refs_list();
61 }
d970f290 62
66a0e574
TL
63 if ($start) { # on the way in
64 push(@html, $node->starttag);
65 } else { # on the way out
66 push(@html, $node->endtag);
67 }
68 } else { # simple text content
69 # To keep from amp-escaping children of script et al. That doesn't deal with descendants;
70 # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
71 if (!$HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }) {
72 # That does magic things if $entities is undef.
73 $node = encode_entities($node);
08d6921a 74 }
66a0e574 75 push(@html, $node);
08d6921a 76 }
66a0e574
TL
77 return 1; # keep traversing
78 });
08d6921a
DM
79
80 return join('', @html, "\n");
81}
82
83sub getscrubber {
84 my ($viewimages, $allowhref) = @_;
85
86 # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
87
98dac1e6 88 my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
08d6921a 89
3f819452
DM
90 my @rules = ( script => 0 );
91
08d6921a
DM
92 my @default = (
93 0 => # default rule, deny all tags
94 {
95 '*' => 0, # default rule, deny all attributes
96 abbr => 1,
97 accept => 1,
98 'accept-charset' => 1,
99 accesskey => 1,
100 align => 1,
101 alt => 1,
102 axis => 1,
103 border => 1,
7cfe043a 104 bgcolor => 1,
08d6921a
DM
105 cellpadding => 1,
106 cellspacing => 1,
107 char => 1,
108 charoff => 1,
109 charset => 1,
110 checked => 1,
111 cite => 1,
112 class => 1,
113 clear => 1,
114 cols => 1,
115 colspan => 1,
116 color => 1,
117 compact => 1,
118 coords => 1,
119 datetime => 1,
120 dir => 1,
121 disabled => 1,
122 enctype => 1,
123 frame => 1,
124 headers => 1,
125 height => 1,
126 # only allow http:// and https:// hrefs
127 'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
128 hreflang => 1,
129 hspace => 1,
130 id => 1,
131 ismap => 1,
132 label => 1,
133 lang => 1,
134 longdesc => 1,
135 maxlength => 1,
136 media => 1,
137 method => 1,
138 multiple => 1,
139 name => 1,
140 nohref => 1,
141 noshade => 1,
142 nowrap => 1,
143 prompt => 1,
144 readonly => 1,
145 rel => 1,
146 rev => 1,
147 rows => 1,
148 rowspan => 1,
149 rules => 1,
150 scope => 1,
151 selected => 1,
152 shape => 1,
153 size => 1,
154 span => 1,
3f819452 155 src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
08d6921a 156 start => 1,
8a1a3461 157 style => $viewimages ? 1 : \&remove_urls_from_attr,
08d6921a
DM
158 summary => 1,
159 tabindex => 1,
160 target => 1,
161 title => 1,
162 type => 1,
163 usemap => 1,
164 valign => 1,
165 value => 1,
166 vspace => 1,
167 width => 1,
168 }
169 );
170
171 my $scrubber = HTML::Scrubber->new(
172 allow => \@allow,
173 rules => \@rules,
174 default => \@default,
175 comment => 0,
08d6921a
DM
176 process => 0,
177 );
178
98dac1e6
DM
179 $scrubber->style(1);
180
08d6921a
DM
181 return $scrubber;
182}
183
184sub read_raw_email {
b75acee7 185 my ($path, $maxbytes) = @_;
08d6921a
DM
186
187 open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
188
189 my $data = '';
190 my $raw_header = '';
191
192 # read header
193 my $header;
194 while (defined(my $line = <$fh>)) {
195 $raw_header .= $line;
196 chomp $line;
197 push @$header, $line;
198 last if $line =~ m/^\s*$/;
199 }
200
201 my $head = MIME::Head->new($header);
202
203 my $cs = $head->mime_attr("content-type.charset");
204
b75acee7
DM
205 my $bytes = 0;
206
08d6921a 207 while (defined(my $line = <$fh>)) {
b75acee7 208 $bytes += length ($line);
08d6921a
DM
209 if ($cs) {
210 $data .= decode($cs, $line);
211 } else {
212 $data .= $line;
213 }
b75acee7
DM
214 if (defined($maxbytes) && ($bytes >= $maxbytes)) {
215 $data .= "\n... mail truncated (> $maxbytes bytes)\n";
216 last;
217 }
08d6921a
DM
218 }
219
220 close($fh);
221
222 return ($raw_header, $data);
223}
224
225my $read_part = sub {
226 my ($part) = @_;
227
228 my $io = $part->open("r");
229 return undef if !$io;
230
231 my $raw = '';
232 while (defined(my $line = $io->getline)) { $raw .= $line; }
233 $io->close;
234
235 return $raw;
236};
237
238my $find_images = sub {
59424fd5 239 my ($cid_hash, $entity) = @_;
08d6921a
DM
240
241 foreach my $part ($entity->parts) {
242 if (my $rawcid = $part->head->get('Content-Id')) {
243 if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
244 my $cid = $1;
245 my $ctype = $part->head->mime_attr('Content-type') // '';
246 if ($ctype =~ m!^image/!) {
247 if (defined(my $raw = $read_part->($part))) {
59424fd5 248 $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
08d6921a
DM
249 }
250 }
251 }
252 }
253 }
08d6921a
DM
254};
255
256sub entity_to_html {
257 my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
258
259 my $mime_type = lc($entity->mime_type);;
260
261 if ($mime_type eq 'text/plain') {
262 my $raw = $read_part->($entity) // '';
263 my $html = "<pre>\n";
264
265 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
266 $html .= PMG::Utils::decode_to_html($cs, $raw);
267 } else {
268 $html .= encode_entities($raw);
269 }
270
271 $html .= "</pre>\n";
272
273 return $html;
274
275 } elsif ($mime_type eq 'text/html') {
276 my $raw = $read_part->($entity) // '';
277
278 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
279 eval { $raw = decode($cs, $raw); }; # ignore errors here
280 }
281
282 # create a well formed tree
283 my $tree = HTML::TreeBuilder->new();
284 $tree->parse($raw);
285 $tree->eof();
286
d970f290
DC
287 # normalizes html, replaces CID references with data uris and scrubs style tags
288 my $whtml = dump_html($tree, $cid_hash, $viewimages);
08d6921a
DM
289 $tree->delete;
290
291 # remove dangerous/unneeded elements
292 my $scrubber = getscrubber($viewimages, $allowhref);
293 return $scrubber->scrub($whtml);
294
295 } elsif ($mime_type =~ m|^multipart/|i) {
296 my $multi_part;
297 my $html_part;
298 my $text_part;
299
300 foreach my $part ($entity->parts) {
301 my $subtype = lc($part->mime_type);
302 $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
303 $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
304 $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
305 }
306
307 # get related/embedded images as data uris
59424fd5 308 $find_images->($cid_hash, $entity);
08d6921a
DM
309
310 my $alt = $multi_part || $html_part || $text_part;
311
59424fd5 312 return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
08d6921a
DM
313 }
314
315 return undef;
316}
317
318sub email_to_html {
319 my ($path, $raw, $viewimages, $allowhref) = @_;
320
321 my $dumpdir = "/tmp/.proxdumpview_$$";
322
323 my $html = '';
324
325 eval {
326 if ($raw) {
327
328 my ($header, $content) = read_raw_email($path);
329
330 $html .= "<pre>\n" .
331 encode_entities($header) .
332 "\n" .
333 encode_entities($content) .
334 "</pre>\n";
335
336 } else {
337
18598b2c
DC
338 my $parser = PMG::MIMEUtils::new_mime_parser({
339 dumpdir => $dumpdir,
340 });
08d6921a
DM
341
342 my $entity = $parser->parse_open($path);
343
18598b2c 344 PMG::MIMEUtils::fixup_multipart($entity);
08d6921a
DM
345
346 $html = entity_to_html($entity, {}, $viewimages, $allowhref);
347 }
348 };
349 my $err = $@;
350
351 rmtree $dumpdir;
352
353 die "unable to parse mail: $err" if $err;
354
355 return $html;
356}
357
3581;