]> git.proxmox.com Git - pmg-api.git/blame - src/PMG/HTMLMail.pm
followup: indentation and description improvement
[pmg-api.git] / src / PMG / HTMLMail.pm
CommitLineData
08d6921a
DM
1package PMG::HTMLMail;
2
3use strict;
4use warnings;
5use Encode;
6use Data::Dumper;
7use MIME::Head;
8use File::Path;
9use HTML::Entities;
10use MIME::Parser;
11use MIME::Base64;
12use HTML::TreeBuilder;
13use HTML::Scrubber;
14
18598b2c
DC
15use PMG::Utils;
16use PMG::MIMEUtils;
17
08d6921a
DM
18sub dump_html {
19 my ($tree, $cid_hash) = @_;
20
21 my @html = ();
22
23 my($tag, $node, $start, $depth);
24
25 $tree->traverse(
26 sub {
27 ($node, $start) = @_;
28 if(ref $node) {
29 $tag = $node->{'_tag'};
30
31 # try to open a new window when user activates a anchor
32 $node->{target} = '_blank' if $tag eq 'a';
33
34 if ($tag eq 'img') {
35 if ($node->{src} =~ m/^cid:(\S+)$/) {
36 if (my $datauri = $cid_hash->{$1}) {
37 $node->{src} = $datauri;
38 }
39 }
40 }
41
42 if($start) { # on the way in
43 push(@html, $node->starttag);
44 } else {
45 # on the way out
46 push(@html, $node->endtag);
47 }
48 } else {
49 # simple text content
50 $node = encode_entities($node)
51 # That does magic things if $entities is undef.
52 unless $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} };
53 # To keep from amp-escaping children of script et al.
54 # That doesn't deal with descendants; but then, CDATA
55 # parents shouldn't /have/ descendants other than a
56 # text children (or comments?)
57 push(@html, $node);
58 }
59 1; # keep traversing
60 }
61 );
62
63 return join('', @html, "\n");
64}
65
66sub getscrubber {
67 my ($viewimages, $allowhref) = @_;
68
69 # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
70
98dac1e6 71 my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
08d6921a 72
3f819452
DM
73 my @rules = ( script => 0 );
74
08d6921a
DM
75 my @default = (
76 0 => # default rule, deny all tags
77 {
78 '*' => 0, # default rule, deny all attributes
79 abbr => 1,
80 accept => 1,
81 'accept-charset' => 1,
82 accesskey => 1,
83 align => 1,
84 alt => 1,
85 axis => 1,
86 border => 1,
7cfe043a 87 bgcolor => 1,
08d6921a
DM
88 cellpadding => 1,
89 cellspacing => 1,
90 char => 1,
91 charoff => 1,
92 charset => 1,
93 checked => 1,
94 cite => 1,
95 class => 1,
96 clear => 1,
97 cols => 1,
98 colspan => 1,
99 color => 1,
100 compact => 1,
101 coords => 1,
102 datetime => 1,
103 dir => 1,
104 disabled => 1,
105 enctype => 1,
106 frame => 1,
107 headers => 1,
108 height => 1,
109 # only allow http:// and https:// hrefs
110 'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
111 hreflang => 1,
112 hspace => 1,
113 id => 1,
114 ismap => 1,
115 label => 1,
116 lang => 1,
117 longdesc => 1,
118 maxlength => 1,
119 media => 1,
120 method => 1,
121 multiple => 1,
122 name => 1,
123 nohref => 1,
124 noshade => 1,
125 nowrap => 1,
126 prompt => 1,
127 readonly => 1,
128 rel => 1,
129 rev => 1,
130 rows => 1,
131 rowspan => 1,
132 rules => 1,
133 scope => 1,
134 selected => 1,
135 shape => 1,
136 size => 1,
137 span => 1,
3f819452 138 src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
08d6921a 139 start => 1,
98dac1e6 140 style => 1,
08d6921a
DM
141 summary => 1,
142 tabindex => 1,
143 target => 1,
144 title => 1,
145 type => 1,
146 usemap => 1,
147 valign => 1,
148 value => 1,
149 vspace => 1,
150 width => 1,
151 }
152 );
153
154 my $scrubber = HTML::Scrubber->new(
155 allow => \@allow,
156 rules => \@rules,
157 default => \@default,
158 comment => 0,
08d6921a
DM
159 process => 0,
160 );
161
98dac1e6
DM
162 $scrubber->style(1);
163
08d6921a
DM
164 return $scrubber;
165}
166
167sub read_raw_email {
b75acee7 168 my ($path, $maxbytes) = @_;
08d6921a
DM
169
170 open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
171
172 my $data = '';
173 my $raw_header = '';
174
175 # read header
176 my $header;
177 while (defined(my $line = <$fh>)) {
178 $raw_header .= $line;
179 chomp $line;
180 push @$header, $line;
181 last if $line =~ m/^\s*$/;
182 }
183
184 my $head = MIME::Head->new($header);
185
186 my $cs = $head->mime_attr("content-type.charset");
187
b75acee7
DM
188 my $bytes = 0;
189
08d6921a 190 while (defined(my $line = <$fh>)) {
b75acee7 191 $bytes += length ($line);
08d6921a
DM
192 if ($cs) {
193 $data .= decode($cs, $line);
194 } else {
195 $data .= $line;
196 }
b75acee7
DM
197 if (defined($maxbytes) && ($bytes >= $maxbytes)) {
198 $data .= "\n... mail truncated (> $maxbytes bytes)\n";
199 last;
200 }
08d6921a
DM
201 }
202
203 close($fh);
204
205 return ($raw_header, $data);
206}
207
208my $read_part = sub {
209 my ($part) = @_;
210
211 my $io = $part->open("r");
212 return undef if !$io;
213
214 my $raw = '';
215 while (defined(my $line = $io->getline)) { $raw .= $line; }
216 $io->close;
217
218 return $raw;
219};
220
221my $find_images = sub {
59424fd5 222 my ($cid_hash, $entity) = @_;
08d6921a
DM
223
224 foreach my $part ($entity->parts) {
225 if (my $rawcid = $part->head->get('Content-Id')) {
226 if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
227 my $cid = $1;
228 my $ctype = $part->head->mime_attr('Content-type') // '';
229 if ($ctype =~ m!^image/!) {
230 if (defined(my $raw = $read_part->($part))) {
59424fd5 231 $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
08d6921a
DM
232 }
233 }
234 }
235 }
236 }
08d6921a
DM
237};
238
239sub entity_to_html {
240 my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
241
242 my $mime_type = lc($entity->mime_type);;
243
244 if ($mime_type eq 'text/plain') {
245 my $raw = $read_part->($entity) // '';
246 my $html = "<pre>\n";
247
248 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
249 $html .= PMG::Utils::decode_to_html($cs, $raw);
250 } else {
251 $html .= encode_entities($raw);
252 }
253
254 $html .= "</pre>\n";
255
256 return $html;
257
258 } elsif ($mime_type eq 'text/html') {
259 my $raw = $read_part->($entity) // '';
260
261 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
262 eval { $raw = decode($cs, $raw); }; # ignore errors here
263 }
264
265 # create a well formed tree
266 my $tree = HTML::TreeBuilder->new();
267 $tree->parse($raw);
268 $tree->eof();
269
270 my $whtml = dump_html($tree, $viewimages ? $cid_hash : {});
271 $tree->delete;
272
273 # remove dangerous/unneeded elements
274 my $scrubber = getscrubber($viewimages, $allowhref);
275 return $scrubber->scrub($whtml);
276
277 } elsif ($mime_type =~ m|^multipart/|i) {
278 my $multi_part;
279 my $html_part;
280 my $text_part;
281
282 foreach my $part ($entity->parts) {
283 my $subtype = lc($part->mime_type);
284 $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
285 $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
286 $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
287 }
288
289 # get related/embedded images as data uris
59424fd5 290 $find_images->($cid_hash, $entity);
08d6921a
DM
291
292 my $alt = $multi_part || $html_part || $text_part;
293
59424fd5 294 return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
08d6921a
DM
295 }
296
297 return undef;
298}
299
300sub email_to_html {
301 my ($path, $raw, $viewimages, $allowhref) = @_;
302
303 my $dumpdir = "/tmp/.proxdumpview_$$";
304
305 my $html = '';
306
307 eval {
308 if ($raw) {
309
310 my ($header, $content) = read_raw_email($path);
311
312 $html .= "<pre>\n" .
313 encode_entities($header) .
314 "\n" .
315 encode_entities($content) .
316 "</pre>\n";
317
318 } else {
319
18598b2c
DC
320 my $parser = PMG::MIMEUtils::new_mime_parser({
321 dumpdir => $dumpdir,
322 });
08d6921a
DM
323
324 my $entity = $parser->parse_open($path);
325
18598b2c 326 PMG::MIMEUtils::fixup_multipart($entity);
08d6921a
DM
327
328 $html = entity_to_html($entity, {}, $viewimages, $allowhref);
329 }
330 };
331 my $err = $@;
332
333 rmtree $dumpdir;
334
335 die "unable to parse mail: $err" if $err;
336
337 return $html;
338}
339
3401;