]> git.proxmox.com Git - pmg-api.git/blob - PMG/HTMLMail.pm
close #1917: add pmg-system-report command
[pmg-api.git] / PMG / HTMLMail.pm
1 package PMG::HTMLMail;
2
3 use strict;
4 use warnings;
5 use Encode;
6 use Data::Dumper;
7 use MIME::Head;
8 use File::Path;
9 use HTML::Entities;
10 use MIME::Parser;
11 use MIME::Base64;
12 use HTML::TreeBuilder;
13 use HTML::Scrubber;
14
15 sub dump_html {
16 my ($tree, $cid_hash) = @_;
17
18 my @html = ();
19
20 my($tag, $node, $start, $depth);
21
22 $tree->traverse(
23 sub {
24 ($node, $start) = @_;
25 if(ref $node) {
26 $tag = $node->{'_tag'};
27
28 # try to open a new window when user activates a anchor
29 $node->{target} = '_blank' if $tag eq 'a';
30
31 if ($tag eq 'img') {
32 if ($node->{src} =~ m/^cid:(\S+)$/) {
33 if (my $datauri = $cid_hash->{$1}) {
34 $node->{src} = $datauri;
35 }
36 }
37 }
38
39 if($start) { # on the way in
40 push(@html, $node->starttag);
41 } else {
42 # on the way out
43 push(@html, $node->endtag);
44 }
45 } else {
46 # simple text content
47 $node = encode_entities($node)
48 # That does magic things if $entities is undef.
49 unless $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} };
50 # To keep from amp-escaping children of script et al.
51 # That doesn't deal with descendants; but then, CDATA
52 # parents shouldn't /have/ descendants other than a
53 # text children (or comments?)
54 push(@html, $node);
55 }
56 1; # keep traversing
57 }
58 );
59
60 return join('', @html, "\n");
61 }
62
63 sub getscrubber {
64 my ($viewimages, $allowhref) = @_;
65
66 # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
67
68 my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
69
70 my @rules = ( script => 0 );
71
72 my @default = (
73 0 => # default rule, deny all tags
74 {
75 '*' => 0, # default rule, deny all attributes
76 abbr => 1,
77 accept => 1,
78 'accept-charset' => 1,
79 accesskey => 1,
80 align => 1,
81 alt => 1,
82 axis => 1,
83 border => 1,
84 bgcolor => 1,
85 cellpadding => 1,
86 cellspacing => 1,
87 char => 1,
88 charoff => 1,
89 charset => 1,
90 checked => 1,
91 cite => 1,
92 class => 1,
93 clear => 1,
94 cols => 1,
95 colspan => 1,
96 color => 1,
97 compact => 1,
98 coords => 1,
99 datetime => 1,
100 dir => 1,
101 disabled => 1,
102 enctype => 1,
103 frame => 1,
104 headers => 1,
105 height => 1,
106 # only allow http:// and https:// hrefs
107 'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
108 hreflang => 1,
109 hspace => 1,
110 id => 1,
111 ismap => 1,
112 label => 1,
113 lang => 1,
114 longdesc => 1,
115 maxlength => 1,
116 media => 1,
117 method => 1,
118 multiple => 1,
119 name => 1,
120 nohref => 1,
121 noshade => 1,
122 nowrap => 1,
123 prompt => 1,
124 readonly => 1,
125 rel => 1,
126 rev => 1,
127 rows => 1,
128 rowspan => 1,
129 rules => 1,
130 scope => 1,
131 selected => 1,
132 shape => 1,
133 size => 1,
134 span => 1,
135 src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
136 start => 1,
137 style => 1,
138 summary => 1,
139 tabindex => 1,
140 target => 1,
141 title => 1,
142 type => 1,
143 usemap => 1,
144 valign => 1,
145 value => 1,
146 vspace => 1,
147 width => 1,
148 }
149 );
150
151 my $scrubber = HTML::Scrubber->new(
152 allow => \@allow,
153 rules => \@rules,
154 default => \@default,
155 comment => 0,
156 process => 0,
157 );
158
159 $scrubber->style(1);
160
161 return $scrubber;
162 }
163
164 sub read_raw_email {
165 my ($path, $maxbytes) = @_;
166
167 open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";
168
169 my $data = '';
170 my $raw_header = '';
171
172 # read header
173 my $header;
174 while (defined(my $line = <$fh>)) {
175 $raw_header .= $line;
176 chomp $line;
177 push @$header, $line;
178 last if $line =~ m/^\s*$/;
179 }
180
181 my $head = MIME::Head->new($header);
182
183 my $cs = $head->mime_attr("content-type.charset");
184
185 my $bytes = 0;
186
187 while (defined(my $line = <$fh>)) {
188 $bytes += length ($line);
189 if ($cs) {
190 $data .= decode($cs, $line);
191 } else {
192 $data .= $line;
193 }
194 if (defined($maxbytes) && ($bytes >= $maxbytes)) {
195 $data .= "\n... mail truncated (> $maxbytes bytes)\n";
196 last;
197 }
198 }
199
200 close($fh);
201
202 return ($raw_header, $data);
203 }
204
205 my $read_part = sub {
206 my ($part) = @_;
207
208 my $io = $part->open("r");
209 return undef if !$io;
210
211 my $raw = '';
212 while (defined(my $line = $io->getline)) { $raw .= $line; }
213 $io->close;
214
215 return $raw;
216 };
217
218 my $find_images = sub {
219 my ($cid_hash, $entity) = @_;
220
221 foreach my $part ($entity->parts) {
222 if (my $rawcid = $part->head->get('Content-Id')) {
223 if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
224 my $cid = $1;
225 my $ctype = $part->head->mime_attr('Content-type') // '';
226 if ($ctype =~ m!^image/!) {
227 if (defined(my $raw = $read_part->($part))) {
228 $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
229 }
230 }
231 }
232 }
233 }
234 };
235
236 sub entity_to_html {
237 my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
238
239 my $mime_type = lc($entity->mime_type);;
240
241 if ($mime_type eq 'text/plain') {
242 my $raw = $read_part->($entity) // '';
243 my $html = "<pre>\n";
244
245 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
246 $html .= PMG::Utils::decode_to_html($cs, $raw);
247 } else {
248 $html .= encode_entities($raw);
249 }
250
251 $html .= "</pre>\n";
252
253 return $html;
254
255 } elsif ($mime_type eq 'text/html') {
256 my $raw = $read_part->($entity) // '';
257
258 if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
259 eval { $raw = decode($cs, $raw); }; # ignore errors here
260 }
261
262 # create a well formed tree
263 my $tree = HTML::TreeBuilder->new();
264 $tree->parse($raw);
265 $tree->eof();
266
267 my $whtml = dump_html($tree, $viewimages ? $cid_hash : {});
268 $tree->delete;
269
270 # remove dangerous/unneeded elements
271 my $scrubber = getscrubber($viewimages, $allowhref);
272 return $scrubber->scrub($whtml);
273
274 } elsif ($mime_type =~ m|^multipart/|i) {
275 my $multi_part;
276 my $html_part;
277 my $text_part;
278
279 foreach my $part ($entity->parts) {
280 my $subtype = lc($part->mime_type);
281 $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
282 $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
283 $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
284 }
285
286 # get related/embedded images as data uris
287 $find_images->($cid_hash, $entity);
288
289 my $alt = $multi_part || $html_part || $text_part;
290
291 return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
292 }
293
294 return undef;
295 }
296
297 sub email_to_html {
298 my ($path, $raw, $viewimages, $allowhref) = @_;
299
300 my $dumpdir = "/tmp/.proxdumpview_$$";
301
302 my $html = '';
303
304 eval {
305 if ($raw) {
306
307 my ($header, $content) = read_raw_email($path);
308
309 $html .= "<pre>\n" .
310 encode_entities($header) .
311 "\n" .
312 encode_entities($content) .
313 "</pre>\n";
314
315 } else {
316
317 my $parser = new MIME::Parser;
318 $parser->extract_nested_messages(0);
319
320 rmtree $dumpdir;
321
322 # Create and set the output directory:
323 (-d $dumpdir || mkdir($dumpdir ,0755)) ||
324 die "can't create $dumpdir: $! : ERROR";
325 (-w $dumpdir) ||
326 die "can't write to directory $dumpdir: $! : ERROR";
327
328 $parser->output_dir($dumpdir);
329
330 my $entity = $parser->parse_open($path);
331
332 # bug fix for bin/tests/content/mimeparser.txt
333 if ($entity->mime_type =~ m|multipart/|i && !$entity->head->multipart_boundary) {
334 $entity->head->mime_attr('Content-type' => "application/x-unparseable-multipart");
335 }
336
337 $html = entity_to_html($entity, {}, $viewimages, $allowhref);
338 }
339 };
340 my $err = $@;
341
342 rmtree $dumpdir;
343
344 die "unable to parse mail: $err" if $err;
345
346 return $html;
347 }
348
349 1;