]>
Commit | Line | Data |
---|---|---|
08d6921a DM |
1 | package PMG::HTMLMail; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | use Encode; | |
6 | use Data::Dumper; | |
7 | use MIME::Head; | |
8 | use File::Path; | |
9 | use HTML::Entities; | |
10 | use MIME::Parser; | |
11 | use MIME::Base64; | |
12 | use HTML::TreeBuilder; | |
13 | use HTML::Scrubber; | |
14 | ||
18598b2c DC |
15 | use PMG::Utils; |
16 | use PMG::MIMEUtils; | |
17 | ||
d970f290 DC |
18 | # $value is a ref to a string scalar |
19 | my sub remove_urls { | |
20 | my ($value) = @_; | |
d6bf3645 TL |
21 | return if !defined $$value; |
22 | ||
d970f290 DC |
23 | # convert 'url([..])' to '___([..])' so the browser does not load it |
24 | $$value =~ s|url\(|___(|gi; | |
25 | ||
26 | # similar for all protocols | |
27 | $$value =~ s|[a-z0-9]+://|_|gi; | |
28 | } | |
29 | ||
30 | my sub remove_urls_from_attr { | |
31 | my ($obj, $tag_name, $attr_name, $value) = @_; | |
32 | ||
33 | remove_urls(\$value); | |
34 | ||
35 | return $value; | |
36 | } | |
37 | ||
08d6921a | 38 | sub dump_html { |
d970f290 | 39 | my ($tree, $cid_hash, $view_images) = @_; |
08d6921a DM |
40 | |
41 | my @html = (); | |
42 | ||
66a0e574 TL |
43 | $tree->traverse(sub { |
44 | my ($node, $start, $depth) = @_; | |
45 | if (ref $node) { | |
46 | my $tag = $node->{'_tag'}; | |
47 | ||
48 | # try to open a new window when user activates a anchor | |
49 | $node->{target} = '_blank' if $tag eq 'a'; | |
50 | ||
51 | if ($tag eq 'img' && $view_images) { | |
52 | if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) { | |
53 | if (my $datauri = $cid_hash->{$1}) { | |
54 | $node->{src} = $datauri; | |
08d6921a DM |
55 | } |
56 | } | |
66a0e574 | 57 | } |
08d6921a | 58 | |
66a0e574 TL |
59 | if ($tag eq 'style' && !$view_images) { |
60 | remove_urls($_) for grep { !ref $$_ } $node->content_refs_list(); | |
61 | } | |
d970f290 | 62 | |
66a0e574 TL |
63 | if ($start) { # on the way in |
64 | push(@html, $node->starttag); | |
65 | } else { # on the way out | |
66 | push(@html, $node->endtag); | |
67 | } | |
68 | } else { # simple text content | |
69 | # To keep from amp-escaping children of script et al. That doesn't deal with descendants; | |
70 | # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?) | |
71 | if (!$HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }) { | |
72 | # That does magic things if $entities is undef. | |
73 | $node = encode_entities($node); | |
08d6921a | 74 | } |
66a0e574 | 75 | push(@html, $node); |
08d6921a | 76 | } |
66a0e574 TL |
77 | return 1; # keep traversing |
78 | }); | |
08d6921a DM |
79 | |
80 | return join('', @html, "\n"); | |
81 | } | |
82 | ||
83 | sub getscrubber { | |
84 | my ($viewimages, $allowhref) = @_; | |
85 | ||
86 | # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html | |
87 | ||
98dac1e6 | 88 | my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body); |
08d6921a | 89 | |
3f819452 DM |
90 | my @rules = ( script => 0 ); |
91 | ||
08d6921a DM |
92 | my @default = ( |
93 | 0 => # default rule, deny all tags | |
94 | { | |
95 | '*' => 0, # default rule, deny all attributes | |
96 | abbr => 1, | |
97 | accept => 1, | |
98 | 'accept-charset' => 1, | |
99 | accesskey => 1, | |
100 | align => 1, | |
101 | alt => 1, | |
102 | axis => 1, | |
103 | border => 1, | |
7cfe043a | 104 | bgcolor => 1, |
08d6921a DM |
105 | cellpadding => 1, |
106 | cellspacing => 1, | |
107 | char => 1, | |
108 | charoff => 1, | |
109 | charset => 1, | |
110 | checked => 1, | |
111 | cite => 1, | |
112 | class => 1, | |
113 | clear => 1, | |
114 | cols => 1, | |
115 | colspan => 1, | |
116 | color => 1, | |
117 | compact => 1, | |
118 | coords => 1, | |
119 | datetime => 1, | |
120 | dir => 1, | |
121 | disabled => 1, | |
122 | enctype => 1, | |
123 | frame => 1, | |
124 | headers => 1, | |
125 | height => 1, | |
126 | # only allow http:// and https:// hrefs | |
127 | 'href' => $allowhref ? qr{^https?://[^/]+/}i : 0, | |
128 | hreflang => 1, | |
129 | hspace => 1, | |
130 | id => 1, | |
131 | ismap => 1, | |
132 | label => 1, | |
133 | lang => 1, | |
134 | longdesc => 1, | |
135 | maxlength => 1, | |
136 | media => 1, | |
137 | method => 1, | |
138 | multiple => 1, | |
139 | name => 1, | |
140 | nohref => 1, | |
141 | noshade => 1, | |
142 | nowrap => 1, | |
143 | prompt => 1, | |
144 | readonly => 1, | |
145 | rel => 1, | |
146 | rev => 1, | |
147 | rows => 1, | |
148 | rowspan => 1, | |
149 | rules => 1, | |
150 | scope => 1, | |
151 | selected => 1, | |
152 | shape => 1, | |
153 | size => 1, | |
154 | span => 1, | |
3f819452 | 155 | src => $viewimages ? qr{^(?!(?:java)?script)}i : 0, |
08d6921a | 156 | start => 1, |
8a1a3461 | 157 | style => $viewimages ? 1 : \&remove_urls_from_attr, |
08d6921a DM |
158 | summary => 1, |
159 | tabindex => 1, | |
160 | target => 1, | |
161 | title => 1, | |
162 | type => 1, | |
163 | usemap => 1, | |
164 | valign => 1, | |
165 | value => 1, | |
166 | vspace => 1, | |
167 | width => 1, | |
168 | } | |
169 | ); | |
170 | ||
171 | my $scrubber = HTML::Scrubber->new( | |
172 | allow => \@allow, | |
173 | rules => \@rules, | |
174 | default => \@default, | |
175 | comment => 0, | |
08d6921a DM |
176 | process => 0, |
177 | ); | |
178 | ||
98dac1e6 DM |
179 | $scrubber->style(1); |
180 | ||
08d6921a DM |
181 | return $scrubber; |
182 | } | |
183 | ||
184 | sub read_raw_email { | |
b75acee7 | 185 | my ($path, $maxbytes) = @_; |
08d6921a DM |
186 | |
187 | open (my $fh, '<', $path) || die "unable to open '$path' - $!\n"; | |
188 | ||
189 | my $data = ''; | |
190 | my $raw_header = ''; | |
191 | ||
192 | # read header | |
193 | my $header; | |
194 | while (defined(my $line = <$fh>)) { | |
07f7275e SI |
195 | my $decoded_line = PMG::Utils::try_decode_utf8($line); |
196 | $raw_header .= $decoded_line; | |
197 | chomp $decoded_line; | |
198 | push @$header, $decoded_line; | |
08d6921a DM |
199 | last if $line =~ m/^\s*$/; |
200 | } | |
201 | ||
202 | my $head = MIME::Head->new($header); | |
203 | ||
204 | my $cs = $head->mime_attr("content-type.charset"); | |
205 | ||
b75acee7 DM |
206 | my $bytes = 0; |
207 | ||
08d6921a | 208 | while (defined(my $line = <$fh>)) { |
b75acee7 | 209 | $bytes += length ($line); |
08d6921a DM |
210 | if ($cs) { |
211 | $data .= decode($cs, $line); | |
212 | } else { | |
213 | $data .= $line; | |
214 | } | |
b75acee7 DM |
215 | if (defined($maxbytes) && ($bytes >= $maxbytes)) { |
216 | $data .= "\n... mail truncated (> $maxbytes bytes)\n"; | |
217 | last; | |
218 | } | |
08d6921a DM |
219 | } |
220 | ||
221 | close($fh); | |
222 | ||
223 | return ($raw_header, $data); | |
224 | } | |
225 | ||
226 | my $read_part = sub { | |
227 | my ($part) = @_; | |
228 | ||
229 | my $io = $part->open("r"); | |
230 | return undef if !$io; | |
231 | ||
232 | my $raw = ''; | |
233 | while (defined(my $line = $io->getline)) { $raw .= $line; } | |
234 | $io->close; | |
235 | ||
236 | return $raw; | |
237 | }; | |
238 | ||
239 | my $find_images = sub { | |
59424fd5 | 240 | my ($cid_hash, $entity) = @_; |
08d6921a DM |
241 | |
242 | foreach my $part ($entity->parts) { | |
243 | if (my $rawcid = $part->head->get('Content-Id')) { | |
244 | if ($rawcid =~ m/^\s*<(\S+)>\s*$/) { | |
245 | my $cid = $1; | |
246 | my $ctype = $part->head->mime_attr('Content-type') // ''; | |
247 | if ($ctype =~ m!^image/!) { | |
248 | if (defined(my $raw = $read_part->($part))) { | |
59424fd5 | 249 | $cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, ''); |
08d6921a DM |
250 | } |
251 | } | |
252 | } | |
253 | } | |
254 | } | |
08d6921a DM |
255 | }; |
256 | ||
257 | sub entity_to_html { | |
258 | my ($entity, $cid_hash, $viewimages, $allowhref) = @_; | |
259 | ||
260 | my $mime_type = lc($entity->mime_type);; | |
261 | ||
262 | if ($mime_type eq 'text/plain') { | |
263 | my $raw = $read_part->($entity) // ''; | |
264 | my $html = "<pre>\n"; | |
265 | ||
266 | if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) { | |
267 | $html .= PMG::Utils::decode_to_html($cs, $raw); | |
268 | } else { | |
269 | $html .= encode_entities($raw); | |
270 | } | |
271 | ||
272 | $html .= "</pre>\n"; | |
273 | ||
274 | return $html; | |
275 | ||
276 | } elsif ($mime_type eq 'text/html') { | |
277 | my $raw = $read_part->($entity) // ''; | |
278 | ||
279 | if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) { | |
280 | eval { $raw = decode($cs, $raw); }; # ignore errors here | |
281 | } | |
282 | ||
283 | # create a well formed tree | |
284 | my $tree = HTML::TreeBuilder->new(); | |
285 | $tree->parse($raw); | |
286 | $tree->eof(); | |
287 | ||
d970f290 DC |
288 | # normalizes html, replaces CID references with data uris and scrubs style tags |
289 | my $whtml = dump_html($tree, $cid_hash, $viewimages); | |
08d6921a DM |
290 | $tree->delete; |
291 | ||
292 | # remove dangerous/unneeded elements | |
293 | my $scrubber = getscrubber($viewimages, $allowhref); | |
294 | return $scrubber->scrub($whtml); | |
295 | ||
296 | } elsif ($mime_type =~ m|^multipart/|i) { | |
297 | my $multi_part; | |
298 | my $html_part; | |
299 | my $text_part; | |
300 | ||
301 | foreach my $part ($entity->parts) { | |
302 | my $subtype = lc($part->mime_type); | |
303 | $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i; | |
304 | $html_part = $part if !defined($html_part) && $subtype eq 'text/html'; | |
305 | $text_part = $part if !defined($text_part) && $subtype eq 'text/plain'; | |
306 | } | |
307 | ||
308 | # get related/embedded images as data uris | |
59424fd5 | 309 | $find_images->($cid_hash, $entity); |
08d6921a DM |
310 | |
311 | my $alt = $multi_part || $html_part || $text_part; | |
312 | ||
59424fd5 | 313 | return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt; |
08d6921a DM |
314 | } |
315 | ||
316 | return undef; | |
317 | } | |
318 | ||
319 | sub email_to_html { | |
320 | my ($path, $raw, $viewimages, $allowhref) = @_; | |
321 | ||
322 | my $dumpdir = "/tmp/.proxdumpview_$$"; | |
323 | ||
324 | my $html = ''; | |
325 | ||
326 | eval { | |
327 | if ($raw) { | |
328 | ||
329 | my ($header, $content) = read_raw_email($path); | |
330 | ||
331 | $html .= "<pre>\n" . | |
332 | encode_entities($header) . | |
333 | "\n" . | |
334 | encode_entities($content) . | |
335 | "</pre>\n"; | |
336 | ||
337 | } else { | |
338 | ||
18598b2c DC |
339 | my $parser = PMG::MIMEUtils::new_mime_parser({ |
340 | dumpdir => $dumpdir, | |
341 | }); | |
08d6921a DM |
342 | |
343 | my $entity = $parser->parse_open($path); | |
344 | ||
18598b2c | 345 | PMG::MIMEUtils::fixup_multipart($entity); |
08d6921a DM |
346 | |
347 | $html = entity_to_html($entity, {}, $viewimages, $allowhref); | |
348 | } | |
349 | }; | |
350 | my $err = $@; | |
351 | ||
352 | rmtree $dumpdir; | |
353 | ||
354 | die "unable to parse mail: $err" if $err; | |
355 | ||
356 | return $html; | |
357 | } | |
358 | ||
359 | 1; |