]>
git.proxmox.com Git - pmg-api.git/blob - src/PMG/HTMLMail.pm
12 use HTML
:: TreeBuilder
;
18 # $value is a ref to a string scalar
21 return if ! defined $$value ;
23 # convert 'url([..])' to '___([..])' so the browser does not load it
24 $$value =~ s
| url\
(| ___
(| gi
;
26 # similar for all protocols
27 $$value =~ s
|[ a-z0-9
]+: // | _
| gi
;
30 my sub remove_urls_from_attr
{
31 my ( $obj, $tag_name, $attr_name, $value ) = @_ ;
39 my ( $tree, $cid_hash, $view_images ) = @_ ;
44 my ( $node, $start, $depth ) = @_ ;
46 my $tag = $node ->{ '_tag' };
48 # try to open a new window when user activates a anchor
49 $node ->{ target
} = '_blank' if $tag eq 'a' ;
51 if ( $tag eq 'img' && $view_images ) {
52 if ( $node ->{ src
} && $node ->{ src
} =~ m/^cid:(\S+)$/ ) {
53 if ( my $datauri = $cid_hash ->{ $1 }) {
54 $node ->{ src
} = $datauri ;
59 if ( $tag eq 'style' && ! $view_images ) {
60 remove_urls
( $_ ) for grep { ! ref $$_ } $node -> content_refs_list ();
63 if ( $start ) { # on the way in
64 push ( @html, $node -> starttag );
65 } else { # on the way out
66 push ( @html, $node -> endtag );
68 } else { # simple text content
69 # To keep from amp-escaping children of script et al. That doesn't deal with descendants;
70 # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
71 if (! $HTML :: Tagset
:: isCDATA_Parent
{ $_ [ 3 ]{ '_tag' } }) {
72 # That does magic things if $entities is undef.
73 $node = encode_entities
( $node );
77 return 1 ; # keep traversing
80 return join ( '' , @html, " \n " );
84 my ( $viewimages, $allowhref ) = @_ ;
86 # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
88 my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body) ;
90 my @rules = ( script
=> 0 );
93 0 => # default rule, deny all tags
95 '*' => 0 , # default rule, deny all attributes
98 'accept-charset' => 1 ,
126 # only allow http:// and https:// hrefs
127 'href' => $allowhref ?
qr{^https?://[^/]+/}i : 0 ,
155 src
=> $viewimages ?
qr{^(?!(?:java)?script)}i : 0 ,
157 style
=> $viewimages ?
1 : \
& remove_urls_from_attr
,
171 my $scrubber = HTML
:: Scrubber-
> new (
174 default => \
@default,
185 my ( $path, $maxbytes ) = @_ ;
187 open ( my $fh, '<' , $path ) || die "unable to open ' $path ' - $!\n " ;
194 while ( defined ( my $line = < $fh >)) {
195 my $decoded_line = PMG
:: Utils
:: try_decode_utf8
( $line );
196 $raw_header .= $decoded_line ;
198 push @$header, $decoded_line ;
199 last if $line =~ m/^\s*$/ ;
202 my $head = MIME
:: Head-
> new ( $header );
204 my $cs = $head -> mime_attr ( "content-type.charset" );
208 while ( defined ( my $line = < $fh >)) {
209 $bytes += length ( $line );
211 $data .= decode
( $cs, $line );
215 if ( defined ( $maxbytes ) && ( $bytes >= $maxbytes )) {
216 $data .= " \n ... mail truncated (> $maxbytes bytes) \n " ;
223 return ( $raw_header, $data );
226 my $read_part = sub {
229 my $io = $part -> open ( "r" );
230 return undef if ! $io ;
233 while ( defined ( my $line = $io -> getline )) { $raw .= $line ; }
239 my $find_images = sub {
240 my ( $cid_hash, $entity ) = @_ ;
242 foreach my $part ( $entity -> parts ) {
243 if ( my $rawcid = $part -> head -> get ( 'Content-Id' )) {
244 if ( $rawcid =~ m/^\s*<(\S+)>\s*$/ ) {
246 my $ctype = $part -> head -> mime_attr ( 'Content-type' ) // '' ;
247 if ( $ctype =~ m!^image/! ) {
248 if ( defined ( my $raw = $read_part ->( $part ))) {
249 $cid_hash ->{ $cid } = "data: $ctype ;base64," . encode_base64
( $raw, '' );
258 my ( $entity, $cid_hash, $viewimages, $allowhref ) = @_ ;
260 my $mime_type = lc ( $entity -> mime_type );;
262 if ( $mime_type eq 'text/plain' ) {
263 my $raw = $read_part ->( $entity ) // '' ;
264 my $html = "<pre> \n " ;
266 if ( defined ( my $cs = $entity -> head -> mime_attr ( "content-type.charset" ))) {
267 $html .= PMG
:: Utils
:: decode_to_html
( $cs, $raw );
269 $html .= encode_entities
( $raw );
276 } elsif ( $mime_type eq 'text/html' ) {
277 my $raw = $read_part ->( $entity ) // '' ;
279 if ( defined ( my $cs = $entity -> head -> mime_attr ( "content-type.charset" ))) {
280 eval { $raw = decode
( $cs, $raw ); }; # ignore errors here
283 # create a well formed tree
284 my $tree = HTML
:: TreeBuilder-
> new ();
288 # normalizes html, replaces CID references with data uris and scrubs style tags
289 my $whtml = dump_html
( $tree, $cid_hash, $viewimages );
292 # remove dangerous/unneeded elements
293 my $scrubber = getscrubber
( $viewimages, $allowhref );
294 return $scrubber -> scrub ( $whtml );
296 } elsif ( $mime_type =~ m
|^ multipart
/| i
) {
301 foreach my $part ( $entity -> parts ) {
302 my $subtype = lc ( $part -> mime_type );
303 $multi_part = $part if ! defined ( $multi_part ) && $subtype =~ m
| multipart
/| i
;
304 $html_part = $part if ! defined ( $html_part ) && $subtype eq 'text/html' ;
305 $text_part = $part if ! defined ( $text_part ) && $subtype eq 'text/plain' ;
308 # get related/embedded images as data uris
309 $find_images ->( $cid_hash, $entity );
311 my $alt = $multi_part || $html_part || $text_part ;
313 return entity_to_html
( $alt, $cid_hash, $viewimages, $allowhref ) if $alt ;
320 my ( $path, $raw, $viewimages, $allowhref ) = @_ ;
322 my $dumpdir = "/tmp/.proxdumpview_ $$ " ;
329 my ( $header, $content ) = read_raw_email
( $path );
332 encode_entities
( $header ) .
334 encode_entities
( $content ) .
339 my $parser = PMG
:: MIMEUtils
:: new_mime_parser
({
343 my $entity = $parser -> parse_open ( $path );
345 PMG
:: MIMEUtils
:: fixup_multipart
( $entity );
347 $html = entity_to_html
( $entity, {}, $viewimages, $allowhref );
354 die "unable to parse mail: $err " if $err ;