]>
git.proxmox.com Git - pmg-api.git/blob - src/PMG/HTMLMail.pm
12 use HTML
:: TreeBuilder
;
18 # $value is a ref to a string scalar
21 return if ! defined $$value ;
23 # convert 'url([..])' to '___([..])' so the browser does not load it
24 $$value =~ s
| url\
(| ___
(| gi
;
26 # similar for all protocols
27 $$value =~ s
|[ a-z0-9
]+: // | _
| gi
;
30 my sub remove_urls_from_attr
{
31 my ( $obj, $tag_name, $attr_name, $value ) = @_ ;
39 my ( $tree, $cid_hash, $view_images ) = @_ ;
44 my ( $node, $start, $depth ) = @_ ;
46 my $tag = $node ->{ '_tag' };
48 # try to open a new window when user activates a anchor
49 $node ->{ target
} = '_blank' if $tag eq 'a' ;
51 if ( $tag eq 'img' && $view_images ) {
52 if ( $node ->{ src
} && $node ->{ src
} =~ m/^cid:(\S+)$/ ) {
53 if ( my $datauri = $cid_hash ->{ $1 }) {
54 $node ->{ src
} = $datauri ;
59 if ( $tag eq 'style' && ! $view_images ) {
60 remove_urls
( $_ ) for grep { ! ref $$_ } $node -> content_refs_list ();
63 if ( $start ) { # on the way in
64 push ( @html, $node -> starttag );
65 } else { # on the way out
66 push ( @html, $node -> endtag );
68 } else { # simple text content
69 # To keep from amp-escaping children of script et al. That doesn't deal with descendants;
70 # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
71 if (! $HTML :: Tagset
:: isCDATA_Parent
{ $_ [ 3 ]{ '_tag' } }) {
72 # That does magic things if $entities is undef.
73 $node = encode_entities
( $node );
77 return 1 ; # keep traversing
80 return join ( '' , @html, " \n " );
84 my ( $viewimages, $allowhref ) = @_ ;
86 # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
88 my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body) ;
90 my @rules = ( script
=> 0 );
93 0 => # default rule, deny all tags
95 '*' => 0 , # default rule, deny all attributes
98 'accept-charset' => 1 ,
126 # only allow http:// and https:// hrefs
127 'href' => $allowhref ?
qr{^https?://[^/]+/}i : 0 ,
155 src
=> $viewimages ?
qr{^(?!(?:java)?script)}i : 0 ,
157 style
=> $viewimages ?
1 : \
& remove_urls_from_attr
,
171 my $scrubber = HTML
:: Scrubber-
> new (
174 default => \
@default,
185 my ( $path, $maxbytes ) = @_ ;
187 open ( my $fh, '<' , $path ) || die "unable to open ' $path ' - $!\n " ;
194 while ( defined ( my $line = < $fh >)) {
195 $raw_header .= $line ;
197 push @$header, $line ;
198 last if $line =~ m/^\s*$/ ;
201 my $head = MIME
:: Head-
> new ( $header );
203 my $cs = $head -> mime_attr ( "content-type.charset" );
207 while ( defined ( my $line = < $fh >)) {
208 $bytes += length ( $line );
210 $data .= decode
( $cs, $line );
214 if ( defined ( $maxbytes ) && ( $bytes >= $maxbytes )) {
215 $data .= " \n ... mail truncated (> $maxbytes bytes) \n " ;
222 return ( $raw_header, $data );
225 my $read_part = sub {
228 my $io = $part -> open ( "r" );
229 return undef if ! $io ;
232 while ( defined ( my $line = $io -> getline )) { $raw .= $line ; }
238 my $find_images = sub {
239 my ( $cid_hash, $entity ) = @_ ;
241 foreach my $part ( $entity -> parts ) {
242 if ( my $rawcid = $part -> head -> get ( 'Content-Id' )) {
243 if ( $rawcid =~ m/^\s*<(\S+)>\s*$/ ) {
245 my $ctype = $part -> head -> mime_attr ( 'Content-type' ) // '' ;
246 if ( $ctype =~ m!^image/! ) {
247 if ( defined ( my $raw = $read_part ->( $part ))) {
248 $cid_hash ->{ $cid } = "data: $ctype ;base64," . encode_base64
( $raw, '' );
257 my ( $entity, $cid_hash, $viewimages, $allowhref ) = @_ ;
259 my $mime_type = lc ( $entity -> mime_type );;
261 if ( $mime_type eq 'text/plain' ) {
262 my $raw = $read_part ->( $entity ) // '' ;
263 my $html = "<pre> \n " ;
265 if ( defined ( my $cs = $entity -> head -> mime_attr ( "content-type.charset" ))) {
266 $html .= PMG
:: Utils
:: decode_to_html
( $cs, $raw );
268 $html .= encode_entities
( $raw );
275 } elsif ( $mime_type eq 'text/html' ) {
276 my $raw = $read_part ->( $entity ) // '' ;
278 if ( defined ( my $cs = $entity -> head -> mime_attr ( "content-type.charset" ))) {
279 eval { $raw = decode
( $cs, $raw ); }; # ignore errors here
282 # create a well formed tree
283 my $tree = HTML
:: TreeBuilder-
> new ();
287 # normalizes html, replaces CID references with data uris and scrubs style tags
288 my $whtml = dump_html
( $tree, $cid_hash, $viewimages );
291 # remove dangerous/unneeded elements
292 my $scrubber = getscrubber
( $viewimages, $allowhref );
293 return $scrubber -> scrub ( $whtml );
295 } elsif ( $mime_type =~ m
|^ multipart
/| i
) {
300 foreach my $part ( $entity -> parts ) {
301 my $subtype = lc ( $part -> mime_type );
302 $multi_part = $part if ! defined ( $multi_part ) && $subtype =~ m
| multipart
/| i
;
303 $html_part = $part if ! defined ( $html_part ) && $subtype eq 'text/html' ;
304 $text_part = $part if ! defined ( $text_part ) && $subtype eq 'text/plain' ;
307 # get related/embedded images as data uris
308 $find_images ->( $cid_hash, $entity );
310 my $alt = $multi_part || $html_part || $text_part ;
312 return entity_to_html
( $alt, $cid_hash, $viewimages, $allowhref ) if $alt ;
319 my ( $path, $raw, $viewimages, $allowhref ) = @_ ;
321 my $dumpdir = "/tmp/.proxdumpview_ $$ " ;
328 my ( $header, $content ) = read_raw_email
( $path );
331 encode_entities
( $header ) .
333 encode_entities
( $content ) .
338 my $parser = PMG
:: MIMEUtils
:: new_mime_parser
({
342 my $entity = $parser -> parse_open ( $path );
344 PMG
:: MIMEUtils
:: fixup_multipart
( $entity );
346 $html = entity_to_html
( $entity, {}, $viewimages, $allowhref );
353 die "unable to parse mail: $err " if $err ;