[pmg-api.git] / src / PMG / HTMLMail.pm

package PMG::HTMLMail;

use strict;
use warnings;
use Encode;
use Data::Dumper;
use MIME::Head;
use File::Path;
use HTML::Entities;
use MIME::Parser;
use MIME::Base64;
use HTML::TreeBuilder;
use HTML::Scrubber;

use PMG::Utils;
use PMG::MIMEUtils;

# $value is a ref to a string scalar
my sub remove_urls {
    my ($value) = @_;
    return if !defined $$value;

    # convert 'url([..])' to '___([..])' so the browser does not load it
    $$value =~ s|url\(|___(|gi;

    # similar for all protocols
    $$value =~ s|[a-z0-9]+://|_|gi;
}

my sub remove_urls_from_attr {
    my ($obj, $tag_name, $attr_name, $value) = @_;

    remove_urls(\$value);

    return $value;
}

sub dump_html {
    my ($tree, $cid_hash, $view_images) = @_;

    my @html = ();

    $tree->traverse(sub {
	my ($node, $start, $depth) = @_;
	if (ref $node) {
	    my $tag = $node->{'_tag'};

	    # try to open a new window when user activates a anchor
	    $node->{target} = '_blank' if $tag eq 'a';

	    if ($tag eq 'img' && $view_images) {
		if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) {
		    if (my $datauri = $cid_hash->{$1}) {
			$node->{src} = $datauri;
		    }
		}
	    }

	    if ($tag eq 'style' && !$view_images) {
		remove_urls($_) for grep { !ref $$_ } $node->content_refs_list();
	    }

	    if ($start) { # on the way in
		push(@html, $node->starttag);
	    } else { # on the way out
		push(@html, $node->endtag);
	    }
	} else { # simple text content
	    # To keep from amp-escaping children of script et al. That doesn't deal with descendants;
	    # but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
	    if (!$HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }) {
		# That does magic things if $entities is undef.
		$node = encode_entities($node);
	    }
	    push(@html, $node);
	}
	return 1; # keep traversing
    });

    return join('', @html, "\n");
}

sub getscrubber {
    my ($viewimages, $allowhref) = @_;

    # see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html

    my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);

    my @rules = ( script => 0 );

    my @default = (
	0 =>  # default rule, deny all tags
	{
	    '*' => 0, # default rule, deny all attributes
	    abbr => 1,
	    accept => 1,
	    'accept-charset' => 1,
	    accesskey => 1,
	    align => 1,
	    alt => 1,
	    axis => 1,
	    border => 1,
	    bgcolor => 1,
	    cellpadding => 1,
	    cellspacing => 1,
	    char => 1,
	    charoff => 1,
	    charset => 1,
	    checked => 1,
	    cite => 1,
	    class => 1,
	    clear => 1,
	    cols => 1,
	    colspan => 1,
	    color => 1,
	    compact => 1,
	    coords => 1,
	    datetime => 1,
	    dir => 1,
	    disabled => 1,
	    enctype => 1,
	    frame => 1,
	    headers => 1,
	    height => 1,
	    # only allow http:// and https:// hrefs
	    'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
	    hreflang => 1,
	    hspace => 1,
	    id => 1,
	    ismap => 1,
	    label => 1,
	    lang => 1,
	    longdesc => 1,
	    maxlength => 1,
	    media => 1,
	    method => 1,
	    multiple => 1,
	    name => 1,
	    nohref => 1,
	    noshade => 1,
	    nowrap => 1,
	    prompt => 1,
	    readonly => 1,
	    rel => 1,
	    rev => 1,
	    rows => 1,
	    rowspan => 1,
	    rules => 1,
	    scope => 1,
	    selected => 1,
	    shape => 1,
	    size => 1,
	    span => 1,
	    src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
	    start => 1,
	    style => $viewimages ? 1 : \&remove_urls_from_attr,
	    summary => 1,
	    tabindex => 1,
	    target => 1,
	    title => 1,
	    type => 1,
	    usemap => 1,
	    valign => 1,
	    value => 1,
	    vspace => 1,
	    width => 1,
	}
    );

    my $scrubber = HTML::Scrubber->new(
	allow   => \@allow,
	rules   => \@rules,
	default => \@default,
	comment => 0,
	process => 0,
    );

    $scrubber->style(1);

    return $scrubber;
}

sub read_raw_email {
    my ($path, $maxbytes) = @_;

    open (my $fh, '<', $path) || die "unable to open '$path' - $!\n";

    my $data = '';
    my $raw_header = '';

    # read header
    my $header;
    while (defined(my $line = <$fh>)) {
	my $decoded_line = PMG::Utils::try_decode_utf8($line);
	$raw_header .= $decoded_line;
	chomp $decoded_line;
	push @$header, $decoded_line;
	last if $line =~ m/^\s*$/;
    }

    my $head = MIME::Head->new($header);

    my $cs = $head->mime_attr("content-type.charset");

    my $bytes = 0;

    while (defined(my $line = <$fh>)) {
	$bytes += length ($line);
	if ($cs) {
	    $data .= decode($cs, $line);
	} else {
	    $data .= $line;
	}
	if (defined($maxbytes) && ($bytes >= $maxbytes)) {
	    $data .= "\n... mail truncated (> $maxbytes bytes)\n";
	    last;
	}
    }

    close($fh);

    return ($raw_header, $data);
}

my $read_part = sub {
    my ($part) = @_;

    my $io = $part->open("r");
    return undef if !$io;

    my $raw = '';
    while (defined(my $line = $io->getline)) { $raw .= $line; }
    $io->close;

    return $raw;
};

my $find_images = sub {
    my ($cid_hash, $entity) = @_;

    foreach my $part ($entity->parts)  {
	if (my $rawcid = $part->head->get('Content-Id')) {
	    if ($rawcid =~ m/^\s*<(\S+)>\s*$/) {
		my $cid = $1;
		my $ctype = $part->head->mime_attr('Content-type') // '';
		if ($ctype =~ m!^image/!) {
		    if (defined(my $raw = $read_part->($part))) {
			$cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
		    }
		}
	    }
	}
    }
};

sub entity_to_html {
    my ($entity, $cid_hash, $viewimages, $allowhref) = @_;

    my $mime_type = lc($entity->mime_type);;

    if ($mime_type eq 'text/plain') {
	my $raw = $read_part->($entity) // '';
	my $html = "<pre>\n";

	if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
	    $html .= PMG::Utils::decode_to_html($cs, $raw);
	} else {
	    $html .= encode_entities($raw);
	}

	$html .= "</pre>\n";

	return $html;

    } elsif ($mime_type eq 'text/html') {
	my $raw = $read_part->($entity) // '';

	if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
	    eval { $raw = decode($cs, $raw); }; # ignore errors here
	}

	# create a well formed tree
	my $tree = HTML::TreeBuilder->new();
	$tree->parse($raw);
	$tree->eof();

	# normalizes html, replaces CID references with data uris and scrubs style tags
	my $whtml = dump_html($tree, $cid_hash, $viewimages);
	$tree->delete;

	# remove dangerous/unneeded elements
	my $scrubber = getscrubber($viewimages, $allowhref);
	return $scrubber->scrub($whtml);

    } elsif ($mime_type =~ m|^multipart/|i) {
	my $multi_part;
	my $html_part;
	my $text_part;

	foreach my $part ($entity->parts)  {
	    my $subtype = lc($part->mime_type);
	    $multi_part = $part if !defined($multi_part) && $subtype =~ m|multipart/|i;
	    $html_part = $part if !defined($html_part) && $subtype eq 'text/html';
	    $text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
	}

	# get related/embedded images as data uris
	$find_images->($cid_hash, $entity);

	my $alt = $multi_part || $html_part || $text_part;

	return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
    }

    return undef;
}

sub email_to_html {
    my ($path, $raw, $viewimages, $allowhref) = @_;

    my $dumpdir = "/tmp/.proxdumpview_$$";

    my $html = '';

    eval {
	if ($raw) {

	    my ($header, $content) = read_raw_email($path);

	    $html .= "<pre>\n" .
		encode_entities($header) .
		"\n" .
		encode_entities($content) .
		"</pre>\n";

	} else {

	    my $parser = PMG::MIMEUtils::new_mime_parser({
		dumpdir => $dumpdir,
	    });

	    my $entity = $parser->parse_open($path);

	    PMG::MIMEUtils::fixup_multipart($entity);

	    $html = entity_to_html($entity, {}, $viewimages, $allowhref);
	}
    };
    my $err = $@;

    rmtree $dumpdir;

    die "unable to parse mail: $err" if $err;

    return $html;
}

1;
Commit	Line	Data
08d6921a DM	1	package PMG::HTMLMail;
	2
	3	use strict;
	4	use warnings;
	5	use Encode;
	6	use Data::Dumper;
	7	use MIME::Head;
	8	use File::Path;
	9	use HTML::Entities;
	10	use MIME::Parser;
	11	use MIME::Base64;
	12	use HTML::TreeBuilder;
	13	use HTML::Scrubber;
	14
18598b2c DC	15	use PMG::Utils;
	16	use PMG::MIMEUtils;
	17
d970f290 DC	18	# $value is a ref to a string scalar
	19	my sub remove_urls {
	20	my ($value) = @_;
d6bf3645 TL	21	return if !defined $$value;
d6bf3645 TL	22
d970f290 DC	23	# convert 'url([..])' to '___([..])' so the browser does not load it
	24	$$value =~ s\|url\(\|___(\|gi;
	25
	26	# similar for all protocols
	27	$$value =~ s\|[a-z0-9]+://\|_\|gi;
	28	}
	29
	30	my sub remove_urls_from_attr {
	31	my ($obj, $tag_name, $attr_name, $value) = @_;
	32
	33	remove_urls(\$value);
	34
	35	return $value;
	36	}
	37
08d6921a	38	sub dump_html {
d970f290	39	my ($tree, $cid_hash, $view_images) = @_;
08d6921a DM	40
	41	my @html = ();
	42
66a0e574 TL	43	$tree->traverse(sub {
	44	my ($node, $start, $depth) = @_;
	45	if (ref $node) {
	46	my $tag = $node->{'_tag'};
	47
	48	# try to open a new window when user activates a anchor
	49	$node->{target} = '_blank' if $tag eq 'a';
	50
	51	if ($tag eq 'img' && $view_images) {
	52	if ($node->{src} && $node->{src} =~ m/^cid:(\S+)$/) {
	53	if (my $datauri = $cid_hash->{$1}) {
	54	$node->{src} = $datauri;
08d6921a DM	55	}
08d6921a DM	56	}
66a0e574	57	}
08d6921a	58
66a0e574 TL	59	if ($tag eq 'style' && !$view_images) {
	60	remove_urls($_) for grep { !ref $$_ } $node->content_refs_list();
	61	}
d970f290	62
66a0e574 TL	63	if ($start) { # on the way in
	64	push(@html, $node->starttag);
	65	} else { # on the way out
	66	push(@html, $node->endtag);
	67	}
	68	} else { # simple text content
	69	# To keep from amp-escaping children of script et al. That doesn't deal with descendants;
	70	# but then, CDATA parents shouldn't /have/ descendants other than a text children (or comments?)
	71	if (!$HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }) {
	72	# That does magic things if $entities is undef.
	73	$node = encode_entities($node);
08d6921a	74	}
66a0e574	75	push(@html, $node);
08d6921a	76	}
66a0e574 TL	77	return 1; # keep traversing
66a0e574 TL	78	});
08d6921a DM	79
	80	return join('', @html, "\n");
	81	}
	82
	83	sub getscrubber {
	84	my ($viewimages, $allowhref) = @_;
	85
	86	# see http://web.archive.org/web/20110726052341/http://feedparser.org/docs/html-sanitization.html
	87
98dac1e6	88	my @allow = qw(a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 head hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span style strike strong sub sup title table tbody td textarea tfoot th thead tr tt u ul var html body);
08d6921a	89
3f819452 DM	90	my @rules = ( script => 0 );
3f819452 DM	91
08d6921a DM	92	my @default = (
	93	0 => # default rule, deny all tags
	94	{
	95	'*' => 0, # default rule, deny all attributes
	96	abbr => 1,
	97	accept => 1,
	98	'accept-charset' => 1,
	99	accesskey => 1,
	100	align => 1,
	101	alt => 1,
	102	axis => 1,
	103	border => 1,
7cfe043a	104	bgcolor => 1,
08d6921a DM	105	cellpadding => 1,
	106	cellspacing => 1,
	107	char => 1,
	108	charoff => 1,
	109	charset => 1,
	110	checked => 1,
	111	cite => 1,
	112	class => 1,
	113	clear => 1,
	114	cols => 1,
	115	colspan => 1,
	116	color => 1,
	117	compact => 1,
	118	coords => 1,
	119	datetime => 1,
	120	dir => 1,
	121	disabled => 1,
	122	enctype => 1,
	123	frame => 1,
	124	headers => 1,
	125	height => 1,
	126	# only allow http:// and https:// hrefs
	127	'href' => $allowhref ? qr{^https?://[^/]+/}i : 0,
	128	hreflang => 1,
	129	hspace => 1,
	130	id => 1,
	131	ismap => 1,
	132	label => 1,
	133	lang => 1,
	134	longdesc => 1,
	135	maxlength => 1,
	136	media => 1,
	137	method => 1,
	138	multiple => 1,
	139	name => 1,
	140	nohref => 1,
	141	noshade => 1,
	142	nowrap => 1,
	143	prompt => 1,
	144	readonly => 1,
	145	rel => 1,
	146	rev => 1,
	147	rows => 1,
	148	rowspan => 1,
	149	rules => 1,
	150	scope => 1,
	151	selected => 1,
	152	shape => 1,
	153	size => 1,
	154	span => 1,
3f819452	155	src => $viewimages ? qr{^(?!(?:java)?script)}i : 0,
08d6921a	156	start => 1,
8a1a3461	157	style => $viewimages ? 1 : \&remove_urls_from_attr,
08d6921a DM	158	summary => 1,
	159	tabindex => 1,
	160	target => 1,
	161	title => 1,
	162	type => 1,
	163	usemap => 1,
	164	valign => 1,
	165	value => 1,
	166	vspace => 1,
	167	width => 1,
	168	}
	169	);
	170
	171	my $scrubber = HTML::Scrubber->new(
	172	allow => \@allow,
	173	rules => \@rules,
	174	default => \@default,
	175	comment => 0,
08d6921a DM	176	process => 0,
	177	);
	178
98dac1e6 DM	179	$scrubber->style(1);
98dac1e6 DM	180
08d6921a DM	181	return $scrubber;
	182	}
	183
	184	sub read_raw_email {
b75acee7	185	my ($path, $maxbytes) = @_;
08d6921a DM	186
	187	open (my $fh, '<', $path) \|\| die "unable to open '$path' - $!\n";
	188
	189	my $data = '';
	190	my $raw_header = '';
	191
	192	# read header
	193	my $header;
	194	while (defined(my $line = <$fh>)) {
07f7275e SI	195	my $decoded_line = PMG::Utils::try_decode_utf8($line);
	196	$raw_header .= $decoded_line;
	197	chomp $decoded_line;
	198	push @$header, $decoded_line;
08d6921a DM	199	last if $line =~ m/^\s*$/;
	200	}
	201
	202	my $head = MIME::Head->new($header);
	203
	204	my $cs = $head->mime_attr("content-type.charset");
	205
b75acee7 DM	206	my $bytes = 0;
b75acee7 DM	207
08d6921a	208	while (defined(my $line = <$fh>)) {
b75acee7	209	$bytes += length ($line);
08d6921a DM	210	if ($cs) {
	211	$data .= decode($cs, $line);
	212	} else {
	213	$data .= $line;
	214	}
b75acee7 DM	215	if (defined($maxbytes) && ($bytes >= $maxbytes)) {
	216	$data .= "\n... mail truncated (> $maxbytes bytes)\n";
	217	last;
	218	}
08d6921a DM	219	}
	220
	221	close($fh);
	222
	223	return ($raw_header, $data);
	224	}
	225
	226	my $read_part = sub {
	227	my ($part) = @_;
	228
	229	my $io = $part->open("r");
	230	return undef if !$io;
	231
	232	my $raw = '';
	233	while (defined(my $line = $io->getline)) { $raw .= $line; }
	234	$io->close;
	235
	236	return $raw;
	237	};
	238
	239	my $find_images = sub {
59424fd5	240	my ($cid_hash, $entity) = @_;
08d6921a DM	241
	242	foreach my $part ($entity->parts) {
	243	if (my $rawcid = $part->head->get('Content-Id')) {
	244	if ($rawcid =~ m/^\s<(\S+)>\s$/) {
	245	my $cid = $1;
	246	my $ctype = $part->head->mime_attr('Content-type') // '';
	247	if ($ctype =~ m!^image/!) {
	248	if (defined(my $raw = $read_part->($part))) {
59424fd5	249	$cid_hash->{$cid} = "data:$ctype;base64," . encode_base64($raw, '');
08d6921a DM	250	}
	251	}
	252	}
	253	}
	254	}
08d6921a DM	255	};
	256
	257	sub entity_to_html {
	258	my ($entity, $cid_hash, $viewimages, $allowhref) = @_;
	259
	260	my $mime_type = lc($entity->mime_type);;
	261
	262	if ($mime_type eq 'text/plain') {
	263	my $raw = $read_part->($entity) // '';
	264	my $html = "<pre>\n";
	265
	266	if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
	267	$html .= PMG::Utils::decode_to_html($cs, $raw);
	268	} else {
	269	$html .= encode_entities($raw);
	270	}
	271
	272	$html .= "</pre>\n";
	273
	274	return $html;
	275
	276	} elsif ($mime_type eq 'text/html') {
	277	my $raw = $read_part->($entity) // '';
	278
	279	if (defined(my $cs = $entity->head->mime_attr("content-type.charset"))) {
	280	eval { $raw = decode($cs, $raw); }; # ignore errors here
	281	}
	282
	283	# create a well formed tree
	284	my $tree = HTML::TreeBuilder->new();
	285	$tree->parse($raw);
	286	$tree->eof();
	287
d970f290 DC	288	# normalizes html, replaces CID references with data uris and scrubs style tags
d970f290 DC	289	my $whtml = dump_html($tree, $cid_hash, $viewimages);
08d6921a DM	290	$tree->delete;
	291
	292	# remove dangerous/unneeded elements
	293	my $scrubber = getscrubber($viewimages, $allowhref);
	294	return $scrubber->scrub($whtml);
	295
	296	} elsif ($mime_type =~ m\|^multipart/\|i) {
	297	my $multi_part;
	298	my $html_part;
	299	my $text_part;
	300
	301	foreach my $part ($entity->parts) {
	302	my $subtype = lc($part->mime_type);
	303	$multi_part = $part if !defined($multi_part) && $subtype =~ m\|multipart/\|i;
	304	$html_part = $part if !defined($html_part) && $subtype eq 'text/html';
	305	$text_part = $part if !defined($text_part) && $subtype eq 'text/plain';
	306	}
	307
	308	# get related/embedded images as data uris
59424fd5	309	$find_images->($cid_hash, $entity);
08d6921a DM	310
	311	my $alt = $multi_part \|\| $html_part \|\| $text_part;
	312
59424fd5	313	return entity_to_html($alt, $cid_hash, $viewimages, $allowhref) if $alt;
08d6921a DM	314	}
	315
	316	return undef;
	317	}
	318
	319	sub email_to_html {
	320	my ($path, $raw, $viewimages, $allowhref) = @_;
	321
	322	my $dumpdir = "/tmp/.proxdumpview_$$";
	323
	324	my $html = '';
	325
	326	eval {
	327	if ($raw) {
	328
	329	my ($header, $content) = read_raw_email($path);
	330
	331	$html .= "<pre>\n" .
	332	encode_entities($header) .
	333	"\n" .
	334	encode_entities($content) .
	335	"</pre>\n";
	336
	337	} else {
	338
18598b2c DC	339	my $parser = PMG::MIMEUtils::new_mime_parser({
	340	dumpdir => $dumpdir,
	341	});
08d6921a DM	342
	343	my $entity = $parser->parse_open($path);
	344
18598b2c	345	PMG::MIMEUtils::fixup_multipart($entity);
08d6921a DM	346
	347	$html = entity_to_html($entity, {}, $viewimages, $allowhref);
	348	}
	349	};
	350	my $err = $@;
	351
	352	rmtree $dumpdir;
	353
	354	die "unable to parse mail: $err" if $err;
	355
	356	return $html;
	357	}
	358
	359	1;