From a966c5640e432f723753e63354394406a7c3dcf8 Mon Sep 17 00:00:00 2001
From: kernelOfTruth <kerneloftruth@gmail.com>
Date: Fri, 21 Aug 2015 03:43:10 +0200
Subject: [PATCH] Reintroduce zfs_remove() synchronous deletes

Reintroduce a slightly adapted version of the Illumos logic for
synchronous unlinks.  The basic idea here is that only files
smaller than zfs_delete_blocks (20480) blocks should be deleted
synchronously.  Unlinking larger files should be handled
asynchronously to minimize impact to the caller.

To accomplish this iput() which is responsible for calling
zfs_znode_delete() on Linux is only called in the delete_now
path.  Otherwise zfs_async_iput() is used which allows the
last reference to be dropped by a taskq thread effectively
making the removal asynchronous.

Porting notes:
- Add zfs_delete_blocks module option for performance analysis.
  The default value is DMU_MAX_DELETEBLKCNT which is the same
  as upstream.  Reducing this value means that smaller files
  will be unlinked asynchronously like large files.
- All occurrences of zfsvfs changes to zsb.

Ported-by: KernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 man/man5/zfs-module-parameters.5 | 15 ++++++
 module/zfs/zfs_vnops.c           | 78 +++++++++++++++++++++++++++-----
 2 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index e0c61ec21..f801f257b 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -715,6 +715,21 @@ Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64.
 Default value: \fB500,000\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_delete_blocks\fR (ulong)
+.ad
+.RS 12n
+This is the used to define a large file for the purposes of delete.  Files
+containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously
+while smaller files are deleted synchronously.  Decreasing this value will
+reduce the time spent in an unlink(2) system call at the expense of a longer
+delay before the freed space is available.
+.sp
+Default value: \fB20,480\fR.
+.RE
+
 .sp
 .ne 2
 .na
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 2f003de9f..f386b98a7 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -418,6 +418,7 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio)
 #endif /* _KERNEL */
 
 unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 
 /*
  * Read bytes from specified file into supplied buffer.
@@ -1520,13 +1521,13 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr)
 	struct inode	*ip;
 	zfs_sb_t	*zsb = ITOZSB(dip);
 	zilog_t		*zilog;
-	uint64_t	xattr_obj;
+	uint64_t	acl_obj, xattr_obj;
 	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	boolean_t	may_delete_now;
-	boolean_t	unlinked;
+	boolean_t	may_delete_now, delete_now = FALSE;
+	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 #ifdef HAVE_PN_UTILS
@@ -1590,9 +1591,10 @@ top:
 	mutex_exit(&zp->z_lock);
 
 	/*
-	 * We never delete the znode and always place it in the unlinked
-	 * set.  The dentry cache will always hold the last reference and
-	 * is responsible for safely freeing the znode.
+	 * We may delete the znode now, or we may put it in the unlinked set;
+	 * it depends on whether we're the last link, and on whether there are
+	 * other holds on the inode.  So we dmu_tx_hold() the right things to
+	 * allow for either case.
 	 */
 	obj = zp->z_id;
 	tx = dmu_tx_create(zsb->z_os);
@@ -1600,6 +1602,12 @@ top:
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
+	if (may_delete_now) {
+		toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
+		/* if the file is too big, only hold_free a token amount */
+		dmu_tx_hold_free(tx, zp->z_id, 0,
+		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+	}
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
@@ -1611,6 +1619,11 @@ top:
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
+	mutex_enter(&zp->z_lock);
+	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+	mutex_exit(&zp->z_lock);
+
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
 
@@ -1662,6 +1675,42 @@ top:
 		mutex_enter(&zp->z_lock);
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
+		delete_now = may_delete_now && !toobig &&
+		    atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) &&
+		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
+		    acl_obj;
+	}
+
+	if (delete_now) {
+		if (xattr_obj_unlinked) {
+			ASSERT3U(xzp->z_links, ==, 2);
+			mutex_enter(&xzp->z_lock);
+			xzp->z_unlinked = 1;
+			xzp->z_links = 0;
+			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zsb),
+			    &xzp->z_links, sizeof (xzp->z_links), tx);
+			ASSERT3U(error,  ==,  0);
+			mutex_exit(&xzp->z_lock);
+			zfs_unlinked_add(xzp, tx);
+
+			if (zp->z_is_sa)
+				error = sa_remove(zp->z_sa_hdl,
+				    SA_ZPL_XATTR(zsb), tx);
+			else
+				error = sa_update(zp->z_sa_hdl,
+				    SA_ZPL_XATTR(zsb), &null_xattr,
+				    sizeof (uint64_t), tx);
+			ASSERT0(error);
+		}
+		/*
+		 * Add to the unlinked set because a new reference could be
+		 * taken concurrently resulting in a deferred destruction.
+		 */
+		zfs_unlinked_add(zp, tx);
+		mutex_exit(&zp->z_lock);
+		zfs_inode_update(zp);
+		iput(ip);
+	} else if (unlinked) {
 		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
 	}
@@ -1682,13 +1731,16 @@ out:
 
 	zfs_dirent_unlock(dl);
 	zfs_inode_update(dzp);
-	zfs_inode_update(zp);
-	if (xzp)
-		zfs_inode_update(xzp);
 
-	iput(ip);
-	if (xzp)
-		iput(ZTOI(xzp));
+	if (!delete_now) {
+		zfs_inode_update(zp);
+		zfs_iput_async(ip);
+	}
+
+	if (xzp) {
+		zfs_inode_update(xzp);
+		zfs_iput_async(ZTOI(xzp));
+	}
 
 	if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
@@ -4710,6 +4762,8 @@ zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
 #endif /* HAVE_UIO_ZEROCOPY */
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_delete_blocks, ulong, 0644);
+MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
 module_param(zfs_read_chunk_size, long, 0644);
 MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
 #endif
-- 
2.39.5