From a966c5640e432f723753e63354394406a7c3dcf8 Mon Sep 17 00:00:00 2001 From: kernelOfTruth Date: Fri, 21 Aug 2015 03:43:10 +0200 Subject: [PATCH] Reintroduce zfs_remove() synchronous deletes Reintroduce a slightly adapted version of the Illumos logic for synchronous unlinks. The basic idea here is that only files smaller than zfs_delete_blocks (20480) blocks should be deleted synchronously. Unlinking larger files should be handled asynchronously to minimize impact to the caller. To accomplish this iput() which is responsible for calling zfs_znode_delete() on Linux is only called in the delete_now path. Otherwise zfs_async_iput() is used which allows the last reference to be dropped by a taskq thread effectively making the removal asynchronous. Porting notes: - Add zfs_delete_blocks module option for performance analysis. The default value is DMU_MAX_DELETEBLKCNT which is the same as upstream. Reducing this value means that smaller files will be unlinked asynchronously like large files. - All occurrences of zfsvfs changes to zsb. Ported-by: KernelOfTruth kerneloftruth@gmail.com Signed-off-by: Brian Behlendorf --- man/man5/zfs-module-parameters.5 | 15 ++++++ module/zfs/zfs_vnops.c | 78 +++++++++++++++++++++++++++----- 2 files changed, 81 insertions(+), 12 deletions(-) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index e0c61ec21..f801f257b 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -715,6 +715,21 @@ Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64. Default value: \fB500,000\fR. .RE +.sp +.ne 2 +.na +\fBzfs_delete_blocks\fR (ulong) +.ad +.RS 12n +This is the used to define a large file for the purposes of delete. Files +containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously +while smaller files are deleted synchronously. Decreasing this value will +reduce the time spent in an unlink(2) system call at the expense of a longer +delay before the freed space is available. +.sp +Default value: \fB20,480\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 2f003de9f..f386b98a7 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -418,6 +418,7 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio) #endif /* _KERNEL */ unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ +unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Read bytes from specified file into supplied buffer. @@ -1520,13 +1521,13 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr) struct inode *ip; zfs_sb_t *zsb = ITOZSB(dip); zilog_t *zilog; - uint64_t xattr_obj; + uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; - boolean_t may_delete_now; - boolean_t unlinked; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; #ifdef HAVE_PN_UTILS @@ -1590,9 +1591,10 @@ top: mutex_exit(&zp->z_lock); /* - * We never delete the znode and always place it in the unlinked - * set. The dentry cache will always hold the last reference and - * is responsible for safely freeing the znode. + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the inode. So we dmu_tx_hold() the right things to + * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zsb->z_os); @@ -1600,6 +1602,12 @@ top: dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), @@ -1611,6 +1619,11 @@ top: dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); @@ -1662,6 +1675,42 @@ top: mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) && + xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == + acl_obj; + } + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(xzp->z_links, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = 1; + xzp->z_links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zsb), + &xzp->z_links, sizeof (xzp->z_links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zsb), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zsb), &null_xattr, + sizeof (uint64_t), tx); + ASSERT0(error); + } + /* + * Add to the unlinked set because a new reference could be + * taken concurrently resulting in a deferred destruction. + */ + zfs_unlinked_add(zp, tx); + mutex_exit(&zp->z_lock); + zfs_inode_update(zp); + iput(ip); + } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } @@ -1682,13 +1731,16 @@ out: zfs_dirent_unlock(dl); zfs_inode_update(dzp); - zfs_inode_update(zp); - if (xzp) - zfs_inode_update(xzp); - iput(ip); - if (xzp) - iput(ZTOI(xzp)); + if (!delete_now) { + zfs_inode_update(zp); + zfs_iput_async(ip); + } + + if (xzp) { + zfs_inode_update(xzp); + zfs_iput_async(ZTOI(xzp)); + } if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); @@ -4710,6 +4762,8 @@ zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) #endif /* HAVE_UIO_ZEROCOPY */ #if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_delete_blocks, ulong, 0644); +MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); module_param(zfs_read_chunk_size, long, 0644); MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); #endif -- 2.39.5