/*
* Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2018 Datto Inc.
*/
#ifndef _SYS_DATASET_KSTATS_H
aggsum_t das_nwritten;
aggsum_t das_reads;
aggsum_t das_nread;
+ aggsum_t das_nunlinks;
+ aggsum_t das_nunlinked;
} dataset_aggsum_stats_t;
typedef struct dataset_kstat_values {
kstat_named_t dkv_nwritten;
kstat_named_t dkv_reads;
kstat_named_t dkv_nread;
+ /*
+ * nunlinks is initialized to the unlinked set size on mount and
+ * is incremented whenever a new entry is added to the unlinked set
+ */
+ kstat_named_t dkv_nunlinks;
+ /*
+ * nunlinked is initialized to zero on mount and is incremented when an
+ * entry is removed from the unlinked set
+ */
+ kstat_named_t dkv_nunlinked;
} dataset_kstat_values_t;
typedef struct dataset_kstats {
void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t);
void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t);
+void dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *, int64_t);
+void dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *, int64_t);
+
#endif /* _SYS_DATASET_KSTATS_H */
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;
+ struct taskq *dp_unlinked_drain_taskq;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);
+taskq_t *dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp);
int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
const char *tag, uint64_t now, dmu_tx_t *tx);
extern boolean_t zfs_dirempty(znode_t *);
extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
+extern void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs);
extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
extern int zfs_get_xattrdir(znode_t *, struct inode **, cred_t *, int);
extern int zfs_make_xattrdir(znode_t *, vattr_t *, struct inode **, cred_t *);
boolean_t z_replay; /* set during ZIL replay */
boolean_t z_use_sa; /* version allow system attributes */
boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */
+ boolean_t z_draining; /* is true when drain is active */
+ boolean_t z_drain_cancel; /* signal the unlinked drain to stop */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
dataset_kstats_t z_kstat; /* fs kstats */
uint64_t z_hold_size; /* znode hold array size */
avl_tree_t *z_hold_trees; /* znode hold trees */
kmutex_t *z_hold_locks; /* znode hold locks */
+ taskqid_t z_drain_task; /* task id for the unlink drain task */
};
#define ZSB_XATTR 0x0001 /* Enable user xattrs */
Default value: 20
.RE
+.sp
+.ne 2
+.na
+\fBzfs_unlink_suspend_progress\fR (uint)
+.ad
+.RS 12n
+When enabled, files will not be asynchronously removed from the list of pending
+unlinks and the space they consume will be leaked. Once this option has been
+disabled and the dataset is remounted, the pending unlinks will be processed
+and the freed space returned to the pool.
+This option is used by the test suite to facilitate testing.
+.sp
+Uses \fB0\fR (default) to allow progress and \fB1\fR to pause progress.
+.RE
+
.sp
.ne 2
.na
/*
* Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2018 Datto Inc.
*/
#include <sys/dataset_kstats.h>
{ "nwritten", KSTAT_DATA_UINT64 },
{ "reads", KSTAT_DATA_UINT64 },
{ "nread", KSTAT_DATA_UINT64 },
+ { "nunlinks", KSTAT_DATA_UINT64 },
+ { "nunlinked", KSTAT_DATA_UINT64 },
};
static int
aggsum_value(&dk->dk_aggsums.das_reads);
dkv->dkv_nread.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nread);
+ dkv->dkv_nunlinks.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_nunlinks);
+ dkv->dkv_nunlinked.value.ui64 =
+ aggsum_value(&dk->dk_aggsums.das_nunlinked);
return (0);
}
aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
aggsum_init(&dk->dk_aggsums.das_reads, 0);
aggsum_init(&dk->dk_aggsums.das_nread, 0);
+ aggsum_init(&dk->dk_aggsums.das_nunlinks, 0);
+ aggsum_init(&dk->dk_aggsums.das_nunlinked, 0);
}
void
aggsum_fini(&dk->dk_aggsums.das_nwritten);
aggsum_fini(&dk->dk_aggsums.das_reads);
aggsum_fini(&dk->dk_aggsums.das_nread);
+ aggsum_fini(&dk->dk_aggsums.das_nunlinks);
+ aggsum_fini(&dk->dk_aggsums.das_nunlinked);
}
void
aggsum_add(&dk->dk_aggsums.das_reads, 1);
aggsum_add(&dk->dk_aggsums.das_nread, nread);
}
+
+void
+dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+ if (dk->dk_kstats == NULL)
+ return;
+
+ aggsum_add(&dk->dk_aggsums.das_nunlinks, delta);
+}
+
+void
+dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+ if (dk->dk_kstats == NULL)
+ return;
+
+ aggsum_add(&dk->dk_aggsums.das_nunlinked, delta);
+}
dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
+ max_ncpus, defclsyspri, max_ncpus, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
return (dp);
}
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
cv_destroy(&dp->dp_spaceavail_cv);
+ taskq_destroy(dp->dp_unlinked_drain_taskq);
taskq_destroy(dp->dp_iput_taskq);
if (dp->dp_blkstats != NULL) {
mutex_destroy(&dp->dp_blkstats->zab_lock);
return (dp->dp_iput_taskq);
}
+taskq_t *
+dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
+{
+ return (dp->dp_unlinked_drain_taskq);
+}
+
/*
* Walk through the pool-wide zap object of temporary snapshot user holds
* and release them.
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
}
/*
* Clean up any znodes that had no links when we either crashed or
* (force) umounted the file system.
*/
-void
-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+static void
+zfs_unlinked_drain_task(void *arg)
{
+ zfsvfs_t *zfsvfs = arg;
zap_cursor_t zc;
zap_attribute_t zap;
dmu_object_info_t doi;
znode_t *zp;
int error;
+ ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
+
/*
* Iterate over the contents of the unlinked set.
*/
for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
- zap_cursor_retrieve(&zc, &zap) == 0;
+ zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
zap_cursor_advance(&zc)) {
/*
continue;
zp->z_unlinked = B_TRUE;
+
+ /*
+ * iput() is Linux's equivalent to illumos' VN_RELE(). It will
+ * decrement the inode's ref count and may cause the inode to be
+ * synchronously freed. We interrupt freeing of this inode, by
+ * checking the return value of dmu_objset_zfs_unmounting() in
+ * dmu_free_long_range(), when an unmount is requested.
+ */
iput(ZTOI(zp));
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
}
zap_cursor_fini(&zc);
+
+ zfsvfs->z_draining = B_FALSE;
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+}
+
+/*
+ * Sets z_draining then tries to dispatch async unlinked drain.
+ * If that fails executes synchronous unlinked drain.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+ ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
+
+ zfsvfs->z_draining = B_TRUE;
+ zfsvfs->z_drain_cancel = B_FALSE;
+
+ zfsvfs->z_drain_task = taskq_dispatch(
+ dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
+ zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
+ if (zfsvfs->z_drain_task == TASKQID_INVALID) {
+ zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
+ zfs_unlinked_drain_task(zfsvfs);
+ }
+}
+
+/*
+ * Wait for the unlinked drain taskq task to stop. This will interrupt the
+ * unlinked set processing if it is in progress.
+ */
+void
+zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
+{
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+
+ if (zfsvfs->z_draining) {
+ zfsvfs->z_drain_cancel = B_TRUE;
+ taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+ zfsvfs->z_draining = B_FALSE;
+ }
}
/*
VERIFY3U(0, ==,
zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+ dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
zfs_znode_delete(zp, tx);
dmu_tx_commit(tx);
return (error);
}
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+ zfsvfs->z_draining = B_FALSE;
+ zfsvfs->z_drain_cancel = B_TRUE;
+
*zfvp = zfsvfs;
return (0);
}
* operations out since we closed the ZIL.
*/
if (mounting) {
+ ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
- if (readonly != 0)
+ if (readonly != 0) {
readonly_changed_cb(zfsvfs, B_FALSE);
- else
+ } else {
+ zap_stats_t zs;
+ if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ &zs) == 0) {
+ dataset_kstats_update_nunlinks_kstat(
+ &zfsvfs->z_kstat, zs.zs_num_entries);
+ }
+ dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+ "num_entries in unlinked set: %llu",
+ zs.zs_num_entries);
zfs_unlinked_drain(zfsvfs);
+ }
/*
* Parse and replay the intent log.
/* restore readonly bit */
if (readonly != 0)
readonly_changed_cb(zfsvfs, B_TRUE);
-
- ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
- dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
}
/*
{
znode_t *zp;
+ zfs_unlinked_drain_stop_wait(zfsvfs);
+
/*
* If someone has not already unmounted this file system,
* drain the iput_taskq to ensure all active references to the
/* zfsvfs is NULL when zfs_domount fails during mount */
if (zfsvfs) {
+ zfs_unlinked_drain_stop_wait(zfsvfs);
zfsctl_destroy(sb->s_fs_info);
/*
* Wait for iput_async before entering evict_inodes in
}
mutex_exit(&zfsvfs->z_znodes_lock);
+ if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
+ /*
+ * zfs_suspend_fs() could have interrupted freeing
+ * of dnodes. We need to restart this freeing so
+ * that we don't "leak" the space.
+ */
+ zfs_unlinked_drain(zfsvfs);
+ }
+
bail:
/* release the VFS ops */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
static kmem_cache_t *znode_hold_cache = NULL;
unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
+/*
+ * This is used by the test suite so that it can delay znodes from being
+ * freed in order to inspect the unlinked set.
+ */
+int zfs_unlink_suspend_progress = 0;
+
/*
* This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
* z_rangelock. It will modify the offset and length of the lock to reflect
*/
if (zp->z_unlinked) {
ASSERT(!zfsvfs->z_issnap);
- if (!zfs_is_readonly(zfsvfs)) {
+ if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
mutex_exit(&zp->z_lock);
zfs_znode_hold_exit(zfsvfs, zh);
zfs_rmnode(zp);
/* CSTYLED */
module_param(zfs_object_mutex_size, uint, 0644);
MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
+module_param(zfs_unlink_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
+"(debug - leaks space into the unlinked set)");
#endif
tags = ['functional', 'mmp']
[tests/functional/mount]
-tests = ['umount_001', 'umountall_001']
+tests = ['umount_001', 'umount_unlinked_drain', 'umountall_001']
tags = ['functional', 'mount']
[tests/functional/mv_files]
setup.ksh \
cleanup.ksh \
umount_001.ksh \
+ umount_unlinked_drain.ksh \
umountall_001.ksh
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Datto Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Test async unlinked drain to ensure mounting is not held up when there are
+# entries in the unlinked set. We also try to test that the list is able to be
+# filled up and drained at the same time.
+#
+# STRATEGY:
+# 1. Use zfs_unlink_suspend_progress tunable to disable freeing to build up
+# the unlinked set
+# 2. Make sure mount happens even when there are entries in the unlinked set
+# 3. Drain and build up the unlinked list at the same time to test for races
+#
+
+function cleanup
+{
+ log_must set_tunable32 zfs_unlink_suspend_progress $default_unlink_sp
+ for fs in $(seq 1 3); do
+ mounted $TESTDIR.$fs || zfs mount $TESTPOOL/$TESTFS.$fs
+ rm -f $TESTDIR.$fs/file-*
+ zfs set xattr=on $TESTPOOL/$TESTFS.$fs
+ done
+}
+
+function unlinked_size_is
+{
+ MAX_ITERS=5 # iteration to do before we consider reported number stable
+ iters=0
+ last_usize=0
+ while [[ $iters -le $MAX_ITERS ]]; do
+ kstat_file=$(grep -nrwl /proc/spl/kstat/zfs/$2/objset-0x* -e $3)
+ nunlinks=`cat $kstat_file | grep nunlinks | awk '{print $3}'`
+ nunlinked=`cat $kstat_file | grep nunlinked | awk '{print $3}'`
+ usize=$(($nunlinks - $nunlinked))
+ if [[ $iters == $MAX_ITERS && $usize == $1 ]]; then
+ return 0
+ fi
+ if [[ $usize == $last_usize ]]; then
+ (( iters++ ))
+ else
+ iters=0
+ fi
+ last_usize=$usize
+ done
+
+ log_note "Unexpected unlinked set size: $last_usize, expected $1"
+ return 1
+}
+
+
+UNLINK_SP_PARAM=/sys/module/zfs/parameters/zfs_unlink_suspend_progress
+default_unlink_sp=$(get_tunable zfs_unlink_suspend_progress)
+
+log_onexit cleanup
+
+log_assert "Unlinked list drain does not hold up mounting of fs"
+
+for fs in 1 2 3; do
+ set -A xattrs on sa off
+ for xa in ${xattrs[@]}; do
+ # setup fs and ensure all deleted files got into unliked set
+ log_must mounted $TESTDIR.$fs
+
+ log_must zfs set xattr=$xa $TESTPOOL/$TESTFS.$fs
+
+ if [[ $xa == off ]]; then
+ for fn in $(seq 1 175); do
+ log_must mkfile 128k $TESTDIR.$fs/file-$fn
+ done
+ else
+ log_must xattrtest -f 175 -x 3 -r -k -p $TESTDIR.$fs
+ fi
+
+ log_must set_tunable32 zfs_unlink_suspend_progress 1
+ log_must unlinked_size_is 0 $TESTPOOL $TESTPOOL/$TESTFS.$fs
+
+ # build up unlinked set
+ for fn in $(seq 1 100); do
+ log_must eval "rm $TESTDIR.$fs/file-$fn &"
+ done
+ log_must unlinked_size_is 100 $TESTPOOL $TESTPOOL/$TESTFS.$fs
+
+ # test that we can mount fs without emptying the unlinked list
+ log_must zfs umount $TESTPOOL/$TESTFS.$fs
+ log_must unmounted $TESTDIR.$fs
+ log_must zfs mount $TESTPOOL/$TESTFS.$fs
+ log_must mounted $TESTDIR.$fs
+ log_must unlinked_size_is 100 $TESTPOOL $TESTPOOL/$TESTFS.$fs
+
+ # confirm we can drain and add to unlinked set at the same time
+ log_must set_tunable32 zfs_unlink_suspend_progress 0
+ log_must zfs umount $TESTPOOL/$TESTFS.$fs
+ log_must zfs mount $TESTPOOL/$TESTFS.$fs
+ for fn in $(seq 101 175); do
+ log_must eval "rm $TESTDIR.$fs/file-$fn &"
+ done
+ log_must unlinked_size_is 0 $TESTPOOL $TESTPOOL/$TESTFS.$fs
+ done
+done
+
+log_pass "Confirmed unlinked list drain does not hold up mounting of fs"