]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Illumos #3464
authorMatthew Ahrens <mahrens@delphix.com>
Wed, 4 Sep 2013 12:00:57 +0000 (07:00 -0500)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 4 Sep 2013 23:01:24 +0000 (16:01 -0700)
3464 zfs synctask code needs restructuring
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>

References:
  https://www.illumos.org/issues/3464
  illumos/illumos-gate@3b2aab18808792cbd248a12f1edf139b89833c13

Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1495

86 files changed:
cmd/zdb/zdb.c
cmd/zfs/zfs_main.c
cmd/zhack/zhack.c
cmd/ztest/ztest.c
include/libzfs.h
include/libzfs_core.h
include/sys/Makefile.am
include/sys/arc.h
include/sys/dbuf.h
include/sys/dmu.h
include/sys/dmu_objset.h
include/sys/dmu_send.h [new file with mode: 0644]
include/sys/dmu_tx.h
include/sys/dsl_dataset.h
include/sys/dsl_destroy.h [new file with mode: 0644]
include/sys/dsl_dir.h
include/sys/dsl_pool.h
include/sys/dsl_prop.h
include/sys/dsl_synctask.h
include/sys/dsl_userhold.h [new file with mode: 0644]
include/sys/metaslab.h
include/sys/nvpair.h
include/sys/refcount.h
include/sys/rrwlock.h
include/sys/spa.h
include/sys/space_map.h
include/sys/txg.h
include/sys/zfeature.h
include/sys/zfs_context.h
include/sys/zfs_debug.h
include/sys/zfs_ioctl.h
include/sys/zfs_znode.h
include/sys/zil.h
include/sys/zvol.h
lib/libzfs/libzfs_config.c
lib/libzfs/libzfs_dataset.c
lib/libzfs/libzfs_diff.c
lib/libzfs/libzfs_fru.c
lib/libzfs/libzfs_graph.c
lib/libzfs/libzfs_import.c
lib/libzfs/libzfs_iter.c
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_sendrecv.c
lib/libzfs_core/libzfs_core.c
lib/libzpool/Makefile.am
lib/libzpool/kernel.c
man/man8/zfs.8
module/nvpair/fnvpair.c
module/zfs/Makefile.in
module/zfs/arc.c
module/zfs/bplist.c
module/zfs/bpobj.c
module/zfs/dbuf.c
module/zfs/dmu.c
module/zfs/dmu_diff.c
module/zfs/dmu_objset.c
module/zfs/dmu_send.c
module/zfs/dmu_traverse.c
module/zfs/dmu_tx.c
module/zfs/dnode.c
module/zfs/dnode_sync.c
module/zfs/dsl_dataset.c
module/zfs/dsl_deleg.c
module/zfs/dsl_destroy.c [new file with mode: 0644]
module/zfs/dsl_dir.c
module/zfs/dsl_pool.c
module/zfs/dsl_prop.c
module/zfs/dsl_scan.c
module/zfs/dsl_synctask.c
module/zfs/dsl_userhold.c [new file with mode: 0644]
module/zfs/metaslab.c
module/zfs/refcount.c
module/zfs/rrwlock.c
module/zfs/sa.c
module/zfs/spa.c
module/zfs/spa_history.c
module/zfs/spa_misc.c
module/zfs/space_map.c
module/zfs/txg.c
module/zfs/zfs_ctldir.c
module/zfs/zfs_ioctl.c
module/zfs/zfs_vfsops.c
module/zfs/zil.c
module/zfs/zio.c
module/zfs/zvol.c
module/zpios/pios.c

index 060498ae2c5029d70b12e1f07fdeae4ed8690f43..b119a16e2f036f414f2da07d34a0362bd667ee36 100644 (file)
@@ -1725,7 +1725,9 @@ dump_dir(objset_t *os)
        int print_header = 1;
        int i, error;
 
+       dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
        dmu_objset_fast_stat(os, &dds);
+       dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
        if (dds.dds_type < DMU_OST_NUMTYPES)
                type = objset_types[dds.dds_type];
@@ -2171,7 +2173,6 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
                zio_nowait(zio_read(NULL, spa, bp, data, size,
                    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
-
        }
 
        zcb->zcb_readfails = 0;
@@ -2365,8 +2366,10 @@ dump_block_stats(spa_t *spa)
         */
        (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
            count_block_cb, &zcb, NULL);
-       (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
-           count_block_cb, &zcb, NULL);
+       if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+               (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+                   count_block_cb, &zcb, NULL);
+       }
        if (spa_feature_is_active(spa,
            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
                VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
index 7176c94169b594e3a441819566826f008e56b754..365f93ba4281f0ac0280feaeb2da11c1c7319042 100644 (file)
@@ -892,6 +892,7 @@ typedef struct destroy_cbdata {
        boolean_t       cb_parsable;
        boolean_t       cb_dryrun;
        nvlist_t        *cb_nvl;
+       nvlist_t        *cb_batchedsnaps;
 
        /* first snap in contiguous run */
        char            *cb_firstsnap;
@@ -988,9 +989,27 @@ destroy_callback(zfs_handle_t *zhp, void *data)
                zfs_close(zhp);
                return (0);
        }
+       if (cb->cb_dryrun) {
+               zfs_close(zhp);
+               return (0);
+       }
+
+       /*
+        * We batch up all contiguous snapshots (even of different
+        * filesystems) and destroy them with one ioctl.  We can't
+        * simply do all snap deletions and then all fs deletions,
+        * because we must delete a clone before its origin.
+        */
+       if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
+               fnvlist_add_boolean(cb->cb_batchedsnaps, name);
+       } else {
+               int error = zfs_destroy_snaps_nvl(g_zfs,
+                   cb->cb_batchedsnaps, B_FALSE);
+               fnvlist_free(cb->cb_batchedsnaps);
+               cb->cb_batchedsnaps = fnvlist_alloc();
 
-       if (!cb->cb_dryrun) {
-               if (zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
+               if (error != 0 ||
+                   zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
                    zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
                        zfs_close(zhp);
                        return (-1);
@@ -1146,8 +1165,10 @@ static int
 zfs_do_destroy(int argc, char **argv)
 {
        destroy_cbdata_t cb = { 0 };
+       int rv = 0;
+       int err = 0;
        int c;
-       zfs_handle_t *zhp;
+       zfs_handle_t *zhp = NULL;
        char *at;
        zfs_type_t type = ZFS_TYPE_DATASET;
 
@@ -1201,11 +1222,9 @@ zfs_do_destroy(int argc, char **argv)
 
        at = strchr(argv[0], '@');
        if (at != NULL) {
-               int err = 0;
 
                /* Build the list of snaps to destroy in cb_nvl. */
-               if (nvlist_alloc(&cb.cb_nvl, NV_UNIQUE_NAME, 0) != 0)
-                       nomem();
+               cb.cb_nvl = fnvlist_alloc();
 
                *at = '\0';
                zhp = zfs_open(g_zfs, argv[0],
@@ -1216,17 +1235,15 @@ zfs_do_destroy(int argc, char **argv)
                cb.cb_snapspec = at + 1;
                if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
                    cb.cb_error) {
-                       zfs_close(zhp);
-                       nvlist_free(cb.cb_nvl);
-                       return (1);
+                       rv = 1;
+                       goto out;
                }
 
                if (nvlist_empty(cb.cb_nvl)) {
                        (void) fprintf(stderr, gettext("could not find any "
                            "snapshots to destroy; check snapshot names.\n"));
-                       zfs_close(zhp);
-                       nvlist_free(cb.cb_nvl);
-                       return (1);
+                       rv = 1;
+                       goto out;
                }
 
                if (cb.cb_verbose) {
@@ -1245,18 +1262,26 @@ zfs_do_destroy(int argc, char **argv)
                }
 
                if (!cb.cb_dryrun) {
-                       if (cb.cb_doclones)
+                       if (cb.cb_doclones) {
+                               cb.cb_batchedsnaps = fnvlist_alloc();
                                err = destroy_clones(&cb);
+                               if (err == 0) {
+                                       err = zfs_destroy_snaps_nvl(g_zfs,
+                                           cb.cb_batchedsnaps, B_FALSE);
+                               }
+                               if (err != 0) {
+                                       rv = 1;
+                                       goto out;
+                               }
+                       }
                        if (err == 0) {
-                               err = zfs_destroy_snaps_nvl(zhp, cb.cb_nvl,
+                               err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
                                    cb.cb_defer_destroy);
                        }
                }
 
-               zfs_close(zhp);
-               nvlist_free(cb.cb_nvl);
                if (err != 0)
-                       return (1);
+                       rv = 1;
        } else {
                /* Open the given dataset */
                if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
@@ -1277,8 +1302,8 @@ zfs_do_destroy(int argc, char **argv)
                            zfs_get_name(zhp));
                        (void) fprintf(stderr, gettext("use 'zpool destroy %s' "
                            "to destroy the pool itself\n"), zfs_get_name(zhp));
-                       zfs_close(zhp);
-                       return (1);
+                       rv = 1;
+                       goto out;
                }
 
                /*
@@ -1288,30 +1313,42 @@ zfs_do_destroy(int argc, char **argv)
                if (!cb.cb_doclones &&
                    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
                    &cb) != 0) {
-                       zfs_close(zhp);
-                       return (1);
+                       rv = 1;
+                       goto out;
                }
 
                if (cb.cb_error) {
-                       zfs_close(zhp);
-                       return (1);
+                       rv = 1;
+                       goto out;
                }
 
+               cb.cb_batchedsnaps = fnvlist_alloc();
                if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
                    &cb) != 0) {
-                       zfs_close(zhp);
-                       return (1);
+                       rv = 1;
+                       goto out;
                }
 
                /*
                 * Do the real thing.  The callback will close the
                 * handle regardless of whether it succeeds or not.
                 */
-               if (destroy_callback(zhp, &cb) != 0)
-                       return (1);
+               err = destroy_callback(zhp, &cb);
+               zhp = NULL;
+               if (err == 0) {
+                       err = zfs_destroy_snaps_nvl(g_zfs,
+                           cb.cb_batchedsnaps, cb.cb_defer_destroy);
+               }
+               if (err != 0)
+                       rv = 1;
        }
 
-       return (0);
+out:
+       fnvlist_free(cb.cb_batchedsnaps);
+       fnvlist_free(cb.cb_nvl);
+       if (zhp != NULL)
+               zfs_close(zhp);
+       return (rv);
 }
 
 static boolean_t
@@ -5081,28 +5118,12 @@ cleanup2:
        return (error);
 }
 
-/*
- * zfs allow [-r] [-t] <tag> <snap> ...
- *
- *     -r      Recursively hold
- *     -t      Temporary hold (hidden option)
- *
- * Apply a user-hold with the given tag to the list of snapshots.
- */
 static int
 zfs_do_allow(int argc, char **argv)
 {
        return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
 }
 
-/*
- * zfs unallow [-r] [-t] <tag> <snap> ...
- *
- *     -r      Recursively hold
- *     -t      Temporary hold (hidden option)
- *
- * Apply a user-hold with the given tag to the list of snapshots.
- */
 static int
 zfs_do_unallow(int argc, char **argv)
 {
@@ -5116,7 +5137,6 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
        int i;
        const char *tag;
        boolean_t recursive = B_FALSE;
-       boolean_t temphold = B_FALSE;
        const char *opts = holding ? "rt" : "r";
        int c;
 
@@ -5126,9 +5146,6 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
                case 'r':
                        recursive = B_TRUE;
                        break;
-               case 't':
-                       temphold = B_TRUE;
-                       break;
                case '?':
                        (void) fprintf(stderr, gettext("invalid option '%c'\n"),
                            optopt);
@@ -5177,7 +5194,7 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
                }
                if (holding) {
                        if (zfs_hold(zhp, delim+1, tag, recursive,
-                           temphold, B_FALSE, -1, 0, 0) != 0)
+                           B_FALSE, -1) != 0)
                                ++errors;
                } else {
                        if (zfs_release(zhp, delim+1, tag, recursive) != 0)
@@ -5193,7 +5210,6 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
  * zfs hold [-r] [-t] <tag> <snap> ...
  *
  *     -r      Recursively hold
- *     -t      Temporary hold (hidden option)
  *
  * Apply a user-hold with the given tag to the list of snapshots.
  */
index 4f80dde6957b0af933fefdcaf36dd224b319fb9c..99d26719450e005aab90880433acccd43f9f5586 100644 (file)
@@ -46,6 +46,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfeature.h>
+#include <sys/dmu_tx.h>
 #undef ZFS_MAXNAMELEN
 #include <libzfs.h>
 
@@ -123,7 +124,7 @@ import_pool(const char *target, boolean_t readonly)
        spa_t *spa;
        nvpair_t *elem;
        nvlist_t *props;
-       const char *name;
+       char *name;
 
        kernel_init(readonly ? FREAD : (FREAD | FWRITE));
        g_zfs = libzfs_init();
@@ -273,10 +274,10 @@ zhack_do_feature_stat(int argc, char **argv)
 }
 
 static void
-feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+feature_enable_sync(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
-       zfeature_info_t *feature = arg2;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       zfeature_info_t *feature = arg;
 
        spa_feature_enable(spa, feature, tx);
        spa_history_log_internal(spa, "zhack enable feature", tx,
@@ -344,8 +345,8 @@ zhack_do_feature_enable(int argc, char **argv)
        if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
                fatal("feature already enabled: %s", feature.fi_guid);
 
-       VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL,
-           feature_enable_sync, spa, &feature, 5));
+       VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+           feature_enable_sync, &feature, 5));
 
        spa_close(spa, FTAG);
 
@@ -353,10 +354,10 @@ zhack_do_feature_enable(int argc, char **argv)
 }
 
 static void
-feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+feature_incr_sync(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
-       zfeature_info_t *feature = arg2;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       zfeature_info_t *feature = arg;
 
        spa_feature_incr(spa, feature, tx);
        spa_history_log_internal(spa, "zhack feature incr", tx,
@@ -364,10 +365,10 @@ feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+feature_decr_sync(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
-       zfeature_info_t *feature = arg2;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       zfeature_info_t *feature = arg;
 
        spa_feature_decr(spa, feature, tx);
        spa_history_log_internal(spa, "zhack feature decr", tx,
@@ -442,8 +443,8 @@ zhack_do_feature_ref(int argc, char **argv)
        if (decr && !spa_feature_is_active(spa, &feature))
                fatal("feature refcount already 0: %s", feature.fi_guid);
 
-       VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL,
-           decr ? feature_decr_sync : feature_incr_sync, spa, &feature, 5));
+       VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+           decr ? feature_decr_sync : feature_incr_sync, &feature, 5));
 
        spa_close(spa, FTAG);
 }
index 28570a09075ca69c359316e3c13134414858a817..e192ab17a16b17187069308b353cd759f073607c 100644 (file)
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
 #include <sys/dsl_scan.h>
 #include <sys/zio_checksum.h>
 #include <sys/refcount.h>
 #include <sys/zfeature.h>
+#include <sys/dsl_userhold.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
@@ -367,7 +369,7 @@ ztest_info_t ztest_info[] = {
        { ztest_scrub,                          1,      &zopt_rarely    },
        { ztest_spa_upgrade,                    1,      &zopt_rarely    },
        { ztest_dsl_dataset_promote_busy,       1,      &zopt_rarely    },
-       { ztest_vdev_attach_detach,             1,      &zopt_rarely    },
+       { ztest_vdev_attach_detach,             1,      &zopt_sometimes },
        { ztest_vdev_LUN_growth,                1,      &zopt_rarely    },
        { ztest_vdev_add_remove,                1,
            &ztest_opts.zo_vdevtime                             },
@@ -1031,9 +1033,8 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
        uint64_t curval;
        int error;
 
-       error = dsl_prop_set(osname, propname,
-           (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL),
-           sizeof (value), 1, &value);
+       error = dsl_prop_set_int(osname, propname,
+           (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
 
        if (error == ENOSPC) {
                ztest_record_enospc(FTAG);
@@ -1042,8 +1043,7 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
        ASSERT0(error);
 
        setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
-       VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
-           1, &curval, setpoint), ==, 0);
+       VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
 
        if (ztest_opts.zo_verbose >= 6) {
                VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
@@ -2484,8 +2484,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
        int error;
 
        mutex_enter(&ztest_vdev_lock);
-       leaves =
-               MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+       leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
 
        spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -2507,7 +2506,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
                 * prevent a race between removing a slog (dmu_objset_find)
                 * and destroying a dataset. Removing the slog will
                 * grab a reference on the dataset which may cause
-                * dmu_objset_destroy() to fail with EBUSY thus
+                * dsl_destroy_head() to fail with EBUSY thus
                 * leaving the dataset in an inconsistent state.
                 */
                rw_enter(&ztest_name_lock, RW_WRITER);
@@ -3196,7 +3195,7 @@ ztest_objset_destroy_cb(const char *name, void *arg)
        /*
         * Verify that the dataset contains a directory object.
         */
-       VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+       VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
        error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
        if (error != ENOENT) {
                /* We could have crashed in the middle of destroying it */
@@ -3204,12 +3203,16 @@ ztest_objset_destroy_cb(const char *name, void *arg)
                ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
                ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
        }
-       dmu_objset_rele(os, FTAG);
+       dmu_objset_disown(os, FTAG);
 
        /*
         * Destroy the dataset.
         */
-       VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
+       if (strchr(name, '@') != NULL) {
+               VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
+       } else {
+               VERIFY0(dsl_destroy_head(name));
+       }
        return (0);
 }
 
@@ -3219,16 +3222,17 @@ ztest_snapshot_create(char *osname, uint64_t id)
        char snapname[MAXNAMELEN];
        int error;
 
-       (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
-           (u_longlong_t)id);
+       (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
 
-       error = dmu_objset_snapshot_one(osname, strchr(snapname, '@') + 1);
+       error = dmu_objset_snapshot_one(osname, snapname);
        if (error == ENOSPC) {
                ztest_record_enospc(FTAG);
                return (B_FALSE);
        }
-       if (error != 0 && error != EEXIST)
-               fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+       if (error != 0 && error != EEXIST) {
+               fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
+                   snapname, error);
+       }
        return (B_TRUE);
 }
 
@@ -3241,7 +3245,7 @@ ztest_snapshot_destroy(char *osname, uint64_t id)
        (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
            (u_longlong_t)id);
 
-       error = dmu_objset_destroy(snapname, B_FALSE);
+       error = dsl_destroy_snapshot(snapname, B_FALSE);
        if (error != 0 && error != ENOENT)
                fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
        return (B_TRUE);
@@ -3269,7 +3273,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
 
        /*
         * If this dataset exists from a previous run, process its replay log
-        * half of the time.  If we don't replay it, then dmu_objset_destroy()
+        * half of the time.  If we don't replay it, then dsl_destroy_head()
         * (invoked from ztest_objset_destroy_cb()) should just throw it away.
         */
        if (ztest_random(2) == 0 &&
@@ -3291,7 +3295,8 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
        /*
         * Verify that the destroyed dataset is no longer in the namespace.
         */
-       VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
+       VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+           FTAG, &os));
 
        /*
         * Verify that we can create a new dataset.
@@ -3305,8 +3310,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_objset_create(%s) = %d", name, error);
        }
 
-       VERIFY3U(0, ==,
-           dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+       VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
 
        ztest_zd_init(zdtmp, NULL, os);
 
@@ -3396,21 +3400,21 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
        (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu",
            clone1name, (u_longlong_t)id);
 
-       error = dmu_objset_destroy(clone2name, B_FALSE);
+       error = dsl_destroy_head(clone2name);
        if (error && error != ENOENT)
-               fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
-       error = dmu_objset_destroy(snap3name, B_FALSE);
+               fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
+       error = dsl_destroy_snapshot(snap3name, B_FALSE);
        if (error && error != ENOENT)
-               fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
-       error = dmu_objset_destroy(snap2name, B_FALSE);
+               fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
+       error = dsl_destroy_snapshot(snap2name, B_FALSE);
        if (error && error != ENOENT)
-               fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
-       error = dmu_objset_destroy(clone1name, B_FALSE);
+               fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
+       error = dsl_destroy_head(clone1name);
        if (error && error != ENOENT)
-               fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
-       error = dmu_objset_destroy(snap1name, B_FALSE);
+               fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
+       error = dsl_destroy_snapshot(snap1name, B_FALSE);
        if (error && error != ENOENT)
-               fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
+               fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
 
        umem_free(snap1name, MAXNAMELEN);
        umem_free(clone1name, MAXNAMELEN);
@@ -3425,8 +3429,7 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
 void
 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
-       objset_t *clone;
-       dsl_dataset_t *ds;
+       objset_t *os;
        char *snap1name;
        char *clone1name;
        char *snap2name;
@@ -3465,12 +3468,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
        }
 
-       error = dmu_objset_hold(snap1name, FTAG, &clone);
-       if (error)
-               fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
-
-       error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
-       dmu_objset_rele(clone, FTAG);
+       error = dmu_objset_clone(clone1name, snap1name);
        if (error) {
                if (error == ENOSPC) {
                        ztest_record_enospc(FTAG);
@@ -3497,12 +3495,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
        }
 
-       error = dmu_objset_hold(snap3name, FTAG, &clone);
-       if (error)
-               fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
-
-       error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
-       dmu_objset_rele(clone, FTAG);
+       error = dmu_objset_clone(clone2name, snap3name);
        if (error) {
                if (error == ENOSPC) {
                        ztest_record_enospc(FTAG);
@@ -3511,14 +3504,14 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
        }
 
-       error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds);
+       error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
        if (error)
-               fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error);
+               fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
        error = dsl_dataset_promote(clone2name, NULL);
        if (error != EBUSY)
                fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
                    error);
-       dsl_dataset_disown(ds, FTAG);
+       dmu_objset_disown(os, FTAG);
 
 out:
        ztest_dsl_dataset_cleanup(osname, id);
@@ -4392,7 +4385,7 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
        }
 
        count = -1ULL;
-       VERIFY(zap_count(os, object, &count) == 0);
+       VERIFY0(zap_count(os, object, &count));
        ASSERT(count != -1ULL);
 
        /*
@@ -4710,6 +4703,22 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
        (void) rw_exit(&ztest_name_lock);
 }
 
+static int
+user_release_one(const char *snapname, const char *holdname)
+{
+       nvlist_t *snaps, *holds;
+       int error;
+
+       snaps = fnvlist_alloc();
+       holds = fnvlist_alloc();
+       fnvlist_add_boolean(holds, holdname);
+       fnvlist_add_nvlist(snaps, snapname, holds);
+       fnvlist_free(holds);
+       error = dsl_dataset_user_release(snaps, NULL);
+       fnvlist_free(snaps);
+       return (error);
+}
+
 /*
  * Test snapshot hold/release and deferred destroy.
  */
@@ -4724,22 +4733,30 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
        char clonename[100];
        char tag[100];
        char osname[MAXNAMELEN];
+       nvlist_t *holds;
 
        (void) rw_enter(&ztest_name_lock, RW_READER);
 
        dmu_objset_name(os, osname);
 
-       (void) snprintf(snapname, 100, "sh1_%llu", (u_longlong_t)id);
-       (void) snprintf(fullname, 100, "%s@%s", osname, snapname);
-       (void) snprintf(clonename, 100, "%s/ch1_%llu",osname,(u_longlong_t)id);
-       (void) snprintf(tag, 100, "tag_%llu", (u_longlong_t)id);
+       (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", (long long unsigned int)id);
+       (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
+       (void) snprintf(clonename, sizeof (clonename),
+           "%s/ch1_%llu", osname, (long long unsigned int)id);
+       (void) snprintf(tag, sizeof (tag), "tag_%llu", (long long unsigned int)id);
 
        /*
         * Clean up from any previous run.
         */
-       (void) dmu_objset_destroy(clonename, B_FALSE);
-       (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
-       (void) dmu_objset_destroy(fullname, B_FALSE);
+       error = dsl_destroy_head(clonename);
+       if (error != ENOENT)
+               ASSERT0(error);
+       error = user_release_one(fullname, tag);
+       if (error != ESRCH && error != ENOENT)
+               ASSERT0(error);
+       error = dsl_destroy_snapshot(fullname, B_FALSE);
+       if (error != ENOENT)
+               ASSERT0(error);
 
        /*
         * Create snapshot, clone it, mark snap for deferred destroy,
@@ -4754,12 +4771,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
        }
 
-       error = dmu_objset_hold(fullname, FTAG, &origin);
-       if (error)
-               fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
-
-       error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0);
-       dmu_objset_rele(origin, FTAG);
+       error = dmu_objset_clone(clonename, fullname);
        if (error) {
                if (error == ENOSPC) {
                        ztest_record_enospc("dmu_objset_clone");
@@ -4768,15 +4780,15 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
        }
 
-       error = dmu_objset_destroy(fullname, B_TRUE);
+       error = dsl_destroy_snapshot(fullname, B_TRUE);
        if (error) {
-               fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+               fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
                    fullname, error);
        }
 
-       error = dmu_objset_destroy(clonename, B_FALSE);
+       error = dsl_destroy_head(clonename);
        if (error)
-               fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error);
+               fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
 
        error = dmu_objset_hold(fullname, FTAG, &origin);
        if (error != ENOENT)
@@ -4796,28 +4808,31 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
        }
 
-       error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE,
-           B_TRUE, -1);
+       holds = fnvlist_alloc();
+       fnvlist_add_string(holds, fullname, tag);
+       error = dsl_dataset_user_hold(holds, 0, NULL);
+       fnvlist_free(holds);
+
        if (error)
                fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
 
-       error = dmu_objset_destroy(fullname, B_FALSE);
+       error = dsl_destroy_snapshot(fullname, B_FALSE);
        if (error != EBUSY) {
-               fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
+               fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
                    fullname, error);
        }
 
-       error = dmu_objset_destroy(fullname, B_TRUE);
+       error = dsl_destroy_snapshot(fullname, B_TRUE);
        if (error) {
-               fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+               fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
                    fullname, error);
        }
 
-       error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+       error = user_release_one(fullname, tag);
        if (error)
-               fatal(0, "dsl_dataset_user_release(%s)", fullname, tag);
+               fatal(0, "user_release_one(%s)", fullname, tag);
 
-       VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT);
+       VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
 
 out:
        (void) rw_exit(&ztest_name_lock);
@@ -4947,7 +4962,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
                         * prevent a race between offlining a slog and
                         * destroying a dataset. Offlining the slog will
                         * grab a reference on the dataset which may cause
-                        * dmu_objset_destroy() to fail with EBUSY thus
+                        * dsl_destroy_head() to fail with EBUSY thus
                         * leaving the dataset in an inconsistent state.
                         */
                        if (islog)
@@ -5084,8 +5099,12 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
         */
        for (i = 0; i < copies; i++) {
                uint64_t offset = i * blocksize;
-               VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
-                   DMU_READ_NO_PREFETCH) == 0);
+               int error = dmu_buf_hold(os, object, offset, FTAG, &db,
+                   DMU_READ_NO_PREFETCH);
+               if (error != 0) {
+                       fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
+                           os, (long long)object, (long long) offset, error);
+               }
                ASSERT(db->db_offset == offset);
                ASSERT(db->db_size == blocksize);
                ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
@@ -5300,6 +5319,7 @@ ztest_spa_import_export(char *oldname, char *newname)
        nvlist_t *config, *newconfig;
        uint64_t pool_guid;
        spa_t *spa;
+       int error;
 
        if (ztest_opts.zo_verbose >= 4) {
                (void) printf("import/export: old = %s, new = %s\n",
@@ -5344,7 +5364,12 @@ ztest_spa_import_export(char *oldname, char *newname)
        /*
         * Import it under the new name.
         */
-       VERIFY3U(0, ==, spa_import(newname, config, NULL, 0));
+       error = spa_import(newname, config, NULL, 0);
+       if (error != 0) {
+               dump_nvlist(config, 0);
+               fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
+                   oldname, newname, error);
+       }
 
        ztest_walk_pool_directory("pools after import");
 
@@ -5551,7 +5576,7 @@ ztest_dataset_open(int d)
        }
        ASSERT(error == 0 || error == EEXIST);
 
-       VERIFY0(dmu_objset_hold(name, zd, &os));
+       VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
        (void) rw_exit(&ztest_name_lock);
 
        ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
@@ -5592,7 +5617,7 @@ ztest_dataset_close(int d)
        ztest_ds_t *zd = &ztest_ds[d];
 
        zil_close(zd->zd_zilog);
-       dmu_objset_rele(zd->zd_os, zd);
+       dmu_objset_disown(zd->zd_os, zd);
 
        ztest_zd_fini(zd);
 }
@@ -5638,13 +5663,14 @@ ztest_run(ztest_shared_t *zs)
         * Open our pool.
         */
        kernel_init(FREAD | FWRITE);
-       VERIFY(spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0);
+       VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
        spa->spa_debug = B_TRUE;
        ztest_spa = spa;
 
-       VERIFY3U(0, ==, dmu_objset_hold(ztest_opts.zo_pool, FTAG, &os));
+       VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
+           DMU_OST_ANY, B_TRUE, FTAG, &os));
        zs->zs_guid = dmu_objset_fsid_guid(os);
-       dmu_objset_rele(os, FTAG);
+       dmu_objset_disown(os, FTAG);
 
        spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
index d51a71d7598b2174287996428d2c8db2dd4cd872..3826c2cc83381904fc302a33bb5d8137f1f6e097 100644 (file)
@@ -563,7 +563,7 @@ extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
 extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
 extern int zfs_destroy(zfs_handle_t *, boolean_t);
 extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
-extern int zfs_destroy_snaps_nvl(zfs_handle_t *, nvlist_t *, boolean_t);
+extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
 extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps,
@@ -606,8 +606,8 @@ extern int zfs_send(zfs_handle_t *, const char *, const char *,
     sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
 
 extern int zfs_promote(zfs_handle_t *);
-extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
-    boolean_t, boolean_t, int, uint64_t, uint64_t);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *,
+    boolean_t, boolean_t, int);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
 extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
 extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
index 9edc884a14d1980d7aa6c2d47db68dcb2ccfd267..f5fd6cda9f0da46411a740d7187d7ff6a6201bf0 100644 (file)
@@ -46,6 +46,10 @@ int lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist);
 int lzc_snaprange_space(const char *firstsnap, const char *lastsnap,
     uint64_t *usedp);
 
+int lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist);
+int lzc_release(nvlist_t *holds, nvlist_t **errlist);
+int lzc_get_holds(const char *snapname, nvlist_t **holdsp);
+
 int lzc_send(const char *snapname, const char *fromsnap, int fd);
 int lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
     boolean_t force, int fd);
index 2245ff445564ffd325fd2d3ef12eeb2b12826349..34c715101054185647b1a1d91c39678c71583f31 100644 (file)
@@ -12,6 +12,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/dmu.h \
        $(top_srcdir)/include/sys/dmu_impl.h \
        $(top_srcdir)/include/sys/dmu_objset.h \
+       $(top_srcdir)/include/sys/dmu_send.h \
        $(top_srcdir)/include/sys/dmu_traverse.h \
        $(top_srcdir)/include/sys/dmu_tx.h \
        $(top_srcdir)/include/sys/dmu_zfetch.h \
@@ -19,11 +20,13 @@ COMMON_H = \
        $(top_srcdir)/include/sys/dsl_dataset.h \
        $(top_srcdir)/include/sys/dsl_deadlist.h \
        $(top_srcdir)/include/sys/dsl_deleg.h \
+       $(top_srcdir)/include/sys/dsl_destroy.h \
        $(top_srcdir)/include/sys/dsl_dir.h \
        $(top_srcdir)/include/sys/dsl_pool.h \
        $(top_srcdir)/include/sys/dsl_prop.h \
        $(top_srcdir)/include/sys/dsl_scan.h \
        $(top_srcdir)/include/sys/dsl_synctask.h \
+       $(top_srcdir)/include/sys/dsl_userhold.h \
        $(top_srcdir)/include/sys/efi_partition.h \
        $(top_srcdir)/include/sys/metaslab.h \
        $(top_srcdir)/include/sys/metaslab_impl.h \
@@ -65,8 +68,8 @@ COMMON_H = \
        $(top_srcdir)/include/sys/zfs_sa.h \
        $(top_srcdir)/include/sys/zfs_stat.h \
        $(top_srcdir)/include/sys/zfs_vfsops.h \
-       $(top_srcdir)/include/sys/zfs_znode.h \
        $(top_srcdir)/include/sys/zfs_vnops.h \
+       $(top_srcdir)/include/sys/zfs_znode.h \
        $(top_srcdir)/include/sys/zil.h \
        $(top_srcdir)/include/sys/zil_impl.h \
        $(top_srcdir)/include/sys/zio_checksum.h \
index 67882197a5e1d27a77ad12c1b371e341afabf352..8c10d947c38697b3bb3aa051de6213e8dede9748 100644 (file)
@@ -100,7 +100,7 @@ arc_buf_t *arc_loan_buf(spa_t *spa, int size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
 void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
-int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
index 394fdfb151d7a379c059b3587b0dc2296cc8208b..8cd1fde01f9c8cd4f2e583c886c64bbe27ee687d 100644 (file)
@@ -307,20 +307,17 @@ void dbuf_fini(void);
 
 boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 
-#define        DBUF_IS_METADATA(_db)   \
-       (dbuf_is_metadata(_db))
-
 #define        DBUF_GET_BUFC_TYPE(_db) \
-       (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+       (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 #define        DBUF_IS_CACHEABLE(_db)                                          \
        ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
-       (DBUF_IS_METADATA(_db) &&                                       \
+       (dbuf_is_metadata(_db) &&                                       \
        ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
 #define        DBUF_IS_L2CACHEABLE(_db)                                        \
        ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
-       (DBUF_IS_METADATA(_db) &&                                       \
+       (dbuf_is_metadata(_db) &&                                       \
        ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #define        DBUF_IS_L2COMPRESSIBLE(_db)                                     \
index c50df391ecb32a6dc59813ce3b4d5c55fe6b70f1..b0db7604da7e76113c1e55fcf5064d06c5608a0b 100644 (file)
@@ -214,6 +214,11 @@ typedef enum dmu_object_type {
        DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
+typedef enum txg_how {
+       TXG_WAIT = 1,
+       TXG_NOWAIT,
+} txg_how_t;
+
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
@@ -252,22 +257,19 @@ void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
-int dmu_objset_evict_dbufs(objset_t *os);
+void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
-    uint64_t flags);
-int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_snapshots_destroy_nvl(struct nvlist *snaps, boolean_t defer,
+int dmu_objset_clone(const char *name, const char *origin);
+int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
-int dmu_objset_snapshot(struct nvlist *snaps, struct nvlist *, struct nvlist *);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_snapshot_tmp(const char *, const char *, int);
-int dmu_objset_rename(const char *name, const char *newname,
-    boolean_t recursive);
 int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
+int dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
        uint64_t db_object;             /* object that this buffer is part of */
@@ -537,7 +539,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 
@@ -785,36 +787,8 @@ typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
-int dmu_send(objset_t *tosnap, objset_t *fromsnap,
-    int outfd, struct vnode *vp, offset_t *off);
-int dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep);
-
-typedef struct dmu_recv_cookie {
-       /*
-        * This structure is opaque!
-        *
-        * If logical and real are different, we are recving the stream
-        * into the "real" temporary clone, and then switching it with
-        * the "logical" target.
-        */
-       struct dsl_dataset *drc_logical_ds;
-       struct dsl_dataset *drc_real_ds;
-       struct drr_begin *drc_drrb;
-       char *drc_tosnap;
-       char *drc_top_ds;
-       boolean_t drc_newfs;
-       boolean_t drc_force;
-       struct avl_tree *drc_guid_to_ds_map;
-} dmu_recv_cookie_t;
-
-int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
-    boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
-    int cleanup_fd, uint64_t *action_handlep);
-int dmu_recv_end(dmu_recv_cookie_t *drc);
-
-int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp,
-    offset_t *off);
+int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+    struct vnode *vp, offset_t *offp);
 
 /* CRC64 table */
 #define        ZFS_CRC64_POLY  0xC96C5795D7870F42ULL   /* ECMA-182, reflected form */
index 79d3a6bc05a9cf9d6b7ac3f306d89f17c1531632..7fe91bebef7eedda6e3cc71d94393983771080f8 100644 (file)
@@ -44,6 +44,7 @@ extern "C" {
 
 extern krwlock_t os_lock;
 
+struct dsl_pool;
 struct dsl_dataset;
 struct dmu_tx;
 
@@ -115,8 +116,6 @@ struct objset {
        /* stuff we store for the user */
        kmutex_t os_user_ptr_lock;
        void *os_user_ptr;
-
-       /* SA layout/attribute registration */
        sa_os_t *os_sa;
 };
 
@@ -146,10 +145,10 @@ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find_spa(spa_t *spa, const char *name,
-    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
-int dmu_objset_prefetch(const char *name, void *arg);
-int dmu_objset_evict_dbufs(objset_t *os);
+int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
+    int func(struct dsl_pool *, struct dsl_dataset *, void *),
+    void *arg, int flags);
+void dmu_objset_evict_dbufs(objset_t *os);
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 /* called from dsl */
@@ -165,6 +164,7 @@ void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
 boolean_t dmu_objset_userused_enabled(objset_t *os);
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
+int dmu_fsname(const char *snapname, char *buf);
 
 void dmu_objset_init(void);
 void dmu_objset_fini(void);
diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h
new file mode 100644 (file)
index 0000000..ee0885a
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _DMU_SEND_H
+#define        _DMU_SEND_H
+
+#include <sys/inttypes.h>
+#include <sys/spa.h>
+
+struct vnode;
+struct dsl_dataset;
+struct drr_begin;
+struct avl_tree;
+
+int dmu_send(const char *tosnap, const char *fromsnap, int outfd,
+    struct vnode *vp, offset_t *off);
+int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
+    uint64_t *sizep);
+int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    int outfd, struct vnode *vp, offset_t *off);
+
+typedef struct dmu_recv_cookie {
+       struct dsl_dataset *drc_ds;
+       struct drr_begin *drc_drrb;
+       const char *drc_tofs;
+       const char *drc_tosnap;
+       boolean_t drc_newfs;
+       boolean_t drc_byteswap;
+       boolean_t drc_force;
+       struct avl_tree *drc_guid_to_ds_map;
+       zio_cksum_t drc_cksum;
+       uint64_t drc_newsnapobj;
+} dmu_recv_cookie_t;
+
+int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+    boolean_t force, char *origin, dmu_recv_cookie_t *drc);
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
+    int cleanup_fd, uint64_t *action_handlep);
+int dmu_recv_end(dmu_recv_cookie_t *drc);
+
+#endif /* _DMU_SEND_H */
index 40c1ded5de8c4d6dba84161c04835f9b2416cf52..48a507e347a5172e68507d1502e648f7305673a9 100644 (file)
@@ -22,6 +22,9 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 #ifndef        _SYS_DMU_TX_H
 #define        _SYS_DMU_TX_H
@@ -133,10 +136,11 @@ extern dmu_tx_stats_t dmu_tx_stats;
  * These routines are defined in dmu.h, and are called by the user.
  */
 dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_abort(dmu_tx_t *tx);
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx);
 void dmu_tx_wait(dmu_tx_t *tx);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
index 735ccbbd3058fb0123bdc12dd055a49f51a7cd24..494f11b90296d1ed2ef2652389e9445e133d1c91 100644 (file)
@@ -35,6 +35,7 @@
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/refcount.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -48,10 +49,8 @@ struct dsl_pool;
 #define        DS_IS_INCONSISTENT(ds)  \
        ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
 /*
- * NB: nopromote can not yet be set, but we want support for it in this
- * on-disk version, so that we don't need to upgrade for it later.  It
- * will be needed when we implement 'zfs split' (where the split off
- * clone should not be promoted).
+ * Note: nopromote can not yet be set, but we want support for it in this
+ * on-disk version, so that we don't need to upgrade for it later.
  */
 #define        DS_FLAG_NOPROMOTE       (1ULL<<1)
 
@@ -76,6 +75,8 @@ struct dsl_pool;
  */
 #define        DS_FLAG_CI_DATASET      (1ULL<<16)
 
+#define        DS_CREATE_FLAG_NODIRTY  (1ULL<<24)
+
 typedef struct dsl_dataset_phys {
        uint64_t ds_dir_obj;            /* DMU_OT_DSL_DIR */
        uint64_t ds_prev_snap_obj;      /* DMU_OT_DSL_DATASET */
@@ -125,9 +126,6 @@ typedef struct dsl_dataset {
        dsl_deadlist_t ds_deadlist;
        bplist_t ds_pending_deadlist;
 
-       /* to protect against multiple concurrent incremental recv */
-       kmutex_t ds_recvlock;
-
        /* protected by lock on pool's dp_dirty_datasets list */
        txg_node_t ds_dirty_link;
        list_node_t ds_synced_link;
@@ -139,13 +137,15 @@ typedef struct dsl_dataset {
        kmutex_t ds_lock;
        objset_t *ds_objset;
        uint64_t ds_userrefs;
+       void *ds_owner;
 
        /*
-        * ds_owner is protected by the ds_rwlock and the ds_lock
+        * Long holds prevent the ds from being destroyed; they allow the
+        * ds to remain held even after dropping the dp_config_rwlock.
+        * Owning counts as a long hold.  See the comments above
+        * dsl_pool_hold() for details.
         */
-       krwlock_t ds_rwlock;
-       kcondvar_t ds_exclusive_cv;
-       void *ds_owner;
+       refcount_t ds_longholds;
 
        /* no locking; only for making guesses */
        uint64_t ds_trysnap_txg;
@@ -163,76 +163,42 @@ typedef struct dsl_dataset {
        char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
-struct dsl_ds_destroyarg {
-       dsl_dataset_t *ds;              /* ds to destroy */
-       dsl_dataset_t *rm_origin;       /* also remove our origin? */
-       boolean_t is_origin_rm;         /* set if removing origin snap */
-       boolean_t defer;                /* destroy -d requested? */
-       boolean_t releasing;            /* destroying due to release? */
-       boolean_t need_prep;            /* do we need to retry due to EBUSY? */
-};
-
 /*
  * The max length of a temporary tag prefix is the number of hex digits
  * required to express UINT64_MAX plus one for the hyphen.
  */
 #define        MAX_TAG_PREFIX_LEN      17
 
-struct dsl_ds_holdarg {
-       dsl_sync_task_group_t *dstg;
-       const char *htag;
-       char *snapname;
-       boolean_t recursive;
-       boolean_t gotone;
-       boolean_t temphold;
-       char failed[MAXPATHLEN];
-};
-
 #define        dsl_dataset_is_snapshot(ds) \
        ((ds)->ds_phys->ds_num_children != 0)
 
 #define        DS_UNIQUE_IS_ACCURATE(ds)       \
        (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
 
-int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
-int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
-    void *tag, dsl_dataset_t **);
-int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
+    dsl_dataset_t **dsp);
+int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
+    dsl_dataset_t **);
+void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
+int dsl_dataset_own(struct dsl_pool *dp, const char *name,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
-    boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
-void dsl_dataset_name(dsl_dataset_t *ds, char *name);
-void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
+    void *tag, dsl_dataset_t **dsp);
 void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
-boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
-    void *tag);
-void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
 void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
     minor_t minor);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx);
-int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
-dsl_checkfunc_t dsl_dataset_destroy_check;
-dsl_syncfunc_t dsl_dataset_destroy_sync;
-dsl_syncfunc_t dsl_dataset_user_hold_sync;
-int dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *, dmu_tx_t *tx);
-void dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *, dmu_tx_t *tx);
-int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
+int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors);
 int dsl_dataset_promote(const char *name, char *conflsnap);
-int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
-    boolean_t force);
-int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive, boolean_t temphold, int cleanup_fd);
-int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
-    boolean_t temphold);
-int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
-    boolean_t recursive);
-int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
-    char *htag, boolean_t retry);
-int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
+int dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive);
+int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+    minor_t cleanup_minor, const char *htag);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
@@ -271,13 +237,35 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used,
     uint64_t *ref_rsrv);
-int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
     uint64_t quota);
-dsl_syncfunc_t dsl_dataset_set_quota_sync;
-int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t reservation);
 
-int dsl_destroy_inconsistent(const char *dsname, void *arg);
+boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier);
+void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
+
+int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, boolean_t force);
+void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, dmu_tx_t *tx);
+int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx);
+void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx);
+
+void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx);
+void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
+int dsl_dataset_get_snapname(dsl_dataset_t *ds);
+int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
+    uint64_t *value);
+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx);
+void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+    zprop_source_t source, uint64_t value, dmu_tx_t *tx);
+int dsl_dataset_rollback(const char *fsname);
 
 #ifdef ZFS_DEBUG
 #define        dprintf_ds(ds, fmt, ...) do { \
diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h
new file mode 100644 (file)
index 0000000..c5a70bb
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef        _SYS_DSL_DESTROY_H
+#define        _SYS_DSL_DESTROY_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct nvlist;
+struct dsl_dataset;
+struct dmu_tx;
+
+int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
+    struct nvlist *errlist);
+int dsl_destroy_snapshot(const char *name, boolean_t defer);
+int dsl_destroy_head(const char *name);
+int dsl_destroy_head_check_impl(struct dsl_dataset *ds, int expected_holds);
+void dsl_destroy_head_sync_impl(struct dsl_dataset *ds, struct dmu_tx *tx);
+int dsl_destroy_inconsistent(const char *dsname, void *arg);
+void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *ds,
+    boolean_t defer, struct dmu_tx *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DESTROY_H */
index 65ad202bba2d2e761f42175895e531b5a26937e8..2477e89af64ac037a4404f13bab1676be0cc4761 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_DIR_H
@@ -101,18 +102,15 @@ struct dsl_dir {
        char dd_myname[MAXNAMELEN];
 };
 
-void dsl_dir_close(dsl_dir_t *dd, void *tag);
-int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
-int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
-    const char **tailp);
-int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+void dsl_dir_rele(dsl_dir_t *dd, void *tag);
+int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+    dsl_dir_t **, const char **tail);
+int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **);
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_namelen(dsl_dir_t *dd);
 uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
     const char *name, dmu_tx_t *tx);
-dsl_checkfunc_t dsl_dir_destroy_check;
-dsl_syncfunc_t dsl_dir_destroy_sync;
 void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
 uint64_t dsl_dir_space_available(dsl_dir_t *dd,
     dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
@@ -131,14 +129,15 @@ int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
     uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation);
-int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
+int dsl_dir_rename(const char *oldname, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
-int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
 void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
 timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
+void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
+    dmu_tx_t *tx);
 
 /* internal reserved dir name */
 #define        MOS_DIR_NAME "$MOS"
index 4a4bf76ef527baf2900d11a97ec145142ae58f63..51b588e6aecc88cf72f919f32761553fd4eb468a 100644 (file)
@@ -36,6 +36,7 @@
 #include <sys/arc.h>
 #include <sys/bpobj.h>
 #include <sys/bptree.h>
+#include <sys/rrwlock.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -129,7 +130,7 @@ typedef struct dsl_pool {
         * syncing context does not need to ever have it for read, since
         * nobody else could possibly have it for write.
         */
-       krwlock_t dp_config_rwlock;
+       rrwlock_t dp_config_rwlock;
 
        zfs_all_blkstats_t *dp_blkstats;
 } dsl_pool_t;
@@ -155,15 +156,20 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
+boolean_t dsl_pool_config_held(dsl_pool_t *dp);
 
 taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);
 
-extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
-    const char *tag, uint64_t *now, dmu_tx_t *tx);
-extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t now, dmu_tx_t *tx);
+int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
     const char *tag, dmu_tx_t *tx);
-extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
 int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
+void dsl_pool_rele(dsl_pool_t *dp, void *tag);
 
 void dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs);
 
index b0d9a52cdfd7fc17043670ce6bd2c31da0e2d55f..5fe18d6a7c550b621e9df0ff8458b703c6f1bcf3 100644 (file)
@@ -54,58 +54,47 @@ typedef struct dsl_props_arg {
        zprop_source_t pa_source;
 } dsl_props_arg_t;
 
-typedef struct dsl_prop_set_arg {
-       const char *psa_name;
-       zprop_source_t psa_source;
-       int psa_intsz;
-       int psa_numints;
-       const void *psa_value;
-
-       /*
-        * Used to handle the special requirements of the quota and reservation
-        * properties.
-        */
-       uint64_t psa_effective_value;
-} dsl_prop_setarg_t;
-
 int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg);
 int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_numcb(struct dsl_dataset *ds);
+void dsl_prop_notify_all(struct dsl_dir *dd);
+boolean_t dsl_prop_hascb(struct dsl_dataset *ds);
 
 int dsl_prop_get(const char *ddname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_integer(const char *ddname, const char *propname,
     uint64_t *valuep, char *setpoint);
 int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
-int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(const char *dsname, nvlist_t **nvp);
 int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname,
+    uint64_t *valuep);
 int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
     int intsz, int numints, void *buf, char *setpoint,
     boolean_t snapshot);
 
-dsl_syncfunc_t dsl_props_set_sync;
-int dsl_prop_set(const char *ddname, const char *propname,
-    zprop_source_t source, int intsz, int numints, const void *buf);
+void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source,
+    nvlist_t *props, dmu_tx_t *tx);
+void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname,
+    zprop_source_t source, int intsz, int numints, const void *value,
+    dmu_tx_t *tx);
 int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
+int dsl_prop_set_int(const char *dsname, const char *propname,
+    zprop_source_t source, uint64_t value);
+int dsl_prop_set_string(const char *dsname, const char *propname,
+    zprop_source_t source, const char *value);
+int dsl_prop_inherit(const char *dsname, const char *propname,
+    zprop_source_t source);
 
-void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
-    zprop_source_t source, uint64_t *value);
-int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
-#ifdef ZFS_DEBUG
-void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
-#define        DSL_PROP_CHECK_PREDICTION(dd, psa)      \
-       dsl_prop_check_prediction((dd), (psa))
-#else
-#define        DSL_PROP_CHECK_PREDICTION(dd, psa)      /* nothing */
-#endif
+int dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+    zprop_source_t source, uint64_t value, uint64_t *newvalp);
 
 /* flag first receive on or after SPA_VERSION_RECVD_PROPS */
-boolean_t dsl_prop_get_hasrecvd(objset_t *os);
-void dsl_prop_set_hasrecvd(objset_t *os);
-void dsl_prop_unset_hasrecvd(objset_t *os);
+boolean_t dsl_prop_get_hasrecvd(const char *dsname);
+int dsl_prop_set_hasrecvd(const char *dsname);
+void dsl_prop_unset_hasrecvd(const char *dsname);
 
 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
 void dsl_prop_nvlist_add_string(nvlist_t *nv,
index 9126290cdb5be47a0a87cfc20b69bfda1ff00f44..ef86fb64cf0c58d62fcef3a17491f4903ddf4220 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_SYNCTASK_H
@@ -34,43 +35,26 @@ extern "C" {
 
 struct dsl_pool;
 
-typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
+typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
 
 typedef struct dsl_sync_task {
-       list_node_t dst_node;
+       txg_node_t dst_node;
+       struct dsl_pool *dst_pool;
+       uint64_t dst_txg;
+       int dst_space;
        dsl_checkfunc_t *dst_checkfunc;
        dsl_syncfunc_t *dst_syncfunc;
-       void *dst_arg1;
-       void *dst_arg2;
-       int dst_err;
+       void *dst_arg;
+       int dst_error;
+       boolean_t dst_nowaiter;
 } dsl_sync_task_t;
 
-typedef struct dsl_sync_task_group {
-       txg_node_t dstg_node;
-       list_t dstg_tasks;
-       struct dsl_pool *dstg_pool;
-       uint64_t dstg_txg;
-       int dstg_err;
-       int dstg_space;
-       boolean_t dstg_nowaiter;
-} dsl_sync_task_group_t;
-
-dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
-void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
-    dsl_checkfunc_t *, dsl_syncfunc_t *,
-    void *arg1, void *arg2, int blocks_modified);
-int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
-void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
-
-int dsl_sync_task_do(struct dsl_pool *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified);
-void dsl_sync_task_do_nowait(struct dsl_pool *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx);
+void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx);
+int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified);
+void dsl_sync_task_nowait(struct dsl_pool *dp, dsl_syncfunc_t *syncfunc,
+    void *arg, int blocks_modified, dmu_tx_t *tx);
 
 #ifdef __cplusplus
 }
diff --git a/include/sys/dsl_userhold.h b/include/sys/dsl_userhold.h
new file mode 100644 (file)
index 0000000..56c6c8f
--- /dev/null
@@ -0,0 +1,57 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef        _SYS_DSL_USERHOLD_H
+#define        _SYS_DSL_USERHOLD_H
+
+#include <sys/nvpair.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+struct dsl_dataset;
+struct dmu_tx;
+
+int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor,
+    nvlist_t *errlist);
+int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl);
+void dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
+    const char *htag);
+int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag,
+    boolean_t temphold, struct dmu_tx *tx);
+void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag,
+    minor_t minor, uint64_t now, struct dmu_tx *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_USERHOLD_H */
index 99912424b3dd3ce959a48cadd0451ff3d67c9fcc..70f7af0a5480628db57878106611daa9b217b692 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -57,6 +57,7 @@ extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
+extern void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
 extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
 extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);
 
index c502568a61c9f7fd97d4c7af9e437765e16a37ff..7a67870455f4ab5ea40ee2591d5fa9d70511c605 100644 (file)
@@ -285,6 +285,7 @@ void fnvlist_pack_free(char *, size_t);
 nvlist_t *fnvlist_unpack(char *, size_t);
 nvlist_t *fnvlist_dup(nvlist_t *);
 void fnvlist_merge(nvlist_t *, nvlist_t *);
+size_t fnvlist_num_pairs(nvlist_t *);
 
 void fnvlist_add_boolean(nvlist_t *, const char *);
 void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
index 1752c64e3e8be2dbc636dd7dcaa188526178780d..e767a2389dab1c38d027ef8235b046e19c4ec747 100644 (file)
@@ -50,15 +50,17 @@ typedef struct reference {
 
 typedef struct refcount {
        kmutex_t rc_mtx;
+       boolean_t rc_tracked;
        list_t rc_list;
        list_t rc_removed;
        int64_t rc_count;
        int64_t rc_removed_count;
 } refcount_t;
 
-/* Note: refcount_t must be initialized with refcount_create() */
+/* Note: refcount_t must be initialized with refcount_create[_untracked]() */
 
 void refcount_create(refcount_t *rc);
+void refcount_create_untracked(refcount_t *rc);
 void refcount_destroy(refcount_t *rc);
 void refcount_destroy_many(refcount_t *rc, uint64_t number);
 int refcount_is_zero(refcount_t *rc);
@@ -79,6 +81,7 @@ typedef struct refcount {
 } refcount_t;
 
 #define        refcount_create(rc) ((rc)->rc_count = 0)
+#define        refcount_create_untracked(rc) ((rc)->rc_count = 0)
 #define        refcount_destroy(rc) ((rc)->rc_count = 0)
 #define        refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
 #define        refcount_is_zero(rc) ((rc)->rc_count == 0)
index 8fde3a3beb58675d150965773f1ffdcc70c6b37c..25c8a52467e724284f2271e49b9945a00bd73378 100644 (file)
@@ -60,6 +60,7 @@ typedef struct rrwlock {
        refcount_t      rr_anon_rcount;
        refcount_t      rr_linked_rcount;
        boolean_t       rr_writer_wanted;
+       boolean_t       rr_track_all;
 } rrwlock_t;
 
 /*
@@ -67,15 +68,19 @@ typedef struct rrwlock {
  * 'tag' must be the same in a rrw_enter() as in its
  * corresponding rrw_exit().
  */
-void rrw_init(rrwlock_t *rrl);
+void rrw_init(rrwlock_t *rrl, boolean_t track_all);
 void rrw_destroy(rrwlock_t *rrl);
 void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
+void rrw_enter_read(rrwlock_t *rrl, void *tag);
+void rrw_enter_write(rrwlock_t *rrl);
 void rrw_exit(rrwlock_t *rrl, void *tag);
 boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
 void rrw_tsd_destroy(void *arg);
 
 #define        RRW_READ_HELD(x)        rrw_held(x, RW_READER)
 #define        RRW_WRITE_HELD(x)       rrw_held(x, RW_WRITER)
+#define        RRW_LOCK_HELD(x) \
+       (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
 
 #ifdef __cplusplus
 }
index 1af9137f8d0040de6955303f3977e89fcd0caf58..401ae8343e96f678776afee5413c7dbeb93ff9fe 100644 (file)
@@ -422,7 +422,7 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
     nvlist_t *zplprops);
 extern int spa_import_rootpool(char *devpath, char *devid);
-extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
index 2da80d29b7c9e7f3e3fe2963cf9ea5647ef2fbfa..c53074a00672ef452bcdf328c9119a5484f02213 100644 (file)
@@ -149,6 +149,8 @@ extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
 extern boolean_t space_map_contains(space_map_t *sm,
     uint64_t start, uint64_t size);
+extern space_seg_t *space_map_find(space_map_t *sm, uint64_t start,
+    uint64_t size, avl_index_t *wherep);
 extern void space_map_swap(space_map_t **msrc, space_map_t **mdest);
 extern void space_map_vacate(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
index f9d6dd421810cfd57fc8dc8e9f16597ede588bef..b9bbba8be2919125089547888fb457e5e91cc7eb 100644 (file)
@@ -45,9 +45,6 @@ extern "C" {
 /* Number of txgs worth of frees we defer adding to in-core spacemaps */
 #define        TXG_DEFER_SIZE          2
 
-#define        TXG_WAIT                1ULL
-#define        TXG_NOWAIT              2ULL
-
 typedef struct tx_cpu tx_cpu_t;
 
 typedef struct txg_handle {
@@ -125,11 +122,11 @@ extern void txg_wait_callbacks(struct dsl_pool *dp);
 extern void txg_list_create(txg_list_t *tl, size_t offset);
 extern void txg_list_destroy(txg_list_t *tl);
 extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
-extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
-extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
 extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
-extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
 extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
 
index 481e85b1bad00f02612b2d8994bf6ce00b4076db..1a081e422d4a50300d9de5128c5074746fbcf8e0 100644 (file)
@@ -26,7 +26,6 @@
 #ifndef _SYS_ZFEATURE_H
 #define        _SYS_ZFEATURE_H
 
-#include <sys/dmu.h>
 #include <sys/nvpair.h>
 #include "zfeature_common.h"
 
 extern "C" {
 #endif
 
-extern boolean_t feature_is_supported(objset_t *os, uint64_t obj,
+struct spa;
+struct dmu_tx;
+struct objset;
+
+extern boolean_t feature_is_supported(struct objset *os, uint64_t obj,
     uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat);
 
-struct spa;
-extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *);
-extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *);
-extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *);
-extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *);
+extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *);
+extern void spa_feature_enable(struct spa *, zfeature_info_t *,
+    struct dmu_tx *);
+extern void spa_feature_incr(struct spa *, zfeature_info_t *, struct dmu_tx *);
+extern void spa_feature_decr(struct spa *, zfeature_info_t *, struct dmu_tx *);
 extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *);
 extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *);
 
index 599b97a9b6c99d943f8cd1073ca821ac261994d3..a126c058ebd9d411195ec4988339ee845a43409c 100644 (file)
@@ -209,8 +209,6 @@ typedef struct kthread {
        void *          t_arg;
 } kthread_t;
 
-#define        tsd_get(key)                    pthread_getspecific(key)
-#define        tsd_set(key, val)               pthread_setspecific(key, val)
 #define        curthread                       zk_thread_current()
 #define        thread_exit                     zk_thread_exit
 #define        thread_create(stk, stksize, func, arg, len, pp, state, pri)     \
@@ -284,6 +282,12 @@ typedef int krw_t;
 #define        RW_WRITE_HELD(x)        ((x)->rw_wr_owner == curthread)
 #define        RW_LOCK_HELD(x)         (RW_READ_HELD(x) || RW_WRITE_HELD(x))
 
+#undef RW_LOCK_HELD
+#define        RW_LOCK_HELD(x)         (RW_READ_HELD(x) || RW_WRITE_HELD(x))
+
+#undef RW_LOCK_HELD
+#define        RW_LOCK_HELD(x)         (RW_READ_HELD(x) || RW_WRITE_HELD(x))
+
 extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
 extern void rw_destroy(krwlock_t *rwlp);
 extern void rw_enter(krwlock_t *rwlp, krw_t rw);
@@ -320,6 +324,22 @@ extern void cv_broadcast(kcondvar_t *cv);
 #define cv_wait_interruptible(cv, mp)          cv_wait(cv, mp)
 #define cv_wait_io(cv, mp)                     cv_wait(cv, mp)
 
+/*
+ * Thread-specific data
+ */
+#define        tsd_get(k) pthread_getspecific(k)
+#define        tsd_set(k, v) pthread_setspecific(k, v)
+#define        tsd_create(kp, d) pthread_key_create(kp, d)
+#define        tsd_destroy(kp) /* nothing */
+
+/*
+ * Thread-specific data
+ */
+#define        tsd_get(k) pthread_getspecific(k)
+#define        tsd_set(k, v) pthread_setspecific(k, v)
+#define        tsd_create(kp, d) pthread_key_create(kp, d)
+#define        tsd_destroy(kp) /* nothing */
+
 /*
  * kstat creation, installation and deletion
  */
@@ -592,7 +612,7 @@ typedef struct callb_cpr {
 
 extern char *kmem_vasprintf(const char *fmt, va_list adx);
 extern char *kmem_asprintf(const char *fmt, ...);
-#define        strfree(str) kmem_free((str), strlen(str)+1)
+#define        strfree(str) kmem_free((str), strlen(str) + 1)
 
 /*
  * Hostname information
index 591d0df8f7114e99fc9c08455cc3e68211e971ce..7632d7420c14fe11dd773f53907aa37769d01f1b 100644 (file)
 extern int zfs_flags;
 extern int zfs_recover;
 
-#define        ZFS_DEBUG_DPRINTF       0x0001
-#define        ZFS_DEBUG_DBUF_VERIFY   0x0002
-#define        ZFS_DEBUG_DNODE_VERIFY  0x0004
-#define        ZFS_DEBUG_SNAPNAMES     0x0008
-#define        ZFS_DEBUG_MODIFY        0x0010
+#define        ZFS_DEBUG_DPRINTF       (1<<0)
+#define        ZFS_DEBUG_DBUF_VERIFY   (1<<1)
+#define        ZFS_DEBUG_DNODE_VERIFY  (1<<2)
+#define        ZFS_DEBUG_SNAPNAMES     (1<<3)
+#define        ZFS_DEBUG_MODIFY        (1<<4)
+#define        ZFS_DEBUG_SPA           (1<<5)
+#define        ZFS_DEBUG_ZIO_FREE      (1<<6)
 
 /*
  * Always log zfs debug messages to the spl debug subsystem as SS_USER1.
index 21bfe2b8f9b0510048c5abe912bc4eccbe9162c8..8838322a9a51eb04ba562b1c9d0625432a41408b 100644 (file)
@@ -302,7 +302,6 @@ typedef struct zfs_cmd {
        uint64_t        zc_history;             /* really (char *) */
        char            zc_value[MAXPATHLEN * 2];
        char            zc_string[MAXNAMELEN];
-       char            zc_top_ds[MAXPATHLEN];
        uint64_t        zc_guid;
        uint64_t        zc_nvlist_conf;         /* really (char *) */
        uint64_t        zc_nvlist_conf_size;
@@ -352,7 +351,8 @@ extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
 extern int zfs_secpolicy_rename_perms(const char *from,
     const char *to, cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
-extern int zfs_unmount_snap(const char *, void *);
+extern void zfs_unmount_snap(const char *);
+extern void zfs_destroy_unmount_origin(const char *);
 
 enum zfsdev_state_type {
        ZST_ONEXIT,
index bdddcc366b8d7328c8f77ee2a5735bc91abb6853..aa9d9d288d138aa63f46440589ed60dcc46dc62c 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_FS_ZFS_ZNODE_H
@@ -254,7 +255,7 @@ typedef struct znode {
  */
 #define        ZFS_ENTER(zsb) \
        { \
-               rrw_enter(&(zsb)->z_teardown_lock, RW_READER, FTAG); \
+               rrw_enter_read(&(zsb)->z_teardown_lock, FTAG); \
                if ((zsb)->z_unmounted) { \
                        ZFS_EXIT(zsb); \
                        return (EIO); \
index 589e28f83752f2e79b87f2a9cdf79878ffc8fe71..f3e00101ba3de23e6fef0373015f1a69ee681b21 100644 (file)
@@ -470,8 +470,8 @@ extern int  zil_check_log_chain(const char *osname, void *txarg);
 extern void    zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void    zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
-extern int     zil_suspend(zilog_t *zilog);
-extern void    zil_resume(zilog_t *zilog);
+extern int     zil_suspend(const char *osname, void **cookiep);
+extern void    zil_resume(void *cookie);
 
 extern void    zil_add_block(zilog_t *zilog, const blkptr_t *bp);
 extern int     zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
index c05f81a5ff54fd4c8466fd5661afb1d4dd1b3a76..640538c2cdaa7097de9041c1061d825a232bc4b8 100644 (file)
@@ -39,7 +39,7 @@ extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
 extern boolean_t zvol_is_zvol(const char *);
 extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 extern int zvol_create_minor(const char *);
-extern int zvol_create_minors(const char *);
+extern int zvol_create_minors(char *);
 extern int zvol_remove_minor(const char *);
 extern void zvol_remove_minors(const char *);
 extern int zvol_set_volsize(const char *, uint64_t);
index 99b6e67c37c2233eda4d6a628fa581cfb8e02ac6..41756353378d88232c5b605e6d05e0b198c40ba5 100644 (file)
@@ -106,7 +106,7 @@ namespace_reload(libzfs_handle_t *hdl)
        nvlist_t *config;
        config_node_t *cn;
        nvpair_t *elem;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        void *cookie;
 
        if (hdl->libzfs_ns_gen == 0) {
@@ -261,7 +261,7 @@ zpool_get_features(zpool_handle_t *zhp)
 int
 zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int error;
        nvlist_t *config;
        libzfs_handle_t *hdl = zhp->zpool_hdl;
index 041750bca8c36c8a28cf668e28eb80ce0e562904..e43c7c6bdeac8296d33c8856adb9fb6e76d23d3e 100644 (file)
@@ -313,7 +313,7 @@ get_recvd_props_ioctl(zfs_handle_t *zhp)
 {
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        nvlist_t *recvdprops;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int err;
 
        if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
@@ -376,7 +376,7 @@ static int
 get_stats(zfs_handle_t *zhp)
 {
        int rc = 0;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
                return (-1);
@@ -439,7 +439,7 @@ make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
 zfs_handle_t *
 make_dataset_handle(libzfs_handle_t *hdl, const char *path)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
@@ -1427,7 +1427,7 @@ zfs_is_namespace_prop(zfs_prop_t prop)
 int
 zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int ret = -1;
        prop_changelist_t *cl = NULL;
        char errbuf[1024];
@@ -1553,7 +1553,7 @@ error:
 int
 zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int ret;
        prop_changelist_t *cl;
        libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -1728,7 +1728,7 @@ static int
 get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
     char **source, uint64_t *val)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        nvlist_t *zplprops = NULL;
        struct mnttab mnt;
        char *mntopt_on = NULL;
@@ -2002,10 +2002,7 @@ get_clones_cb(zfs_handle_t *zhp, void *arg)
            NULL, NULL, 0, B_TRUE) != 0)
                goto out;
        if (strcmp(gca->buf, gca->origin) == 0) {
-               if (nvlist_add_boolean(gca->value, zfs_get_name(zhp)) != 0) {
-                       zfs_close(zhp);
-                       return (no_memory(zhp->zfs_hdl));
-               }
+               fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
                gca->numclones--;
        }
 
@@ -2580,7 +2577,7 @@ zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue, zfs_userquota_prop_t *typep)
 {
        int err;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
@@ -2640,7 +2637,7 @@ zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue)
 {
        int err;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        const char *snapname;
 
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -2760,7 +2757,7 @@ static int
 check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
     boolean_t accept_ancestor, int *prefixlen)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char parent[ZFS_MAXNAMELEN];
        char *slash;
        zfs_handle_t *zhp;
@@ -3120,7 +3117,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
 int
 zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
@@ -3200,46 +3197,50 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
                    dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
                    zhp->zfs_name, snapname);
        } else {
-               ret = zfs_destroy_snaps_nvl(zhp, dd.nvl, defer);
+               ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
        }
        nvlist_free(dd.nvl);
        return (ret);
 }
 
 /*
- * Destroys all the snapshots named in the nvlist.  They must be underneath
- * the zhp (either snapshots of it, or snapshots of its descendants).
+ * Destroys all the snapshots named in the nvlist.
  */
 int
-zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer)
+zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer)
 {
        int ret;
        nvlist_t *errlist;
+       nvpair_t *pair;
 
        ret = lzc_destroy_snaps(snaps, defer, &errlist);
 
-       if (ret != 0) {
-               nvpair_t *pair;
-               for (pair = nvlist_next_nvpair(errlist, NULL);
-                   pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
-                       char errbuf[1024];
-                       (void) snprintf(errbuf, sizeof (errbuf),
-                           dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
-                           nvpair_name(pair));
+       if (ret == 0)
+               return (0);
 
-                       switch (fnvpair_value_int32(pair)) {
-                       case EEXIST:
-                               zfs_error_aux(zhp->zfs_hdl,
-                                   dgettext(TEXT_DOMAIN,
-                                   "snapshot is cloned"));
-                               ret = zfs_error(zhp->zfs_hdl, EZFS_EXISTS,
-                                   errbuf);
-                               break;
-                       default:
-                               ret = zfs_standard_error(zhp->zfs_hdl, errno,
-                                   errbuf);
-                               break;
-                       }
+       if (nvlist_next_nvpair(errlist, NULL) == NULL) {
+               char errbuf[1024];
+               (void) snprintf(errbuf, sizeof (errbuf),
+                   dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));
+
+               ret = zfs_standard_error(hdl, ret, errbuf);
+       }
+       for (pair = nvlist_next_nvpair(errlist, NULL);
+           pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
+               char errbuf[1024];
+               (void) snprintf(errbuf, sizeof (errbuf),
+                   dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
+                   nvpair_name(pair));
+
+               switch (fnvpair_value_int32(pair)) {
+               case EEXIST:
+                       zfs_error_aux(hdl,
+                           dgettext(TEXT_DOMAIN, "snapshot is cloned"));
+                       ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
+                       break;
+               default:
+                       ret = zfs_standard_error(hdl, errno, errbuf);
+                       break;
                }
        }
 
@@ -3388,7 +3389,7 @@ int
 zfs_promote(zfs_handle_t *zhp)
 {
        libzfs_handle_t *hdl = zhp->zfs_hdl;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char parent[MAXPATHLEN];
        char *cp;
        int ret;
@@ -3726,7 +3727,7 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 {
        rollback_data_t cb = { 0 };
        int err;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        boolean_t restore_resv = 0;
        uint64_t old_volsize = 0, new_volsize;
        zfs_prop_t resv_prop = { 0 };
@@ -3813,7 +3814,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive,
     boolean_t force_unmount)
 {
        int ret;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char *delim;
        prop_changelist_t *cl = NULL;
        zfs_handle_t *zhrp = NULL;
@@ -4032,7 +4033,7 @@ zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
 static int
 zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char path[MAXPATHLEN];
        int error;
 
@@ -4096,7 +4097,7 @@ zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
 int
 zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int timeout = 3000; /* in milliseconds */
        int error = 0;
        int i;
@@ -4289,7 +4290,7 @@ static int
 zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path,
     zfs_smb_acl_op_t cmd, char *resource1, char *resource2)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        nvlist_t *nvlist = NULL;
        int error;
 
@@ -4371,7 +4372,7 @@ int
 zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
     zfs_userspace_cb_t func, void *arg)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        zfs_useracct_t buf[100];
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        int ret;
@@ -4408,37 +4409,83 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
        return (0);
 }
 
+struct holdarg {
+       nvlist_t *nvl;
+       const char *snapname;
+       const char *tag;
+       boolean_t recursive;
+};
+
+static int
+zfs_hold_one(zfs_handle_t *zhp, void *arg)
+{
+       struct holdarg *ha = arg;
+       zfs_handle_t *szhp;
+       char name[ZFS_MAXNAMELEN];
+       int rv = 0;
+
+       (void) snprintf(name, sizeof (name),
+           "%s@%s", zhp->zfs_name, ha->snapname);
+
+       szhp = make_dataset_handle(zhp->zfs_hdl, name);
+       if (szhp) {
+               fnvlist_add_string(ha->nvl, name, ha->tag);
+               zfs_close(szhp);
+       }
+
+       if (ha->recursive)
+               rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha);
+       zfs_close(zhp);
+       return (rv);
+}
+
 int
 zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
-    boolean_t recursive, boolean_t temphold, boolean_t enoent_ok,
-    int cleanup_fd, uint64_t dsobj, uint64_t createtxg)
+    boolean_t recursive, boolean_t enoent_ok, int cleanup_fd)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       int ret;
+       struct holdarg ha;
+       nvlist_t *errors;
        libzfs_handle_t *hdl = zhp->zfs_hdl;
+       char errbuf[1024];
+       nvpair_t *elem;
 
-       ASSERT(!recursive || dsobj == 0);
+       ha.nvl = fnvlist_alloc();
+       ha.snapname = snapname;
+       ha.tag = tag;
+       ha.recursive = recursive;
+       (void) zfs_hold_one(zfs_handle_dup(zhp), &ha);
+       ret = lzc_hold(ha.nvl, cleanup_fd, &errors);
+       fnvlist_free(ha.nvl);
 
-       (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-       (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-       if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
-           >= sizeof (zc.zc_string))
-               return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
-       zc.zc_cookie = recursive;
-       zc.zc_temphold = temphold;
-       zc.zc_cleanup_fd = cleanup_fd;
-       zc.zc_sendobj = dsobj;
-       zc.zc_createtxg = createtxg;
+       if (ret == 0)
+               return (0);
 
-       if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
-               char errbuf[ZFS_MAXNAMELEN+32];
+       if (nvlist_next_nvpair(errors, NULL) == NULL) {
+               /* no hold-specific errors */
+               (void) snprintf(errbuf, sizeof (errbuf),
+                   dgettext(TEXT_DOMAIN, "cannot hold"));
+               switch (ret) {
+               case ENOTSUP:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "pool must be upgraded"));
+                       (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+                       break;
+               case EINVAL:
+                       (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+                       break;
+               default:
+                       (void) zfs_standard_error(hdl, ret, errbuf);
+               }
+       }
 
-               /*
-                * if it was recursive, the one that actually failed will be in
-                * zc.zc_name.
-                */
-               (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-                   "cannot hold '%s@%s'"), zc.zc_name, snapname);
-               switch (errno) {
+       for (elem = nvlist_next_nvpair(errors, NULL);
+           elem != NULL;
+           elem = nvlist_next_nvpair(errors, elem)) {
+               (void) snprintf(errbuf, sizeof (errbuf),
+                   dgettext(TEXT_DOMAIN,
+                   "cannot hold snapshot '%s'"), nvpair_name(elem));
+               switch (fnvpair_value_int32(elem)) {
                case E2BIG:
                        /*
                         * Temporary tags wind up having the ds object id
@@ -4446,77 +4493,133 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
                         * above, it's still possible for the tag to wind
                         * up being slightly too long.
                         */
-                       return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf));
-               case ENOTSUP:
-                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                           "pool must be upgraded"));
-                       return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+                       (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
+                       break;
                case EINVAL:
-                       return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+                       (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+                       break;
                case EEXIST:
-                       return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
+                       (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
+                       break;
                case ENOENT:
                        if (enoent_ok)
                                return (ENOENT);
                        /* FALLTHROUGH */
                default:
-                       return (zfs_standard_error_fmt(hdl, errno, errbuf));
+                       (void) zfs_standard_error(hdl,
+                           fnvpair_value_int32(elem), errbuf);
                }
        }
 
-       return (0);
+       fnvlist_free(errors);
+       return (ret);
+}
+
+struct releasearg {
+       nvlist_t *nvl;
+       const char *snapname;
+       const char *tag;
+       boolean_t recursive;
+};
+
+static int
+zfs_release_one(zfs_handle_t *zhp, void *arg)
+{
+       struct holdarg *ha = arg;
+       zfs_handle_t *szhp;
+       char name[ZFS_MAXNAMELEN];
+       int rv = 0;
+
+       (void) snprintf(name, sizeof (name),
+           "%s@%s", zhp->zfs_name, ha->snapname);
+
+       szhp = make_dataset_handle(zhp->zfs_hdl, name);
+       if (szhp) {
+               nvlist_t *holds = fnvlist_alloc();
+               fnvlist_add_boolean(holds, ha->tag);
+               fnvlist_add_nvlist(ha->nvl, name, holds);
+               zfs_close(szhp);
+       }
+
+       if (ha->recursive)
+               rv = zfs_iter_filesystems(zhp, zfs_release_one, ha);
+       zfs_close(zhp);
+       return (rv);
 }
 
 int
 zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       int ret;
+       struct holdarg ha;
+       nvlist_t *errors;
+       nvpair_t *elem;
        libzfs_handle_t *hdl = zhp->zfs_hdl;
 
-       (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-       (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-       if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
-           >= sizeof (zc.zc_string))
-               return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
-       zc.zc_cookie = recursive;
+       ha.nvl = fnvlist_alloc();
+       ha.snapname = snapname;
+       ha.tag = tag;
+       ha.recursive = recursive;
+       (void) zfs_release_one(zfs_handle_dup(zhp), &ha);
+       ret = lzc_release(ha.nvl, &errors);
+       fnvlist_free(ha.nvl);
 
-       if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
-               char errbuf[ZFS_MAXNAMELEN+32];
+       if (ret == 0)
+               return (0);
+
+       if (nvlist_next_nvpair(errors, NULL) == NULL) {
+               /* no hold-specific errors */
+               char errbuf[1024];
 
-               /*
-                * if it was recursive, the one that actually failed will be in
-                * zc.zc_name.
-                */
                (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-                   "cannot release '%s' from '%s@%s'"), tag, zc.zc_name,
-                   snapname);
+                   "cannot release"));
                switch (errno) {
-               case ESRCH:
-                       return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
                case ENOTSUP:
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "pool must be upgraded"));
-                       return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+                       (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+                       break;
+               default:
+                       (void) zfs_standard_error_fmt(hdl, errno, errbuf);
+               }
+       }
+
+       for (elem = nvlist_next_nvpair(errors, NULL);
+           elem != NULL;
+           elem = nvlist_next_nvpair(errors, elem)) {
+               char errbuf[1024];
+
+               (void) snprintf(errbuf, sizeof (errbuf),
+                   dgettext(TEXT_DOMAIN,
+                   "cannot release hold from snapshot '%s'"),
+                   nvpair_name(elem));
+               switch (fnvpair_value_int32(elem)) {
+               case ESRCH:
+                       (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
+                       break;
                case EINVAL:
-                       return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+                       (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+                       break;
                default:
-                       return (zfs_standard_error_fmt(hdl, errno, errbuf));
+                       (void) zfs_standard_error_fmt(hdl,
+                           fnvpair_value_int32(elem), errbuf);
                }
        }
 
-       return (0);
+       fnvlist_free(errors);
+       return (ret);
 }
 
 int
 zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        int nvsz = 2048;
        void *nvbuf;
        int err = 0;
-       char errbuf[ZFS_MAXNAMELEN+32];
+       char errbuf[1024];
 
        assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
            zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
@@ -4578,10 +4681,10 @@ out:
 int
 zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        char *nvbuf;
-       char errbuf[ZFS_MAXNAMELEN+32];
+       char errbuf[1024];
        size_t nvsz;
        int err;
 
@@ -4632,38 +4735,18 @@ zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
 int
 zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
-       libzfs_handle_t *hdl = zhp->zfs_hdl;
-       int nvsz = 2048;
-       void *nvbuf;
-       int err = 0;
-       char errbuf[ZFS_MAXNAMELEN+32];
-
-       assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
-
-tryagain:
-
-       nvbuf = malloc(nvsz);
-       if (nvbuf == NULL) {
-               err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
-               goto out;
-       }
+       int err;
+       char errbuf[1024];
 
-       zc.zc_nvlist_dst_size = nvsz;
-       zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+       err = lzc_get_holds(zhp->zfs_name, nvl);
 
-       (void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
+       if (err != 0) {
+               libzfs_handle_t *hdl = zhp->zfs_hdl;
 
-       if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) {
                (void) snprintf(errbuf, sizeof (errbuf),
                    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
-                   zc.zc_name);
-               switch (errno) {
-               case ENOMEM:
-                       free(nvbuf);
-                       nvsz = zc.zc_nvlist_dst_size;
-                       goto tryagain;
-
+                   zhp->zfs_name);
+               switch (err) {
                case ENOTSUP:
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "pool must be upgraded"));
@@ -4679,19 +4762,8 @@ tryagain:
                        err = zfs_standard_error_fmt(hdl, errno, errbuf);
                        break;
                }
-       } else {
-               /* success */
-               int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
-               if (rc) {
-                       (void) snprintf(errbuf, sizeof (errbuf),
-                           dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
-                           zc.zc_name);
-                       err = zfs_standard_error_fmt(hdl, rc, errbuf);
-               }
        }
 
-       free(nvbuf);
-out:
        return (err);
 }
 
index d8ef6ff025b1d7765ee73547a2cda613932e67b4..7472d246b01213c574451b6c39fd1f506c4d69bf 100644 (file)
@@ -90,7 +90,7 @@ static int
 get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj,
     char *pn, int maxlen, zfs_stat_t *sb)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int error;
 
        (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
@@ -379,7 +379,7 @@ describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf,
 static int
 write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *lhdl = di->zhp->zfs_hdl;
        char fobjname[MAXPATHLEN];
 
@@ -507,7 +507,7 @@ static int
 make_temp_snapshot(differ_info_t *di)
 {
        libzfs_handle_t *hdl = di->zhp->zfs_hdl;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) snprintf(zc.zc_value, sizeof (zc.zc_value),
            ZDIFF_PREFIX, getpid());
@@ -749,7 +749,7 @@ int
 zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap,
     const char *tosnap, int flags)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char errbuf[1024];
        differ_info_t di = { 0 };
        pthread_t tid;
index aa84aa30deb589966c4b785d9f2c7856ec70f189..4e2fe9d07bfc4a9f28759fbd58dd6dad789a18a9 100644 (file)
@@ -361,7 +361,7 @@ libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru)
 int
 zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
        (void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value));
index 3c5bdcc6767c4e09b2973f9a6e1b7b12d8053775..63d9138ef33f01d78ec0fe81809f1c01c4aa4b47 100644 (file)
@@ -379,7 +379,7 @@ zfs_graph_add(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *source,
 static int
 iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        zfs_vertex_t *zvp;
 
        /*
@@ -473,7 +473,7 @@ iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset)
 static boolean_t
 external_dependents(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        /*
         * Check whether this dataset is a clone or has clones since
index af6a43d8388f0c190263f6dab64bb3506f83ee0e..53609f2cb73a79bf0aeb536516d87e8f60dd8806 100644 (file)
@@ -365,7 +365,7 @@ static nvlist_t *
 refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 {
        nvlist_t *nvl;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int err;
 
        if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0)
index ff76f9f367e2d4686ee90f2380841550e780b656..3d8bc5e14803dcc9273fb9ceecfc213615decbde 100644 (file)
@@ -103,7 +103,7 @@ top:
 int
 zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        zfs_handle_t *nzhp;
        int ret;
 
@@ -140,7 +140,7 @@ int
 zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func,
     void *data)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        zfs_handle_t *nzhp;
        int ret;
 
index 45c39cc0facd6cf6d744f5de117cf89853e9e7f1..e8e7efca5c0825984e798dabcd54387ab2d4cb5a 100644 (file)
@@ -64,7 +64,7 @@ typedef struct prop_flags {
 static int
 zpool_get_all_props(zpool_handle_t *zhp)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
@@ -692,7 +692,7 @@ error:
 int
 zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int ret = -1;
        char errbuf[1024];
        nvlist_t *nvl = NULL;
@@ -1141,7 +1141,7 @@ int
 zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
     nvlist_t *props, nvlist_t *fsprops)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        nvlist_t *zc_fsprops = NULL;
        nvlist_t *zc_props = NULL;
        char msg[1024];
@@ -1275,7 +1275,7 @@ create_failed:
 int
 zpool_destroy(zpool_handle_t *zhp, const char *log_str)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        zfs_handle_t *zfp = NULL;
        libzfs_handle_t *hdl = zhp->zpool_hdl;
        char msg[1024];
@@ -1319,7 +1319,7 @@ zpool_destroy(zpool_handle_t *zhp, const char *log_str)
 int
 zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int ret;
        libzfs_handle_t *hdl = zhp->zpool_hdl;
        char msg[1024];
@@ -1446,7 +1446,7 @@ static int
 zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
     const char *log_str)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
 
        (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
@@ -1721,7 +1721,7 @@ int
 zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
     nvlist_t *props, int flags)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        zpool_rewind_policy_t policy;
        nvlist_t *nv = NULL;
        nvlist_t *nvinfo = NULL;
@@ -1913,7 +1913,7 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
 int
 zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
@@ -2389,7 +2389,7 @@ int
 zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
     vdev_state_t *newstate)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        nvlist_t *tgt;
        boolean_t avail_spare, l2cache, islog;
@@ -2473,7 +2473,7 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
 int
 zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        nvlist_t *tgt;
        boolean_t avail_spare, l2cache;
@@ -2523,7 +2523,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
 int
 zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
@@ -2558,7 +2558,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
 int
 zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
@@ -2612,7 +2612,7 @@ int
 zpool_vdev_attach(zpool_handle_t *zhp,
     const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        int ret;
        nvlist_t *tgt;
@@ -2788,7 +2788,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 int
 zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        nvlist_t *tgt;
        boolean_t avail_spare, l2cache;
@@ -2886,7 +2886,7 @@ int
 zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
     nvlist_t *props, splitflags_t flags)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
        nvlist_t **varray = NULL, *zc_props = NULL;
@@ -3097,7 +3097,7 @@ out:
 int
 zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        nvlist_t *tgt;
        boolean_t avail_spare, l2cache, islog;
@@ -3142,7 +3142,7 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
 int
 zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        nvlist_t *tgt;
        zpool_rewind_policy_t policy;
@@ -3218,7 +3218,7 @@ zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
 int
 zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
@@ -3244,7 +3244,7 @@ zpool_reguid(zpool_handle_t *zhp)
 {
        char msg[1024];
        libzfs_handle_t *hdl = zhp->zpool_hdl;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) snprintf(msg, sizeof (msg),
            dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
@@ -3262,7 +3262,7 @@ zpool_reguid(zpool_handle_t *zhp)
 int
 zpool_reopen(zpool_handle_t *zhp)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
@@ -3342,7 +3342,7 @@ path_to_devid(const char *path)
 static void
 set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
        (void) strncpy(zc.zc_value, path, sizeof (zc.zc_value));
@@ -3517,7 +3517,7 @@ zbookmark_compare(const void *a, const void *b)
 int
 zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        uint64_t count;
        zbookmark_t *zb = NULL;
        int i;
@@ -3613,7 +3613,7 @@ nomem:
 int
 zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
        (void) strcpy(zc.zc_name, zhp->zpool_name);
@@ -3641,7 +3641,7 @@ zfs_save_arguments(int argc, char **argv, char *string, int len)
 int
 zpool_log_history(libzfs_handle_t *hdl, const char *message)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        nvlist_t *args;
        int err;
 
@@ -3667,7 +3667,7 @@ zpool_log_history(libzfs_handle_t *hdl, const char *message)
 static int
 get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
@@ -3804,7 +3804,7 @@ int
 zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp,
     int *dropped, int block, int cleanup_fd)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int error = 0;
 
        *nvp = NULL;
@@ -3863,7 +3863,7 @@ out:
 int
 zpool_events_clear(libzfs_handle_t *hdl, int *count)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char msg[1024];
 
        (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
@@ -3882,7 +3882,7 @@ void
 zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
     char *pathname, size_t len)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        boolean_t mounted = B_FALSE;
        char *mntpnt = NULL;
        char dsname[MAXNAMELEN];
index 5d0ab0eb4b71daabfd927717534b81affad3a99f..28751b215d2c95abeb23c9231464fa49554e4586 100644 (file)
@@ -812,7 +812,7 @@ static int
 estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
     boolean_t fromorigin, uint64_t *sizep)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zfs_hdl;
 
        assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
@@ -876,7 +876,7 @@ static int
 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
     boolean_t fromorigin, int outfd, nvlist_t *debugnv)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        nvlist_t *thisdbg;
 
@@ -978,9 +978,7 @@ hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd)
         */
        if (pzhp) {
                error = zfs_hold(pzhp, thissnap, sdd->holdtag,
-                   B_FALSE, B_TRUE, B_TRUE, sdd->cleanup_fd,
-                   zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID),
-                   zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG));
+                   B_FALSE, B_TRUE, sdd->cleanup_fd);
                zfs_close(pzhp);
        }
 
@@ -992,7 +990,7 @@ send_progress_thread(void *arg)
 {
        progress_arg_t *pa = arg;
 
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        zfs_handle_t *zhp = pa->pa_zhp;
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        unsigned long long bytes;
@@ -1195,7 +1193,7 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
        int rv = 0;
        send_dump_data_t *sdd = arg;
        boolean_t missingfrom = B_FALSE;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
            zhp->zfs_name, sdd->tosnap);
@@ -1683,7 +1681,7 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
     int baselen, char *newname, recvflags_t *flags)
 {
        static int seq;
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int err;
        prop_changelist_t *clp;
        zfs_handle_t *zhp;
@@ -1719,12 +1717,11 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
                err = ENOENT;
        }
 
-       if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) {
+       if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) {
                seq++;
 
-               (void) strncpy(newname, name, baselen);
-               (void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen,
-                   "recv-%ld-%u", (long) getpid(), seq);
+               (void) snprintf(newname, ZFS_MAXNAMELEN, "%.*srecv-%u-%u",
+                   baselen, name, getpid(), seq);
                (void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value));
 
                if (flags->verbose) {
@@ -1756,7 +1753,7 @@ static int
 recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
     char *newname, recvflags_t *flags)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int err = 0;
        prop_changelist_t *clp;
        zfs_handle_t *zhp;
@@ -2015,7 +2012,7 @@ again:
                            stream_originguid, originguid)) {
                        case 1: {
                                /* promote it! */
-                               zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+                               zfs_cmd_t zc = {"\0"};
                                nvlist_t *origin_nvfs;
                                char *origin_fsname;
 
@@ -2087,7 +2084,7 @@ again:
                        if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops",
                            &props) && 0 == nvlist_lookup_nvlist(props,
                            stream_snapname, &props)) {
-                               zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+                               zfs_cmd_t zc = {"\0"};
 
                                zc.zc_cookie = B_TRUE; /* received */
                                (void) snprintf(zc.zc_name, sizeof (zc.zc_name),
@@ -2518,7 +2515,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
     nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
     uint64_t *action_handlep)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        time_t begin_time;
        int ioctl_err, ioctl_errno, err;
        char *cp;
@@ -2649,7 +2646,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
        /*
         * Determine name of destination snapshot, store in zc_value.
         */
-       (void) strcpy(zc.zc_top_ds, tosnap);
        (void) strcpy(zc.zc_value, tosnap);
        (void) strlcat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
        free(cp);
@@ -2892,7 +2888,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
        zcmd_free_nvlists(&zc);
 
        if (err == 0 && snapprops_nvlist) {
-               zfs_cmd_t zc2 = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+               zfs_cmd_t zc2 = {"\0"};
 
                (void) strcpy(zc2.zc_name, zc.zc_value);
                zc2.zc_cookie = B_TRUE; /* received */
index bff6902caaf9df6af35c0e5e2b4a447643b084f6..44a2070d602826d640b8a42c6799c7adadd5c68e 100644 (file)
@@ -118,7 +118,7 @@ static int
 lzc_ioctl(zfs_ioc_t ioc, const char *name,
     nvlist_t *source, nvlist_t **resultp)
 {
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        int error = 0;
        char *packed;
        size_t size;
@@ -132,6 +132,7 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name,
        zc.zc_nvlist_src_size = size;
 
        if (resultp != NULL) {
+               *resultp = NULL;
                zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
                zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
                    malloc(zc.zc_nvlist_dst_size);
@@ -159,8 +160,6 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name,
        if (zc.zc_nvlist_dst_filled) {
                *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
                    zc.zc_nvlist_dst_size);
-       } else if (resultp != NULL) {
-               *resultp = NULL;
        }
 
 out:
@@ -209,7 +208,7 @@ lzc_clone(const char *fsname, const char *origin,
  * The value will be the (int32) error code.
  *
  * The return value will be 0 if all snapshots were created, otherwise it will
- * be the errno of a (undetermined) snapshot that failed.
+ * be the errno of a (unspecified) snapshot that failed.
  */
 int
 lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
@@ -258,7 +257,7 @@ lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
  * The return value will be 0 if all snapshots were destroyed (or marked for
  * later destruction if 'defer' is set) or didn't exist to begin with.
  *
- * Otherwise the return value will be the errno of a (undetermined) snapshot
+ * Otherwise the return value will be the errno of a (unspecified) snapshot
  * that failed, no snapshots will be destroyed, and the errlist will have an
  * entry for each snapshot that failed.  The value in the errlist will be
  * the (int32) error code.
@@ -326,12 +325,107 @@ lzc_exists(const char *dataset)
         * The objset_stats ioctl is still legacy, so we need to construct our
         * own zfs_cmd_t rather than using zfsc_ioctl().
         */
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
 
        (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
        return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
 }
 
+/*
+ * Create "user holds" on snapshots.  If there is a hold on a snapshot,
+ * the snapshot can not be destroyed.  (However, it can be marked for deletion
+ * by lzc_destroy_snaps(defer=B_TRUE).)
+ *
+ * The keys in the nvlist are snapshot names.
+ * The snapshots must all be in the same pool.
+ * The value is the name of the hold (string type).
+ *
+ * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
+ * In this case, when the cleanup_fd is closed (including on process
+ * termination), the holds will be released.  If the system is shut down
+ * uncleanly, the holds will be released when the pool is next opened
+ * or imported.
+ *
+ * The return value will be 0 if all holds were created. Otherwise the return
+ * value will be the errno of a (unspecified) hold that failed, no holds will
+ * be created, and the errlist will have an entry for each hold that
+ * failed (name = snapshot).  The value in the errlist will be the error
+ * code (int32).
+ */
+int
+lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist)
+{
+       char pool[MAXNAMELEN];
+       nvlist_t *args;
+       nvpair_t *elem;
+       int error;
+
+       /* determine the pool name */
+       elem = nvlist_next_nvpair(holds, NULL);
+       if (elem == NULL)
+               return (0);
+       (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+       pool[strcspn(pool, "/@")] = '\0';
+
+       args = fnvlist_alloc();
+       fnvlist_add_nvlist(args, "holds", holds);
+       if (cleanup_fd != -1)
+               fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
+
+       error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
+       nvlist_free(args);
+       return (error);
+}
+
+/*
+ * Release "user holds" on snapshots.  If the snapshot has been marked for
+ * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
+ * any clones, and all the user holds are removed, then the snapshot will be
+ * destroyed.
+ *
+ * The keys in the nvlist are snapshot names.
+ * The snapshots must all be in the same pool.
+ * The value is a nvlist whose keys are the holds to remove.
+ *
+ * The return value will be 0 if all holds were removed.
+ * Otherwise the return value will be the errno of a (unspecified) release
+ * that failed, no holds will be released, and the errlist will have an
+ * entry for each snapshot that has failed releases (name = snapshot).
+ * The value in the errlist will be the error code (int32) of a failed release.
+ */
+int
+lzc_release(nvlist_t *holds, nvlist_t **errlist)
+{
+       char pool[MAXNAMELEN];
+       nvpair_t *elem;
+
+       /* determine the pool name */
+       elem = nvlist_next_nvpair(holds, NULL);
+       if (elem == NULL)
+               return (0);
+       (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+       pool[strcspn(pool, "/@")] = '\0';
+
+       return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
+}
+
+/*
+ * Retrieve list of user holds on the specified snapshot.
+ *
+ * On success, *holdsp will be set to a nvlist which the caller must free.
+ * The keys are the names of the holds, and the value is the creation time
+ * of the hold (uint64) in seconds since the epoch.
+ */
+int
+lzc_get_holds(const char *snapname, nvlist_t **holdsp)
+{
+       int error;
+       nvlist_t *innvl = fnvlist_alloc();
+       error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp);
+       fnvlist_free(innvl);
+       return (error);
+}
+
 /*
  * If fromsnap is NULL, a full (non-incremental) stream will be sent.
  */
@@ -411,7 +505,7 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
         * The receive ioctl is still legacy, so we need to construct our own
         * zfs_cmd_t rather than using zfsc_ioctl().
         */
-       zfs_cmd_t zc = {"\0", 0, 0, 0, 0, 0, 0, 0, "\0", "\0", "\0"};
+       zfs_cmd_t zc = {"\0"};
        char *atp;
        char *packed = NULL;
        size_t size;
index cbba38896f27d46ae72d770fec16e3c5cee1224a..637dc15da8939fe62be75f59dc6343ccfd8fb3f9 100644 (file)
@@ -45,6 +45,8 @@ libzpool_la_SOURCES = \
        $(top_srcdir)/module/zfs/dsl_prop.c \
        $(top_srcdir)/module/zfs/dsl_scan.c \
        $(top_srcdir)/module/zfs/dsl_synctask.c \
+       $(top_srcdir)/module/zfs/dsl_destroy.c \
+       $(top_srcdir)/module/zfs/dsl_userhold.c \
        $(top_srcdir)/module/zfs/fm.c \
        $(top_srcdir)/module/zfs/gzip.c \
        $(top_srcdir)/module/zfs/lzjb.c \
index 0293b5eb5a2dd8f92e16d82527279e0770c01c36..e4d645cf7797db2519834c2ea95fe47bc0f1789e 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/stat.h>
 #include <sys/processor.h>
 #include <sys/zfs_context.h>
+#include <sys/rrwlock.h>
 #include <sys/utsname.h>
 #include <sys/time.h>
 #include <sys/systeminfo.h>
@@ -1042,6 +1043,8 @@ umem_out_of_memory(void)
 void
 kernel_init(int mode)
 {
+       extern uint_t rrw_tsd_key;
+
        umem_nofail_callback(umem_out_of_memory);
 
        physmem = sysconf(_SC_PHYS_PAGES);
@@ -1059,6 +1062,8 @@ kernel_init(int mode)
        system_taskq_init();
 
        spa_init(mode);
+
+       tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 }
 
 void
index bd8bbfdcf69a94f04113d6c89399966d8e75575e..2067437dd7ceb73459caea5dcfd2df9ea728d1db 100644 (file)
@@ -1511,7 +1511,9 @@ Destroy (or mark for deferred destruction) all snapshots with this name in desce
 .ad
 .sp .6
 .RS 4n
-Recursively destroy all dependents.
+Recursively destroy all clones of these snapshots, including the clones,
+snapshots, and children.  If this flag is specified, the \fB-d\fR flag will
+have no effect.
 .RE
 
 .sp
@@ -1547,7 +1549,7 @@ Print verbose information about the deleted data.
 .RE
 
 .sp
-Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR
+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR
 options, as they can destroy large portions of a pool and cause unexpected
 behavior for mounted file systems in use.
 .RE
index 7faea0fceb9a73c1111e43c676c630e2c503db47..a91b9524d8a081515ae9ea82b4db3a37400f999d 100644 (file)
@@ -26,6 +26,7 @@
 #include <sys/nvpair.h>
 #include <sys/kmem.h>
 #include <sys/debug.h>
+#include <sys/param.h>
 #ifndef _KERNEL
 #include <stdlib.h>
 #endif
@@ -114,6 +115,18 @@ fnvlist_merge(nvlist_t *dst, nvlist_t *src)
        VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
 }
 
+size_t
+fnvlist_num_pairs(nvlist_t *nvl)
+{
+       size_t count = 0;
+       nvpair_t *pair;
+
+       for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
+           pair = nvlist_next_nvpair(nvl, pair))
+               count++;
+       return (count);
+}
+
 void
 fnvlist_add_boolean(nvlist_t *nvl, const char *name)
 {
@@ -563,5 +576,6 @@ EXPORT_SYMBOL(fnvpair_value_int64);
 EXPORT_SYMBOL(fnvpair_value_uint64);
 EXPORT_SYMBOL(fnvpair_value_string);
 EXPORT_SYMBOL(fnvpair_value_nvlist);
+EXPORT_SYMBOL(fnvlist_num_pairs);
 
 #endif
index 81b1680e41329a3568486a7cf9ead4b691eb0f76..e71228454110d48861f2bec8a181916fd9189998 100644 (file)
@@ -93,3 +93,5 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_super.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_xattr.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zrlock.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zvol.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_destroy.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_userhold.o
index ce4a0239c0c0ba18370fd9ea43cb2b176fb90e05..1298c5b91bc6a5157f47c038ef3aa203b8a0b7ef 100644 (file)
@@ -1643,12 +1643,12 @@ arc_buf_free(arc_buf_t *buf, void *tag)
        }
 }
 
-int
+boolean_t
 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
        kmutex_t *hash_lock = NULL;
-       int no_callback = (buf->b_efunc == NULL);
+       boolean_t no_callback = (buf->b_efunc == NULL);
 
        if (hdr->b_state == arc_anon) {
                ASSERT(hdr->b_datacnt == 1);
@@ -1854,7 +1854,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                ARCSTAT_INCR(arcstat_mutex_miss, missed);
 
        /*
-        * We have just evicted some date into the ghost state, make
+        * We have just evicted some data into the ghost state, make
         * sure we also adjust the ghost state size if necessary.
         */
        if (arc_no_grow &&
@@ -2772,7 +2772,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
        if (zio == NULL || zio->io_error == 0)
                bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-       VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+       VERIFY(arc_buf_remove_ref(buf, arg));
 }
 
 /* a generic arc_done_func_t */
@@ -2781,7 +2781,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
        arc_buf_t **bufp = arg;
        if (zio && zio->io_error) {
-               VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+               VERIFY(arc_buf_remove_ref(buf, arg));
                *bufp = NULL;
        } else {
                *bufp = buf;
index d196351dcf964ec9097771268e2b732e54fe4f48..c3927e74a2d8e44750029afac529affe8e06badb 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/bplist.h>
@@ -52,6 +53,12 @@ bplist_append(bplist_t *bpl, const blkptr_t *bp)
        mutex_exit(&bpl->bpl_lock);
 }
 
+/*
+ * To aid debugging, we keep the most recently removed entry.  This way if
+ * we are in the callback, we can easily locate the entry.
+ */
+static bplist_entry_t *bplist_iterate_last_removed;
+
 void
 bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 {
@@ -59,6 +66,7 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 
        mutex_enter(&bpl->bpl_lock);
        while ((bpe = list_head(&bpl->bpl_list))) {
+               bplist_iterate_last_removed = bpe;
                list_remove(&bpl->bpl_list, bpe);
                mutex_exit(&bpl->bpl_lock);
                func(arg, &bpe->bpe_blk, tx);
index 1920da4408c8c5c4de83666d56045f213bc20a63..4ba9f8002e2d6671df7518e0cec300423cbc82cc 100644 (file)
@@ -366,6 +366,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 {
        bpobj_t subbpo;
        uint64_t used, comp, uncomp, subsubobjs;
+       ASSERTV(dmu_object_info_t doi);
 
        ASSERT(bpo->bpo_havesubobj);
        ASSERT(bpo->bpo_havecomp);
@@ -392,6 +393,9 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
                    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
        }
 
+       ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
+       ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+
        mutex_enter(&bpo->bpo_lock);
        dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
            bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
index faa6cc345b8f53b02bbc2e646efa3eb8040fcacd..d655d66212ce476c1ccd4e359e87e7e10cfd05a9 100644 (file)
@@ -64,7 +64,7 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
 static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
-static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
 /*
@@ -546,7 +546,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        } else {
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                ASSERT3P(db->db_buf, ==, NULL);
-               VERIFY(arc_buf_remove_ref(buf, db) == 1);
+               VERIFY(arc_buf_remove_ref(buf, db));
                db->db_state = DB_UNCACHED;
        }
        cv_broadcast(&db->db_changed);
@@ -875,10 +875,12 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
                        continue;
 
                /* found a level 0 buffer in the range */
-               if (dbuf_undirty(db, tx))
+               mutex_enter(&db->db_mtx);
+               if (dbuf_undirty(db, tx)) {
+                       /* mutex has been dropped and dbuf destroyed */
                        continue;
+               }
 
-               mutex_enter(&db->db_mtx);
                if (db->db_state == DB_UNCACHED ||
                    db->db_state == DB_NOFILL ||
                    db->db_state == DB_EVICTING) {
@@ -1005,7 +1007,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 
        mutex_enter(&db->db_mtx);
        dbuf_set_data(db, buf);
-       VERIFY(arc_buf_remove_ref(obuf, db) == 1);
+       VERIFY(arc_buf_remove_ref(obuf, db));
        db->db.db_size = size;
 
        if (db->db_level == 0) {
@@ -1306,7 +1308,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        return (dr);
 }
 
-static int
+/*
+ * Return TRUE if this evicted the dbuf.
+ */
+static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
        dnode_t *dn;
@@ -1315,18 +1320,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        ASSERT(txg != 0);
        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+       ASSERT0(db->db_level);
+       ASSERT(MUTEX_HELD(&db->db_mtx));
 
-       mutex_enter(&db->db_mtx);
        /*
         * If this buffer is not dirty, we're done.
         */
        for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
                if (dr->dr_txg <= txg)
                        break;
-       if (dr == NULL || dr->dr_txg < txg) {
-               mutex_exit(&db->db_mtx);
-               return (0);
-       }
+       if (dr == NULL || dr->dr_txg < txg)
+               return (B_FALSE);
        ASSERT(dr->dr_txg == txg);
        ASSERT(dr->dr_dbuf == db);
 
@@ -1334,24 +1338,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        dn = DB_DNODE(db);
 
        /*
-        * If this buffer is currently held, we cannot undirty
-        * it, since one of the current holders may be in the
-        * middle of an update.  Note that users of dbuf_undirty()
-        * should not place a hold on the dbuf before the call.
-        * Also note: we can get here with a spill block, so
-        * test for that similar to how dbuf_dirty does.
+        * Note:  This code will probably work even if there are concurrent
+        * holders, but it is untested in that scenerio, as the ZPL and
+        * ztest have additional locking (the range locks) that prevents
+        * that type of concurrent access.
         */
-       if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
-               mutex_exit(&db->db_mtx);
-               /* Make sure we don't toss this buffer at sync phase */
-               if (db->db_blkid != DMU_SPILL_BLKID) {
-                       mutex_enter(&dn->dn_mtx);
-                       dnode_clear_range(dn, db->db_blkid, 1, tx);
-                       mutex_exit(&dn->dn_mtx);
-               }
-               DB_DNODE_EXIT(db);
-               return (0);
-       }
+       ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
 
        dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
@@ -1380,21 +1372,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        }
        DB_DNODE_EXIT(db);
 
-       if (db->db_level == 0) {
-               if (db->db_state != DB_NOFILL) {
-                       dbuf_unoverride(dr);
+       if (db->db_state != DB_NOFILL) {
+               dbuf_unoverride(dr);
 
-                       ASSERT(db->db_buf != NULL);
-                       ASSERT(dr->dt.dl.dr_data != NULL);
-                       if (dr->dt.dl.dr_data != db->db_buf)
-                               VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
-                                   db) == 1);
-               }
-       } else {
                ASSERT(db->db_buf != NULL);
-               ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-               mutex_destroy(&dr->dt.di.dr_mtx);
-               list_destroy(&dr->dt.di.dr_children);
+               ASSERT(dr->dt.dl.dr_data != NULL);
+               if (dr->dt.dl.dr_data != db->db_buf)
+                       VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
        }
        kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
@@ -1406,13 +1390,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
                ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
                dbuf_set_data(db, NULL);
-               VERIFY(arc_buf_remove_ref(buf, db) == 1);
+               VERIFY(arc_buf_remove_ref(buf, db));
                dbuf_evict(db);
-               return (1);
+               return (B_TRUE);
        }
 
-       mutex_exit(&db->db_mtx);
-       return (0);
+       return (B_FALSE);
 }
 
 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
@@ -1511,7 +1494,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
                mutex_exit(&db->db_mtx);
                (void) dbuf_dirty(db, tx);
                bcopy(buf->b_data, db->db.db_data, db->db.db_size);
-               VERIFY(arc_buf_remove_ref(buf, db) == 1);
+               VERIFY(arc_buf_remove_ref(buf, db));
                xuio_stat_wbuf_copied();
                return;
        }
@@ -1529,10 +1512,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
                                arc_release(db->db_buf, db);
                        }
                        dr->dt.dl.dr_data = buf;
-                       VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+                       VERIFY(arc_buf_remove_ref(db->db_buf, db));
                } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
                        arc_release(db->db_buf, db);
-                       VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+                       VERIFY(arc_buf_remove_ref(db->db_buf, db));
                }
                db->db_buf = NULL;
        }
@@ -2168,10 +2151,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                         * This dbuf has anonymous data associated with it.
                         */
                        dbuf_set_data(db, NULL);
-                       VERIFY(arc_buf_remove_ref(buf, db) == 1);
+                       VERIFY(arc_buf_remove_ref(buf, db));
                        dbuf_evict(db);
                } else {
-                       VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+                       VERIFY(!arc_buf_remove_ref(db->db_buf, db));
 
                        /*
                         * A dbuf will be eligible for eviction if either the
@@ -2669,7 +2652,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                if (db->db_state != DB_NOFILL) {
                        if (dr->dt.dl.dr_data != db->db_buf)
                                VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
-                                   db) == 1);
+                                   db));
                        else if (!arc_released(db->db_buf))
                                arc_set_callback(db->db_buf, dbuf_do_evict, db);
                }
index 0a903335655f4b52f2b543672f9249ae92f04a54..cbf4790b1799faae11c206eb9fdbce996f9e8fcc 100644 (file)
@@ -1382,7 +1382,7 @@ void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
        arc_return_buf(buf, FTAG);
-       VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
+       VERIFY(arc_buf_remove_ref(buf, FTAG));
 }
 
 /*
index dc237780c0b05cf4822de78ca310e851e946f8c7..2d1aaa4c449d95b35e41395108164fbf1f2e18bd 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -155,51 +156,49 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 }
 
 int
-dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp)
+dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+    struct vnode *vp, offset_t *offp)
 {
        struct diffarg da;
-       dsl_dataset_t *ds = tosnap->os_dsl_dataset;
-       dsl_dataset_t *fromds = fromsnap->os_dsl_dataset;
-       dsl_dataset_t *findds;
-       dsl_dataset_t *relds;
-       int err = 0;
-
-       /* make certain we are looking at snapshots */
-       if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds))
+       dsl_dataset_t *fromsnap;
+       dsl_dataset_t *tosnap;
+       dsl_pool_t *dp;
+       int error;
+       uint64_t fromtxg;
+
+       if (strchr(tosnap_name, '@') == NULL ||
+           strchr(fromsnap_name, '@') == NULL)
                return (EINVAL);
 
-       /* fromsnap must be earlier and from the same lineage as tosnap */
-       if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)
-               return (EXDEV);
-
-       relds = NULL;
-       findds = ds;
-
-       while (fromds->ds_dir != findds->ds_dir) {
-               dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-               if (!dsl_dir_is_clone(findds->ds_dir)) {
-                       if (relds)
-                               dsl_dataset_rele(relds, FTAG);
-                       return (EXDEV);
-               }
-
-               rw_enter(&dp->dp_config_rwlock, RW_READER);
-               err = dsl_dataset_hold_obj(dp,
-                   findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds);
-               rw_exit(&dp->dp_config_rwlock);
+       error = dsl_pool_hold(tosnap_name, FTAG, &dp);
+       if (error != 0)
+               return (error);
 
-               if (relds)
-                       dsl_dataset_rele(relds, FTAG);
+       error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (error);
+       }
 
-               if (err)
-                       return (EXDEV);
+       error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
+       if (error != 0) {
+               dsl_dataset_rele(tosnap, FTAG);
+               dsl_pool_rele(dp, FTAG);
+               return (error);
+       }
 
-               relds = findds;
+       if (!dsl_dataset_is_before(tosnap, fromsnap)) {
+               dsl_dataset_rele(fromsnap, FTAG);
+               dsl_dataset_rele(tosnap, FTAG);
+               dsl_pool_rele(dp, FTAG);
+               return (EXDEV);
        }
 
-       if (relds)
-               dsl_dataset_rele(relds, FTAG);
+       fromtxg = fromsnap->ds_phys->ds_creation_txg;
+       dsl_dataset_rele(fromsnap, FTAG);
+
+       dsl_dataset_long_hold(tosnap, FTAG);
+       dsl_pool_rele(dp, FTAG);
 
        da.da_vp = vp;
        da.da_offp = offp;
@@ -207,15 +206,18 @@ dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp)
        da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
        da.da_err = 0;
 
-       err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg,
+       error = traverse_dataset(tosnap, fromtxg,
            TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
 
-       if (err) {
-               da.da_err = err;
+       if (error != 0) {
+               da.da_err = error;
        } else {
                /* we set the da.da_err we return as side-effect */
                (void) write_record(&da);
        }
 
+       dsl_dataset_long_rele(tosnap, FTAG);
+       dsl_dataset_rele(tosnap, FTAG);
+
        return (da.da_err);
 }
index 0f07a4cc95d20f52374018417a0b0f9eff97b778..97a224b911872474e1a8d2bf28a06c484a94a103 100644 (file)
@@ -45,6 +45,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
+#include <sys/dsl_destroy.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
@@ -283,7 +284,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                err = arc_read(NULL, spa, os->os_rootbp,
                    arc_getbuf_func, &os->os_phys_buf,
                    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
-               if (err) {
+               if (err != 0) {
                        kmem_free(os, sizeof (objset_t));
                        /* convert checksum errors into IO errors */
                        if (err == ECKSUM)
@@ -323,34 +324,49 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
         * checksum/compression/copies.
         */
        if (ds) {
-               err = dsl_prop_register(ds, "primarycache",
+               err = dsl_prop_register(ds,
+                   zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
                    primary_cache_changed_cb, os);
-               if (err == 0)
-                       err = dsl_prop_register(ds, "secondarycache",
+               if (err == 0) {
+                       err = dsl_prop_register(ds,
+                           zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
                            secondary_cache_changed_cb, os);
+               }
                if (!dsl_dataset_is_snapshot(ds)) {
-                       if (err == 0)
-                               err = dsl_prop_register(ds, "checksum",
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_CHECKSUM),
                                    checksum_changed_cb, os);
-                       if (err == 0)
-                               err = dsl_prop_register(ds, "compression",
+                       }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_COMPRESSION),
                                    compression_changed_cb, os);
-                       if (err == 0)
-                               err = dsl_prop_register(ds, "copies",
+                       }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_COPIES),
                                    copies_changed_cb, os);
-                       if (err == 0)
-                               err = dsl_prop_register(ds, "dedup",
+                       }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_DEDUP),
                                    dedup_changed_cb, os);
-                       if (err == 0)
-                               err = dsl_prop_register(ds, "logbias",
+                       }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_LOGBIAS),
                                    logbias_changed_cb, os);
-                       if (err == 0)
-                               err = dsl_prop_register(ds, "sync",
+                       }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_SYNC),
                                    sync_changed_cb, os);
+                       }
                }
-               if (err) {
+               if (err != 0) {
                        VERIFY(arc_buf_remove_ref(os->os_phys_buf,
-                           &os->os_phys_buf) == 1);
+                           &os->os_phys_buf));
                        kmem_free(os, sizeof (objset_t));
                        return (err);
                }
@@ -428,44 +444,66 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
        return (err);
 }
 
-/* called from zpl */
+/*
+ * Holds the pool while the objset is held.  Therefore only one objset
+ * can be held at a time.
+ */
 int
 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
+       dsl_pool_t *dp;
        dsl_dataset_t *ds;
        int err;
 
-       err = dsl_dataset_hold(name, tag, &ds);
-       if (err)
+       err = dsl_pool_hold(name, tag, &dp);
+       if (err != 0)
+               return (err);
+       err = dsl_dataset_hold(dp, name, tag, &ds);
+       if (err != 0) {
+               dsl_pool_rele(dp, tag);
                return (err);
+       }
 
        err = dmu_objset_from_ds(ds, osp);
-       if (err)
+       if (err != 0) {
                dsl_dataset_rele(ds, tag);
+               dsl_pool_rele(dp, tag);
+       }
 
        return (err);
 }
 
-/* called from zpl */
+/*
+ * dsl_pool must not be held when this is called.
+ * Upon successful return, there will be a longhold on the dataset,
+ * and the dsl_pool will not be held.
+ */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
+       dsl_pool_t *dp;
        dsl_dataset_t *ds;
        int err;
 
-       err = dsl_dataset_own(name, B_FALSE, tag, &ds);
-       if (err)
+       err = dsl_pool_hold(name, FTAG, &dp);
+       if (err != 0)
+               return (err);
+       err = dsl_dataset_own(dp, name, tag, &ds);
+       if (err != 0) {
+               dsl_pool_rele(dp, FTAG);
                return (err);
+       }
 
        err = dmu_objset_from_ds(ds, osp);
-       if (err) {
+       dsl_pool_rele(dp, FTAG);
+       if (err != 0) {
                dsl_dataset_disown(ds, tag);
        } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-               dmu_objset_disown(*osp, tag);
+               dsl_dataset_disown(ds, tag);
                return (EINVAL);
        } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
-               dmu_objset_disown(*osp, tag);
+               dsl_dataset_disown(ds, tag);
                return (EROFS);
        }
        return (err);
@@ -474,7 +512,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
+       dsl_pool_t *dp = dmu_objset_pool(os);
        dsl_dataset_rele(os->os_dsl_dataset, tag);
+       dsl_pool_rele(dp, tag);
 }
 
 void
@@ -483,7 +523,7 @@ dmu_objset_disown(objset_t *os, void *tag)
        dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
-int
+void
 dmu_objset_evict_dbufs(objset_t *os)
 {
        dnode_t *dn;
@@ -518,9 +558,7 @@ dmu_objset_evict_dbufs(objset_t *os)
                mutex_enter(&os->os_lock);
                dn = next_dn;
        }
-       dn = list_head(&os->os_dnodes);
        mutex_exit(&os->os_lock);
-       return (dn != DMU_META_DNODE(os));
 }
 
 void
@@ -535,33 +573,37 @@ dmu_objset_evict(objset_t *os)
 
        if (ds) {
                if (!dsl_dataset_is_snapshot(ds)) {
-                       VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_CHECKSUM),
                            checksum_changed_cb, os));
-                       VERIFY(0 == dsl_prop_unregister(ds, "compression",
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_COMPRESSION),
                            compression_changed_cb, os));
-                       VERIFY(0 == dsl_prop_unregister(ds, "copies",
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_COPIES),
                            copies_changed_cb, os));
-                       VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_DEDUP),
                            dedup_changed_cb, os));
-                       VERIFY(0 == dsl_prop_unregister(ds, "logbias",
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_LOGBIAS),
                            logbias_changed_cb, os));
-                       VERIFY(0 == dsl_prop_unregister(ds, "sync",
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_SYNC),
                            sync_changed_cb, os));
                }
-               VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
+               VERIFY0(dsl_prop_unregister(ds,
+                   zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
                    primary_cache_changed_cb, os));
-               VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
+               VERIFY0(dsl_prop_unregister(ds,
+                   zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
                    secondary_cache_changed_cb, os));
        }
 
        if (os->os_sa)
                sa_tear_down(os);
 
-       /*
-        * We should need only a single pass over the dnode list, since
-        * nothing can be added to the list at this point.
-        */
-       (void) dmu_objset_evict_dbufs(os);
+       dmu_objset_evict_dbufs(os);
 
        dnode_special_close(&os->os_meta_dnode);
        if (DMU_USERUSED_DNODE(os)) {
@@ -572,7 +614,7 @@ dmu_objset_evict(objset_t *os)
 
        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
-       VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+       VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 
        /*
         * This is a barrier to prevent the objset from going away in
@@ -604,10 +646,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        dnode_t *mdn;
 
        ASSERT(dmu_tx_is_syncing(tx));
+
        if (ds != NULL)
-               VERIFY(0 == dmu_objset_from_ds(ds, &os));
+               VERIFY0(dmu_objset_from_ds(ds, &os));
        else
-               VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os));
+               VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 
        mdn = DMU_META_DNODE(os);
 
@@ -655,361 +698,181 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        return (os);
 }
 
-struct oscarg {
-       void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-       void *userarg;
-       dsl_dataset_t *clone_origin;
-       const char *lastname;
-       dmu_objset_type_t type;
-       uint64_t flags;
-       cred_t *cr;
-};
+typedef struct dmu_objset_create_arg {
+       const char *doca_name;
+       cred_t *doca_cred;
+       void (*doca_userfunc)(objset_t *os, void *arg,
+           cred_t *cr, dmu_tx_t *tx);
+       void *doca_userarg;
+       dmu_objset_type_t doca_type;
+       uint64_t doca_flags;
+} dmu_objset_create_arg_t;
 
 /*ARGSUSED*/
 static int
-dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       struct oscarg *oa = arg2;
-       objset_t *mos = dd->dd_pool->dp_meta_objset;
-       int err;
-       uint64_t ddobj;
-
-       err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-           oa->lastname, sizeof (uint64_t), 1, &ddobj);
-       if (err != ENOENT)
-               return (err ? err : EEXIST);
+       dmu_objset_create_arg_t *doca = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dir_t *pdd;
+       const char *tail;
+       int error;
 
-       if (oa->clone_origin != NULL) {
-               /* You can't clone across pools. */
-               if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
-                       return (EXDEV);
+       if (strchr(doca->doca_name, '@') != NULL)
+               return (EINVAL);
 
-               /* You can only clone snapshots, not the head datasets. */
-               if (!dsl_dataset_is_snapshot(oa->clone_origin))
-                       return (EINVAL);
+       error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
+       if (error != 0)
+               return (error);
+       if (tail == NULL) {
+               dsl_dir_rele(pdd, FTAG);
+               return (EEXIST);
        }
+       dsl_dir_rele(pdd, FTAG);
 
        return (0);
 }
 
 static void
-dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       spa_t *spa = dd->dd_pool->dp_spa;
-       struct oscarg *oa = arg2;
-       uint64_t obj;
+       dmu_objset_create_arg_t *doca = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dir_t *pdd;
+       const char *tail;
        dsl_dataset_t *ds;
+       uint64_t obj;
        blkptr_t *bp;
+       objset_t *os;
 
-       ASSERT(dmu_tx_is_syncing(tx));
+       VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 
-       obj = dsl_dataset_create_sync(dd, oa->lastname,
-           oa->clone_origin, oa->flags, oa->cr, tx);
+       obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
+           doca->doca_cred, tx);
 
-       VERIFY3U(0, ==, dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds));
+       VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
        bp = dsl_dataset_get_blkptr(ds);
-       if (BP_IS_HOLE(bp)) {
-               objset_t *os =
-                   dmu_objset_create_impl(spa, ds, bp, oa->type, tx);
+       os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
+           ds, bp, doca->doca_type, tx);
 
-               if (oa->userfunc)
-                       oa->userfunc(os, oa->userarg, oa->cr, tx);
+       if (doca->doca_userfunc != NULL) {
+               doca->doca_userfunc(os, doca->doca_userarg,
+                   doca->doca_cred, tx);
        }
 
-       if (oa->clone_origin == NULL) {
-               spa_history_log_internal_ds(ds, "create", tx, "");
-       } else {
-               char namebuf[MAXNAMELEN];
-               dsl_dataset_name(oa->clone_origin, namebuf);
-               spa_history_log_internal_ds(ds, "clone", tx,
-                   "origin=%s (%llu)", namebuf, oa->clone_origin->ds_object);
-       }
+       spa_history_log_internal_ds(ds, "create", tx, "");
        dsl_dataset_rele(ds, FTAG);
+       dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
-       dsl_dir_t *pdd;
-       const char *tail;
-       int err = 0;
-       struct oscarg oa = { 0 };
-
-       ASSERT(strchr(name, '@') == NULL);
-       err = dsl_dir_open(name, FTAG, &pdd, &tail);
-       if (err)
-               return (err);
-       if (tail == NULL) {
-               dsl_dir_close(pdd, FTAG);
-               return (EEXIST);
-       }
+       dmu_objset_create_arg_t doca;
 
-       oa.userfunc = func;
-       oa.userarg = arg;
-       oa.lastname = tail;
-       oa.type = type;
-       oa.flags = flags;
-       oa.cr = CRED();
+       doca.doca_name = name;
+       doca.doca_cred = CRED();
+       doca.doca_flags = flags;
+       doca.doca_userfunc = func;
+       doca.doca_userarg = arg;
+       doca.doca_type = type;
 
-       err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
-           dmu_objset_create_sync, pdd, &oa, 5);
-       dsl_dir_close(pdd, FTAG);
-       return (err);
+       return (dsl_sync_task(name,
+           dmu_objset_create_check, dmu_objset_create_sync, &doca, 5));
 }
 
-int
-dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
+typedef struct dmu_objset_clone_arg {
+       const char *doca_clone;
+       const char *doca_origin;
+       cred_t *doca_cred;
+} dmu_objset_clone_arg_t;
+
+/*ARGSUSED*/
+static int
+dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 {
+       dmu_objset_clone_arg_t *doca = arg;
        dsl_dir_t *pdd;
        const char *tail;
-       int err = 0;
-       struct oscarg oa = { 0 };
+       int error;
+       dsl_dataset_t *origin;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
 
-       ASSERT(strchr(name, '@') == NULL);
-       err = dsl_dir_open(name, FTAG, &pdd, &tail);
-       if (err)
-               return (err);
+       if (strchr(doca->doca_clone, '@') != NULL)
+               return (EINVAL);
+
+       error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
+       if (error != 0)
+               return (error);
        if (tail == NULL) {
-               dsl_dir_close(pdd, FTAG);
+               dsl_dir_rele(pdd, FTAG);
                return (EEXIST);
        }
-
-       oa.lastname = tail;
-       oa.clone_origin = clone_origin;
-       oa.flags = flags;
-       oa.cr = CRED();
-
-       err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
-           dmu_objset_create_sync, pdd, &oa, 5);
-       dsl_dir_close(pdd, FTAG);
-       return (err);
-}
-
-int
-dmu_objset_destroy(const char *name, boolean_t defer)
-{
-       dsl_dataset_t *ds;
-       int error;
-
-       error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
-       if (error == 0) {
-               error = dsl_dataset_destroy(ds, FTAG, defer);
-               /* dsl_dataset_destroy() closes the ds. */
+       /* You can't clone across pools. */
+       if (pdd->dd_pool != dp) {
+               dsl_dir_rele(pdd, FTAG);
+               return (EXDEV);
        }
+       dsl_dir_rele(pdd, FTAG);
 
-       return (error);
-}
-
-typedef struct snapallarg {
-       dsl_sync_task_group_t *saa_dstg;
-       boolean_t saa_needsuspend;
-       nvlist_t *saa_props;
-
-       /* the following are used only if 'temporary' is set: */
-       boolean_t saa_temporary;
-       const char *saa_htag;
-       struct dsl_ds_holdarg *saa_ha;
-       dsl_dataset_t *saa_newds;
-} snapallarg_t;
-
-typedef struct snaponearg {
-       const char *soa_longname; /* long snap name */
-       const char *soa_snapname; /* short snap name */
-       snapallarg_t *soa_saa;
-} snaponearg_t;
-
-static int
-snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       objset_t *os = arg1;
-       snaponearg_t *soa = arg2;
-       snapallarg_t *saa = soa->soa_saa;
-       int error;
-
-       /* The props have already been checked by zfs_check_userprops(). */
-
-       error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
-           soa->soa_snapname, tx);
-       if (error)
+       error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
+       if (error != 0)
                return (error);
 
-       if (saa->saa_temporary) {
-               /*
-                * Ideally we would just call
-                * dsl_dataset_user_hold_check() and
-                * dsl_dataset_destroy_check() here.  However the
-                * dataset we want to hold and destroy is the snapshot
-                * that we just confirmed we can create, but it won't
-                * exist until after these checks are run.  Do any
-                * checks we can here and if more checks are added to
-                * those routines in the future, similar checks may be
-                * necessary here.
-                */
-               if (spa_version(os->os_spa) < SPA_VERSION_USERREFS)
-                       return (ENOTSUP);
-               /*
-                * Not checking number of tags because the tag will be
-                * unique, as it will be the only tag.
-                */
-               if (strlen(saa->saa_htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
-                       return (E2BIG);
-
-               saa->saa_ha = kmem_alloc(sizeof (struct dsl_ds_holdarg),
-                   KM_PUSHPAGE);
-               saa->saa_ha->temphold = B_TRUE;
-               saa->saa_ha->htag = saa->saa_htag;
+       /* You can't clone across pools. */
+       if (origin->ds_dir->dd_pool != dp) {
+               dsl_dataset_rele(origin, FTAG);
+               return (EXDEV);
        }
-       return (error);
-}
 
-static void
-snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       objset_t *os = arg1;
-       dsl_dataset_t *ds = os->os_dsl_dataset;
-       snaponearg_t *soa = arg2;
-       snapallarg_t *saa = soa->soa_saa;
-
-       dsl_dataset_snapshot_sync(ds, soa->soa_snapname, tx);
-
-       if (saa->saa_props != NULL) {
-               dsl_props_arg_t pa;
-               pa.pa_props = saa->saa_props;
-               pa.pa_source = ZPROP_SRC_LOCAL;
-               dsl_props_set_sync(ds->ds_prev, &pa, tx);
+       /* You can only clone snapshots, not the head datasets. */
+       if (!dsl_dataset_is_snapshot(origin)) {
+               dsl_dataset_rele(origin, FTAG);
+               return (EINVAL);
        }
+       dsl_dataset_rele(origin, FTAG);
 
-       if (saa->saa_temporary) {
-               struct dsl_ds_destroyarg da;
-
-               dsl_dataset_user_hold_sync(ds->ds_prev, saa->saa_ha, tx);
-               kmem_free(saa->saa_ha, sizeof (struct dsl_ds_holdarg));
-               saa->saa_ha = NULL;
-               saa->saa_newds = ds->ds_prev;
-
-               da.ds = ds->ds_prev;
-               da.defer = B_TRUE;
-               dsl_dataset_destroy_sync(&da, FTAG, tx);
-       }
+       return (0);
 }
 
-static int
-snapshot_one_impl(const char *snapname, void *arg)
+static void
+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 {
-       char *fsname;
-       snapallarg_t *saa = arg;
-       snaponearg_t *soa;
-       objset_t *os;
-       int err;
-
-       fsname = kmem_zalloc(MAXPATHLEN, KM_PUSHPAGE);
-       (void) strlcpy(fsname, snapname, MAXPATHLEN);
-       strchr(fsname, '@')[0] = '\0';
-
-       err = dmu_objset_hold(fsname, saa, &os);
-       kmem_free(fsname, MAXPATHLEN);
-       if (err != 0)
-               return (err);
-
-       /*
-        * If the objset is in an inconsistent state (eg, in the process
-        * of being destroyed), don't snapshot it.
-        */
-       if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
-               dmu_objset_rele(os, saa);
-               return (EBUSY);
-       }
-
-       if (saa->saa_needsuspend) {
-               err = zil_suspend(dmu_objset_zil(os));
-               if (err) {
-                       dmu_objset_rele(os, saa);
-                       return (err);
-               }
-       }
+       dmu_objset_clone_arg_t *doca = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dir_t *pdd;
+       const char *tail;
+       dsl_dataset_t *origin, *ds;
+       uint64_t obj;
+       char namebuf[MAXNAMELEN];
 
-       soa = kmem_zalloc(sizeof (*soa), KM_PUSHPAGE);
-       soa->soa_saa = saa;
-       soa->soa_longname = snapname;
-       soa->soa_snapname = strchr(snapname, '@') + 1;
+       VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
+       VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 
-       dsl_sync_task_create(saa->saa_dstg, snapshot_check, snapshot_sync,
-           os, soa, 3);
+       obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
+           doca->doca_cred, tx);
 
-       return (0);
+       VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+       dsl_dataset_name(origin, namebuf);
+       spa_history_log_internal_ds(ds, "clone", tx,
+           "origin=%s (%llu)", namebuf, origin->ds_object);
+       dsl_dataset_rele(ds, FTAG);
+       dsl_dataset_rele(origin, FTAG);
+       dsl_dir_rele(pdd, FTAG);
 }
 
-/*
- * The snapshots must all be in the same pool.
- */
 int
-dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+dmu_objset_clone(const char *clone, const char *origin)
 {
-       dsl_sync_task_t *dst;
-       snapallarg_t saa = { 0 };
-       spa_t *spa;
-       int rv = 0;
-       int err;
-       nvpair_t *pair;
-
-       pair = nvlist_next_nvpair(snaps, NULL);
-       if (pair == NULL)
-               return (0);
-
-       err = spa_open(nvpair_name(pair), &spa, FTAG);
-       if (err)
-               return (err);
-       saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-       saa.saa_props = props;
-       saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
-
-       for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-           pair = nvlist_next_nvpair(snaps, pair)) {
-               err = snapshot_one_impl(nvpair_name(pair), &saa);
-               if (err != 0) {
-                       if (errors != NULL) {
-                               fnvlist_add_int32(errors,
-                                   nvpair_name(pair), err);
-                       }
-                       rv = err;
-               }
-       }
+       dmu_objset_clone_arg_t doca;
 
-       /*
-        * If any call to snapshot_one_impl() failed, don't execute the
-        * sync task.  The error handling code below will clean up the
-        * snaponearg_t from any successful calls to
-        * snapshot_one_impl().
-        */
-       if (rv == 0)
-               err = dsl_sync_task_group_wait(saa.saa_dstg);
-       if (err != 0)
-               rv = err;
-
-       for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst;
-           dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) {
-               objset_t *os = dst->dst_arg1;
-               snaponearg_t *soa = dst->dst_arg2;
-               if (dst->dst_err != 0) {
-                       if (errors != NULL) {
-                               fnvlist_add_int32(errors,
-                                   soa->soa_longname, dst->dst_err);
-                       }
-                       rv = dst->dst_err;
-               }
-
-               if (saa.saa_needsuspend)
-                       zil_resume(dmu_objset_zil(os));
-               dmu_objset_rele(os, &saa);
-               kmem_free(soa, sizeof (*soa));
-       }
+       doca.doca_clone = clone;
+       doca.doca_origin = origin;
+       doca.doca_cred = CRED();
 
-       dsl_sync_task_group_destroy(saa.saa_dstg);
-       spa_close(spa, FTAG);
-       return (rv);
+       return (dsl_sync_task(clone,
+           dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5));
 }
 
 int
@@ -1020,59 +883,12 @@ dmu_objset_snapshot_one(const char *fsname, const char *snapname)
        nvlist_t *snaps = fnvlist_alloc();
 
        fnvlist_add_boolean(snaps, longsnap);
-       err = dmu_objset_snapshot(snaps, NULL, NULL);
-       fnvlist_free(snaps);
        strfree(longsnap);
+       err = dsl_dataset_snapshot(snaps, NULL, NULL);
+       fnvlist_free(snaps);
        return (err);
 }
 
-int
-dmu_objset_snapshot_tmp(const char *snapname, const char *tag, int cleanup_fd)
-{
-       dsl_sync_task_t *dst;
-       snapallarg_t saa = { 0 };
-       spa_t *spa;
-       minor_t minor;
-       int err;
-
-       err = spa_open(snapname, &spa, FTAG);
-       if (err)
-               return (err);
-       saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-       saa.saa_htag = tag;
-       saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
-       saa.saa_temporary = B_TRUE;
-
-       if (cleanup_fd < 0) {
-               spa_close(spa, FTAG);
-               return (EINVAL);
-       }
-       if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) {
-               spa_close(spa, FTAG);
-               return (err);
-       }
-
-       err = snapshot_one_impl(snapname, &saa);
-
-       if (err == 0)
-               err = dsl_sync_task_group_wait(saa.saa_dstg);
-
-       for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst;
-           dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) {
-               objset_t *os = dst->dst_arg1;
-               dsl_register_onexit_hold_cleanup(saa.saa_newds, tag, minor);
-               if (saa.saa_needsuspend)
-                       zil_resume(dmu_objset_zil(os));
-               dmu_objset_rele(os, &saa);
-       }
-
-       zfs_onexit_fd_rele(cleanup_fd);
-       dsl_sync_task_group_destroy(saa.saa_dstg);
-       spa_close(spa, FTAG);
-       return (err);
-}
-
-
 static void
 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 {
@@ -1110,9 +926,9 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
        objset_t *os = arg;
        dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
-       ASSERT(bp == os->os_rootbp);
-       ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET);
-       ASSERT(BP_GET_LEVEL(bp) == 0);
+       ASSERT3P(bp, ==, os->os_rootbp);
+       ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+       ASSERT0(BP_GET_LEVEL(bp));
 
        /*
         * Update rootbp fill count: it should be the number of objects
@@ -1220,7 +1036,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 
        list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
        while ((dr = list_head(list))) {
-               ASSERT(dr->dr_dbuf->db_level == 0);
+               ASSERT0(dr->dr_dbuf->db_level);
                list_remove(list, dr);
                if (dr->dr_zio)
                        zio_nowait(dr->dr_zio);
@@ -1514,12 +1330,12 @@ dmu_objset_userspace_upgrade(objset_t *os)
                        return (EINTR);
 
                objerr = dmu_bonus_hold(os, obj, FTAG, &db);
-               if (objerr)
+               if (objerr != 0)
                        continue;
                tx = dmu_tx_create(os);
                dmu_tx_hold_bonus(tx, obj);
                objerr = dmu_tx_assign(tx, TXG_WAIT);
-               if (objerr) {
+               if (objerr != 0) {
                        dmu_tx_abort(tx);
                        continue;
                }
@@ -1602,6 +1418,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
        zap_cursor_t cursor;
        zap_attribute_t attr;
 
+       ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+
        if (ds->ds_phys->ds_snapnames_zapobj == 0)
                return (ENOENT);
 
@@ -1674,42 +1492,122 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
        return (0);
 }
 
-struct findarg {
-       int (*func)(const char *, void *);
-       void *arg;
-};
-
-/* ARGSUSED */
-static int
-findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-       struct findarg *fa = arg;
-       return (fa->func(dsname, fa->arg));
-}
-
 /*
- * Find all objsets under name, and for each, call 'func(child_name, arg)'.
- * Perhaps change all callers to use dmu_objset_find_spa()?
+ * Find objsets under and including ddobj, call func(ds) on each.
  */
 int
-dmu_objset_find(char *name, int func(const char *, void *), void *arg,
-    int flags)
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
 {
-       struct findarg fa;
-       fa.func = func;
-       fa.arg = arg;
-       return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
+       dsl_dir_t *dd;
+       dsl_dataset_t *ds;
+       zap_cursor_t zc;
+       zap_attribute_t *attr;
+       uint64_t thisobj;
+       int err;
+
+       ASSERT(dsl_pool_config_held(dp));
+
+       err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+       if (err != 0)
+               return (err);
+
+       /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+       if (dd->dd_myname[0] == '$') {
+               dsl_dir_rele(dd, FTAG);
+               return (0);
+       }
+
+       thisobj = dd->dd_phys->dd_head_dataset_obj;
+       attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+       /*
+        * Iterate over all children.
+        */
+       if (flags & DS_FIND_CHILDREN) {
+               for (zap_cursor_init(&zc, dp->dp_meta_objset,
+                   dd->dd_phys->dd_child_dir_zapobj);
+                   zap_cursor_retrieve(&zc, attr) == 0;
+                   (void) zap_cursor_advance(&zc)) {
+                       ASSERT3U(attr->za_integer_length, ==,
+                           sizeof (uint64_t));
+                       ASSERT3U(attr->za_num_integers, ==, 1);
+
+                       err = dmu_objset_find_dp(dp, attr->za_first_integer,
+                           func, arg, flags);
+                       if (err != 0)
+                               break;
+               }
+               zap_cursor_fini(&zc);
+
+               if (err != 0) {
+                       dsl_dir_rele(dd, FTAG);
+                       kmem_free(attr, sizeof (zap_attribute_t));
+                       return (err);
+               }
+       }
+
+       /*
+        * Iterate over all snapshots.
+        */
+       if (flags & DS_FIND_SNAPSHOTS) {
+               dsl_dataset_t *ds;
+               err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+               if (err == 0) {
+                       uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+                       dsl_dataset_rele(ds, FTAG);
+
+                       for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+                           zap_cursor_retrieve(&zc, attr) == 0;
+                           (void) zap_cursor_advance(&zc)) {
+                               ASSERT3U(attr->za_integer_length, ==,
+                                   sizeof (uint64_t));
+                               ASSERT3U(attr->za_num_integers, ==, 1);
+
+                               err = dsl_dataset_hold_obj(dp,
+                                   attr->za_first_integer, FTAG, &ds);
+                               if (err != 0)
+                                       break;
+                               err = func(dp, ds, arg);
+                               dsl_dataset_rele(ds, FTAG);
+                               if (err != 0)
+                                       break;
+                       }
+                       zap_cursor_fini(&zc);
+               }
+       }
+
+       dsl_dir_rele(dd, FTAG);
+       kmem_free(attr, sizeof (zap_attribute_t));
+
+       if (err != 0)
+               return (err);
+
+       /*
+        * Apply to self.
+        */
+       err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+       if (err != 0)
+               return (err);
+       err = func(dp, ds, arg);
+       dsl_dataset_rele(ds, FTAG);
+       return (err);
 }
 
 /*
- * Find all objsets under name, call func on each
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * The dp_config_rwlock must not be held when this is called, and it
+ * will not be held when the callback is called.
+ * Therefore this function should only be used when the pool is not changing
+ * (e.g. in syncing context), or the callback can deal with the possible races.
  */
-int
-dmu_objset_find_spa(spa_t *spa, const char *name,
-    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
+static int
+dmu_objset_find_impl(spa_t *spa, const char *name,
+    int func(const char *, void *), void *arg, int flags)
 {
        dsl_dir_t *dd;
-       dsl_pool_t *dp;
+       dsl_pool_t *dp = spa_get_dsl(spa);
        dsl_dataset_t *ds;
        zap_cursor_t zc;
        zap_attribute_t *attr;
@@ -1717,21 +1615,23 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
        uint64_t thisobj;
        int err;
 
-       if (name == NULL)
-               name = spa_name(spa);
-       err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
-       if (err)
+       dsl_pool_config_enter(dp, FTAG);
+
+       err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
+       if (err != 0) {
+               dsl_pool_config_exit(dp, FTAG);
                return (err);
+       }
 
        /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
        if (dd->dd_myname[0] == '$') {
-               dsl_dir_close(dd, FTAG);
+               dsl_dir_rele(dd, FTAG);
+               dsl_pool_config_exit(dp, FTAG);
                return (0);
        }
 
        thisobj = dd->dd_phys->dd_head_dataset_obj;
        attr = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE);
-       dp = dd->dd_pool;
 
        /*
         * Iterate over all children.
@@ -1741,19 +1641,24 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
                    dd->dd_phys->dd_child_dir_zapobj);
                    zap_cursor_retrieve(&zc, attr) == 0;
                    (void) zap_cursor_advance(&zc)) {
-                       ASSERT(attr->za_integer_length == sizeof (uint64_t));
-                       ASSERT(attr->za_num_integers == 1);
+                       ASSERT3U(attr->za_integer_length, ==,
+                           sizeof (uint64_t));
+                       ASSERT3U(attr->za_num_integers, ==, 1);
 
                        child = kmem_asprintf("%s/%s", name, attr->za_name);
-                       err = dmu_objset_find_spa(spa, child, func, arg, flags);
+                       dsl_pool_config_exit(dp, FTAG);
+                       err = dmu_objset_find_impl(spa, child,
+                           func, arg, flags);
+                       dsl_pool_config_enter(dp, FTAG);
                        strfree(child);
-                       if (err)
+                       if (err != 0)
                                break;
                }
                zap_cursor_fini(&zc);
 
-               if (err) {
-                       dsl_dir_close(dd, FTAG);
+               if (err != 0) {
+                       dsl_dir_rele(dd, FTAG);
+                       dsl_pool_config_exit(dp, FTAG);
                        kmem_free(attr, sizeof (zap_attribute_t));
                        return (err);
                }
@@ -1763,11 +1668,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
         * Iterate over all snapshots.
         */
        if (flags & DS_FIND_SNAPSHOTS) {
-               if (!dsl_pool_sync_context(dp))
-                       rw_enter(&dp->dp_config_rwlock, RW_READER);
                err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
-               if (!dsl_pool_sync_context(dp))
-                       rw_exit(&dp->dp_config_rwlock);
 
                if (err == 0) {
                        uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
@@ -1776,64 +1677,50 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
                        for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
                            zap_cursor_retrieve(&zc, attr) == 0;
                            (void) zap_cursor_advance(&zc)) {
-                               ASSERT(attr->za_integer_length ==
+                               ASSERT3U(attr->za_integer_length, ==,
                                    sizeof (uint64_t));
-                               ASSERT(attr->za_num_integers == 1);
+                               ASSERT3U(attr->za_num_integers, ==, 1);
 
                                child = kmem_asprintf("%s@%s",
                                    name, attr->za_name);
-                               err = func(spa, attr->za_first_integer,
-                                   child, arg);
+                               dsl_pool_config_exit(dp, FTAG);
+                               err = func(child, arg);
+                               dsl_pool_config_enter(dp, FTAG);
                                strfree(child);
-                               if (err)
+                               if (err != 0)
                                        break;
                        }
                        zap_cursor_fini(&zc);
                }
        }
 
-       dsl_dir_close(dd, FTAG);
+       dsl_dir_rele(dd, FTAG);
        kmem_free(attr, sizeof (zap_attribute_t));
+       dsl_pool_config_exit(dp, FTAG);
 
-       if (err)
+       if (err != 0)
                return (err);
 
-       /*
-        * Apply to self if appropriate.
-        */
-       err = func(spa, thisobj, name, arg);
-       return (err);
+       /* Apply to self. */
+       return (func(name, arg));
 }
 
-/* ARGSUSED */
+/*
+ * See comment above dmu_objset_find_impl().
+ */
 int
-dmu_objset_prefetch(const char *name, void *arg)
+dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+    int flags)
 {
-       dsl_dataset_t *ds;
-
-       if (dsl_dataset_hold(name, FTAG, &ds))
-               return (0);
-
-       if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
-               mutex_enter(&ds->ds_opening_lock);
-               if (ds->ds_objset == NULL) {
-                       uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-                       zbookmark_t zb;
-
-                       SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
-                           ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-
-                       (void) arc_read(NULL, dsl_dataset_get_spa(ds),
-                           &ds->ds_phys->ds_bp, NULL, NULL,
-                           ZIO_PRIORITY_ASYNC_READ,
-                           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-                           &aflags, &zb);
-               }
-               mutex_exit(&ds->ds_opening_lock);
-       }
+       spa_t *spa;
+       int error;
 
-       dsl_dataset_rele(ds, FTAG);
-       return (0);
+       error = spa_open(name, &spa, FTAG);
+       if (error != 0)
+               return (error);
+       error = dmu_objset_find_impl(spa, name, func, arg, flags);
+       spa_close(spa, FTAG);
+       return (error);
 }
 
 void
@@ -1850,6 +1737,22 @@ dmu_objset_get_user(objset_t *os)
        return (os->os_user_ptr);
 }
 
+/*
+ * Determine name of filesystem, given name of snapshot.
+ * buf must be at least MAXNAMELEN bytes
+ */
+int
+dmu_fsname(const char *snapname, char *buf)
+{
+       char *atp = strchr(snapname, '@');
+       if (atp == NULL)
+               return (EINVAL);
+       if (atp - snapname >= MAXNAMELEN)
+               return (ENAMETOOLONG);
+       (void) strlcpy(buf, snapname, atp - snapname + 1);
+       return (0);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dmu_objset_zil);
 EXPORT_SYMBOL(dmu_objset_pool);
@@ -1863,16 +1766,12 @@ EXPORT_SYMBOL(dmu_objset_disown);
 EXPORT_SYMBOL(dmu_objset_from_ds);
 EXPORT_SYMBOL(dmu_objset_create);
 EXPORT_SYMBOL(dmu_objset_clone);
-EXPORT_SYMBOL(dmu_objset_destroy);
-EXPORT_SYMBOL(dmu_objset_snapshot);
 EXPORT_SYMBOL(dmu_objset_stats);
 EXPORT_SYMBOL(dmu_objset_fast_stat);
 EXPORT_SYMBOL(dmu_objset_spa);
 EXPORT_SYMBOL(dmu_objset_space);
 EXPORT_SYMBOL(dmu_objset_fsid_guid);
 EXPORT_SYMBOL(dmu_objset_find);
-EXPORT_SYMBOL(dmu_objset_find_spa);
-EXPORT_SYMBOL(dmu_objset_prefetch);
 EXPORT_SYMBOL(dmu_objset_byteswap);
 EXPORT_SYMBOL(dmu_objset_evict_dbufs);
 EXPORT_SYMBOL(dmu_objset_snap_cmtime);
index 6552e1d9d72d8d3afb879981e8f8fcae30604166..2945be89b8d828300a9268f37d6556a94e6c4068 100644 (file)
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 int zfs_send_corrupt_data = B_FALSE;
 
 static char *dmu_recv_tag = "dmu_recv_tag";
+static const char *recv_clone_name = "%recv";
 
 typedef struct dump_bytes_io {
        dmu_sendarg_t   *dbi_dsp;
@@ -319,7 +322,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
        if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
            (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
                return (EINTR);
-       if (dsp->dsa_err)
+       if (dsp->dsa_err != 0)
                return (EINTR);
        return (0);
 }
@@ -369,7 +372,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                        uint64_t dnobj = (zb->zb_blkid <<
                            (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
                        err = dump_dnode(dsp, dnobj, blk+i);
-                       if (err)
+                       if (err != 0)
                                break;
                }
                (void) arc_buf_remove_ref(abuf, &abuf);
@@ -417,65 +420,33 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 }
 
 /*
- * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
- * For example, they could both be snapshots of the same filesystem, and
- * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
- * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
- * filesystem.  Or 'earlier' could be the origin's origin.
+ * Releases dp, ds, and fromds, using the specified tag.
  */
-static boolean_t
-is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
-{
-       dsl_pool_t *dp = later->ds_dir->dd_pool;
-       int error;
-       boolean_t ret;
-       dsl_dataset_t *origin;
-
-       if (earlier->ds_phys->ds_creation_txg >=
-           later->ds_phys->ds_creation_txg)
-               return (B_FALSE);
-
-       if (later->ds_dir == earlier->ds_dir)
-               return (B_TRUE);
-       if (!dsl_dir_is_clone(later->ds_dir))
-               return (B_FALSE);
-
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) {
-               rw_exit(&dp->dp_config_rwlock);
-               return (B_TRUE);
-       }
-       error = dsl_dataset_hold_obj(dp,
-           later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
-       rw_exit(&dp->dp_config_rwlock);
-       if (error != 0)
-               return (B_FALSE);
-       ret = is_before(origin, earlier);
-       dsl_dataset_rele(origin, FTAG);
-       return (ret);
-}
-
-int
-dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
-    offset_t *off)
+static int
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
+    dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off)
 {
-       dsl_dataset_t *ds = tosnap->os_dsl_dataset;
-       dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
+       objset_t *os;
        dmu_replay_record_t *drr;
        dmu_sendarg_t *dsp;
        int err;
        uint64_t fromtxg = 0;
 
-       /* tosnap must be a snapshot */
-       if (ds->ds_phys->ds_next_snap_obj == 0)
-               return (EINVAL);
-
-       /*
-        * fromsnap must be an earlier snapshot from the same fs as tosnap,
-        * or the origin's fs.
-        */
-       if (fromds != NULL && !is_before(ds, fromds))
+       if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) {
+               dsl_dataset_rele(fromds, tag);
+               dsl_dataset_rele(ds, tag);
+               dsl_pool_rele(dp, tag);
                return (EXDEV);
+       }
+
+       err = dmu_objset_from_ds(ds, &os);
+       if (err != 0) {
+               if (fromds != NULL)
+                       dsl_dataset_rele(fromds, tag);
+               dsl_dataset_rele(ds, tag);
+               dsl_pool_rele(dp, tag);
+               return (err);
+       }
 
        drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
        drr->drr_type = DRR_BEGIN;
@@ -484,13 +455,17 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
            DMU_SUBSTREAM);
 
 #ifdef _KERNEL
-       if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
+       if (dmu_objset_type(os) == DMU_OST_ZFS) {
                uint64_t version;
-               if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) {
+               if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
                        kmem_free(drr, sizeof (dmu_replay_record_t));
+                       if (fromds != NULL)
+                               dsl_dataset_rele(fromds, tag);
+                       dsl_dataset_rele(ds, tag);
+                       dsl_pool_rele(dp, tag);
                        return (EINVAL);
                }
-               if (version == ZPL_VERSION_SA) {
+               if (version >= ZPL_VERSION_SA) {
                        DMU_SET_FEATUREFLAGS(
                            drr->drr_u.drr_begin.drr_versioninfo,
                            DMU_BACKUP_FEATURE_SA_SPILL);
@@ -500,19 +475,22 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
 
        drr->drr_u.drr_begin.drr_creation_time =
            ds->ds_phys->ds_creation_time;
-       drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
+       drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
        if (fromds != NULL && ds->ds_dir != fromds->ds_dir)
                drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
        drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
        if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
                drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 
-       if (fromds)
+       if (fromds != NULL)
                drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
        dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
 
-       if (fromds)
+       if (fromds != NULL) {
                fromtxg = fromds->ds_phys->ds_creation_txg;
+               dsl_dataset_rele(fromds, tag);
+               fromds = NULL;
+       }
 
        dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
 
@@ -520,7 +498,7 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
        dsp->dsa_vp = vp;
        dsp->dsa_outfd = outfd;
        dsp->dsa_proc = curproc;
-       dsp->dsa_os = tosnap;
+       dsp->dsa_os = os;
        dsp->dsa_off = off;
        dsp->dsa_toguid = ds->ds_phys->ds_guid;
        ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
@@ -535,6 +513,9 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
                goto out;
        }
 
+       dsl_dataset_long_hold(ds, FTAG);
+       dsl_pool_rele(dp, tag);
+
        err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
            backup_cb, dsp);
 
@@ -542,8 +523,8 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
                if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
                        err = EINTR;
 
-       if (err) {
-               if (err == EINTR && dsp->dsa_err)
+       if (err != 0) {
+               if (err == EINTR && dsp->dsa_err != 0)
                        err = dsp->dsa_err;
                goto out;
        }
@@ -566,27 +547,96 @@ out:
        kmem_free(drr, sizeof (dmu_replay_record_t));
        kmem_free(dsp, sizeof (dmu_sendarg_t));
 
+       dsl_dataset_long_rele(ds, FTAG);
+       dsl_dataset_rele(ds, tag);
+
        return (err);
 }
 
 int
-dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep)
+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    int outfd, vnode_t *vp, offset_t *off)
+{
+       dsl_pool_t *dp;
+       dsl_dataset_t *ds;
+       dsl_dataset_t *fromds = NULL;
+       int err;
+
+       err = dsl_pool_hold(pool, FTAG, &dp);
+       if (err != 0)
+               return (err);
+
+       err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
+       if (err != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (err);
+       }
+
+       if (fromsnap != 0) {
+               err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
+               if (err != 0) {
+                       dsl_dataset_rele(ds, FTAG);
+                       dsl_pool_rele(dp, FTAG);
+                       return (err);
+               }
+       }
+
+       return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off));
+}
+
+int
+dmu_send(const char *tosnap, const char *fromsnap,
+    int outfd, vnode_t *vp, offset_t *off)
+{
+       dsl_pool_t *dp;
+       dsl_dataset_t *ds;
+       dsl_dataset_t *fromds = NULL;
+       int err;
+
+       if (strchr(tosnap, '@') == NULL)
+               return (EINVAL);
+       if (fromsnap != NULL && strchr(fromsnap, '@') == NULL)
+               return (EINVAL);
+
+       err = dsl_pool_hold(tosnap, FTAG, &dp);
+       if (err != 0)
+               return (err);
+
+       err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+       if (err != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (err);
+       }
+
+       if (fromsnap != NULL) {
+               err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+               if (err != 0) {
+                       dsl_dataset_rele(ds, FTAG);
+                       dsl_pool_rele(dp, FTAG);
+                       return (err);
+               }
+       }
+       return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off));
+}
+
+int
+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
 {
-       dsl_dataset_t *ds = tosnap->os_dsl_dataset;
-       dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
        int err;
        uint64_t size, recordsize;
+       ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool);
+
+       ASSERT(dsl_pool_config_held(dp));
 
        /* tosnap must be a snapshot */
-       if (ds->ds_phys->ds_next_snap_obj == 0)
+       if (!dsl_dataset_is_snapshot(ds))
                return (EINVAL);
 
        /*
         * fromsnap must be an earlier snapshot from the same fs as tosnap,
         * or the origin's fs.
         */
-       if (fromds != NULL && !is_before(ds, fromds))
+       if (fromds != NULL && !dsl_dataset_is_before(ds, fromds))
                return (EXDEV);
 
        /* Get uncompressed size estimate of changed data. */
@@ -596,7 +646,7 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep)
                uint64_t used, comp;
                err = dsl_dataset_space_written(fromds, ds,
                    &used, &comp, &size);
-               if (err)
+               if (err != 0)
                        return (err);
        }
 
@@ -615,11 +665,8 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep)
         * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
         * block, which we observe in practice.
         */
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       err = dsl_prop_get_ds(ds, "recordsize",
-           sizeof (recordsize), 1, &recordsize, NULL);
-       rw_exit(&dp->dp_config_rwlock);
-       if (err)
+       err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
+       if (err != 0)
                return (err);
        size -= size / recordsize * sizeof (blkptr_t);
 
@@ -631,93 +678,40 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep)
        return (0);
 }
 
-struct recvbeginsyncarg {
-       const char *tofs;
-       const char *tosnap;
-       dsl_dataset_t *origin;
-       uint64_t fromguid;
-       dmu_objset_type_t type;
-       void *tag;
-       boolean_t force;
-       uint64_t dsflags;
-       char clonelastname[MAXNAMELEN];
-       dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
-       cred_t *cr;
-};
-
-/* ARGSUSED */
-static int
-recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dir_t *dd = arg1;
-       struct recvbeginsyncarg *rbsa = arg2;
-       objset_t *mos = dd->dd_pool->dp_meta_objset;
-       uint64_t val;
-       int err;
-
-       err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-           strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
-
-       if (err != ENOENT)
-               return (err ? err : EEXIST);
-
-       if (rbsa->origin) {
-               /* make sure it's a snap in the same pool */
-               if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
-                       return (EXDEV);
-               if (!dsl_dataset_is_snapshot(rbsa->origin))
-                       return (EINVAL);
-               if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
-                       return (ENODEV);
-       }
-
-       return (0);
-}
-
-static void
-recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dir_t *dd = arg1;
-       struct recvbeginsyncarg *rbsa = arg2;
-       uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
-       uint64_t dsobj;
-
-       /* Create and open new dataset. */
-       dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
-           rbsa->origin, flags, rbsa->cr, tx);
-       VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
-           B_TRUE, dmu_recv_tag, &rbsa->ds));
-
-       if (rbsa->origin == NULL) {
-               (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
-                   rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
-       }
-
-       spa_history_log_internal_ds(rbsa->ds, "receive new", tx, "");
-}
+typedef struct dmu_recv_begin_arg {
+       const char *drba_origin;
+       dmu_recv_cookie_t *drba_cookie;
+       cred_t *drba_cred;
+} dmu_recv_begin_arg_t;
 
-/* ARGSUSED */
 static int
-recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
+    uint64_t fromguid)
 {
-       dsl_dataset_t *ds = arg1;
-       struct recvbeginsyncarg *rbsa = arg2;
-       int err;
        uint64_t val;
+       int error;
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
        /* must not have any changes since most recent snapshot */
-       if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
+       if (!drba->drba_cookie->drc_force &&
+           dsl_dataset_modified_since_lastsnap(ds))
                return (ETXTBSY);
 
+       /* temporary clone name must not exist */
+       error = zap_lookup(dp->dp_meta_objset,
+           ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
+           8, 1, &val);
+       if (error != ENOENT)
+               return (error == 0 ? EBUSY : error);
+
        /* new snapshot name must not exist */
-       err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-           ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
-       if (err == 0)
-               return (EEXIST);
-       if (err != ENOENT)
-               return (err);
+       error = zap_lookup(dp->dp_meta_objset,
+           ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
+           8, 1, &val);
+       if (error != ENOENT)
+               return (error == 0 ? EEXIST : error);
 
-       if (rbsa->fromguid) {
+       if (fromguid != 0) {
                /* if incremental, most recent snapshot must match fromguid */
                if (ds->ds_prev == NULL)
                        return (ENODEV);
@@ -726,20 +720,20 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
                 * most recent snapshot must match fromguid, or there are no
                 * changes since the fromguid one
                 */
-               if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
+               if (ds->ds_prev->ds_phys->ds_guid != fromguid) {
                        uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
                        uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
                        while (obj != 0) {
                                dsl_dataset_t *snap;
-                               err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-                                   obj, FTAG, &snap);
-                               if (err)
+                               error = dsl_dataset_hold_obj(dp, obj, FTAG,
+                                   &snap);
+                               if (error != 0)
                                        return (ENODEV);
                                if (snap->ds_phys->ds_creation_txg < birth) {
                                        dsl_dataset_rele(snap, FTAG);
                                        return (ENODEV);
                                }
-                               if (snap->ds_phys->ds_guid == rbsa->fromguid) {
+                               if (snap->ds_phys->ds_guid == fromguid) {
                                        dsl_dataset_rele(snap, FTAG);
                                        break; /* it's ok */
                                }
@@ -755,58 +749,153 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
                        return (ENODEV);
        }
 
-       /* temporary clone name must not exist */
-       err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-           ds->ds_dir->dd_phys->dd_child_dir_zapobj,
-           rbsa->clonelastname, 8, 1, &val);
-       if (err == 0)
-               return (EEXIST);
-       if (err != ENOENT)
-               return (err);
-
        return (0);
+
+}
+
+static int
+dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
+{
+       dmu_recv_begin_arg_t *drba = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+       uint64_t fromguid = drrb->drr_fromguid;
+       int flags = drrb->drr_flags;
+       int error;
+       dsl_dataset_t *ds;
+       const char *tofs = drba->drba_cookie->drc_tofs;
+
+       /* already checked */
+       ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+       if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+           DMU_COMPOUNDSTREAM ||
+           drrb->drr_type >= DMU_OST_NUMTYPES ||
+           ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
+               return (EINVAL);
+
+       /* Verify pool version supports SA if SA_SPILL feature set */
+       if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+           DMU_BACKUP_FEATURE_SA_SPILL) &&
+           spa_version(dp->dp_spa) < SPA_VERSION_SA) {
+               return (ENOTSUP);
+       }
+
+       error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+       if (error == 0) {
+               /* target fs already exists; recv into temp clone */
+
+               /* Can't recv a clone into an existing fs */
+               if (flags & DRR_FLAG_CLONE) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (EINVAL);
+               }
+
+               error = recv_begin_check_existing_impl(drba, ds, fromguid);
+               dsl_dataset_rele(ds, FTAG);
+       } else if (error == ENOENT) {
+               /* target fs does not exist; must be a full backup or clone */
+               char buf[MAXNAMELEN];
+
+               /*
+                * If it's a non-clone incremental, we are missing the
+                * target fs, so fail the recv.
+                */
+               if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+                       return (ENOENT);
+
+               /* Open the parent of tofs */
+               ASSERT3U(strlen(tofs), <, MAXNAMELEN);
+               (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
+               error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+               if (error != 0)
+                       return (error);
+
+               if (drba->drba_origin != NULL) {
+                       dsl_dataset_t *origin;
+                       error = dsl_dataset_hold(dp, drba->drba_origin,
+                           FTAG, &origin);
+                       if (error != 0) {
+                               dsl_dataset_rele(ds, FTAG);
+                               return (error);
+                       }
+                       if (!dsl_dataset_is_snapshot(origin)) {
+                               dsl_dataset_rele(origin, FTAG);
+                               dsl_dataset_rele(ds, FTAG);
+                               return (EINVAL);
+                       }
+                       if (origin->ds_phys->ds_guid != fromguid) {
+                               dsl_dataset_rele(origin, FTAG);
+                               dsl_dataset_rele(ds, FTAG);
+                               return (ENODEV);
+                       }
+                       dsl_dataset_rele(origin, FTAG);
+               }
+               dsl_dataset_rele(ds, FTAG);
+               error = 0;
+       }
+       return (error);
 }
 
-/* ARGSUSED */
 static void
-recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ohds = arg1;
-       struct recvbeginsyncarg *rbsa = arg2;
-       dsl_pool_t *dp = ohds->ds_dir->dd_pool;
-       dsl_dataset_t *cds;
-       uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+       dmu_recv_begin_arg_t *drba = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+       const char *tofs = drba->drba_cookie->drc_tofs;
+       dsl_dataset_t *ds, *newds;
        uint64_t dsobj;
+       int error;
+       uint64_t crflags;
+
+       crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
+           DS_FLAG_CI_DATASET : 0;
 
-       /* create and open the temporary clone */
-       dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
-           ohds->ds_prev, flags, rbsa->cr, tx);
-       VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
+       error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+       if (error == 0) {
+               /* create temporary clone */
+               dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
+                   ds->ds_prev, crflags, drba->drba_cred, tx);
+               dsl_dataset_rele(ds, FTAG);
+       } else {
+               dsl_dir_t *dd;
+               const char *tail;
+               dsl_dataset_t *origin = NULL;
+
+               VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
+
+               if (drba->drba_origin != NULL) {
+                       VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
+                           FTAG, &origin));
+               }
+
+               /* Create new dataset. */
+               dsobj = dsl_dataset_create_sync(dd,
+                   strrchr(tofs, '/') + 1,
+                   origin, crflags, drba->drba_cred, tx);
+               if (origin != NULL)
+                       dsl_dataset_rele(origin, FTAG);
+               dsl_dir_rele(dd, FTAG);
+               drba->drba_cookie->drc_newfs = B_TRUE;
+       }
+       VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+
+       dmu_buf_will_dirty(newds->ds_dbuf, tx);
+       newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
        /*
         * If we actually created a non-clone, we need to create the
         * objset in our new dataset.
         */
-       if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
+       if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
                (void) dmu_objset_create_impl(dp->dp_spa,
-                   cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
+                   newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
        }
 
-       rbsa->ds = cds;
-
-       spa_history_log_internal_ds(cds, "receive over existing", tx, "");
-}
-
-static boolean_t
-dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
-{
-       int featureflags;
-
-       featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+       drba->drba_cookie->drc_ds = newds;
 
-       /* Verify pool version supports SA if SA_SPILL feature set */
-       return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
-           (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
+       spa_history_log_internal_ds(newds, "receive", tx, "");
 }
 
 /*
@@ -814,132 +903,55 @@ dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
-dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
-    boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+    boolean_t force, char *origin, dmu_recv_cookie_t *drc)
 {
-       int err = 0;
-       boolean_t byteswap;
-       struct recvbeginsyncarg rbsa = { 0 };
-       uint64_t versioninfo;
-       int flags;
-       dsl_dataset_t *ds;
-
-       if (drrb->drr_magic == DMU_BACKUP_MAGIC)
-               byteswap = FALSE;
-       else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
-               byteswap = TRUE;
-       else
-               return (EINVAL);
-
-       rbsa.tofs = tofs;
-       rbsa.tosnap = tosnap;
-       rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
-       rbsa.fromguid = drrb->drr_fromguid;
-       rbsa.type = drrb->drr_type;
-       rbsa.tag = FTAG;
-       rbsa.dsflags = 0;
-       rbsa.cr = CRED();
-       versioninfo = drrb->drr_versioninfo;
-       flags = drrb->drr_flags;
-
-       if (byteswap) {
-               rbsa.type = BSWAP_32(rbsa.type);
-               rbsa.fromguid = BSWAP_64(rbsa.fromguid);
-               versioninfo = BSWAP_64(versioninfo);
-               flags = BSWAP_32(flags);
-       }
-
-       if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
-           rbsa.type >= DMU_OST_NUMTYPES ||
-           ((flags & DRR_FLAG_CLONE) && origin == NULL))
-               return (EINVAL);
-
-       if (flags & DRR_FLAG_CI_DATA)
-               rbsa.dsflags = DS_FLAG_CI_DATASET;
+       dmu_recv_begin_arg_t drba = { 0 };
+       dmu_replay_record_t *drr;
 
        bzero(drc, sizeof (dmu_recv_cookie_t));
        drc->drc_drrb = drrb;
        drc->drc_tosnap = tosnap;
-       drc->drc_top_ds = top_ds;
+       drc->drc_tofs = tofs;
        drc->drc_force = force;
 
-       /*
-        * Process the begin in syncing context.
-        */
-
-       /* open the dataset we are logically receiving into */
-       err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
-       if (err == 0) {
-               if (dmu_recv_verify_features(ds, drrb)) {
-                       dsl_dataset_rele(ds, dmu_recv_tag);
-                       return (ENOTSUP);
-               }
-               /* target fs already exists; recv into temp clone */
-
-               /* Can't recv a clone into an existing fs */
-               if (flags & DRR_FLAG_CLONE) {
-                       dsl_dataset_rele(ds, dmu_recv_tag);
-                       return (EINVAL);
-               }
-
-               /* must not have an incremental recv already in progress */
-               if (!mutex_tryenter(&ds->ds_recvlock)) {
-                       dsl_dataset_rele(ds, dmu_recv_tag);
-                       return (EBUSY);
-               }
-
-               /* tmp clone name is: tofs/%tosnap" */
-               (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
-                   "%%%s", tosnap);
-               rbsa.force = force;
-               err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-                   recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
-               if (err) {
-                       mutex_exit(&ds->ds_recvlock);
-                       dsl_dataset_rele(ds, dmu_recv_tag);
-                       return (err);
-               }
-               drc->drc_logical_ds = ds;
-               drc->drc_real_ds = rbsa.ds;
-       } else if (err == ENOENT) {
-               /* target fs does not exist; must be a full backup or clone */
-               char *cp;
-
-               /*
-                * If it's a non-clone incremental, we are missing the
-                * target fs, so fail the recv.
-                */
-               if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
-                       return (ENOENT);
-
-               /* Open the parent of tofs */
-               cp = strrchr(tofs, '/');
-               *cp = '\0';
-               err = dsl_dataset_hold(tofs, FTAG, &ds);
-               *cp = '/';
-               if (err)
-                       return (err);
+       if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+               drc->drc_byteswap = B_TRUE;
+       else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
+               return (EINVAL);
 
-               if (dmu_recv_verify_features(ds, drrb)) {
-                       dsl_dataset_rele(ds, FTAG);
-                       return (ENOTSUP);
-               }
+       drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+       drr->drr_type = DRR_BEGIN;
+       drr->drr_u.drr_begin = *drc->drc_drrb;
+       if (drc->drc_byteswap) {
+               fletcher_4_incremental_byteswap(drr,
+                   sizeof (dmu_replay_record_t), &drc->drc_cksum);
+       } else {
+               fletcher_4_incremental_native(drr,
+                   sizeof (dmu_replay_record_t), &drc->drc_cksum);
+       }
+       kmem_free(drr, sizeof (dmu_replay_record_t));
 
-               err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-                   recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
-               dsl_dataset_rele(ds, FTAG);
-               if (err)
-                       return (err);
-               drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
-               drc->drc_newfs = B_TRUE;
+       if (drc->drc_byteswap) {
+               drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+               drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
+               drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+               drrb->drr_type = BSWAP_32(drrb->drr_type);
+               drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+               drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
        }
 
-       return (err);
+       drba.drba_origin = origin;
+       drba.drba_cookie = drc;
+       drba.drba_cred = CRED();
+
+       return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
+           &drba, 5));
 }
 
 struct restorearg {
        int err;
-       int byteswap;
+       boolean_t byteswap;
        vnode_t *vp;
        char *buf;
        uint64_t voff;
@@ -975,7 +987,7 @@ free_guid_map_onexit(void *arg)
        guid_map_entry_t *gmep;
 
        while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
-               dsl_dataset_rele(gmep->gme_ds, ca);
+               dsl_dataset_long_rele(gmep->gme_ds, gmep);
                kmem_free(gmep, sizeof (guid_map_entry_t));
        }
        avl_destroy(ca);
@@ -1003,7 +1015,7 @@ restore_read(struct restorearg *ra, int len)
                        ra->err = EINVAL;
                ra->voff += len - done - resid;
                done = len - resid;
-               if (ra->err)
+               if (ra->err != 0)
                        return (NULL);
        }
 
@@ -1124,7 +1136,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 
        if (drro->drr_bonuslen) {
                data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
-               if (ra->err)
+               if (ra->err != 0)
                        return (ra->err);
        }
 
@@ -1133,7 +1145,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
                tx = dmu_tx_create(os);
                dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
                err = dmu_tx_assign(tx, TXG_WAIT);
-               if (err) {
+               if (err != 0) {
                        dmu_tx_abort(tx);
                        return (err);
                }
@@ -1147,14 +1159,14 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
                    drro->drr_type, drro->drr_blksz,
                    drro->drr_bonustype, drro->drr_bonuslen);
        }
-       if (err) {
+       if (err != 0) {
                return (EINVAL);
        }
 
        tx = dmu_tx_create(os);
        dmu_tx_hold_bonus(tx, drro->drr_object);
        err = dmu_tx_assign(tx, TXG_WAIT);
-       if (err) {
+       if (err != 0) {
                dmu_tx_abort(tx);
                return (err);
        }
@@ -1202,7 +1214,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
                        continue;
 
                err = dmu_free_object(os, obj);
-               if (err)
+               if (err != 0)
                        return (err);
        }
        return (0);
@@ -1232,7 +1244,7 @@ restore_write(struct restorearg *ra, objset_t *os,
        dmu_tx_hold_write(tx, drrw->drr_object,
            drrw->drr_offset, drrw->drr_length);
        err = dmu_tx_assign(tx, TXG_WAIT);
-       if (err) {
+       if (err != 0) {
                dmu_tx_abort(tx);
                return (err);
        }
@@ -1295,7 +1307,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
        dmu_tx_hold_write(tx, drrwbr->drr_object,
            drrwbr->drr_offset, drrwbr->drr_length);
        err = dmu_tx_assign(tx, TXG_WAIT);
-       if (err) {
+       if (err != 0) {
                dmu_tx_abort(tx);
                return (err);
        }
@@ -1336,7 +1348,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
        dmu_tx_hold_spill(tx, db->db_object);
 
        err = dmu_tx_assign(tx, TXG_WAIT);
-       if (err) {
+       if (err != 0) {
                dmu_buf_rele(db, FTAG);
                dmu_buf_rele(db_spill, FTAG);
                dmu_tx_abort(tx);
@@ -1375,6 +1387,16 @@ restore_free(struct restorearg *ra, objset_t *os,
        return (err);
 }
 
+/* used to destroy the drc_ds on error */
+static void
+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
+{
+       char name[MAXNAMELEN];
+       dsl_dataset_name(drc->drc_ds, name);
+       dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+       (void) dsl_destroy_head(name);
+}
+
 /*
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
@@ -1388,52 +1410,24 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
        zio_cksum_t pcksum;
        int featureflags;
 
-       if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
-               ra.byteswap = TRUE;
-
-       {
-               /* compute checksum of drr_begin record */
-               dmu_replay_record_t *drr;
-               drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-
-               drr->drr_type = DRR_BEGIN;
-               drr->drr_u.drr_begin = *drc->drc_drrb;
-               if (ra.byteswap) {
-                       fletcher_4_incremental_byteswap(drr,
-                           sizeof (dmu_replay_record_t), &ra.cksum);
-               } else {
-                       fletcher_4_incremental_native(drr,
-                           sizeof (dmu_replay_record_t), &ra.cksum);
-               }
-               kmem_free(drr, sizeof (dmu_replay_record_t));
-       }
-
-       if (ra.byteswap) {
-               struct drr_begin *drrb = drc->drc_drrb;
-               drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-               drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
-               drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
-               drrb->drr_type = BSWAP_32(drrb->drr_type);
-               drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
-               drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
-       }
-
+       ra.byteswap = drc->drc_byteswap;
+       ra.cksum = drc->drc_cksum;
        ra.vp = vp;
        ra.voff = *voffp;
        ra.bufsize = 1<<20;
        ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
 
        /* these were verified in dmu_recv_begin */
-       ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
+       ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
            DMU_SUBSTREAM);
-       ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
+       ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
        /*
         * Open the objset we are modifying.
         */
-       VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
+       VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
 
-       ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+       ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
 
        featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
@@ -1446,7 +1440,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
                        goto out;
                }
                ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
-               if (ra.err) {
+               if (ra.err != 0) {
                        cleanup_fd = -1;
                        goto out;
                }
@@ -1460,12 +1454,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
                        ra.err = zfs_onexit_add_cb(minor,
                            free_guid_map_onexit, ra.guid_to_ds_map,
                            action_handlep);
-                       if (ra.err)
+                       if (ra.err != 0)
                                goto out;
                } else {
                        ra.err = zfs_onexit_cb_data(minor, *action_handlep,
                            (void **)&ra.guid_to_ds_map);
-                       if (ra.err)
+                       if (ra.err != 0)
                                goto out;
                }
 
@@ -1559,14 +1553,7 @@ out:
                 * destroy what we created, so we don't leave it in the
                 * inconsistent restoring state.
                 */
-               txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
-
-               (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
-                   B_FALSE);
-               if (drc->drc_real_ds != drc->drc_logical_ds) {
-                       mutex_exit(&drc->drc_logical_ds->ds_recvlock);
-                       dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
-               }
+               dmu_recv_cleanup_ds(drc);
        }
 
        vmem_free(ra.buf, ra.bufsize);
@@ -1574,142 +1561,179 @@ out:
        return (ra.err);
 }
 
-struct recvendsyncarg {
-       char *tosnap;
-       uint64_t creation_time;
-       uint64_t toguid;
-};
-
 static int
-recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       struct recvendsyncarg *resa = arg2;
+       dmu_recv_cookie_t *drc = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       int error;
+
+       ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
+
+       if (!drc->drc_newfs) {
+               dsl_dataset_t *origin_head;
+
+               error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
+               if (error != 0)
+                       return (error);
+               error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
+                   origin_head, drc->drc_force);
+               if (error != 0) {
+                       dsl_dataset_rele(origin_head, FTAG);
+                       return (error);
+               }
+               error = dsl_dataset_snapshot_check_impl(origin_head,
+                   drc->drc_tosnap, tx);
+               dsl_dataset_rele(origin_head, FTAG);
+               if (error != 0)
+                       return (error);
 
-       return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
+               error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
+       } else {
+               error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
+                   drc->drc_tosnap, tx);
+       }
+       return (error);
 }
 
 static void
-recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       struct recvendsyncarg *resa = arg2;
+       dmu_recv_cookie_t *drc = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+
+       spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
+           tx, "snap=%s", drc->drc_tosnap);
+
+       if (!drc->drc_newfs) {
+               dsl_dataset_t *origin_head;
+
+               VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
+                   &origin_head));
+               dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
+                   origin_head, tx);
+               dsl_dataset_snapshot_sync_impl(origin_head,
+                   drc->drc_tosnap, tx);
+
+               /* set snapshot's creation time and guid */
+               dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
+               origin_head->ds_prev->ds_phys->ds_creation_time =
+                   drc->drc_drrb->drr_creation_time;
+               origin_head->ds_prev->ds_phys->ds_guid =
+                   drc->drc_drrb->drr_toguid;
+               origin_head->ds_prev->ds_phys->ds_flags &=
+                   ~DS_FLAG_INCONSISTENT;
+
+               dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+               origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+               dsl_dataset_rele(origin_head, FTAG);
+               dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+       } else {
+               dsl_dataset_t *ds = drc->drc_ds;
 
-       dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
+               dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
 
-       /* set snapshot's creation time and guid */
-       dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-       ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
-       ds->ds_prev->ds_phys->ds_guid = resa->toguid;
-       ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+               /* set snapshot's creation time and guid */
+               dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+               ds->ds_prev->ds_phys->ds_creation_time =
+                   drc->drc_drrb->drr_creation_time;
+               ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
+               ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
 
-       dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-       spa_history_log_internal_ds(ds, "finished receiving", tx, "");
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+       }
+       drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
+       /*
+        * Release the hold from dmu_recv_begin.  This must be done before
+        * we return to open context, so that when we free the dataset's dnode,
+        * we can evict its bonus buffer.
+        */
+       dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+       drc->drc_ds = NULL;
 }
 
 static int
-add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds)
+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
 {
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-       uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj;
+       dsl_pool_t *dp;
        dsl_dataset_t *snapds;
        guid_map_entry_t *gmep;
        int err;
 
        ASSERT(guid_map != NULL);
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds);
+       err = dsl_pool_hold(name, FTAG, &dp);
+       if (err != 0)
+               return (err);
+       err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snapds);
        if (err == 0) {
                gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
                gmep->guid = snapds->ds_phys->ds_guid;
                gmep->gme_ds = snapds;
                avl_add(guid_map, gmep);
+               dsl_dataset_long_hold(snapds, gmep);
+               dsl_dataset_rele(snapds, FTAG);
        }
 
-       rw_exit(&dp->dp_config_rwlock);
+       dsl_pool_rele(dp, FTAG);
        return (err);
 }
 
+static int dmu_recv_end_modified_blocks = 3;
+
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
-       struct recvendsyncarg resa;
-       dsl_dataset_t *ds = drc->drc_logical_ds;
-       int err, myerr;
-
-       if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
-               err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
-                   drc->drc_force);
-               if (err)
-                       goto out;
-       } else {
-               mutex_exit(&ds->ds_recvlock);
-               dsl_dataset_rele(ds, dmu_recv_tag);
-               (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
-                   B_FALSE);
-               return (EBUSY);
-       }
+       int error;
 
-       resa.creation_time = drc->drc_drrb->drr_creation_time;
-       resa.toguid = drc->drc_drrb->drr_toguid;
-       resa.tosnap = drc->drc_tosnap;
+#ifdef _KERNEL
+       char *name;
 
-       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           recv_end_check, recv_end_sync, ds, &resa, 3);
-       if (err) {
-               /* swap back */
-               (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
-       }
+       /*
+        * We will be destroying the ds; make sure its origin is unmounted if
+        * necessary.
+        */
+       name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+       dsl_dataset_name(drc->drc_ds, name);
+       zfs_destroy_unmount_origin(name);
+       kmem_free(name, MAXNAMELEN);
+#endif
 
-out:
-       mutex_exit(&ds->ds_recvlock);
-       if (err == 0 && drc->drc_guid_to_ds_map != NULL)
-               (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
-       dsl_dataset_disown(ds, dmu_recv_tag);
-       myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
-       ASSERT0(myerr);
-       return (err);
+       error = dsl_sync_task(drc->drc_tofs,
+           dmu_recv_end_check, dmu_recv_end_sync, drc,
+           dmu_recv_end_modified_blocks);
+
+       if (error != 0)
+               dmu_recv_cleanup_ds(drc);
+       return (error);
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
-       struct recvendsyncarg resa;
-       dsl_dataset_t *ds = drc->drc_logical_ds;
-       int err;
-
-       /*
-        * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
-        * expects it to have a ds_user_ptr (and zil), but clone_swap()
-        * can close it.
-        */
-       txg_wait_synced(ds->ds_dir->dd_pool, 0);
+       int error;
 
-       resa.creation_time = drc->drc_drrb->drr_creation_time;
-       resa.toguid = drc->drc_drrb->drr_toguid;
-       resa.tosnap = drc->drc_tosnap;
+       error = dsl_sync_task(drc->drc_tofs,
+           dmu_recv_end_check, dmu_recv_end_sync, drc,
+           dmu_recv_end_modified_blocks);
 
-       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           recv_end_check, recv_end_sync, ds, &resa, 3);
-       if (err) {
-               /* clean up the fs we just recv'd into */
-               (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
-       } else {
-               if (drc->drc_guid_to_ds_map != NULL)
-                       (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
-               /* release the hold from dmu_recv_begin */
-               dsl_dataset_disown(ds, dmu_recv_tag);
+       if (error != 0) {
+               dmu_recv_cleanup_ds(drc);
+       } else if (drc->drc_guid_to_ds_map != NULL) {
+               (void) add_ds_to_guidmap(drc->drc_tofs,
+                   drc->drc_guid_to_ds_map,
+                   drc->drc_newsnapobj);
        }
-       return (err);
+       return (error);
 }
 
 int
 dmu_recv_end(dmu_recv_cookie_t *drc)
 {
-       if (drc->drc_logical_ds != drc->drc_real_ds)
-               return (dmu_recv_existing_end(drc));
-       else
+       if (drc->drc_newfs)
                return (dmu_recv_new_end(drc));
+       else
+               return (dmu_recv_existing_end(drc));
 }
index 1c39723714379b2596c233f9b1ec6f0b192df70f..32b3e50fc005a079e28e56dc77a585e4e8db7d83 100644 (file)
@@ -265,7 +265,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 
                err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-               if (err)
+               if (err != 0)
                        return (err);
                cbp = buf->b_data;
 
@@ -282,7 +282,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                            zb->zb_level - 1,
                            zb->zb_blkid * epb + i);
                        err = traverse_visitbp(td, dnp, &cbp[i], &czb);
-                       if (err) {
+                       if (err != 0) {
                                if (!hard)
                                        break;
                                lasterr = err;
@@ -295,7 +295,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 
                err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-               if (err)
+               if (err != 0)
                        return (err);
                dnp = buf->b_data;
 
@@ -308,7 +308,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                for (i = 0; i < epb; i++) {
                        err = traverse_dnode(td, &dnp[i], zb->zb_objset,
                            zb->zb_blkid * epb + i);
-                       if (err) {
+                       if (err != 0) {
                                if (!hard)
                                        break;
                                lasterr = err;
@@ -321,7 +321,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 
                err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-               if (err)
+               if (err != 0)
                        return (err);
 
                osp = buf->b_data;
@@ -405,7 +405,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
        for (j = 0; j < dnp->dn_nblkptr; j++) {
                SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
                err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
-               if (err) {
+               if (err != 0) {
                        if (!hard)
                                break;
                        lasterr = err;
@@ -415,7 +415,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
        if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
                SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
                err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
-               if (err) {
+               if (err != 0) {
                        if (!hard)
                                return (err);
                        lasterr = err;
@@ -518,14 +518,20 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
        cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
 
        /* See comment on ZIL traversal in dsl_scan_visitds. */
-       if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
-               objset_t *os;
+       if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
+               uint32_t flags = ARC_WAIT;
+               objset_phys_t *osp;
+               arc_buf_t *buf;
 
-               err = dmu_objset_from_ds(ds, &os);
-               if (err)
+               err = arc_read(NULL, td->td_spa, rootbp,
+                   arc_getbuf_func, &buf,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
+               if (err != 0)
                        return (err);
 
-               traverse_zil(td, &os->os_zil_header);
+               osp = buf->b_data;
+               traverse_zil(td, &osp->os_zil_header);
+               (void) arc_buf_remove_ref(buf, &buf);
        }
 
        if (!(flags & TRAVERSE_PREFETCH_DATA) ||
@@ -591,7 +597,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
        /* visit the MOS */
        err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
            txg_start, NULL, flags, func, arg);
-       if (err)
+       if (err != 0)
                return (err);
 
        /* visit each dataset */
@@ -600,7 +606,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
                dmu_object_info_t doi;
 
                err = dmu_object_info(mos, obj, &doi);
-               if (err) {
+               if (err != 0) {
                        if (!hard)
                                return (err);
                        lasterr = err;
@@ -611,10 +617,10 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
                        dsl_dataset_t *ds;
                        uint64_t txg = txg_start;
 
-                       rw_enter(&dp->dp_config_rwlock, RW_READER);
+                       dsl_pool_config_enter(dp, FTAG);
                        err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
-                       rw_exit(&dp->dp_config_rwlock);
-                       if (err) {
+                       dsl_pool_config_exit(dp, FTAG);
+                       if (err != 0) {
                                if (!hard)
                                        return (err);
                                lasterr = err;
@@ -624,7 +630,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
                                txg = ds->ds_phys->ds_prev_snap_txg;
                        err = traverse_dataset(ds, txg, flags, func, arg);
                        dsl_dataset_rele(ds, FTAG);
-                       if (err) {
+                       if (err != 0) {
                                if (!hard)
                                        return (err);
                                lasterr = err;
index 30867f9d76cbbacd0c5d68573f81403df4c02da5..3e46a02f8e573d436d716f944bae4256c6eab298 100644 (file)
@@ -917,7 +917,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 #endif
 
 static int
-dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
        dmu_tx_hold_t *txh;
        spa_t *spa = tx->tx_pool->dp_spa;
@@ -985,15 +985,6 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
                fudge += txh->txh_fudge;
        }
 
-       /*
-        * NB: This check must be after we've held the dnodes, so that
-        * the dmu_tx_unassign() logic will work properly
-        */
-       if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) {
-               DMU_TX_STAT_BUMP(dmu_tx_how);
-               return (ERESTART);
-       }
-
        /*
         * If a snapshot has been taken since we made our estimates,
         * assume that we won't be able to free or overwrite anything.
@@ -1076,29 +1067,28 @@ dmu_tx_unassign(dmu_tx_t *tx)
  *
  * (1) TXG_WAIT.  If the current open txg is full, waits until there's
  *     a new one.  This should be used when you're not holding locks.
- *     If will only fail if we're truly out of space (or over quota).
+ *     It will only fail if we're truly out of space (or over quota).
  *
  * (2) TXG_NOWAIT.  If we can't assign into the current open txg without
  *     blocking, returns immediately with ERESTART.  This should be used
  *     whenever you're holding locks.  On an ERESTART error, the caller
  *     should drop locks, do a dmu_tx_wait(tx), and try again.
- *
- * (3) A specific txg.  Use this if you need to ensure that multiple
- *     transactions all sync in the same txg.  Like TXG_NOWAIT, it
- *     returns ERESTART if it can't assign you into the requested txg.
  */
 int
-dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
        hrtime_t before, after;
        int err;
 
        ASSERT(tx->tx_txg == 0);
-       ASSERT(txg_how != 0);
+       ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
        ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
        before = gethrtime();
 
+       /* If we might wait, we must not hold the config lock. */
+       ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+
        while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
                dmu_tx_unassign(tx);
 
@@ -1124,6 +1114,7 @@ dmu_tx_wait(dmu_tx_t *tx)
        spa_t *spa = tx->tx_pool->dp_spa;
 
        ASSERT(tx->tx_txg == 0);
+       ASSERT(!dsl_pool_config_held(tx->tx_pool));
 
        /*
         * It's possible that the pool has become active after this thread
@@ -1250,6 +1241,14 @@ dmu_tx_get_txg(dmu_tx_t *tx)
        return (tx->tx_txg);
 }
 
+dsl_pool_t *
+dmu_tx_pool(dmu_tx_t *tx)
+{
+       ASSERT(tx->tx_pool != NULL);
+       return (tx->tx_pool);
+}
+
+
 void
 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 {
index d8d66513d52853d57f1ed5d78ca83434d985c70b..d88134d72b01a14a1e27fded8d5c95ec3e711f2c 100644 (file)
@@ -74,7 +74,11 @@ dnode_cons(void *arg, void *unused, int kmflag)
        mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 
-       refcount_create(&dn->dn_holds);
+       /*
+        * Every dbuf has a reference, and dropping a tracked reference is
+        * O(number of references), so don't track dn_holds.
+        */
+       refcount_create_untracked(&dn->dn_holds);
        refcount_create(&dn->dn_tx_holds);
        list_link_init(&dn->dn_link);
 
index 76e603753cae747c78d5dddab94594dd9e8f9ab2..a1c71d4870a6a14b5a45f372366e64fe79aec97b 100644 (file)
@@ -481,6 +481,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
        dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
        dnode_evict_dbufs(dn);
        ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT3P(dn->dn_bonus, ==, NULL);
 
        /*
         * XXX - It would be nice to assert this, but we may still
index 2eca2b2044e2adb8dbb3b66bf4139a3301fab0c0..5c0ca4d96225b80c1e89a5773c3e40e5f4574ebe 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
-
-static char *dsl_reaper = "the grim reaper";
-
-static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
-static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
 
 #define        SWITCH64(x, y) \
        { \
@@ -63,9 +59,6 @@ static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
 #define        DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
 
-#define        DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
-
-
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
@@ -256,7 +249,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 {
        dsl_dataset_t *ds = dsv;
 
-       ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
+       ASSERT(ds->ds_owner == NULL);
 
        unique_remove(ds->ds_fsid_guid);
 
@@ -264,32 +257,26 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
                dmu_objset_evict(ds->ds_objset);
 
        if (ds->ds_prev) {
-               dsl_dataset_drop_ref(ds->ds_prev, ds);
+               dsl_dataset_rele(ds->ds_prev, ds);
                ds->ds_prev = NULL;
        }
 
        bplist_destroy(&ds->ds_pending_deadlist);
-       if (db != NULL) {
+       if (ds->ds_phys->ds_deadlist_obj != 0)
                dsl_deadlist_close(&ds->ds_deadlist);
-       } else {
-               ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
-               ASSERT(!ds->ds_deadlist.dl_oldfmt);
-       }
        if (ds->ds_dir)
-               dsl_dir_close(ds->ds_dir, ds);
+               dsl_dir_rele(ds->ds_dir, ds);
 
        ASSERT(!list_link_active(&ds->ds_synced_link));
 
        mutex_destroy(&ds->ds_lock);
-       mutex_destroy(&ds->ds_recvlock);
        mutex_destroy(&ds->ds_opening_lock);
-       rw_destroy(&ds->ds_rwlock);
-       cv_destroy(&ds->ds_exclusive_cv);
+       refcount_destroy(&ds->ds_longholds);
 
        kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
-static int
+int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
        dsl_dataset_phys_t *headphys;
@@ -305,7 +292,7 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
 
        err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
            FTAG, &headdbuf);
-       if (err)
+       if (err != 0)
                return (err);
        headphys = headdbuf->db_data;
        err = zap_value_search(dp->dp_meta_objset,
@@ -334,8 +321,8 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
        return (err);
 }
 
-static int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
+int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
 {
        objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
        uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
@@ -355,8 +342,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
        return (err);
 }
 
-static int
-dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
        objset_t *mos = dp->dp_meta_objset;
@@ -365,11 +352,10 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
        int err;
        dmu_object_info_t doi;
 
-       ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-           dsl_pool_sync_context(dp));
+       ASSERT(dsl_pool_config_held(dp));
 
        err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
-       if (err)
+       if (err != 0)
                return (err);
 
        /* Make sure dsobj has the correct object type. */
@@ -388,12 +374,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                list_link_init(&ds->ds_synced_link);
 
                mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
-               mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
                mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
                mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
-
-               rw_init(&ds->ds_rwlock, NULL, RW_DEFAULT, NULL);
-               cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
+               refcount_create(&ds->ds_longholds);
 
                bplist_create(&ds->ds_pending_deadlist);
                dsl_deadlist_open(&ds->ds_deadlist,
@@ -403,15 +386,13 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                    offsetof(dmu_sendarg_t, dsa_link));
 
                if (err == 0) {
-                       err = dsl_dir_open_obj(dp,
+                       err = dsl_dir_hold_obj(dp,
                            ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
                }
-               if (err) {
+               if (err != 0) {
                        mutex_destroy(&ds->ds_lock);
-                       mutex_destroy(&ds->ds_recvlock);
                        mutex_destroy(&ds->ds_opening_lock);
-                       rw_destroy(&ds->ds_rwlock);
-                       cv_destroy(&ds->ds_exclusive_cv);
+                       refcount_destroy(&ds->ds_longholds);
                        bplist_destroy(&ds->ds_pending_deadlist);
                        dsl_deadlist_close(&ds->ds_deadlist);
                        kmem_free(ds, sizeof (dsl_dataset_t));
@@ -421,8 +402,8 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 
                if (!dsl_dataset_is_snapshot(ds)) {
                        ds->ds_snapname[0] = '\0';
-                       if (ds->ds_phys->ds_prev_snap_obj) {
-                               err = dsl_dataset_get_ref(dp,
+                       if (ds->ds_phys->ds_prev_snap_obj != 0) {
+                               err = dsl_dataset_hold_obj(dp,
                                    ds->ds_phys->ds_prev_snap_obj,
                                    ds, &ds->ds_prev);
                        }
@@ -438,29 +419,14 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                }
 
                if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
-                       /*
-                        * In sync context, we're called with either no lock
-                        * or with the write lock.  If we're not syncing,
-                        * we're always called with the read lock held.
-                        */
-                       boolean_t need_lock =
-                           !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
-                           dsl_pool_sync_context(dp);
-
-                       if (need_lock)
-                               rw_enter(&dp->dp_config_rwlock, RW_READER);
-
-                       err = dsl_prop_get_ds(ds,
-                           "refreservation", sizeof (uint64_t), 1,
-                           &ds->ds_reserved, NULL);
+                       err = dsl_prop_get_int_ds(ds,
+                           zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+                           &ds->ds_reserved);
                        if (err == 0) {
-                               err = dsl_prop_get_ds(ds,
-                                   "refquota", sizeof (uint64_t), 1,
-                                   &ds->ds_quota, NULL);
+                               err = dsl_prop_get_int_ds(ds,
+                                   zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+                                   &ds->ds_quota);
                        }
-
-                       if (need_lock)
-                               rw_exit(&dp->dp_config_rwlock);
                } else {
                        ds->ds_reserved = ds->ds_quota = 0;
                }
@@ -473,15 +439,13 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                        bplist_destroy(&ds->ds_pending_deadlist);
                        dsl_deadlist_close(&ds->ds_deadlist);
                        if (ds->ds_prev)
-                               dsl_dataset_drop_ref(ds->ds_prev, ds);
-                       dsl_dir_close(ds->ds_dir, ds);
+                               dsl_dataset_rele(ds->ds_prev, ds);
+                       dsl_dir_rele(ds->ds_dir, ds);
                        mutex_destroy(&ds->ds_lock);
-                       mutex_destroy(&ds->ds_recvlock);
                        mutex_destroy(&ds->ds_opening_lock);
-                       rw_destroy(&ds->ds_rwlock);
-                       cv_destroy(&ds->ds_exclusive_cv);
+                       refcount_destroy(&ds->ds_longholds);
                        kmem_free(ds, sizeof (dsl_dataset_t));
-                       if (err) {
+                       if (err != 0) {
                                dmu_buf_rele(dbuf, tag);
                                return (err);
                        }
@@ -496,170 +460,118 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
        ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
            spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
            dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
-       mutex_enter(&ds->ds_lock);
-       if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
-               mutex_exit(&ds->ds_lock);
-               dmu_buf_rele(ds->ds_dbuf, tag);
-               return (ENOENT);
-       }
-       mutex_exit(&ds->ds_lock);
        *dsp = ds;
        return (0);
 }
 
-static int
-dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
-{
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-       /*
-        * In syncing context we don't want the rwlock lock: there
-        * may be an existing writer waiting for sync phase to
-        * finish.  We don't need to worry about such writers, since
-        * sync phase is single-threaded, so the writer can't be
-        * doing anything while we are active.
-        */
-       if (dsl_pool_sync_context(dp)) {
-               ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
-               return (0);
-       }
-
-       /*
-        * Normal users will hold the ds_rwlock as a READER until they
-        * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
-        * drop their READER lock after they set the ds_owner field.
-        *
-        * If the dataset is being destroyed, the destroy thread will
-        * obtain a WRITER lock for exclusive access after it's done its
-        * open-context work and then change the ds_owner to
-        * dsl_reaper once destruction is assured.  So threads
-        * may block here temporarily, until the "destructability" of
-        * the dataset is determined.
-        */
-       ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
-       mutex_enter(&ds->ds_lock);
-       while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
-               rw_exit(&dp->dp_config_rwlock);
-               cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
-               if (DSL_DATASET_IS_DESTROYED(ds)) {
-                       mutex_exit(&ds->ds_lock);
-                       dsl_dataset_drop_ref(ds, tag);
-                       rw_enter(&dp->dp_config_rwlock, RW_READER);
-                       return (ENOENT);
-               }
-               /*
-                * The dp_config_rwlock lives above the ds_lock. And
-                * we need to check DSL_DATASET_IS_DESTROYED() while
-                * holding the ds_lock, so we have to drop and reacquire
-                * the ds_lock here.
-                */
-               mutex_exit(&ds->ds_lock);
-               rw_enter(&dp->dp_config_rwlock, RW_READER);
-               mutex_enter(&ds->ds_lock);
-       }
-       mutex_exit(&ds->ds_lock);
-       return (0);
-}
-
-int
-dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
-    dsl_dataset_t **dsp)
-{
-       int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
-
-       if (err)
-               return (err);
-       return (dsl_dataset_hold_ref(*dsp, tag));
-}
-
 int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
+dsl_dataset_hold(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
-{
-       int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
-       if (err)
-               return (err);
-       if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
-               dsl_dataset_rele(*dsp, tag);
-               *dsp = NULL;
-               return (EBUSY);
-       }
-       return (0);
-}
-
-int
-dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 {
        dsl_dir_t *dd;
-       dsl_pool_t *dp;
        const char *snapname;
        uint64_t obj;
        int err = 0;
 
-       err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
-       if (err)
+       err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
+       if (err != 0)
                return (err);
 
-       dp = dd->dd_pool;
+       ASSERT(dsl_pool_config_held(dp));
        obj = dd->dd_phys->dd_head_dataset_obj;
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       if (obj)
-               err = dsl_dataset_get_ref(dp, obj, tag, dsp);
+       if (obj != 0)
+               err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
        else
                err = ENOENT;
-       if (err)
-               goto out;
-
-       err = dsl_dataset_hold_ref(*dsp, tag);
 
        /* we may be looking for a snapshot */
        if (err == 0 && snapname != NULL) {
-               dsl_dataset_t *ds = NULL;
+               dsl_dataset_t *ds;
 
                if (*snapname++ != '@') {
                        dsl_dataset_rele(*dsp, tag);
-                       err = ENOENT;
-                       goto out;
+                       dsl_dir_rele(dd, FTAG);
+                       return (ENOENT);
                }
 
                dprintf("looking for snapshot '%s'\n", snapname);
                err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
                if (err == 0)
-                       err = dsl_dataset_get_ref(dp, obj, tag, &ds);
+                       err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
                dsl_dataset_rele(*dsp, tag);
 
-               ASSERT3U((err == 0), ==, (ds != NULL));
-
-               if (ds) {
+               if (err == 0) {
                        mutex_enter(&ds->ds_lock);
                        if (ds->ds_snapname[0] == 0)
                                (void) strlcpy(ds->ds_snapname, snapname,
                                    sizeof (ds->ds_snapname));
                        mutex_exit(&ds->ds_lock);
-                       err = dsl_dataset_hold_ref(ds, tag);
-                       *dsp = err ? NULL : ds;
+                       *dsp = ds;
                }
        }
-out:
-       rw_exit(&dp->dp_config_rwlock);
-       dsl_dir_close(dd, FTAG);
+
+       dsl_dir_rele(dd, FTAG);
        return (err);
 }
 
 int
-dsl_dataset_own(const char *name, boolean_t inconsistentok,
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **dsp)
 {
-       int err = dsl_dataset_hold(name, tag, dsp);
-       if (err)
+       int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
+       if (err != 0)
+               return (err);
+       if (!dsl_dataset_tryown(*dsp, tag)) {
+               dsl_dataset_rele(*dsp, tag);
+               *dsp = NULL;
+               return (EBUSY);
+       }
+       return (0);
+}
+
+int
+dsl_dataset_own(dsl_pool_t *dp, const char *name,
+    void *tag, dsl_dataset_t **dsp)
+{
+       int err = dsl_dataset_hold(dp, name, tag, dsp);
+       if (err != 0)
                return (err);
-       if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+       if (!dsl_dataset_tryown(*dsp, tag)) {
                dsl_dataset_rele(*dsp, tag);
                return (EBUSY);
        }
        return (0);
 }
 
+/*
+ * See the comment above dsl_pool_hold() for details.  In summary, a long
+ * hold is used to prevent destruction of a dataset while the pool hold
+ * is dropped, allowing other concurrent operations (e.g. spa_sync()).
+ *
+ * The dataset and pool must be held when this function is called.  After it
+ * is called, the pool hold may be released while the dataset is still held
+ * and accessed.
+ */
+void
+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+{
+       ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+       (void) refcount_add(&ds->ds_longholds, tag);
+}
+
+void
+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+{
+       (void) refcount_remove(&ds->ds_longholds, tag);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_dataset_long_held(dsl_dataset_t *ds)
+{
+       return (!refcount_is_zero(&ds->ds_longholds));
+}
+
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
@@ -667,7 +579,7 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
                (void) strcpy(name, "mos");
        } else {
                dsl_dir_name(ds->ds_dir, name);
-               VERIFY(0 == dsl_dataset_get_snapname(ds));
+               VERIFY0(dsl_dataset_get_snapname(ds));
                if (ds->ds_snapname[0]) {
                        (void) strcat(name, "@");
                        /*
@@ -685,90 +597,42 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
        }
 }
 
-static int
-dsl_dataset_namelen(dsl_dataset_t *ds)
-{
-       int result;
-
-       if (ds == NULL) {
-               result = 3;     /* "mos" */
-       } else {
-               result = dsl_dir_namelen(ds->ds_dir);
-               VERIFY(0 == dsl_dataset_get_snapname(ds));
-               if (ds->ds_snapname[0]) {
-                       ++result;       /* adding one for the @-sign */
-                       if (!MUTEX_HELD(&ds->ds_lock)) {
-                               mutex_enter(&ds->ds_lock);
-                               result += strlen(ds->ds_snapname);
-                               mutex_exit(&ds->ds_lock);
-                       } else {
-                               result += strlen(ds->ds_snapname);
-                       }
-               }
-       }
-
-       return (result);
-}
-
-void
-dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
-{
-       dmu_buf_rele(ds->ds_dbuf, tag);
-}
-
 void
 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 {
-       if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
-               rw_exit(&ds->ds_rwlock);
-       }
-       dsl_dataset_drop_ref(ds, tag);
+       dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
-       ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
-           (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
+       ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL);
 
        mutex_enter(&ds->ds_lock);
        ds->ds_owner = NULL;
-       if (RW_WRITE_HELD(&ds->ds_rwlock)) {
-               rw_exit(&ds->ds_rwlock);
-               cv_broadcast(&ds->ds_exclusive_cv);
-       }
        mutex_exit(&ds->ds_lock);
-       if (ds->ds_dbuf)
-               dsl_dataset_drop_ref(ds, tag);
+       dsl_dataset_long_rele(ds, tag);
+       if (ds->ds_dbuf != NULL)
+               dsl_dataset_rele(ds, tag);
        else
                dsl_dataset_evict(NULL, ds);
 }
 
 boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 {
        boolean_t gotit = FALSE;
 
        mutex_enter(&ds->ds_lock);
-       if (ds->ds_owner == NULL &&
-           (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
+       if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
                ds->ds_owner = tag;
-               if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
-                       rw_exit(&ds->ds_rwlock);
+               dsl_dataset_long_hold(ds, tag);
                gotit = TRUE;
        }
        mutex_exit(&ds->ds_lock);
        return (gotit);
 }
 
-void
-dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
-{
-       ASSERT3P(owner, ==, ds->ds_owner);
-       if (!RW_WRITE_HELD(&ds->ds_rwlock))
-               rw_enter(&ds->ds_rwlock, RW_WRITER);
-}
-
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
@@ -789,7 +653,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 
        dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
            DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-       VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+       VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
        dmu_buf_will_dirty(dbuf, tx);
        dsphys = dbuf->db_data;
        bzero(dsphys, sizeof (dsl_dataset_phys_t));
@@ -807,7 +671,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
        if (origin == NULL) {
                dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
        } else {
-               dsl_dataset_t *ohds;
+               dsl_dataset_t *ohds; /* head of the origin snapshot */
 
                dsphys->ds_prev_snap_obj = origin->ds_object;
                dsphys->ds_prev_snap_txg =
@@ -824,7 +688,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                dmu_buf_will_dirty(origin->ds_dbuf, tx);
                origin->ds_phys->ds_num_children++;
 
-               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+               VERIFY0(dsl_dataset_hold_obj(dp,
                    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
                dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
                    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
@@ -836,9 +700,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                                    zap_create(mos,
                                    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
                        }
-                       VERIFY(0 == zap_add_int(mos,
-                           origin->ds_phys->ds_next_clones_obj,
-                           dsobj, tx));
+                       VERIFY0(zap_add_int(mos,
+                           origin->ds_phys->ds_next_clones_obj, dsobj, tx));
                }
 
                dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -850,7 +713,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                                    zap_create(mos,
                                    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
                        }
-                       VERIFY3U(0, ==, zap_add_int(mos,
+                       VERIFY0(zap_add_int(mos,
                            origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
                }
        }
@@ -866,6 +729,16 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
        return (dsobj);
 }
 
+static void
+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+       objset_t *os;
+
+       VERIFY0(dmu_objset_from_ds(ds, &os));
+       bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+       dsl_dataset_dirty(ds, tx);
+}
+
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
@@ -874,29 +747,28 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
        uint64_t dsobj, ddobj;
        dsl_dir_t *dd;
 
+       ASSERT(dmu_tx_is_syncing(tx));
        ASSERT(lastname[0] != '@');
 
        ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
-       VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
+       VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
 
-       dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
+       dsobj = dsl_dataset_create_sync_dd(dd, origin,
+           flags & ~DS_CREATE_FLAG_NODIRTY, tx);
 
        dsl_deleg_set_create_perms(dd, tx, cr);
 
-       dsl_dir_close(dd, FTAG);
+       dsl_dir_rele(dd, FTAG);
 
        /*
         * If we are creating a clone, make sure we zero out any stale
         * data from the origin snapshots zil header.
         */
-       if (origin != NULL) {
+       if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
                dsl_dataset_t *ds;
-               objset_t *os;
 
-               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-               VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
-               bzero(&os->os_zil_header, sizeof (os->os_zil_header));
-               dsl_dataset_dirty(ds, tx);
+               VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+               dsl_dataset_zero_zil(ds, tx);
                dsl_dataset_rele(ds, FTAG);
        }
 
@@ -904,304 +776,97 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 }
 
 /*
- * The snapshots must all be in the same pool.
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use.  To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
  */
-int
-dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
-    nvlist_t *errlist)
+void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 {
-       int err;
-       dsl_sync_task_t *dst;
-       spa_t *spa;
-       nvpair_t *pair;
-       dsl_sync_task_group_t *dstg;
-
-       pair = nvlist_next_nvpair(snaps, NULL);
-       if (pair == NULL)
-               return (0);
-
-       err = spa_open(nvpair_name(pair), &spa, FTAG);
-       if (err)
-               return (err);
-       dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-
-       for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-           pair = nvlist_next_nvpair(snaps, pair)) {
-               dsl_dataset_t *ds;
-
-               err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
-               if (err == 0) {
-                       struct dsl_ds_destroyarg *dsda;
-
-                       dsl_dataset_make_exclusive(ds, dstg);
-                       dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
-                           KM_SLEEP);
-                       dsda->ds = ds;
-                       dsda->defer = defer;
-                       dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-                           dsl_dataset_destroy_sync, dsda, dstg, 0);
-               } else if (err == ENOENT) {
-                       err = 0;
-               } else {
-                       fnvlist_add_int32(errlist, nvpair_name(pair), err);
-                       break;
-               }
-       }
+       uint64_t mrs_used;
+       uint64_t dlused, dlcomp, dluncomp;
 
-       if (err == 0)
-               err = dsl_sync_task_group_wait(dstg);
+       ASSERT(!dsl_dataset_is_snapshot(ds));
 
-       for (dst = list_head(&dstg->dstg_tasks); dst;
-           dst = list_next(&dstg->dstg_tasks, dst)) {
-               struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
-               dsl_dataset_t *ds = dsda->ds;
+       if (ds->ds_phys->ds_prev_snap_obj != 0)
+               mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
+       else
+               mrs_used = 0;
 
-               /*
-                * Return the snapshots that triggered the error.
-                */
-               if (dst->dst_err != 0) {
-                       char name[ZFS_MAXNAMELEN];
-                       dsl_dataset_name(ds, name);
-                       fnvlist_add_int32(errlist, name, dst->dst_err);
-               }
-               ASSERT3P(dsda->rm_origin, ==, NULL);
-               dsl_dataset_disown(ds, dstg);
-               kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
-       }
+       dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
-       dsl_sync_task_group_destroy(dstg);
-       spa_close(spa, FTAG);
-       return (err);
+       ASSERT3U(dlused, <=, mrs_used);
+       ds->ds_phys->ds_unique_bytes =
+           ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
 
+       if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+           SPA_VERSION_UNIQUE_ACCURATE)
+               ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
-static boolean_t
-dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
+void
+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx)
 {
-       boolean_t might_destroy = B_FALSE;
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       int err;
+       ASSERTV(uint64_t count);
+
+       ASSERT(ds->ds_phys->ds_num_children >= 2);
+       err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+       /*
+        * The err should not be ENOENT, but a bug in a previous version
+        * of the code could cause upgrade_clones_cb() to not set
+        * ds_next_snap_obj when it should, leading to a missing entry.
+        * If we knew that the pool was created after
+        * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+        * ENOENT.  However, at least we can check that we don't have
+        * too many entries in the next_clones_obj even after failing to
+        * remove this one.
+        */
+       if (err != ENOENT)
+               VERIFY0(err);
+       ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+           &count));
+       ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+}
 
-       mutex_enter(&ds->ds_lock);
-       if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
-           DS_IS_DEFER_DESTROY(ds))
-               might_destroy = B_TRUE;
-       mutex_exit(&ds->ds_lock);
 
-       return (might_destroy);
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+       return (&ds->ds_phys->ds_bp);
 }
 
-/*
- * If we're removing a clone, and these three conditions are true:
- *     1) the clone's origin has no other children
- *     2) the clone's origin has no user references
- *     3) the clone's origin has been marked for deferred destruction
- * Then, prepare to remove the origin as part of this sync task group.
- */
-static int
-dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = dsda->ds;
-       dsl_dataset_t *origin = ds->ds_prev;
-
-       if (dsl_dataset_might_destroy_origin(origin)) {
-               char *name;
-               int namelen;
-               int error;
-
-               namelen = dsl_dataset_namelen(origin) + 1;
-               name = kmem_alloc(namelen, KM_SLEEP);
-               dsl_dataset_name(origin, name);
-#ifdef _KERNEL
-               error = zfs_unmount_snap(name, NULL);
-               if (error) {
-                       kmem_free(name, namelen);
-                       return (error);
-               }
-#endif
-               error = dsl_dataset_own(name, B_TRUE, tag, &origin);
-               kmem_free(name, namelen);
-               if (error)
-                       return (error);
-               dsda->rm_origin = origin;
-               dsl_dataset_make_exclusive(origin, tag);
+       ASSERT(dmu_tx_is_syncing(tx));
+       /* If it's the meta-objset, set dp_meta_rootbp */
+       if (ds == NULL) {
+               tx->tx_pool->dp_meta_rootbp = *bp;
+       } else {
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               ds->ds_phys->ds_bp = *bp;
        }
+}
 
-       return (0);
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+       return (ds->ds_dir->dd_pool->dp_spa);
 }
 
-/*
- * ds must be opened as OWNER.  On return (whether successful or not),
- * ds will be closed and caller can no longer dereference it.
- */
-int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
-       int err;
-       dsl_sync_task_group_t *dstg;
-       objset_t *os;
-       dsl_dir_t *dd;
-       uint64_t obj;
-       struct dsl_ds_destroyarg dsda = { 0 };
+       dsl_pool_t *dp;
 
-       dsda.ds = ds;
-
-       if (dsl_dataset_is_snapshot(ds)) {
-               /* Destroying a snapshot is simpler */
-               dsl_dataset_make_exclusive(ds, tag);
-
-               dsda.defer = defer;
-               err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-                   dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-                   &dsda, tag, 0);
-               ASSERT3P(dsda.rm_origin, ==, NULL);
-               goto out;
-       } else if (defer) {
-               err = EINVAL;
-               goto out;
-       }
-
-       dd = ds->ds_dir;
-
-       if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
-           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
-               /*
-                * Check for errors and mark this ds as inconsistent, in
-                * case we crash while freeing the objects.
-                */
-               err = dsl_sync_task_do(dd->dd_pool,
-                   dsl_dataset_destroy_begin_check,
-                   dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-               if (err)
-                       goto out;
-
-               err = dmu_objset_from_ds(ds, &os);
-               if (err)
-                       goto out;
-
-               /*
-                * Remove all objects while in the open context so that
-                * there is less work to do in the syncing context.
-                */
-               for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
-                   ds->ds_phys->ds_prev_snap_txg)) {
-                       /*
-                        * Ignore errors, if there is not enough disk space
-                        * we will deal with it in dsl_dataset_destroy_sync().
-                        */
-                       (void) dmu_free_object(os, obj);
-               }
-               if (err != ESRCH)
-                       goto out;
-
-               /*
-                * Sync out all in-flight IO.
-                */
-               txg_wait_synced(dd->dd_pool, 0);
-
-               /*
-                * If we managed to free all the objects in open
-                * context, the user space accounting should be zero.
-                */
-               if (ds->ds_phys->ds_bp.blk_fill == 0 &&
-                   dmu_objset_userused_enabled(os)) {
-                       ASSERTV(uint64_t count);
-
-                       ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
-                           &count) != 0 || count == 0);
-                       ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
-                           &count) != 0 || count == 0);
-               }
-       }
-
-       rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-       err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
-       rw_exit(&dd->dd_pool->dp_config_rwlock);
-
-       if (err)
-               goto out;
-
-       /*
-        * Blow away the dsl_dir + head dataset.
-        */
-       dsl_dataset_make_exclusive(ds, tag);
-       /*
-        * If we're removing a clone, we might also need to remove its
-        * origin.
-        */
-       do {
-               dsda.need_prep = B_FALSE;
-               if (dsl_dir_is_clone(dd)) {
-                       err = dsl_dataset_origin_rm_prep(&dsda, tag);
-                       if (err) {
-                               dsl_dir_close(dd, FTAG);
-                               goto out;
-                       }
-               }
-
-               dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
-               dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-                   dsl_dataset_destroy_sync, &dsda, tag, 0);
-               dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-                   dsl_dir_destroy_sync, dd, FTAG, 0);
-               err = dsl_sync_task_group_wait(dstg);
-               dsl_sync_task_group_destroy(dstg);
-
-               /*
-                * We could be racing against 'zfs release' or 'zfs destroy -d'
-                * on the origin snap, in which case we can get EBUSY if we
-                * needed to destroy the origin snap but were not ready to
-                * do so.
-                */
-               if (dsda.need_prep) {
-                       ASSERT(err == EBUSY);
-                       ASSERT(dsl_dir_is_clone(dd));
-                       ASSERT(dsda.rm_origin == NULL);
-               }
-       } while (dsda.need_prep);
-
-       if (dsda.rm_origin != NULL)
-               dsl_dataset_disown(dsda.rm_origin, tag);
-
-       /* if it is successful, dsl_dir_destroy_sync will close the dd */
-       if (err)
-               dsl_dir_close(dd, FTAG);
-
-out:
-       dsl_dataset_disown(ds, tag);
-       return (err);
-}
-
-blkptr_t *
-dsl_dataset_get_blkptr(dsl_dataset_t *ds)
-{
-       return (&ds->ds_phys->ds_bp);
-}
-
-void
-dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
-       ASSERT(dmu_tx_is_syncing(tx));
-       /* If it's the meta-objset, set dp_meta_rootbp */
-       if (ds == NULL) {
-               tx->tx_pool->dp_meta_rootbp = *bp;
-       } else {
-               dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_phys->ds_bp = *bp;
-       }
-}
-
-spa_t *
-dsl_dataset_get_spa(dsl_dataset_t *ds)
-{
-       return (ds->ds_dir->dd_pool->dp_spa);
-}
-
-void
-dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-       dsl_pool_t *dp;
-
-       if (ds == NULL) /* this is the meta-objset */
-               return;
+       if (ds == NULL) /* this is the meta-objset */
+               return;
 
        ASSERT(ds->ds_objset != NULL);
 
@@ -1210,7 +875,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 
        dp = ds->ds_dir->dd_pool;
 
-       if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+       if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
                /* up the hold count until we can be written out */
                dmu_buf_add_ref(ds->ds_dbuf, ds);
        }
@@ -1229,850 +894,145 @@ dsl_dataset_is_dirty(dsl_dataset_t *ds)
        return (B_FALSE);
 }
 
-/*
- * The unique space in the head dataset can be calculated by subtracting
- * the space used in the most recent snapshot, that is still being used
- * in this file system, from the space currently in use.  To figure out
- * the space in the most recent snapshot still in use, we need to take
- * the total space used in the snapshot and subtract out the space that
- * has been freed up since the snapshot was taken.
- */
-static void
-dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
-{
-       uint64_t mrs_used;
-       uint64_t dlused, dlcomp, dluncomp;
-
-       ASSERT(!dsl_dataset_is_snapshot(ds));
-
-       if (ds->ds_phys->ds_prev_snap_obj != 0)
-               mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
-       else
-               mrs_used = 0;
-
-       dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
-
-       ASSERT3U(dlused, <=, mrs_used);
-       ds->ds_phys->ds_unique_bytes =
-           ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
-
-       if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-           SPA_VERSION_UNIQUE_ACCURATE)
-               ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-}
-
-struct killarg {
-       dsl_dataset_t *ds;
-       dmu_tx_t *tx;
-};
-
-/* ARGSUSED */
 static int
-kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
-       struct killarg *ka = arg;
-       dmu_tx_t *tx = ka->tx;
+       uint64_t asize;
 
-       if (bp == NULL)
+       if (!dmu_tx_is_syncing(tx))
                return (0);
 
-       if (zb->zb_level == ZB_ZIL_LEVEL) {
-               ASSERT(zilog != NULL);
-               /*
-                * It's a block in the intent log.  It has no
-                * accounting, so just free it.
-                */
-               dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
-       } else {
-               ASSERT(zilog == NULL);
-               ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
-               (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
-       }
-
-       return (0);
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       uint64_t count;
-       int err;
-
        /*
-        * Can't delete a head dataset if there are snapshots of it.
-        * (Except if the only snapshots are from the branch we cloned
-        * from.)
+        * If there's an fs-only reservation, any blocks that might become
+        * owned by the snapshot dataset must be accommodated by space
+        * outside of the reservation.
         */
-       if (ds->ds_prev != NULL &&
-           ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-               return (EBUSY);
+       ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+       asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+       if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+               return (ENOSPC);
 
        /*
-        * This is really a dsl_dir thing, but check it here so that
-        * we'll be less likely to leave this dataset inconsistent &
-        * nearly destroyed.
+        * Propagate any reserved space for this snapshot to other
+        * snapshot checks in this sync group.
         */
-       err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
-       if (err)
-               return (err);
-       if (count != 0)
-               return (EEXIST);
+       if (asize > 0)
+               dsl_dir_willuse_space(ds->ds_dir, asize, tx);
 
        return (0);
 }
 
-/* ARGSUSED */
-static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-
-       /* Mark it as inconsistent on-disk, in case we crash */
-       dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
-       spa_history_log_internal_ds(ds, "destroy begin", tx, "");
-}
-
-static int
-dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
-    dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = dsda->ds;
-       dsl_dataset_t *ds_prev = ds->ds_prev;
-
-       if (dsl_dataset_might_destroy_origin(ds_prev)) {
-               struct dsl_ds_destroyarg ndsda = {0};
-
-               /*
-                * If we're not prepared to remove the origin, don't remove
-                * the clone either.
-                */
-               if (dsda->rm_origin == NULL) {
-                       dsda->need_prep = B_TRUE;
-                       return (EBUSY);
-               }
-
-               ndsda.ds = ds_prev;
-               ndsda.is_origin_rm = B_TRUE;
-               return (dsl_dataset_destroy_check(&ndsda, tag, tx));
-       }
-
-       /*
-        * If we're not going to remove the origin after all,
-        * undo the open context setup.
-        */
-       if (dsda->rm_origin != NULL) {
-               dsl_dataset_disown(dsda->rm_origin, tag);
-               dsda->rm_origin = NULL;
-       }
-
-       return (0);
-}
+typedef struct dsl_dataset_snapshot_arg {
+       nvlist_t *ddsa_snaps;
+       nvlist_t *ddsa_props;
+       nvlist_t *ddsa_errors;
+} dsl_dataset_snapshot_arg_t;
 
-/*
- * If you add new checks here, you may need to add
- * additional checks to the "temporary" case in
- * snapshot_check() in dmu_objset.c.
- */
-/* ARGSUSED */
 int
-dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx)
 {
-       struct dsl_ds_destroyarg *dsda = arg1;
-       dsl_dataset_t *ds = dsda->ds;
+       int error;
+       uint64_t value;
 
-       /* we have an owner hold, so noone else can destroy us */
-       ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
+       ds->ds_trysnap_txg = tx->tx_txg;
 
-       /*
-        * Only allow deferred destroy on pools that support it.
-        * NOTE: deferred destroy is only supported on snapshots.
-        */
-       if (dsda->defer) {
-               if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
-                   SPA_VERSION_USERREFS)
-                       return (ENOTSUP);
-               ASSERT(dsl_dataset_is_snapshot(ds));
+       if (!dmu_tx_is_syncing(tx))
                return (0);
-       }
 
        /*
-        * Can't delete a head dataset if there are snapshots of it.
-        * (Except if the only snapshots are from the branch we cloned
-        * from.)
+        * We don't allow multiple snapshots of the same txg.  If there
+        * is already one, try again.
         */
-       if (ds->ds_prev != NULL &&
-           ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-               return (EBUSY);
+       if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+               return (EAGAIN);
 
        /*
-        * If we made changes this txg, traverse_dsl_dataset won't find
-        * them.  Try again.
+        * Check for conflicting snapshot name.
         */
-       if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-               return (EAGAIN);
-
-       if (dsl_dataset_is_snapshot(ds)) {
-               /*
-                * If this snapshot has an elevated user reference count,
-                * we can't destroy it yet.
-                */
-               if (ds->ds_userrefs > 0 && !dsda->releasing)
-                       return (EBUSY);
+       error = dsl_dataset_snap_lookup(ds, snapname, &value);
+       if (error == 0)
+               return (EEXIST);
+       if (error != ENOENT)
+               return (error);
 
-               mutex_enter(&ds->ds_lock);
-               /*
-                * Can't delete a branch point. However, if we're destroying
-                * a clone and removing its origin due to it having a user
-                * hold count of 0 and having been marked for deferred destroy,
-                * it's OK for the origin to have a single clone.
-                */
-               if (ds->ds_phys->ds_num_children >
-                   (dsda->is_origin_rm ? 2 : 1)) {
-                       mutex_exit(&ds->ds_lock);
-                       return (EEXIST);
-               }
-               mutex_exit(&ds->ds_lock);
-       } else if (dsl_dir_is_clone(ds->ds_dir)) {
-               return (dsl_dataset_origin_check(dsda, arg2, tx));
-       }
+       error = dsl_dataset_snapshot_reserve_space(ds, tx);
+       if (error != 0)
+               return (error);
 
-       /* XXX we should do some i/o error checking... */
        return (0);
 }
 
-struct refsarg {
-       kmutex_t lock;
-       boolean_t gone;
-       kcondvar_t cv;
-};
-
-/* ARGSUSED */
-static void
-dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
+static int
+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
 {
-       struct refsarg *arg = argv;
-
-       mutex_enter(&arg->lock);
-       arg->gone = TRUE;
-       cv_signal(&arg->cv);
-       mutex_exit(&arg->lock);
-}
+       dsl_dataset_snapshot_arg_t *ddsa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
+       int rv = 0;
 
-static void
-dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
-{
-       struct refsarg arg;
+       for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+           pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+               int error = 0;
+               dsl_dataset_t *ds;
+               char *name, *atp;
+               char dsname[MAXNAMELEN];
+
+               name = nvpair_name(pair);
+               if (strlen(name) >= MAXNAMELEN)
+                       error = ENAMETOOLONG;
+               if (error == 0) {
+                       atp = strchr(name, '@');
+                       if (atp == NULL)
+                               error = EINVAL;
+                       if (error == 0)
+                               (void) strlcpy(dsname, name, atp - name + 1);
+               }
+               if (error == 0)
+                       error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+               if (error == 0) {
+                       error = dsl_dataset_snapshot_check_impl(ds,
+                           atp + 1, tx);
+                       dsl_dataset_rele(ds, FTAG);
+               }
 
-       mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
-       cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
-       arg.gone = FALSE;
-       (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
-           dsl_dataset_refs_gone);
-       dmu_buf_rele(ds->ds_dbuf, tag);
-       mutex_enter(&arg.lock);
-       while (!arg.gone)
-               cv_wait(&arg.cv, &arg.lock);
-       ASSERT(arg.gone);
-       mutex_exit(&arg.lock);
-       ds->ds_dbuf = NULL;
-       ds->ds_phys = NULL;
-       mutex_destroy(&arg.lock);
-       cv_destroy(&arg.cv);
+               if (error != 0) {
+                       if (ddsa->ddsa_errors != NULL) {
+                               fnvlist_add_int32(ddsa->ddsa_errors,
+                                   name, error);
+                       }
+                       rv = error;
+               }
+       }
+       return (rv);
 }
 
-static void
-remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
+void
+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx)
 {
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       int err;
-       ASSERTV(uint64_t count);
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       dmu_buf_t *dbuf;
+       dsl_dataset_phys_t *dsphys;
+       uint64_t dsobj, crtxg;
+       objset_t *mos = dp->dp_meta_objset;
+       ASSERTV(static zil_header_t zero_zil);
+       ASSERTV(objset_t *os);
+
+       ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
-       ASSERT(ds->ds_phys->ds_num_children >= 2);
-       err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
        /*
-        * The err should not be ENOENT, but a bug in a previous version
-        * of the code could cause upgrade_clones_cb() to not set
-        * ds_next_snap_obj when it should, leading to a missing entry.
-        * If we knew that the pool was created after
-        * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
-        * ENOENT.  However, at least we can check that we don't have
-        * too many entries in the next_clones_obj even after failing to
-        * remove this one.
+        * If we are on an old pool, the zil must not be active, in which
+        * case it will be zeroed.  Usually zil_suspend() accomplishes this.
         */
-       if (err != ENOENT) {
-               VERIFY0(err);
-       }
-       ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
-           &count));
-       ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
-}
+       ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
+           dmu_objset_from_ds(ds, &os) != 0 ||
+           bcmp(&os->os_phys->os_zil_header, &zero_zil,
+           sizeof (zero_zil)) == 0);
 
-static void
-dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
-{
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       zap_cursor_t zc;
-       zap_attribute_t za;
 
        /*
-        * If it is the old version, dd_clones doesn't exist so we can't
-        * find the clones, but deadlist_remove_key() is a no-op so it
-        * doesn't matter.
-        */
-       if (ds->ds_dir->dd_phys->dd_clones == 0)
-               return;
-
-       for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
-           zap_cursor_retrieve(&zc, &za) == 0;
-           zap_cursor_advance(&zc)) {
-               dsl_dataset_t *clone;
-
-               VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-                   za.za_first_integer, FTAG, &clone));
-               if (clone->ds_dir->dd_origin_txg > mintxg) {
-                       dsl_deadlist_remove_key(&clone->ds_deadlist,
-                           mintxg, tx);
-                       dsl_dataset_remove_clones_key(clone, mintxg, tx);
-               }
-               dsl_dataset_rele(clone, FTAG);
-       }
-       zap_cursor_fini(&zc);
-}
-
-struct process_old_arg {
-       dsl_dataset_t *ds;
-       dsl_dataset_t *ds_prev;
-       boolean_t after_branch_point;
-       zio_t *pio;
-       uint64_t used, comp, uncomp;
-};
-
-static int
-process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-       struct process_old_arg *poa = arg;
-       dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
-
-       if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
-               dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
-               if (poa->ds_prev && !poa->after_branch_point &&
-                   bp->blk_birth >
-                   poa->ds_prev->ds_phys->ds_prev_snap_txg) {
-                       poa->ds_prev->ds_phys->ds_unique_bytes +=
-                           bp_get_dsize_sync(dp->dp_spa, bp);
-               }
-       } else {
-               poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
-               poa->comp += BP_GET_PSIZE(bp);
-               poa->uncomp += BP_GET_UCSIZE(bp);
-               dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
-       }
-       return (0);
-}
-
-static void
-process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
-    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
-{
-       struct process_old_arg poa = { 0 };
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-       objset_t *mos = dp->dp_meta_objset;
-
-       ASSERT(ds->ds_deadlist.dl_oldfmt);
-       ASSERT(ds_next->ds_deadlist.dl_oldfmt);
-
-       poa.ds = ds;
-       poa.ds_prev = ds_prev;
-       poa.after_branch_point = after_branch_point;
-       poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-       VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
-           process_old_cb, &poa, tx));
-       VERIFY0(zio_wait(poa.pio));
-       ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
-
-       /* change snapused */
-       dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-           -poa.used, -poa.comp, -poa.uncomp, tx);
-
-       /* swap next's deadlist to our deadlist */
-       dsl_deadlist_close(&ds->ds_deadlist);
-       dsl_deadlist_close(&ds_next->ds_deadlist);
-       SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
-           ds->ds_phys->ds_deadlist_obj);
-       dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
-       dsl_deadlist_open(&ds_next->ds_deadlist, mos,
-           ds_next->ds_phys->ds_deadlist_obj);
-}
-
-static int
-old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-       int err;
-       struct killarg ka;
-
-       /*
-        * Free everything that we point to (that's born after
-        * the previous snapshot, if we are a clone)
-        *
-        * NB: this should be very quick, because we already
-        * freed all the objects in open context.
-        */
-       ka.ds = ds;
-       ka.tx = tx;
-       err = traverse_dataset(ds,
-           ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
-           kill_blkptr, &ka);
-       ASSERT0(err);
-       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
-
-       return (err);
-}
-
-void
-dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
-       struct dsl_ds_destroyarg *dsda = arg1;
-       dsl_dataset_t *ds = dsda->ds;
-       int err = 0;
-       int after_branch_point = FALSE;
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-       objset_t *mos = dp->dp_meta_objset;
-       dsl_dataset_t *ds_prev = NULL;
-       boolean_t wont_destroy;
-       uint64_t obj;
-
-       wont_destroy = (dsda->defer &&
-           (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
-
-       ASSERT(ds->ds_owner || wont_destroy);
-       ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
-       ASSERT(ds->ds_prev == NULL ||
-           ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
-       ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
-
-       if (wont_destroy) {
-               ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
-               dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
-               spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
-               return;
-       }
-
-       /* We need to log before removing it from the namespace. */
-       spa_history_log_internal_ds(ds, "destroy", tx, "");
-
-       /* signal any waiters that this dataset is going away */
-       mutex_enter(&ds->ds_lock);
-       ds->ds_owner = dsl_reaper;
-       cv_broadcast(&ds->ds_exclusive_cv);
-       mutex_exit(&ds->ds_lock);
-
-       /* Remove our reservation */
-       if (ds->ds_reserved != 0) {
-               dsl_prop_setarg_t psa;
-               uint64_t value = 0;
-
-               dsl_prop_setarg_init_uint64(&psa, "refreservation",
-                   (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
-                   &value);
-               psa.psa_effective_value = 0;    /* predict default value */
-
-               dsl_dataset_set_reservation_sync(ds, &psa, tx);
-               ASSERT0(ds->ds_reserved);
-       }
-
-       ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
-       dsl_scan_ds_destroyed(ds, tx);
-
-       obj = ds->ds_object;
-
-       if (ds->ds_phys->ds_prev_snap_obj != 0) {
-               if (ds->ds_prev) {
-                       ds_prev = ds->ds_prev;
-               } else {
-                       VERIFY(0 == dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
-               }
-               after_branch_point =
-                   (ds_prev->ds_phys->ds_next_snap_obj != obj);
-
-               dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
-               if (after_branch_point &&
-                   ds_prev->ds_phys->ds_next_clones_obj != 0) {
-                       remove_from_next_clones(ds_prev, obj, tx);
-                       if (ds->ds_phys->ds_next_snap_obj != 0) {
-                               VERIFY(0 == zap_add_int(mos,
-                                   ds_prev->ds_phys->ds_next_clones_obj,
-                                   ds->ds_phys->ds_next_snap_obj, tx));
-                       }
-               }
-               if (after_branch_point &&
-                   ds->ds_phys->ds_next_snap_obj == 0) {
-                       /* This clone is toast. */
-                       ASSERT(ds_prev->ds_phys->ds_num_children > 1);
-                       ds_prev->ds_phys->ds_num_children--;
-
-                       /*
-                        * If the clone's origin has no other clones, no
-                        * user holds, and has been marked for deferred
-                        * deletion, then we should have done the necessary
-                        * destroy setup for it.
-                        */
-                       if (ds_prev->ds_phys->ds_num_children == 1 &&
-                           ds_prev->ds_userrefs == 0 &&
-                           DS_IS_DEFER_DESTROY(ds_prev)) {
-                               ASSERT3P(dsda->rm_origin, !=, NULL);
-                       } else {
-                               ASSERT3P(dsda->rm_origin, ==, NULL);
-                       }
-               } else if (!after_branch_point) {
-                       ds_prev->ds_phys->ds_next_snap_obj =
-                           ds->ds_phys->ds_next_snap_obj;
-               }
-       }
-
-       if (dsl_dataset_is_snapshot(ds)) {
-               dsl_dataset_t *ds_next;
-               uint64_t old_unique;
-               uint64_t used = 0, comp = 0, uncomp = 0;
-
-               VERIFY(0 == dsl_dataset_hold_obj(dp,
-                   ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
-               ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
-
-               old_unique = ds_next->ds_phys->ds_unique_bytes;
-
-               dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-               ds_next->ds_phys->ds_prev_snap_obj =
-                   ds->ds_phys->ds_prev_snap_obj;
-               ds_next->ds_phys->ds_prev_snap_txg =
-                   ds->ds_phys->ds_prev_snap_txg;
-               ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-                   ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
-
-
-               if (ds_next->ds_deadlist.dl_oldfmt) {
-                       process_old_deadlist(ds, ds_prev, ds_next,
-                           after_branch_point, tx);
-               } else {
-                       /* Adjust prev's unique space. */
-                       if (ds_prev && !after_branch_point) {
-                               dsl_deadlist_space_range(&ds_next->ds_deadlist,
-                                   ds_prev->ds_phys->ds_prev_snap_txg,
-                                   ds->ds_phys->ds_prev_snap_txg,
-                                   &used, &comp, &uncomp);
-                               ds_prev->ds_phys->ds_unique_bytes += used;
-                       }
-
-                       /* Adjust snapused. */
-                       dsl_deadlist_space_range(&ds_next->ds_deadlist,
-                           ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-                           &used, &comp, &uncomp);
-                       dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-                           -used, -comp, -uncomp, tx);
-
-                       /* Move blocks to be freed to pool's free list. */
-                       dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
-                           &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
-                           tx);
-                       dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
-                           DD_USED_HEAD, used, comp, uncomp, tx);
-
-                       /* Merge our deadlist into next's and free it. */
-                       dsl_deadlist_merge(&ds_next->ds_deadlist,
-                           ds->ds_phys->ds_deadlist_obj, tx);
-               }
-               dsl_deadlist_close(&ds->ds_deadlist);
-               dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
-
-               /* Collapse range in clone heads */
-               dsl_dataset_remove_clones_key(ds,
-                   ds->ds_phys->ds_creation_txg, tx);
-
-               if (dsl_dataset_is_snapshot(ds_next)) {
-                       dsl_dataset_t *ds_nextnext;
-                       dsl_dataset_t *hds;
-
-                       /*
-                        * Update next's unique to include blocks which
-                        * were previously shared by only this snapshot
-                        * and it.  Those blocks will be born after the
-                        * prev snap and before this snap, and will have
-                        * died after the next snap and before the one
-                        * after that (ie. be on the snap after next's
-                        * deadlist).
-                        */
-                       VERIFY(0 == dsl_dataset_hold_obj(dp,
-                           ds_next->ds_phys->ds_next_snap_obj,
-                           FTAG, &ds_nextnext));
-                       dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
-                           ds->ds_phys->ds_prev_snap_txg,
-                           ds->ds_phys->ds_creation_txg,
-                           &used, &comp, &uncomp);
-                       ds_next->ds_phys->ds_unique_bytes += used;
-                       dsl_dataset_rele(ds_nextnext, FTAG);
-                       ASSERT3P(ds_next->ds_prev, ==, NULL);
-
-                       /* Collapse range in this head. */
-                       VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
-                           ds->ds_dir->dd_phys->dd_head_dataset_obj,
-                           FTAG, &hds));
-                       dsl_deadlist_remove_key(&hds->ds_deadlist,
-                           ds->ds_phys->ds_creation_txg, tx);
-                       dsl_dataset_rele(hds, FTAG);
-
-               } else {
-                       ASSERT3P(ds_next->ds_prev, ==, ds);
-                       dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
-                       ds_next->ds_prev = NULL;
-                       if (ds_prev) {
-                               VERIFY(0 == dsl_dataset_get_ref(dp,
-                                   ds->ds_phys->ds_prev_snap_obj,
-                                   ds_next, &ds_next->ds_prev));
-                       }
-
-                       dsl_dataset_recalc_head_uniq(ds_next);
-
-                       /*
-                        * Reduce the amount of our unconsmed refreservation
-                        * being charged to our parent by the amount of
-                        * new unique data we have gained.
-                        */
-                       if (old_unique < ds_next->ds_reserved) {
-                               int64_t mrsdelta;
-                               uint64_t new_unique =
-                                   ds_next->ds_phys->ds_unique_bytes;
-
-                               ASSERT(old_unique <= new_unique);
-                               mrsdelta = MIN(new_unique - old_unique,
-                                   ds_next->ds_reserved - old_unique);
-                               dsl_dir_diduse_space(ds->ds_dir,
-                                   DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
-                       }
-               }
-               dsl_dataset_rele(ds_next, FTAG);
-       } else {
-               zfeature_info_t *async_destroy =
-                   &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
-               objset_t *os;
-
-               /*
-                * There's no next snapshot, so this is a head dataset.
-                * Destroy the deadlist.  Unless it's a clone, the
-                * deadlist should be empty.  (If it's a clone, it's
-                * safe to ignore the deadlist contents.)
-                */
-               dsl_deadlist_close(&ds->ds_deadlist);
-               dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
-               ds->ds_phys->ds_deadlist_obj = 0;
-
-               VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
-
-               if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
-                       err = old_synchronous_dataset_destroy(ds, tx);
-               } else {
-                       /*
-                        * Move the bptree into the pool's list of trees to
-                        * clean up and update space accounting information.
-                        */
-                       uint64_t used, comp, uncomp;
-
-                       zil_destroy_sync(dmu_objset_zil(os), tx);
-
-                       if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
-                               spa_feature_incr(dp->dp_spa, async_destroy, tx);
-                               dp->dp_bptree_obj = bptree_alloc(mos, tx);
-                               VERIFY(zap_add(mos,
-                                   DMU_POOL_DIRECTORY_OBJECT,
-                                   DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
-                                   &dp->dp_bptree_obj, tx) == 0);
-                       }
-
-                       used = ds->ds_dir->dd_phys->dd_used_bytes;
-                       comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
-                       uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
-
-                       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-                           ds->ds_phys->ds_unique_bytes == used);
-
-                       bptree_add(mos, dp->dp_bptree_obj,
-                           &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
-                           used, comp, uncomp, tx);
-                       dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-                           -used, -comp, -uncomp, tx);
-                       dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-                           used, comp, uncomp, tx);
-               }
-
-               if (ds->ds_prev != NULL) {
-                       if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-                               VERIFY3U(0, ==, zap_remove_int(mos,
-                                   ds->ds_prev->ds_dir->dd_phys->dd_clones,
-                                   ds->ds_object, tx));
-                       }
-                       dsl_dataset_rele(ds->ds_prev, ds);
-                       ds->ds_prev = ds_prev = NULL;
-               }
-       }
-
-       /*
-        * This must be done after the dsl_traverse(), because it will
-        * re-open the objset.
-        */
-       if (ds->ds_objset) {
-               dmu_objset_evict(ds->ds_objset);
-               ds->ds_objset = NULL;
-       }
-
-       if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
-               /* Erase the link in the dir */
-               dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-               ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
-               ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
-               err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
-               ASSERT(err == 0);
-       } else {
-               /* remove from snapshot namespace */
-               dsl_dataset_t *ds_head;
-               ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
-               VERIFY(0 == dsl_dataset_hold_obj(dp,
-                   ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
-               VERIFY(0 == dsl_dataset_get_snapname(ds));
-#ifdef ZFS_DEBUG
-               {
-                       uint64_t val;
-
-                       err = dsl_dataset_snap_lookup(ds_head,
-                           ds->ds_snapname, &val);
-                       ASSERT0(err);
-                       ASSERT3U(val, ==, obj);
-               }
-#endif
-               err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
-               ASSERT(err == 0);
-               dsl_dataset_rele(ds_head, FTAG);
-       }
-
-       if (ds_prev && ds->ds_prev != ds_prev)
-               dsl_dataset_rele(ds_prev, FTAG);
-
-       spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-
-       if (ds->ds_phys->ds_next_clones_obj != 0) {
-               ASSERTV(uint64_t count);
-               ASSERT(0 == zap_count(mos,
-                   ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
-               VERIFY(0 == dmu_object_free(mos,
-                   ds->ds_phys->ds_next_clones_obj, tx));
-       }
-       if (ds->ds_phys->ds_props_obj != 0)
-               VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
-       if (ds->ds_phys->ds_userrefs_obj != 0)
-               VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
-       dsl_dir_close(ds->ds_dir, ds);
-       ds->ds_dir = NULL;
-       dsl_dataset_drain_refs(ds, tag);
-       VERIFY(0 == dmu_object_free(mos, obj, tx));
-
-       if (dsda->rm_origin) {
-               /*
-                * Remove the origin of the clone we just destroyed.
-                */
-               struct dsl_ds_destroyarg ndsda = {0};
-
-               ndsda.ds = dsda->rm_origin;
-               dsl_dataset_destroy_sync(&ndsda, tag, tx);
-       }
-}
-
-static int
-dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-       uint64_t asize;
-
-       if (!dmu_tx_is_syncing(tx))
-               return (0);
-
-       /*
-        * If there's an fs-only reservation, any blocks that might become
-        * owned by the snapshot dataset must be accommodated by space
-        * outside of the reservation.
-        */
-       ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
-       asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
-       if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
-               return (ENOSPC);
-
-       /*
-        * Propagate any reserved space for this snapshot to other
-        * snapshot checks in this sync group.
-        */
-       if (asize > 0)
-               dsl_dir_willuse_space(ds->ds_dir, asize, tx);
-
-       return (0);
-}
-
-int
-dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx)
-{
-       int err;
-       uint64_t value;
-
-       /*
-        * We don't allow multiple snapshots of the same txg.  If there
-        * is already one, try again.
-        */
-       if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
-               return (EAGAIN);
-
-       /*
-        * Check for conflicting snapshot name.
-        */
-       err = dsl_dataset_snap_lookup(ds, snapname, &value);
-       if (err == 0)
-               return (EEXIST);
-       if (err != ENOENT)
-               return (err);
-
-       /*
-        * Check that the dataset's name is not too long.  Name consists
-        * of the dataset's length + 1 for the @-sign + snapshot name's length
-        */
-       if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
-               return (ENAMETOOLONG);
-
-       err = dsl_dataset_snapshot_reserve_space(ds, tx);
-       if (err)
-               return (err);
-
-       ds->ds_trysnap_txg = tx->tx_txg;
-       return (0);
-}
-
-void
-dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx)
-{
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-       dmu_buf_t *dbuf;
-       dsl_dataset_phys_t *dsphys;
-       uint64_t dsobj, crtxg;
-       objset_t *mos = dp->dp_meta_objset;
-       int err;
-
-       ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
-       /*
-        * The origin's ds_creation_txg has to be < TXG_INITIAL
+        * The origin's ds_creation_txg has to be < TXG_INITIAL
         */
        if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
                crtxg = 1;
@@ -2081,7 +1041,7 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
 
        dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
            DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-       VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+       VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
        dmu_buf_will_dirty(dbuf, tx);
        dsphys = dbuf->db_data;
        bzero(dsphys, sizeof (dsl_dataset_phys_t));
@@ -2116,9 +1076,9 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
                            ds->ds_prev->ds_phys->ds_creation_txg);
                        ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
                } else if (next_clones_obj != 0) {
-                       remove_from_next_clones(ds->ds_prev,
+                       dsl_dataset_remove_from_next_clones(ds->ds_prev,
                            dsphys->ds_next_snap_obj, tx);
-                       VERIFY3U(0, ==, zap_add_int(mos,
+                       VERIFY0(zap_add_int(mos,
                            next_clones_obj, dsobj, tx));
                }
        }
@@ -2137,9 +1097,6 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
        }
 
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
-           ds->ds_dir->dd_myname, snapname, dsobj,
-           ds->ds_phys->ds_prev_snap_txg);
        ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
            UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
        dsl_deadlist_close(&ds->ds_deadlist);
@@ -2154,13 +1111,12 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
        if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
                ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
-       err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
-           snapname, 8, 1, &dsobj, tx);
-       ASSERT(err == 0);
+       VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+           snapname, 8, 1, &dsobj, tx));
 
        if (ds->ds_prev)
-               dsl_dataset_drop_ref(ds->ds_prev, ds);
-       VERIFY(0 == dsl_dataset_get_ref(dp,
+               dsl_dataset_rele(ds->ds_prev, ds);
+       VERIFY0(dsl_dataset_hold_obj(dp,
            ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 
        dsl_scan_ds_snapshotted(ds, tx);
@@ -2170,88 +1126,264 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
        spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
 }
 
-void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+static void
+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
-       ASSERT(dmu_tx_is_syncing(tx));
-       ASSERT(ds->ds_objset != NULL);
-       ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
-
-       /*
-        * in case we had to change ds_fsid_guid when we opened it,
-        * sync it out now.
-        */
-       dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+       dsl_dataset_snapshot_arg_t *ddsa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
 
-       dmu_objset_sync(ds->ds_objset, zio, tx);
+       for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+           pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+               dsl_dataset_t *ds;
+               char *name, *atp;
+               char dsname[MAXNAMELEN];
+
+               name = nvpair_name(pair);
+               atp = strchr(name, '@');
+               (void) strlcpy(dsname, name, atp - name + 1);
+               VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
+
+               dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
+               if (ddsa->ddsa_props != NULL) {
+                       dsl_props_set_sync_impl(ds->ds_prev,
+                           ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+               }
+               dsl_dataset_rele(ds, FTAG);
+       }
 }
 
-static void
-get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
+/*
+ * The snapshots must all be in the same pool.
+ * All-or-nothing: if there are any failures, nothing will be modified.
+ */
+int
+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 {
-       uint64_t count = 0;
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       zap_cursor_t zc;
-       zap_attribute_t za;
-       nvlist_t *propval;
-       nvlist_t *val;
+       dsl_dataset_snapshot_arg_t ddsa;
+       nvpair_t *pair;
+       boolean_t needsuspend;
+       int error;
+       spa_t *spa;
+       char *firstname;
+       nvlist_t *suspended = NULL;
 
-       rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-       VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-       VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       pair = nvlist_next_nvpair(snaps, NULL);
+       if (pair == NULL)
+               return (0);
+       firstname = nvpair_name(pair);
+
+       error = spa_open(firstname, &spa, FTAG);
+       if (error != 0)
+               return (error);
+       needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+       spa_close(spa, FTAG);
+
+       if (needsuspend) {
+               suspended = fnvlist_alloc();
+               for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+                   pair = nvlist_next_nvpair(snaps, pair)) {
+                       char fsname[MAXNAMELEN];
+                       char *snapname = nvpair_name(pair);
+                       char *atp;
+                       void *cookie;
+
+                       atp = strchr(snapname, '@');
+                       if (atp == NULL) {
+                               error = EINVAL;
+                               break;
+                       }
+                       (void) strlcpy(fsname, snapname, atp - snapname + 1);
+
+                       error = zil_suspend(fsname, &cookie);
+                       if (error != 0)
+                               break;
+                       fnvlist_add_uint64(suspended, fsname,
+                           (uintptr_t)cookie);
+               }
+       }
+
+       ddsa.ddsa_snaps = snaps;
+       ddsa.ddsa_props = props;
+       ddsa.ddsa_errors = errors;
+
+       if (error == 0) {
+               error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
+                   dsl_dataset_snapshot_sync, &ddsa,
+                   fnvlist_num_pairs(snaps) * 3);
+       }
+
+       if (suspended != NULL) {
+               for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
+                   pair = nvlist_next_nvpair(suspended, pair)) {
+                       zil_resume((void *)(uintptr_t)
+                           fnvpair_value_uint64(pair));
+               }
+               fnvlist_free(suspended);
+       }
+
+       return (error);
+}
+
+typedef struct dsl_dataset_snapshot_tmp_arg {
+       const char *ddsta_fsname;
+       const char *ddsta_snapname;
+       minor_t ddsta_cleanup_minor;
+       const char *ddsta_htag;
+} dsl_dataset_snapshot_tmp_arg_t;
+
+static int
+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       int error;
+
+       error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx);
+       if (error != 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return (error);
+       }
+
+       if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
+               dsl_dataset_rele(ds, FTAG);
+               return (ENOTSUP);
+       }
+       error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
+           B_TRUE, tx);
+       if (error != 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return (error);
+       }
+
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
+
+static void
+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+
+       VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
+
+       dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
+       dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
+           ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
+       dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
+
+       dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+    minor_t cleanup_minor, const char *htag)
+{
+       dsl_dataset_snapshot_tmp_arg_t ddsta;
+       int error;
+       spa_t *spa;
+       boolean_t needsuspend;
+       void *cookie;
+
+       ddsta.ddsta_fsname = fsname;
+       ddsta.ddsta_snapname = snapname;
+       ddsta.ddsta_cleanup_minor = cleanup_minor;
+       ddsta.ddsta_htag = htag;
+
+       error = spa_open(fsname, &spa, FTAG);
+       if (error != 0)
+               return (error);
+       needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+       spa_close(spa, FTAG);
+
+       if (needsuspend) {
+               error = zil_suspend(fsname, &cookie);
+               if (error != 0)
+                       return (error);
+       }
+
+       error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
+           dsl_dataset_snapshot_tmp_sync, &ddsta, 3);
+
+       if (needsuspend)
+               zil_resume(cookie);
+       return (error);
+}
+
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+       ASSERT(dmu_tx_is_syncing(tx));
+       ASSERT(ds->ds_objset != NULL);
+       ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+       /*
+        * in case we had to change ds_fsid_guid when we opened it,
+        * sync it out now.
+        */
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+
+       dmu_objset_sync(ds->ds_objset, zio, tx);
+}
+
+static void
+get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
+{
+       uint64_t count = 0;
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       nvlist_t *propval = fnvlist_alloc();
+       nvlist_t *val = fnvlist_alloc();
+
+       ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
        /*
-        * There may me missing entries in ds_next_clones_obj
+        * There may be missing entries in ds_next_clones_obj
         * due to a bug in a previous version of the code.
         * Only trust it if it has the right number of entries.
         */
        if (ds->ds_phys->ds_next_clones_obj != 0) {
-               ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+               ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
                    &count));
        }
-       if (count != ds->ds_phys->ds_num_children - 1) {
+       if (count != ds->ds_phys->ds_num_children - 1)
                goto fail;
-       }
        for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
            zap_cursor_retrieve(&zc, &za) == 0;
            zap_cursor_advance(&zc)) {
                dsl_dataset_t *clone;
                char buf[ZFS_MAXNAMELEN];
-               /*
-                * Even though we hold the dp_config_rwlock, the dataset
-                * may fail to open, returning ENOENT.  If there is a
-                * thread concurrently attempting to destroy this
-                * dataset, it will have the ds_rwlock held for
-                * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
-                * dsl_dataset_hold_ref() will fail its
-                * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
-                * dp_config_rwlock, and wait for the destroy progress
-                * and signal ds_exclusive_cv.  If the destroy was
-                * successful, we will see that
-                * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
-                */
-               if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-                   za.za_first_integer, FTAG, &clone) != 0)
-                       continue;
+               VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+                   za.za_first_integer, FTAG, &clone));
                dsl_dir_name(clone->ds_dir, buf);
-               VERIFY(nvlist_add_boolean(val, buf) == 0);
+               fnvlist_add_boolean(val, buf);
                dsl_dataset_rele(clone, FTAG);
        }
        zap_cursor_fini(&zc);
-       VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
-       VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
-           propval) == 0);
+       fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
+       fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
 fail:
        nvlist_free(val);
        nvlist_free(propval);
-       rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
        uint64_t refd, avail, uobjs, aobjs, ratio;
+       ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool);
+
+       ASSERT(dsl_pool_config_held(dp));
 
        ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
            (ds->ds_phys->ds_uncompressed_bytes * 100 /
@@ -2297,10 +1429,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
                dsl_dataset_t *prev;
                int err;
 
-               rw_enter(&dp->dp_config_rwlock, RW_READER);
                err = dsl_dataset_hold_obj(dp,
                    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-               rw_exit(&dp->dp_config_rwlock);
                if (err == 0) {
                        err = dsl_dataset_space_written(prev, ds, &written,
                            &comp, &uncomp);
@@ -2317,6 +1447,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       ASSERT(dsl_pool_config_held(dp));
+
        stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
        stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
        stat->dds_guid = ds->ds_phys->ds_guid;
@@ -2328,16 +1461,14 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
                stat->dds_is_snapshot = B_FALSE;
                stat->dds_num_clones = 0;
 
-               rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
                if (dsl_dir_is_clone(ds->ds_dir)) {
                        dsl_dataset_t *ods;
 
-                       VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
+                       VERIFY0(dsl_dataset_hold_obj(dp,
                            ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
                        dsl_dataset_name(ods, stat->dds_origin);
-                       dsl_dataset_drop_ref(ods, FTAG);
+                       dsl_dataset_rele(ods, FTAG);
                }
-               rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
        }
 }
 
@@ -2375,8 +1506,7 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
 {
        ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool);
 
-       ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-           dsl_pool_sync_context(dp));
+       ASSERT(dsl_pool_config_held(dp));
        if (ds->ds_prev == NULL)
                return (B_FALSE);
        if (ds->ds_phys->ds_bp.blk_birth >
@@ -2398,237 +1528,225 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
        return (B_FALSE);
 }
 
+typedef struct dsl_dataset_rename_snapshot_arg {
+       const char *ddrsa_fsname;
+       const char *ddrsa_oldsnapname;
+       const char *ddrsa_newsnapname;
+       boolean_t ddrsa_recursive;
+       dmu_tx_t *ddrsa_tx;
+} dsl_dataset_rename_snapshot_arg_t;
+
 /* ARGSUSED */
 static int
-dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
 {
-       dsl_dataset_t *ds = arg1;
-       char *newsnapname = arg2;
-       dsl_dir_t *dd = ds->ds_dir;
-       dsl_dataset_t *hds;
+       dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+       int error;
        uint64_t val;
-       int err;
-
-       err = dsl_dataset_hold_obj(dd->dd_pool,
-           dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
-       if (err)
-               return (err);
 
-       /* new name better not be in use */
-       err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
-       dsl_dataset_rele(hds, FTAG);
+       error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+       if (error != 0) {
+               /* ignore nonexistent snapshots */
+               return (error == ENOENT ? 0 : error);
+       }
 
-       if (err == 0)
-               err = EEXIST;
-       else if (err == ENOENT)
-               err = 0;
+       /* new name should not exist */
+       error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
+       if (error == 0)
+               error = EEXIST;
+       else if (error == ENOENT)
+               error = 0;
 
        /* dataset name + 1 for the "@" + the new snapshot name must fit */
-       if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
-               err = ENAMETOOLONG;
+       if (dsl_dir_namelen(hds->ds_dir) + 1 +
+           strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
+               error = ENAMETOOLONG;
 
-       return (err);
+       return (error);
 }
 
-static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+static int
+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       const char *newsnapname = arg2;
-       dsl_dir_t *dd = ds->ds_dir;
-       objset_t *mos = dd->dd_pool->dp_meta_objset;
+       dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
        dsl_dataset_t *hds;
-       int err;
-
-       ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
-
-       VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
-           dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+       int error;
 
-       VERIFY(0 == dsl_dataset_get_snapname(ds));
-       err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
-       ASSERT0(err);
-       mutex_enter(&ds->ds_lock);
-       (void) strcpy(ds->ds_snapname, newsnapname);
-       mutex_exit(&ds->ds_lock);
-       err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
-           ds->ds_snapname, 8, 1, &ds->ds_object, tx);
-       ASSERT0(err);
+       error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
+       if (error != 0)
+               return (error);
 
-       spa_history_log_internal_ds(ds, "rename", tx,
-           "-> @%s", newsnapname);
+       if (ddrsa->ddrsa_recursive) {
+               error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+                   dsl_dataset_rename_snapshot_check_impl, ddrsa,
+                   DS_FIND_CHILDREN);
+       } else {
+               error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
+       }
        dsl_dataset_rele(hds, FTAG);
+       return (error);
 }
 
-struct renamesnaparg {
-       dsl_sync_task_group_t *dstg;
-       char failed[MAXPATHLEN];
-       char *oldsnap;
-       char *newsnap;
-};
-
 static int
-dsl_snapshot_rename_one(const char *name, void *arg)
+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
 {
-       struct renamesnaparg *ra = arg;
-       dsl_dataset_t *ds = NULL;
-       char *snapname;
-       int err;
-
-       snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
-       (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
+       dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+       dsl_dataset_t *ds;
+       uint64_t val;
+       dmu_tx_t *tx = ddrsa->ddrsa_tx;
+       int error;
 
-       /*
-        * For recursive snapshot renames the parent won't be changing
-        * so we just pass name for both the to/from argument.
-        */
-       err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
-       if (err != 0) {
-               strfree(snapname);
-               return (err == ENOENT ? 0 : err);
+       error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+       ASSERT(error == 0 || error == ENOENT);
+       if (error == ENOENT) {
+               /* ignore nonexistent snapshots */
+               return (0);
        }
 
-#ifdef _KERNEL
-       /*
-        * For all filesystems undergoing rename, we'll need to unmount it.
-        */
-       (void) zfs_unmount_snap(snapname, NULL);
-#endif
-       err = dsl_dataset_hold(snapname, ra->dstg, &ds);
-       strfree(snapname);
-       if (err != 0)
-               return (err == ENOENT ? 0 : err);
+       VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
+
+       /* log before we change the name */
+       spa_history_log_internal_ds(ds, "rename", tx,
+           "-> @%s", ddrsa->ddrsa_newsnapname);
 
-       dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
-           dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
+       VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
+       mutex_enter(&ds->ds_lock);
+       (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
+       mutex_exit(&ds->ds_lock);
+       VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
+           ds->ds_snapname, 8, 1, &ds->ds_object, tx));
 
+       dsl_dataset_rele(ds, FTAG);
        return (0);
 }
 
-static int
-dsl_recursive_rename(char *oldname, const char *newname)
+static void
+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
-       int err;
-       struct renamesnaparg *ra;
-       dsl_sync_task_t *dst;
-       spa_t *spa;
-       char *cp, *fsname = spa_strdup(oldname);
-       int len = strlen(oldname) + 1;
-
-       /* truncate the snapshot name to get the fsname */
-       cp = strchr(fsname, '@');
-       *cp = '\0';
-
-       err = spa_open(fsname, &spa, FTAG);
-       if (err) {
-               kmem_free(fsname, len);
-               return (err);
-       }
-       ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
-       ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-
-       ra->oldsnap = strchr(oldname, '@') + 1;
-       ra->newsnap = strchr(newname, '@') + 1;
-       *ra->failed = '\0';
-
-       err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
-           DS_FIND_CHILDREN);
-       kmem_free(fsname, len);
-
-       if (err == 0) {
-               err = dsl_sync_task_group_wait(ra->dstg);
-       }
+       dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *hds;
 
-       for (dst = list_head(&ra->dstg->dstg_tasks); dst;
-           dst = list_next(&ra->dstg->dstg_tasks, dst)) {
-               dsl_dataset_t *ds = dst->dst_arg1;
-               if (dst->dst_err) {
-                       dsl_dir_name(ds->ds_dir, ra->failed);
-                       (void) strlcat(ra->failed, "@", sizeof (ra->failed));
-                       (void) strlcat(ra->failed, ra->newsnap,
-                           sizeof (ra->failed));
-               }
-               dsl_dataset_rele(ds, ra->dstg);
+       VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
+       ddrsa->ddrsa_tx = tx;
+       if (ddrsa->ddrsa_recursive) {
+               VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+                   dsl_dataset_rename_snapshot_sync_impl, ddrsa,
+                   DS_FIND_CHILDREN));
+       } else {
+               VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
        }
-
-       if (err)
-               (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
-
-       dsl_sync_task_group_destroy(ra->dstg);
-       kmem_free(ra, sizeof (struct renamesnaparg));
-       spa_close(spa, FTAG);
-       return (err);
+       dsl_dataset_rele(hds, FTAG);
 }
 
-static int
-dsl_valid_rename(const char *oldname, void *arg)
+int
+dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
 {
-       int delta = *(int *)arg;
+       dsl_dataset_rename_snapshot_arg_t ddrsa;
 
-       if (strlen(oldname) + delta >= MAXNAMELEN)
-               return (ENAMETOOLONG);
+       ddrsa.ddrsa_fsname = fsname;
+       ddrsa.ddrsa_oldsnapname = oldsnapname;
+       ddrsa.ddrsa_newsnapname = newsnapname;
+       ddrsa.ddrsa_recursive = recursive;
 
-       return (0);
+       return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+           dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
 }
 
-#pragma weak dmu_objset_rename = dsl_dataset_rename
-int
-dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
+static int
+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd;
+       const char *fsname = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
        dsl_dataset_t *ds;
-       const char *tail;
-       int err;
+       int64_t unused_refres_delta;
+       int error;
 
-       err = dsl_dir_open(oldname, FTAG, &dd, &tail);
-       if (err)
-               return (err);
+       error = dsl_dataset_hold(dp, fsname, FTAG, &ds);
+       if (error != 0)
+               return (error);
 
-       if (tail == NULL) {
-               int delta = strlen(newname) - strlen(oldname);
+       /* must not be a snapshot */
+       if (dsl_dataset_is_snapshot(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EINVAL);
+       }
 
-               /* if we're growing, validate child name lengths */
-               if (delta > 0)
-                       err = dmu_objset_find(oldname, dsl_valid_rename,
-                           &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+       /* must have a most recent snapshot */
+       if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EINVAL);
+       }
 
-               if (err == 0)
-                       err = dsl_dir_rename(dd, newname);
-               dsl_dir_close(dd, FTAG);
-               return (err);
+       if (dsl_dataset_long_held(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EBUSY);
        }
 
-       if (tail[0] != '@') {
-               /* the name ended in a nonexistent component */
-               dsl_dir_close(dd, FTAG);
-               return (ENOENT);
+       /*
+        * Check if the snap we are rolling back to uses more than
+        * the refquota.
+        */
+       if (ds->ds_quota != 0 &&
+           ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EDQUOT);
        }
 
-       dsl_dir_close(dd, FTAG);
+       /*
+        * When we do the clone swap, we will temporarily use more space
+        * due to the refreservation (the head will no longer have any
+        * unique space, so the entire amount of the refreservation will need
+        * to be free).  We will immediately destroy the clone, freeing
+        * this space, but the freeing happens over many txg's.
+        */
+       unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
+           ds->ds_phys->ds_unique_bytes);
 
-       /* new name must be snapshot in same filesystem */
-       tail = strchr(newname, '@');
-       if (tail == NULL)
-               return (EINVAL);
-       tail++;
-       if (strncmp(oldname, newname, tail - newname) != 0)
-               return (EXDEV);
+       if (unused_refres_delta > 0 &&
+           unused_refres_delta >
+           dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (ENOSPC);
+       }
 
-       if (recursive) {
-               err = dsl_recursive_rename(oldname, newname);
-       } else {
-               err = dsl_dataset_hold(oldname, FTAG, &ds);
-               if (err)
-                       return (err);
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
 
-               err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-                   dsl_dataset_snapshot_rename_check,
-                   dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
+static void
+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
+{
+       const char *fsname = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds, *clone;
+       uint64_t cloneobj;
 
-               dsl_dataset_rele(ds, FTAG);
-       }
+       VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds));
 
-       return (err);
+       cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
+           ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
+
+       VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
+
+       dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
+       dsl_dataset_zero_zil(ds, tx);
+
+       dsl_destroy_head_sync_impl(clone, tx);
+
+       dsl_dataset_rele(clone, FTAG);
+       dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_rollback(const char *fsname)
+{
+       return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
+           dsl_dataset_rollback_sync, (void *)fsname, 1));
 }
 
 struct promotenode {
@@ -2636,48 +1754,66 @@ struct promotenode {
        dsl_dataset_t *ds;
 };
 
-struct promotearg {
+typedef struct dsl_dataset_promote_arg {
+       const char *ddpa_clonename;
+       dsl_dataset_t *ddpa_clone;
        list_t shared_snaps, origin_snaps, clone_snaps;
-       dsl_dataset_t *origin_origin;
+       dsl_dataset_t *origin_origin; /* origin of the origin */
        uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
        char *err_ds;
-};
+} dsl_dataset_promote_arg_t;
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
+    void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
 
 static int
-dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *hds = arg1;
-       struct promotearg *pa = arg2;
-       struct promotenode *snap = list_head(&pa->shared_snaps);
-       dsl_dataset_t *origin_ds = snap->ds;
+       dsl_dataset_promote_arg_t *ddpa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *hds;
+       struct promotenode *snap;
+       dsl_dataset_t *origin_ds;
        int err;
        uint64_t unused;
 
-       /* Check that it is a real clone */
-       if (!dsl_dir_is_clone(hds->ds_dir))
-               return (EINVAL);
+       err = promote_hold(ddpa, dp, FTAG);
+       if (err != 0)
+               return (err);
 
-       /* Since this is so expensive, don't do the preliminary check */
-       if (!dmu_tx_is_syncing(tx))
-               return (0);
+       hds = ddpa->ddpa_clone;
 
-       if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
+       if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+               promote_rele(ddpa, FTAG);
                return (EXDEV);
+       }
+
+       /*
+        * Compute and check the amount of space to transfer.  Since this is
+        * so expensive, don't do the preliminary check.
+        */
+       if (!dmu_tx_is_syncing(tx)) {
+               promote_rele(ddpa, FTAG);
+               return (0);
+       }
+
+       snap = list_head(&ddpa->shared_snaps);
+       origin_ds = snap->ds;
 
        /* compute origin's new unique space */
-       snap = list_tail(&pa->clone_snaps);
+       snap = list_tail(&ddpa->clone_snaps);
        ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
        dsl_deadlist_space_range(&snap->ds->ds_deadlist,
            origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-           &pa->unique, &unused, &unused);
+           &ddpa->unique, &unused, &unused);
 
        /*
         * Walk the snapshots that we are moving
         *
         * Compute space to transfer.  Consider the incremental changes
-        * to used for each snapshot:
+        * to used by each snapshot:
         * (my used) = (prev's used) + (blocks born) - (blocks killed)
         * So each snapshot gave birth to:
         * (blocks born) = (my used) - (prev's used) + (blocks killed)
@@ -2688,18 +1824,28 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
         * Note however, if we stop before we reach the ORIGIN we get:
         * uN + kN + kN-1 + ... + kM - uM-1
         */
-       pa->used = origin_ds->ds_phys->ds_referenced_bytes;
-       pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
-       pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
-       for (snap = list_head(&pa->shared_snaps); snap;
-           snap = list_next(&pa->shared_snaps, snap)) {
+       ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
+       ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
+       ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+       for (snap = list_head(&ddpa->shared_snaps); snap;
+           snap = list_next(&ddpa->shared_snaps, snap)) {
                uint64_t val, dlused, dlcomp, dluncomp;
                dsl_dataset_t *ds = snap->ds;
 
+               /*
+                * If there are long holds, we won't be able to evict
+                * the objset.
+                */
+               if (dsl_dataset_long_held(ds)) {
+                       err = EBUSY;
+                       goto out;
+               }
+
                /* Check that the snapshot name does not conflict */
-               VERIFY(0 == dsl_dataset_get_snapname(ds));
+               VERIFY0(dsl_dataset_get_snapname(ds));
                err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
                if (err == 0) {
+                       (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
                        err = EEXIST;
                        goto out;
                }
@@ -2712,26 +1858,27 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
                dsl_deadlist_space(&ds->ds_deadlist,
                    &dlused, &dlcomp, &dluncomp);
-               pa->used += dlused;
-               pa->comp += dlcomp;
-               pa->uncomp += dluncomp;
+               ddpa->used += dlused;
+               ddpa->comp += dlcomp;
+               ddpa->uncomp += dluncomp;
        }
 
        /*
         * If we are a clone of a clone then we never reached ORIGIN,
         * so we need to subtract out the clone origin's used space.
         */
-       if (pa->origin_origin) {
-               pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
-               pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
-               pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
+       if (ddpa->origin_origin) {
+               ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
+               ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
+               ddpa->uncomp -=
+                   ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
        }
 
        /* Check that there is enough space here */
        err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-           pa->used);
-       if (err)
-               return (err);
+           ddpa->used);
+       if (err != 0)
+               goto out;
 
        /*
         * Compute the amounts of space that will be used by snapshots
@@ -2749,68 +1896,75 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
                 * calls will be fast because they do not have to
                 * iterate over all bps.
                 */
-               snap = list_head(&pa->origin_snaps);
-               err = snaplist_space(&pa->shared_snaps,
-                   snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
-               if (err)
-                       return (err);
+               snap = list_head(&ddpa->origin_snaps);
+               err = snaplist_space(&ddpa->shared_snaps,
+                   snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
+               if (err != 0)
+                       goto out;
 
-               err = snaplist_space(&pa->clone_snaps,
+               err = snaplist_space(&ddpa->clone_snaps,
                    snap->ds->ds_dir->dd_origin_txg, &space);
-               if (err)
-                       return (err);
-               pa->cloneusedsnap += space;
+               if (err != 0)
+                       goto out;
+               ddpa->cloneusedsnap += space;
        }
        if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
-               err = snaplist_space(&pa->origin_snaps,
-                   origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
-               if (err)
-                       return (err);
+               err = snaplist_space(&ddpa->origin_snaps,
+                   origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
+               if (err != 0)
+                       goto out;
        }
 
-       return (0);
 out:
-       pa->err_ds =  snap->ds->ds_snapname;
+       promote_rele(ddpa, FTAG);
        return (err);
 }
 
 static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *hds = arg1;
-       struct promotearg *pa = arg2;
-       struct promotenode *snap = list_head(&pa->shared_snaps);
-       dsl_dataset_t *origin_ds = snap->ds;
+       dsl_dataset_promote_arg_t *ddpa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *hds;
+       struct promotenode *snap;
+       dsl_dataset_t *origin_ds;
        dsl_dataset_t *origin_head;
-       dsl_dir_t *dd = hds->ds_dir;
-       dsl_pool_t *dp = hds->ds_dir->dd_pool;
+       dsl_dir_t *dd;
        dsl_dir_t *odd = NULL;
        uint64_t oldnext_obj;
        int64_t delta;
 
-       ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
+       VERIFY0(promote_hold(ddpa, dp, FTAG));
+       hds = ddpa->ddpa_clone;
+
+       ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
+
+       snap = list_head(&ddpa->shared_snaps);
+       origin_ds = snap->ds;
+       dd = hds->ds_dir;
 
-       snap = list_head(&pa->origin_snaps);
+       snap = list_head(&ddpa->origin_snaps);
        origin_head = snap->ds;
 
        /*
         * We need to explicitly open odd, since origin_ds's dd will be
         * changing.
         */
-       VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
+       VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
            NULL, FTAG, &odd));
 
        /* change origin's next snap */
        dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
        oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
-       snap = list_tail(&pa->clone_snaps);
+       snap = list_tail(&ddpa->clone_snaps);
        ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
        origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
 
        /* change the origin's next clone */
        if (origin_ds->ds_phys->ds_next_clones_obj) {
-               remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
-               VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+               dsl_dataset_remove_from_next_clones(origin_ds,
+                   snap->ds->ds_object, tx);
+               VERIFY0(zap_add_int(dp->dp_meta_objset,
                    origin_ds->ds_phys->ds_next_clones_obj,
                    oldnext_obj, tx));
        }
@@ -2827,39 +1981,43 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 
        /* change dd_clone entries */
        if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+               VERIFY0(zap_remove_int(dp->dp_meta_objset,
                    odd->dd_phys->dd_clones, hds->ds_object, tx));
-               VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
-                   pa->origin_origin->ds_dir->dd_phys->dd_clones,
+               VERIFY0(zap_add_int(dp->dp_meta_objset,
+                   ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
                    hds->ds_object, tx));
 
-               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-                   pa->origin_origin->ds_dir->dd_phys->dd_clones,
+               VERIFY0(zap_remove_int(dp->dp_meta_objset,
+                   ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
                    origin_head->ds_object, tx));
                if (dd->dd_phys->dd_clones == 0) {
                        dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
                            DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
                }
-               VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+               VERIFY0(zap_add_int(dp->dp_meta_objset,
                    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
-
        }
 
        /* move snapshots to this dir */
-       for (snap = list_head(&pa->shared_snaps); snap;
-           snap = list_next(&pa->shared_snaps, snap)) {
+       for (snap = list_head(&ddpa->shared_snaps); snap;
+           snap = list_next(&ddpa->shared_snaps, snap)) {
                dsl_dataset_t *ds = snap->ds;
 
-               /* unregister props as dsl_dir is changing */
+               /*
+                * Property callbacks are registered to a particular
+                * dsl_dir.  Since ours is changing, evict the objset
+                * so that they will be unregistered from the old dsl_dir.
+                */
                if (ds->ds_objset) {
                        dmu_objset_evict(ds->ds_objset);
                        ds->ds_objset = NULL;
                }
+
                /* move snap name entry */
-               VERIFY(0 == dsl_dataset_get_snapname(ds));
-               VERIFY(0 == dsl_dataset_snap_remove(origin_head,
+               VERIFY0(dsl_dataset_get_snapname(ds));
+               VERIFY0(dsl_dataset_snap_remove(origin_head,
                    ds->ds_snapname, tx));
-               VERIFY(0 == zap_add(dp->dp_meta_objset,
+               VERIFY0(zap_add(dp->dp_meta_objset,
                    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
                    8, 1, &ds->ds_object, tx));
 
@@ -2868,8 +2026,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
                ds->ds_phys->ds_dir_obj = dd->dd_object;
                ASSERT3P(ds->ds_dir, ==, odd);
-               dsl_dir_close(ds->ds_dir, ds);
-               VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
+               dsl_dir_rele(ds->ds_dir, ds);
+               VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
                    NULL, ds, &ds->ds_dir));
 
                /* move any clone references */
@@ -2881,1278 +2039,687 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                        for (zap_cursor_init(&zc, dp->dp_meta_objset,
                            ds->ds_phys->ds_next_clones_obj);
                            zap_cursor_retrieve(&zc, &za) == 0;
-                           zap_cursor_advance(&zc)) {
-                               dsl_dataset_t *cnds;
-                               uint64_t o;
-
-                               if (za.za_first_integer == oldnext_obj) {
-                                       /*
-                                        * We've already moved the
-                                        * origin's reference.
-                                        */
-                                       continue;
-                               }
-
-                               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
-                                   za.za_first_integer, FTAG, &cnds));
-                               o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
-
-                               VERIFY3U(zap_remove_int(dp->dp_meta_objset,
-                                   odd->dd_phys->dd_clones, o, tx), ==, 0);
-                               VERIFY3U(zap_add_int(dp->dp_meta_objset,
-                                   dd->dd_phys->dd_clones, o, tx), ==, 0);
-                               dsl_dataset_rele(cnds, FTAG);
-                       }
-                       zap_cursor_fini(&zc);
-               }
-
-               ASSERT0(dsl_prop_numcb(ds));
-       }
-
-       /*
-        * Change space accounting.
-        * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
-        * both be valid, or both be 0 (resulting in delta == 0).  This
-        * is true for each of {clone,origin} independently.
-        */
-
-       delta = pa->cloneusedsnap -
-           dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
-       ASSERT3S(delta, >=, 0);
-       ASSERT3U(pa->used, >=, delta);
-       dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
-       dsl_dir_diduse_space(dd, DD_USED_HEAD,
-           pa->used - delta, pa->comp, pa->uncomp, tx);
-
-       delta = pa->originusedsnap -
-           odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
-       ASSERT3S(delta, <=, 0);
-       ASSERT3U(pa->used, >=, -delta);
-       dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
-       dsl_dir_diduse_space(odd, DD_USED_HEAD,
-           -pa->used - delta, -pa->comp, -pa->uncomp, tx);
-
-       origin_ds->ds_phys->ds_unique_bytes = pa->unique;
-
-       /* log history record */
-       spa_history_log_internal_ds(hds, "promote", tx, "");
-
-       dsl_dir_close(odd, FTAG);
-}
-
-static char *snaplist_tag = "snaplist";
-/*
- * Make a list of dsl_dataset_t's for the snapshots between first_obj
- * (exclusive) and last_obj (inclusive).  The list will be in reverse
- * order (last_obj will be the list_head()).  If first_obj == 0, do all
- * snapshots back to this dataset's origin.
- */
-static int
-snaplist_make(dsl_pool_t *dp, boolean_t own,
-    uint64_t first_obj, uint64_t last_obj, list_t *l)
-{
-       uint64_t obj = last_obj;
-
-       ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
-
-       list_create(l, sizeof (struct promotenode),
-           offsetof(struct promotenode, link));
-
-       while (obj != first_obj) {
-               dsl_dataset_t *ds;
-               struct promotenode *snap;
-               int err;
-
-               if (own) {
-                       err = dsl_dataset_own_obj(dp, obj,
-                           0, snaplist_tag, &ds);
-                       if (err == 0)
-                               dsl_dataset_make_exclusive(ds, snaplist_tag);
-               } else {
-                       err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
-               }
-               if (err == ENOENT) {
-                       /* lost race with snapshot destroy */
-                       struct promotenode *last = list_tail(l);
-                       ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
-                       obj = last->ds->ds_phys->ds_prev_snap_obj;
-                       continue;
-               } else if (err) {
-                       return (err);
-               }
-
-               if (first_obj == 0)
-                       first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
-
-               snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
-               snap->ds = ds;
-               list_insert_tail(l, snap);
-               obj = ds->ds_phys->ds_prev_snap_obj;
-       }
-
-       return (0);
-}
-
-static int
-snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
-{
-       struct promotenode *snap;
-
-       *spacep = 0;
-       for (snap = list_head(l); snap; snap = list_next(l, snap)) {
-               uint64_t used, comp, uncomp;
-               dsl_deadlist_space_range(&snap->ds->ds_deadlist,
-                   mintxg, UINT64_MAX, &used, &comp, &uncomp);
-               *spacep += used;
-       }
-       return (0);
-}
-
-static void
-snaplist_destroy(list_t *l, boolean_t own)
-{
-       struct promotenode *snap;
-
-       if (!l || !list_link_active(&l->list_head))
-               return;
-
-       while ((snap = list_tail(l)) != NULL) {
-               list_remove(l, snap);
-               if (own)
-                       dsl_dataset_disown(snap->ds, snaplist_tag);
-               else
-                       dsl_dataset_rele(snap->ds, snaplist_tag);
-               kmem_free(snap, sizeof (struct promotenode));
-       }
-       list_destroy(l);
-}
-
-/*
- * Promote a clone.  Nomenclature note:
- * "clone" or "cds": the original clone which is being promoted
- * "origin" or "ods": the snapshot which is originally clone's origin
- * "origin head" or "ohds": the dataset which is the head
- * (filesystem/volume) for the origin
- * "origin origin": the origin of the origin's filesystem (typically
- * NULL, indicating that the clone is not a clone of a clone).
- */
-int
-dsl_dataset_promote(const char *name, char *conflsnap)
-{
-       dsl_dataset_t *ds;
-       dsl_dir_t *dd;
-       dsl_pool_t *dp;
-       dmu_object_info_t doi;
-       struct promotearg pa;
-       struct promotenode *snap;
-       int err;
-
-       bzero(&pa, sizeof(struct promotearg));
-       err = dsl_dataset_hold(name, FTAG, &ds);
-       if (err)
-               return (err);
-       dd = ds->ds_dir;
-       dp = dd->dd_pool;
-
-       err = dmu_object_info(dp->dp_meta_objset,
-           ds->ds_phys->ds_snapnames_zapobj, &doi);
-       if (err) {
-               dsl_dataset_rele(ds, FTAG);
-               return (err);
-       }
-
-       if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
-               dsl_dataset_rele(ds, FTAG);
-               return (EINVAL);
-       }
-
-       /*
-        * We are going to inherit all the snapshots taken before our
-        * origin (i.e., our new origin will be our parent's origin).
-        * Take ownership of them so that we can rename them into our
-        * namespace.
-        */
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-
-       err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
-           &pa.shared_snaps);
-       if (err != 0)
-               goto out;
-
-       err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
-       if (err != 0)
-               goto out;
-
-       snap = list_head(&pa.shared_snaps);
-       ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
-       err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
-           snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
-       if (err != 0)
-               goto out;
-
-       if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
-               err = dsl_dataset_hold_obj(dp,
-                   snap->ds->ds_dir->dd_phys->dd_origin_obj,
-                   FTAG, &pa.origin_origin);
-               if (err != 0)
-                       goto out;
-       }
-
-out:
-       rw_exit(&dp->dp_config_rwlock);
-
-       /*
-        * Add in 128x the snapnames zapobj size, since we will be moving
-        * a bunch of snapnames to the promoted ds, and dirtying their
-        * bonus buffers.
-        */
-       if (err == 0) {
-               err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
-                   dsl_dataset_promote_sync, ds, &pa,
-                   2 + 2 * doi.doi_physical_blocks_512);
-               if (err && pa.err_ds && conflsnap)
-                       (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
-       }
-
-       snaplist_destroy(&pa.shared_snaps, B_TRUE);
-       snaplist_destroy(&pa.clone_snaps, B_FALSE);
-       snaplist_destroy(&pa.origin_snaps, B_FALSE);
-       if (pa.origin_origin)
-               dsl_dataset_rele(pa.origin_origin, FTAG);
-       dsl_dataset_rele(ds, FTAG);
-       return (err);
-}
-
-struct cloneswaparg {
-       dsl_dataset_t *cds; /* clone dataset */
-       dsl_dataset_t *ohds; /* origin's head dataset */
-       boolean_t force;
-       int64_t unused_refres_delta; /* change in unconsumed refreservation */
-};
-
-/* ARGSUSED */
-static int
-dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       struct cloneswaparg *csa = arg1;
-
-       /* they should both be heads */
-       if (dsl_dataset_is_snapshot(csa->cds) ||
-           dsl_dataset_is_snapshot(csa->ohds))
-               return (EINVAL);
-
-       /* the branch point should be just before them */
-       if (csa->cds->ds_prev != csa->ohds->ds_prev)
-               return (EINVAL);
-
-       /* cds should be the clone (unless they are unrelated) */
-       if (csa->cds->ds_prev != NULL &&
-           csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
-           csa->ohds->ds_object !=
-           csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
-               return (EINVAL);
-
-       /* the clone should be a child of the origin */
-       if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
-               return (EINVAL);
-
-       /* ohds shouldn't be modified unless 'force' */
-       if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
-               return (ETXTBSY);
-
-       /* adjust amount of any unconsumed refreservation */
-       csa->unused_refres_delta =
-           (int64_t)MIN(csa->ohds->ds_reserved,
-           csa->ohds->ds_phys->ds_unique_bytes) -
-           (int64_t)MIN(csa->ohds->ds_reserved,
-           csa->cds->ds_phys->ds_unique_bytes);
-
-       if (csa->unused_refres_delta > 0 &&
-           csa->unused_refres_delta >
-           dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
-               return (ENOSPC);
-
-       if (csa->ohds->ds_quota != 0 &&
-           csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
-               return (EDQUOT);
-
-       return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       struct cloneswaparg *csa = arg1;
-       dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
-
-       ASSERT(csa->cds->ds_reserved == 0);
-       ASSERT(csa->ohds->ds_quota == 0 ||
-           csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
-
-       dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
-       dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
-
-       if (csa->cds->ds_objset != NULL) {
-               dmu_objset_evict(csa->cds->ds_objset);
-               csa->cds->ds_objset = NULL;
-       }
-
-       if (csa->ohds->ds_objset != NULL) {
-               dmu_objset_evict(csa->ohds->ds_objset);
-               csa->ohds->ds_objset = NULL;
-       }
-
-       /*
-        * Reset origin's unique bytes, if it exists.
-        */
-       if (csa->cds->ds_prev) {
-               dsl_dataset_t *origin = csa->cds->ds_prev;
-               uint64_t comp, uncomp;
-
-               dmu_buf_will_dirty(origin->ds_dbuf, tx);
-               dsl_deadlist_space_range(&csa->cds->ds_deadlist,
-                   origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-                   &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
-       }
-
-       /* swap blkptrs */
-       {
-               blkptr_t tmp;
-               tmp = csa->ohds->ds_phys->ds_bp;
-               csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
-               csa->cds->ds_phys->ds_bp = tmp;
-       }
-
-       /* set dd_*_bytes */
-       {
-               int64_t dused, dcomp, duncomp;
-               uint64_t cdl_used, cdl_comp, cdl_uncomp;
-               uint64_t odl_used, odl_comp, odl_uncomp;
-
-               ASSERT3U(csa->cds->ds_dir->dd_phys->
-                   dd_used_breakdown[DD_USED_SNAP], ==, 0);
-
-               dsl_deadlist_space(&csa->cds->ds_deadlist,
-                   &cdl_used, &cdl_comp, &cdl_uncomp);
-               dsl_deadlist_space(&csa->ohds->ds_deadlist,
-                   &odl_used, &odl_comp, &odl_uncomp);
-
-               dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
-                   (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
-               dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
-                   (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
-               duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
-                   cdl_uncomp -
-                   (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
-
-               dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
-                   dused, dcomp, duncomp, tx);
-               dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
-                   -dused, -dcomp, -duncomp, tx);
-
-               /*
-                * The difference in the space used by snapshots is the
-                * difference in snapshot space due to the head's
-                * deadlist (since that's the only thing that's
-                * changing that affects the snapused).
-                */
-               dsl_deadlist_space_range(&csa->cds->ds_deadlist,
-                   csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
-                   &cdl_used, &cdl_comp, &cdl_uncomp);
-               dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
-                   csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
-                   &odl_used, &odl_comp, &odl_uncomp);
-               dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
-                   DD_USED_HEAD, DD_USED_SNAP, tx);
-       }
-
-       /* swap ds_*_bytes */
-       SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
-           csa->cds->ds_phys->ds_referenced_bytes);
-       SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
-           csa->cds->ds_phys->ds_compressed_bytes);
-       SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
-           csa->cds->ds_phys->ds_uncompressed_bytes);
-       SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
-           csa->cds->ds_phys->ds_unique_bytes);
-
-       /* apply any parent delta for change in unconsumed refreservation */
-       dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
-           csa->unused_refres_delta, 0, 0, tx);
-
-       /*
-        * Swap deadlists.
-        */
-       dsl_deadlist_close(&csa->cds->ds_deadlist);
-       dsl_deadlist_close(&csa->ohds->ds_deadlist);
-       SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
-           csa->cds->ds_phys->ds_deadlist_obj);
-       dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
-           csa->cds->ds_phys->ds_deadlist_obj);
-       dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
-           csa->ohds->ds_phys->ds_deadlist_obj);
-
-       dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
-
-       spa_history_log_internal_ds(csa->cds, "clone swap", tx,
-           "parent=%s", csa->ohds->ds_dir->dd_myname);
-}
-
-/*
- * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
- * recv" into an existing fs to swizzle the file system to the new
- * version, and by "zfs rollback".  Can also be used to swap two
- * independent head datasets if neither has any snapshots.
- */
-int
-dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
-    boolean_t force)
-{
-       struct cloneswaparg csa;
-       int error;
-
-       ASSERT(clone->ds_owner);
-       ASSERT(origin_head->ds_owner);
-retry:
-       /*
-        * Need exclusive access for the swap. If we're swapping these
-        * datasets back after an error, we already hold the locks.
-        */
-       if (!RW_WRITE_HELD(&clone->ds_rwlock))
-               rw_enter(&clone->ds_rwlock, RW_WRITER);
-       if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
-           !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
-               rw_exit(&clone->ds_rwlock);
-               rw_enter(&origin_head->ds_rwlock, RW_WRITER);
-               if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
-                       rw_exit(&origin_head->ds_rwlock);
-                       goto retry;
-               }
-       }
-       csa.cds = clone;
-       csa.ohds = origin_head;
-       csa.force = force;
-       error = dsl_sync_task_do(clone->ds_dir->dd_pool,
-           dsl_dataset_clone_swap_check,
-           dsl_dataset_clone_swap_sync, &csa, NULL, 9);
-       return (error);
-}
-
-/*
- * Given a pool name and a dataset object number in that pool,
- * return the name of that dataset.
- */
-int
-dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
-{
-       spa_t *spa;
-       dsl_pool_t *dp;
-       dsl_dataset_t *ds;
-       int error;
-
-       if ((error = spa_open(pname, &spa, FTAG)) != 0)
-               return (error);
-       dp = spa_get_dsl(spa);
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
-               dsl_dataset_name(ds, buf);
-               dsl_dataset_rele(ds, FTAG);
-       }
-       rw_exit(&dp->dp_config_rwlock);
-       spa_close(spa, FTAG);
-
-       return (error);
-}
+                           zap_cursor_advance(&zc)) {
+                               dsl_dataset_t *cnds;
+                               uint64_t o;
 
-int
-dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
-    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
-{
-       int error = 0;
+                               if (za.za_first_integer == oldnext_obj) {
+                                       /*
+                                        * We've already moved the
+                                        * origin's reference.
+                                        */
+                                       continue;
+                               }
 
-       ASSERT3S(asize, >, 0);
+                               VERIFY0(dsl_dataset_hold_obj(dp,
+                                   za.za_first_integer, FTAG, &cnds));
+                               o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
 
-       /*
-        * *ref_rsrv is the portion of asize that will come from any
-        * unconsumed refreservation space.
-        */
-       *ref_rsrv = 0;
+                               VERIFY0(zap_remove_int(dp->dp_meta_objset,
+                                   odd->dd_phys->dd_clones, o, tx));
+                               VERIFY0(zap_add_int(dp->dp_meta_objset,
+                                   dd->dd_phys->dd_clones, o, tx));
+                               dsl_dataset_rele(cnds, FTAG);
+                       }
+                       zap_cursor_fini(&zc);
+               }
 
-       mutex_enter(&ds->ds_lock);
-       /*
-        * Make a space adjustment for reserved bytes.
-        */
-       if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
-               ASSERT3U(*used, >=,
-                   ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
-               *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
-               *ref_rsrv =
-                   asize - MIN(asize, parent_delta(ds, asize + inflight));
+               ASSERT(!dsl_prop_hascb(ds));
        }
 
-       if (!check_quota || ds->ds_quota == 0) {
-               mutex_exit(&ds->ds_lock);
-               return (0);
-       }
        /*
-        * If they are requesting more space, and our current estimate
-        * is over quota, they get to try again unless the actual
-        * on-disk is over quota and there are no pending changes (which
-        * may free up space for us).
+        * Change space accounting.
+        * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
+        * both be valid, or both be 0 (resulting in delta == 0).  This
+        * is true for each of {clone,origin} independently.
         */
-       if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
-               if (inflight > 0 ||
-                   ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
-                       error = ERESTART;
-               else
-                       error = EDQUOT;
 
-               DMU_TX_STAT_BUMP(dmu_tx_quota);
-       }
-       mutex_exit(&ds->ds_lock);
+       delta = ddpa->cloneusedsnap -
+           dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+       ASSERT3S(delta, >=, 0);
+       ASSERT3U(ddpa->used, >=, delta);
+       dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
+       dsl_dir_diduse_space(dd, DD_USED_HEAD,
+           ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
 
-       return (error);
+       delta = ddpa->originusedsnap -
+           odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+       ASSERT3S(delta, <=, 0);
+       ASSERT3U(ddpa->used, >=, -delta);
+       dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
+       dsl_dir_diduse_space(odd, DD_USED_HEAD,
+           -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
+
+       origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
+
+       /* log history record */
+       spa_history_log_internal_ds(hds, "promote", tx, "");
+
+       dsl_dir_rele(odd, FTAG);
+       promote_rele(ddpa, FTAG);
 }
 
-/* ARGSUSED */
+/*
+ * Make a list of dsl_dataset_t's for the snapshots between first_obj
+ * (exclusive) and last_obj (inclusive).  The list will be in reverse
+ * order (last_obj will be the list_head()).  If first_obj == 0, do all
+ * snapshots back to this dataset's origin.
+ */
 static int
-dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+snaplist_make(dsl_pool_t *dp,
+    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_prop_setarg_t *psa = arg2;
-       int err;
+       uint64_t obj = last_obj;
 
-       if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
-               return (ENOTSUP);
+       list_create(l, sizeof (struct promotenode),
+           offsetof(struct promotenode, link));
 
-       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-               return (err);
+       while (obj != first_obj) {
+               dsl_dataset_t *ds;
+               struct promotenode *snap;
+               int err;
 
-       if (psa->psa_effective_value == 0)
-               return (0);
+               err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+               ASSERT(err != ENOENT);
+               if (err != 0)
+                       return (err);
 
-       if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
-           psa->psa_effective_value < ds->ds_reserved)
-               return (ENOSPC);
+               if (first_obj == 0)
+                       first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+
+               snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
+               snap->ds = ds;
+               list_insert_tail(l, snap);
+               obj = ds->ds_phys->ds_prev_snap_obj;
+       }
 
        return (0);
 }
 
-extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
-
-void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+static int
+snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_prop_setarg_t *psa = arg2;
-       uint64_t effective_value = psa->psa_effective_value;
-
-       dsl_prop_set_sync(ds, psa, tx);
-       DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
-
-       if (ds->ds_quota != effective_value) {
-               dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_quota = effective_value;
+       struct promotenode *snap;
 
-               spa_history_log_internal_ds(ds, "set refquota", tx,
-                   "refquota=%lld", (longlong_t)ds->ds_quota);
+       *spacep = 0;
+       for (snap = list_head(l); snap; snap = list_next(l, snap)) {
+               uint64_t used, comp, uncomp;
+               dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+                   mintxg, UINT64_MAX, &used, &comp, &uncomp);
+               *spacep += used;
        }
+       return (0);
 }
 
-int
-dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
+static void
+snaplist_destroy(list_t *l, void *tag)
 {
-       dsl_dataset_t *ds;
-       dsl_prop_setarg_t psa;
-       int err;
-
-       dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
-
-       err = dsl_dataset_hold(dsname, FTAG, &ds);
-       if (err)
-               return (err);
-
-       /*
-        * If someone removes a file, then tries to set the quota, we
-        * want to make sure the file freeing takes effect.
-        */
-       txg_wait_open(ds->ds_dir->dd_pool, 0);
+       struct promotenode *snap;
 
-       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
-           ds, &psa, 0);
+       if (l == NULL || !list_link_active(&l->list_head))
+               return;
 
-       dsl_dataset_rele(ds, FTAG);
-       return (err);
+       while ((snap = list_tail(l)) != NULL) {
+               list_remove(l, snap);
+               dsl_dataset_rele(snap->ds, tag);
+               kmem_free(snap, sizeof (*snap));
+       }
+       list_destroy(l);
 }
 
 static int
-dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_prop_setarg_t *psa = arg2;
-       uint64_t effective_value;
-       uint64_t unique;
-       int err;
+       int error;
+       dsl_dir_t *dd;
+       struct promotenode *snap;
 
-       if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
-           SPA_VERSION_REFRESERVATION)
-               return (ENOTSUP);
+       error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
+           &ddpa->ddpa_clone);
+       if (error != 0)
+               return (error);
+       dd = ddpa->ddpa_clone->ds_dir;
 
-       if (dsl_dataset_is_snapshot(ds))
+       if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
+           !dsl_dir_is_clone(dd)) {
+               dsl_dataset_rele(ddpa->ddpa_clone, tag);
                return (EINVAL);
+       }
 
-       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-               return (err);
-
-       effective_value = psa->psa_effective_value;
-
-       /*
-        * If we are doing the preliminary check in open context, the
-        * space estimates may be inaccurate.
-        */
-       if (!dmu_tx_is_syncing(tx))
-               return (0);
+       error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
+           &ddpa->shared_snaps, tag);
+       if (error != 0)
+               goto out;
 
-       mutex_enter(&ds->ds_lock);
-       if (!DS_UNIQUE_IS_ACCURATE(ds))
-               dsl_dataset_recalc_head_uniq(ds);
-       unique = ds->ds_phys->ds_unique_bytes;
-       mutex_exit(&ds->ds_lock);
+       error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
+           &ddpa->clone_snaps, tag);
+       if (error != 0)
+               goto out;
 
-       if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
-               uint64_t delta = MAX(unique, effective_value) -
-                   MAX(unique, ds->ds_reserved);
+       snap = list_head(&ddpa->shared_snaps);
+       ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
+       error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
+           snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
+           &ddpa->origin_snaps, tag);
+       if (error != 0)
+               goto out;
 
-               if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
-                       return (ENOSPC);
-               if (ds->ds_quota > 0 &&
-                   effective_value > ds->ds_quota)
-                       return (ENOSPC);
+       if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+               error = dsl_dataset_hold_obj(dp,
+                   snap->ds->ds_dir->dd_phys->dd_origin_obj,
+                   tag, &ddpa->origin_origin);
+               if (error != 0)
+                       goto out;
        }
-
-       return (0);
+out:
+       if (error != 0)
+               promote_rele(ddpa, tag);
+       return (error);
 }
 
 static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_prop_setarg_t *psa = arg2;
-       uint64_t effective_value = psa->psa_effective_value;
-       uint64_t unique;
-       int64_t delta;
-
-       dsl_prop_set_sync(ds, psa, tx);
-       DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
+       snaplist_destroy(&ddpa->shared_snaps, tag);
+       snaplist_destroy(&ddpa->clone_snaps, tag);
+       snaplist_destroy(&ddpa->origin_snaps, tag);
+       if (ddpa->origin_origin != NULL)
+               dsl_dataset_rele(ddpa->origin_origin, tag);
+       dsl_dataset_rele(ddpa->ddpa_clone, tag);
+}
 
-       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+/*
+ * Promote a clone.
+ *
+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
+ * in with the name.  (It must be at least MAXNAMELEN bytes long.)
+ */
+int
+dsl_dataset_promote(const char *name, char *conflsnap)
+{
+       dsl_dataset_promote_arg_t ddpa = { 0 };
+       uint64_t numsnaps;
+       int error;
+       objset_t *os;
 
-       mutex_enter(&ds->ds_dir->dd_lock);
-       mutex_enter(&ds->ds_lock);
-       ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-       unique = ds->ds_phys->ds_unique_bytes;
-       delta = MAX(0, (int64_t)(effective_value - unique)) -
-           MAX(0, (int64_t)(ds->ds_reserved - unique));
-       ds->ds_reserved = effective_value;
-       mutex_exit(&ds->ds_lock);
+       /*
+        * We will modify space proportional to the number of
+        * snapshots.  Compute numsnaps.
+        */
+       error = dmu_objset_hold(name, FTAG, &os);
+       if (error != 0)
+               return (error);
+       error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
+           dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
+       dmu_objset_rele(os, FTAG);
+       if (error != 0)
+               return (error);
 
-       dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
-       mutex_exit(&ds->ds_dir->dd_lock);
+       ddpa.ddpa_clonename = name;
+       ddpa.err_ds = conflsnap;
 
-       spa_history_log_internal_ds(ds, "set refreservation", tx,
-           "refreservation=%lld", (longlong_t)effective_value);
+       return (dsl_sync_task(name, dsl_dataset_promote_check,
+           dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
 }
 
 int
-dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
-    uint64_t reservation)
+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, boolean_t force)
 {
-       dsl_dataset_t *ds;
-       dsl_prop_setarg_t psa;
-       int err;
+       int64_t unused_refres_delta;
 
-       dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
-           &reservation);
+       /* they should both be heads */
+       if (dsl_dataset_is_snapshot(clone) ||
+           dsl_dataset_is_snapshot(origin_head))
+               return (EINVAL);
 
-       err = dsl_dataset_hold(dsname, FTAG, &ds);
-       if (err)
-               return (err);
+       /* the branch point should be just before them */
+       if (clone->ds_prev != origin_head->ds_prev)
+               return (EINVAL);
 
-       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           dsl_dataset_set_reservation_check,
-           dsl_dataset_set_reservation_sync, ds, &psa, 0);
+       /* clone should be the clone (unless they are unrelated) */
+       if (clone->ds_prev != NULL &&
+           clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
+           origin_head->ds_object !=
+           clone->ds_prev->ds_phys->ds_next_snap_obj)
+               return (EINVAL);
 
-       dsl_dataset_rele(ds, FTAG);
-       return (err);
-}
+       /* the clone should be a child of the origin */
+       if (clone->ds_dir->dd_parent != origin_head->ds_dir)
+               return (EINVAL);
 
-typedef struct zfs_hold_cleanup_arg {
-       dsl_pool_t *dp;
-       uint64_t dsobj;
-       char htag[MAXNAMELEN];
-} zfs_hold_cleanup_arg_t;
+       /* origin_head shouldn't be modified unless 'force' */
+       if (!force && dsl_dataset_modified_since_lastsnap(origin_head))
+               return (ETXTBSY);
 
-static void
-dsl_dataset_user_release_onexit(void *arg)
-{
-       zfs_hold_cleanup_arg_t *ca = arg;
+       /* origin_head should have no long holds (e.g. is not mounted) */
+       if (dsl_dataset_long_held(origin_head))
+               return (EBUSY);
+
+       /* check amount of any unconsumed refreservation */
+       unused_refres_delta =
+           (int64_t)MIN(origin_head->ds_reserved,
+           origin_head->ds_phys->ds_unique_bytes) -
+           (int64_t)MIN(origin_head->ds_reserved,
+           clone->ds_phys->ds_unique_bytes);
+
+       if (unused_refres_delta > 0 &&
+           unused_refres_delta >
+           dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
+               return (ENOSPC);
+
+       /* clone can't be over the head's refquota */
+       if (origin_head->ds_quota != 0 &&
+           clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
+               return (EDQUOT);
 
-       (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
-           B_TRUE);
-       kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+       return (0);
 }
 
 void
-dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
-    minor_t minor)
+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, dmu_tx_t *tx)
 {
-       zfs_hold_cleanup_arg_t *ca;
-
-       ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
-       ca->dp = ds->ds_dir->dd_pool;
-       ca->dsobj = ds->ds_object;
-       (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
-       VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
-           dsl_dataset_user_release_onexit, ca, NULL));
-}
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       int64_t unused_refres_delta;
 
-/*
- * If you add new checks here, you may need to add
- * additional checks to the "temporary" case in
- * snapshot_check() in dmu_objset.c.
- */
-static int
-dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-       struct dsl_ds_holdarg *ha = arg2;
-       const char *htag = ha->htag;
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       int error = 0;
+       ASSERT(clone->ds_reserved == 0);
+       ASSERT(origin_head->ds_quota == 0 ||
+           clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
 
-       if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
-               return (ENOTSUP);
+       dmu_buf_will_dirty(clone->ds_dbuf, tx);
+       dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 
-       if (!dsl_dataset_is_snapshot(ds))
-               return (EINVAL);
+       if (clone->ds_objset != NULL) {
+               dmu_objset_evict(clone->ds_objset);
+               clone->ds_objset = NULL;
+       }
 
-       /* tags must be unique */
-       mutex_enter(&ds->ds_lock);
-       if (ds->ds_phys->ds_userrefs_obj) {
-               error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
-                   8, 1, tx);
-               if (error == 0)
-                       error = EEXIST;
-               else if (error == ENOENT)
-                       error = 0;
+       if (origin_head->ds_objset != NULL) {
+               dmu_objset_evict(origin_head->ds_objset);
+               origin_head->ds_objset = NULL;
        }
-       mutex_exit(&ds->ds_lock);
 
-       if (error == 0 && ha->temphold &&
-           strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
-               error = E2BIG;
+       unused_refres_delta =
+           (int64_t)MIN(origin_head->ds_reserved,
+           origin_head->ds_phys->ds_unique_bytes) -
+           (int64_t)MIN(origin_head->ds_reserved,
+           clone->ds_phys->ds_unique_bytes);
+
+       /*
+        * Reset origin's unique bytes, if it exists.
+        */
+       if (clone->ds_prev) {
+               dsl_dataset_t *origin = clone->ds_prev;
+               uint64_t comp, uncomp;
+
+               dmu_buf_will_dirty(origin->ds_dbuf, tx);
+               dsl_deadlist_space_range(&clone->ds_deadlist,
+                   origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+                   &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+       }
+
+       /* swap blkptrs */
+       {
+               blkptr_t tmp;
+               tmp = origin_head->ds_phys->ds_bp;
+               origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
+               clone->ds_phys->ds_bp = tmp;
+       }
+
+       /* set dd_*_bytes */
+       {
+               int64_t dused, dcomp, duncomp;
+               uint64_t cdl_used, cdl_comp, cdl_uncomp;
+               uint64_t odl_used, odl_comp, odl_uncomp;
+
+               ASSERT3U(clone->ds_dir->dd_phys->
+                   dd_used_breakdown[DD_USED_SNAP], ==, 0);
+
+               dsl_deadlist_space(&clone->ds_deadlist,
+                   &cdl_used, &cdl_comp, &cdl_uncomp);
+               dsl_deadlist_space(&origin_head->ds_deadlist,
+                   &odl_used, &odl_comp, &odl_uncomp);
 
-       return (error);
-}
+               dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
+                   (origin_head->ds_phys->ds_referenced_bytes + odl_used);
+               dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
+                   (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
+               duncomp = clone->ds_phys->ds_uncompressed_bytes +
+                   cdl_uncomp -
+                   (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
 
-void
-dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-       struct dsl_ds_holdarg *ha = arg2;
-       const char *htag = ha->htag;
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-       objset_t *mos = dp->dp_meta_objset;
-       uint64_t now = gethrestime_sec();
-       uint64_t zapobj;
+               dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
+                   dused, dcomp, duncomp, tx);
+               dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
+                   -dused, -dcomp, -duncomp, tx);
 
-       mutex_enter(&ds->ds_lock);
-       if (ds->ds_phys->ds_userrefs_obj == 0) {
                /*
-                * This is the first user hold for this dataset.  Create
-                * the userrefs zap object.
+                * The difference in the space used by snapshots is the
+                * difference in snapshot space due to the head's
+                * deadlist (since that's the only thing that's
+                * changing that affects the snapused).
                 */
-               dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               zapobj = ds->ds_phys->ds_userrefs_obj =
-                   zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
-       } else {
-               zapobj = ds->ds_phys->ds_userrefs_obj;
+               dsl_deadlist_space_range(&clone->ds_deadlist,
+                   origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+                   &cdl_used, &cdl_comp, &cdl_uncomp);
+               dsl_deadlist_space_range(&origin_head->ds_deadlist,
+                   origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+                   &odl_used, &odl_comp, &odl_uncomp);
+               dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
+                   DD_USED_HEAD, DD_USED_SNAP, tx);
        }
-       ds->ds_userrefs++;
-       mutex_exit(&ds->ds_lock);
 
-       VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+       /* swap ds_*_bytes */
+       SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
+           clone->ds_phys->ds_referenced_bytes);
+       SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
+           clone->ds_phys->ds_compressed_bytes);
+       SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
+           clone->ds_phys->ds_uncompressed_bytes);
+       SWITCH64(origin_head->ds_phys->ds_unique_bytes,
+           clone->ds_phys->ds_unique_bytes);
 
-       if (ha->temphold) {
-               VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
-                   htag, &now, tx));
-       }
+       /* apply any parent delta for change in unconsumed refreservation */
+       dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
+           unused_refres_delta, 0, 0, tx);
 
-       spa_history_log_internal_ds(ds, "hold", tx,
-           "tag = %s temp = %d holds now = %llu",
-           htag, (int)ha->temphold, ds->ds_userrefs);
-}
+       /*
+        * Swap deadlists.
+        */
+       dsl_deadlist_close(&clone->ds_deadlist);
+       dsl_deadlist_close(&origin_head->ds_deadlist);
+       SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
+           clone->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
+           clone->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
+           origin_head->ds_phys->ds_deadlist_obj);
 
-static int
-dsl_dataset_user_hold_one(const char *dsname, void *arg)
-{
-       struct dsl_ds_holdarg *ha = arg;
-       dsl_dataset_t *ds;
-       int error;
-       char *name;
+       dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
-       /* alloc a buffer to hold dsname@snapname plus terminating NULL */
-       name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-       error = dsl_dataset_hold(name, ha->dstg, &ds);
-       strfree(name);
-       if (error == 0) {
-               ha->gotone = B_TRUE;
-               dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
-                   dsl_dataset_user_hold_sync, ds, ha, 0);
-       } else if (error == ENOENT && ha->recursive) {
-               error = 0;
-       } else {
-               (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-       }
-       return (error);
+       spa_history_log_internal_ds(clone, "clone swap", tx,
+           "parent=%s", origin_head->ds_dir->dd_myname);
 }
 
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
 int
-dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
-    boolean_t temphold)
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
-       struct dsl_ds_holdarg *ha;
+       dsl_pool_t *dp;
+       dsl_dataset_t *ds;
        int error;
 
-       ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
-       ha->htag = htag;
-       ha->temphold = temphold;
-       error = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
-           ds, ha, 0);
-       kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+       error = dsl_pool_hold(pname, FTAG, &dp);
+       if (error != 0)
+               return (error);
+
+       error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+       if (error == 0) {
+               dsl_dataset_name(ds, buf);
+               dsl_dataset_rele(ds, FTAG);
+       }
+       dsl_pool_rele(dp, FTAG);
 
        return (error);
 }
 
 int
-dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive, boolean_t temphold, int cleanup_fd)
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
 {
-       struct dsl_ds_holdarg *ha;
-       dsl_sync_task_t *dst;
-       spa_t *spa;
-       int error;
-       minor_t minor = 0;
-
-       if (cleanup_fd != -1) {
-               /* Currently we only support cleanup-on-exit of tempholds. */
-               if (!temphold)
-                       return (EINVAL);
-               error = zfs_onexit_fd_hold(cleanup_fd, &minor);
-               if (error)
-                       return (error);
-       }
+       int error = 0;
 
-       ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+       ASSERT3S(asize, >, 0);
 
-       (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+       /*
+        * *ref_rsrv is the portion of asize that will come from any
+        * unconsumed refreservation space.
+        */
+       *ref_rsrv = 0;
 
-       error = spa_open(dsname, &spa, FTAG);
-       if (error) {
-               kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-               if (cleanup_fd != -1)
-                       zfs_onexit_fd_rele(cleanup_fd);
-               return (error);
+       mutex_enter(&ds->ds_lock);
+       /*
+        * Make a space adjustment for reserved bytes.
+        */
+       if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+               ASSERT3U(*used, >=,
+                   ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+               *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+               *ref_rsrv =
+                   asize - MIN(asize, parent_delta(ds, asize + inflight));
        }
 
-       ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-       ha->htag = htag;
-       ha->snapname = snapname;
-       ha->recursive = recursive;
-       ha->temphold = temphold;
-
-       if (recursive) {
-               error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
-                   ha, DS_FIND_CHILDREN);
-       } else {
-               error = dsl_dataset_user_hold_one(dsname, ha);
+       if (!check_quota || ds->ds_quota == 0) {
+               mutex_exit(&ds->ds_lock);
+               return (0);
        }
-       if (error == 0)
-               error = dsl_sync_task_group_wait(ha->dstg);
-
-       for (dst = list_head(&ha->dstg->dstg_tasks); dst;
-           dst = list_next(&ha->dstg->dstg_tasks, dst)) {
-               dsl_dataset_t *ds = dst->dst_arg1;
-
-               if (dst->dst_err) {
-                       dsl_dataset_name(ds, ha->failed);
-                       *strchr(ha->failed, '@') = '\0';
-               } else if (error == 0 && minor != 0 && temphold) {
-                       /*
-                        * If this hold is to be released upon process exit,
-                        * register that action now.
-                        */
-                       dsl_register_onexit_hold_cleanup(ds, htag, minor);
-               }
-               dsl_dataset_rele(ds, ha->dstg);
+       /*
+        * If they are requesting more space, and our current estimate
+        * is over quota, they get to try again unless the actual
+        * on-disk is over quota and there are no pending changes (which
+        * may free up space for us).
+        */
+       if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+               if (inflight > 0 ||
+                   ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
+                       error = ERESTART;
+               else
+                       error = EDQUOT;
        }
+       mutex_exit(&ds->ds_lock);
 
-       if (error == 0 && recursive && !ha->gotone)
-               error = ENOENT;
-
-       if (error)
-               (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
-
-       dsl_sync_task_group_destroy(ha->dstg);
-
-       kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-       spa_close(spa, FTAG);
-       if (cleanup_fd != -1)
-               zfs_onexit_fd_rele(cleanup_fd);
        return (error);
 }
 
-struct dsl_ds_releasearg {
-       dsl_dataset_t *ds;
-       const char *htag;
-       boolean_t own;          /* do we own or just hold ds? */
-};
+typedef struct dsl_dataset_set_qr_arg {
+       const char *ddsqra_name;
+       zprop_source_t ddsqra_source;
+       uint64_t ddsqra_value;
+} dsl_dataset_set_qr_arg_t;
 
+
+/* ARGSUSED */
 static int
-dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
-    boolean_t *might_destroy)
+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
 {
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       uint64_t zapobj;
-       uint64_t tmp;
+       dsl_dataset_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
        int error;
+       uint64_t newval;
 
-       *might_destroy = B_FALSE;
+       if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
+               return (ENOTSUP);
 
-       mutex_enter(&ds->ds_lock);
-       zapobj = ds->ds_phys->ds_userrefs_obj;
-       if (zapobj == 0) {
-               /* The tag can't possibly exist */
-               mutex_exit(&ds->ds_lock);
-               return (ESRCH);
+       error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       if (dsl_dataset_is_snapshot(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EINVAL);
        }
 
-       /* Make sure the tag exists */
-       error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
-       if (error) {
-               mutex_exit(&ds->ds_lock);
-               if (error == ENOENT)
-                       error = ESRCH;
+       error = dsl_prop_predict(ds->ds_dir,
+           zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+           ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+       if (error != 0) {
+               dsl_dataset_rele(ds, FTAG);
                return (error);
        }
 
-       if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
-           DS_IS_DEFER_DESTROY(ds))
-               *might_destroy = B_TRUE;
+       if (newval == 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return (0);
+       }
+
+       if (newval < ds->ds_phys->ds_referenced_bytes ||
+           newval < ds->ds_reserved) {
+               dsl_dataset_rele(ds, FTAG);
+               return (ENOSPC);
+       }
 
-       mutex_exit(&ds->ds_lock);
+       dsl_dataset_rele(ds, FTAG);
        return (0);
 }
 
-static int
-dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
+static void
+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
 {
-       struct dsl_ds_releasearg *ra = arg1;
-       dsl_dataset_t *ds = ra->ds;
-       boolean_t might_destroy;
-       int error;
+       dsl_dataset_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       uint64_t newval;
 
-       if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
-               return (ENOTSUP);
+       VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
-       error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
-       if (error)
-               return (error);
+       dsl_prop_set_sync_impl(ds,
+           zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+           ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+           &ddsqra->ddsqra_value, tx);
 
-       if (might_destroy) {
-               struct dsl_ds_destroyarg dsda = {0};
+       VERIFY0(dsl_prop_get_int_ds(ds,
+           zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
 
-               if (dmu_tx_is_syncing(tx)) {
-                       /*
-                        * If we're not prepared to remove the snapshot,
-                        * we can't allow the release to happen right now.
-                        */
-                       if (!ra->own)
-                               return (EBUSY);
-               }
-               dsda.ds = ds;
-               dsda.releasing = B_TRUE;
-               return (dsl_dataset_destroy_check(&dsda, tag, tx));
+       if (ds->ds_quota != newval) {
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               ds->ds_quota = newval;
        }
-
-       return (0);
+       dsl_dataset_rele(ds, FTAG);
 }
 
-static void
-dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
+int
+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+    uint64_t refquota)
 {
-       struct dsl_ds_releasearg *ra = arg1;
-       dsl_dataset_t *ds = ra->ds;
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-       objset_t *mos = dp->dp_meta_objset;
-       uint64_t zapobj;
-       uint64_t refs;
-       int error;
+       dsl_dataset_set_qr_arg_t ddsqra;
 
-       mutex_enter(&ds->ds_lock);
-       ds->ds_userrefs--;
-       refs = ds->ds_userrefs;
-       mutex_exit(&ds->ds_lock);
-       error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
-       VERIFY(error == 0 || error == ENOENT);
-       zapobj = ds->ds_phys->ds_userrefs_obj;
-       VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
-       if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
-           DS_IS_DEFER_DESTROY(ds)) {
-               struct dsl_ds_destroyarg dsda = {0};
-
-               ASSERT(ra->own);
-               dsda.ds = ds;
-               dsda.releasing = B_TRUE;
-               /* We already did the destroy_check */
-               dsl_dataset_destroy_sync(&dsda, tag, tx);
-       }
+       ddsqra.ddsqra_name = dsname;
+       ddsqra.ddsqra_source = source;
+       ddsqra.ddsqra_value = refquota;
+
+       return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
+           dsl_dataset_set_refquota_sync, &ddsqra, 0));
 }
 
 static int
-dsl_dataset_user_release_one(const char *dsname, void *arg)
+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
 {
-       struct dsl_ds_holdarg *ha = arg;
-       struct dsl_ds_releasearg *ra;
+       dsl_dataset_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
        dsl_dataset_t *ds;
        int error;
-       void *dtag = ha->dstg;
-       char *name;
-       boolean_t own = B_FALSE;
-       boolean_t might_destroy;
-
-       /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
-       name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-       error = dsl_dataset_hold(name, dtag, &ds);
-       strfree(name);
-       if (error == ENOENT && ha->recursive)
-               return (0);
-       (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-       if (error)
-               return (error);
-
-       ha->gotone = B_TRUE;
+       uint64_t newval, unique;
 
-       ASSERT(dsl_dataset_is_snapshot(ds));
+       if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
+               return (ENOTSUP);
 
-       error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
-       if (error) {
-               dsl_dataset_rele(ds, dtag);
+       error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+       if (error != 0)
                return (error);
-       }
 
-       if (might_destroy) {
-#ifdef _KERNEL
-               name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-               error = zfs_unmount_snap(name, NULL);
-               strfree(name);
-               if (error) {
-                       dsl_dataset_rele(ds, dtag);
-                       return (error);
-               }
-#endif
-               if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
-                       dsl_dataset_rele(ds, dtag);
-                       return (EBUSY);
-               } else {
-                       own = B_TRUE;
-                       dsl_dataset_make_exclusive(ds, dtag);
-               }
+       if (dsl_dataset_is_snapshot(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EINVAL);
        }
 
-       ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
-       ra->ds = ds;
-       ra->htag = ha->htag;
-       ra->own = own;
-       dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
-           dsl_dataset_user_release_sync, ra, dtag, 0);
-
-       return (0);
-}
-
-int
-dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
-    boolean_t recursive)
-{
-       struct dsl_ds_holdarg *ha;
-       dsl_sync_task_t *dst;
-       spa_t *spa;
-       int error;
-
-top:
-       ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
-
-       (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-
-       error = spa_open(dsname, &spa, FTAG);
-       if (error) {
-               kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+       error = dsl_prop_predict(ds->ds_dir,
+           zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+           ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+       if (error != 0) {
+               dsl_dataset_rele(ds, FTAG);
                return (error);
        }
 
-       ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-       ha->htag = htag;
-       ha->snapname = snapname;
-       ha->recursive = recursive;
-       if (recursive) {
-               error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
-                   ha, DS_FIND_CHILDREN);
-       } else {
-               error = dsl_dataset_user_release_one(dsname, ha);
+       /*
+        * If we are doing the preliminary check in open context, the
+        * space estimates may be inaccurate.
+        */
+       if (!dmu_tx_is_syncing(tx)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (0);
        }
-       if (error == 0)
-               error = dsl_sync_task_group_wait(ha->dstg);
 
-       for (dst = list_head(&ha->dstg->dstg_tasks); dst;
-           dst = list_next(&ha->dstg->dstg_tasks, dst)) {
-               struct dsl_ds_releasearg *ra = dst->dst_arg1;
-               dsl_dataset_t *ds = ra->ds;
-
-               if (dst->dst_err)
-                       dsl_dataset_name(ds, ha->failed);
+       mutex_enter(&ds->ds_lock);
+       if (!DS_UNIQUE_IS_ACCURATE(ds))
+               dsl_dataset_recalc_head_uniq(ds);
+       unique = ds->ds_phys->ds_unique_bytes;
+       mutex_exit(&ds->ds_lock);
 
-               if (ra->own)
-                       dsl_dataset_disown(ds, ha->dstg);
-               else
-                       dsl_dataset_rele(ds, ha->dstg);
+       if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
+               uint64_t delta = MAX(unique, newval) -
+                   MAX(unique, ds->ds_reserved);
 
-               kmem_free(ra, sizeof (struct dsl_ds_releasearg));
+               if (delta >
+                   dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
+                   (ds->ds_quota > 0 && newval > ds->ds_quota)) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (ENOSPC);
+               }
        }
 
-       if (error == 0 && recursive && !ha->gotone)
-               error = ENOENT;
-
-       if (error && error != EBUSY)
-               (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
-
-       dsl_sync_task_group_destroy(ha->dstg);
-       kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-       spa_close(spa, FTAG);
-
-       /*
-        * We can get EBUSY if we were racing with deferred destroy and
-        * dsl_dataset_user_release_check() hadn't done the necessary
-        * open context setup.  We can also get EBUSY if we're racing
-        * with destroy and that thread is the ds_owner.  Either way
-        * the busy condition should be transient, and we should retry
-        * the release operation.
-        */
-       if (error == EBUSY)
-               goto top;
-
-       return (error);
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
 }
 
-/*
- * Called at spa_load time (with retry == B_FALSE) to release a stale
- * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
- */
-int
-dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
-    boolean_t retry)
+void
+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds;
-       char *snap;
-       char *name;
-       int namelen;
-       int error;
+       uint64_t newval;
+       uint64_t unique;
+       int64_t delta;
 
-       do {
-               rw_enter(&dp->dp_config_rwlock, RW_READER);
-               error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
-               rw_exit(&dp->dp_config_rwlock);
-               if (error)
-                       return (error);
-               namelen = dsl_dataset_namelen(ds)+1;
-               name = kmem_alloc(namelen, KM_SLEEP);
-               dsl_dataset_name(ds, name);
-               dsl_dataset_rele(ds, FTAG);
+       dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+           source, sizeof (value), 1, &value, tx);
 
-               snap = strchr(name, '@');
-               *snap = '\0';
-               ++snap;
-               error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
-               kmem_free(name, namelen);
+       VERIFY0(dsl_prop_get_int_ds(ds,
+           zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
 
-               /*
-                * The object can't have been destroyed because we have a hold,
-                * but it might have been renamed, resulting in ENOENT.  Retry
-                * if we've been requested to do so.
-                *
-                * It would be nice if we could use the dsobj all the way
-                * through and avoid ENOENT entirely.  But we might need to
-                * unmount the snapshot, and there's currently no way to lookup
-                * a vfsp using a ZFS object id.
-                */
-       } while ((error == ENOENT) && retry);
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       mutex_enter(&ds->ds_dir->dd_lock);
+       mutex_enter(&ds->ds_lock);
+       ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+       unique = ds->ds_phys->ds_unique_bytes;
+       delta = MAX(0, (int64_t)(newval - unique)) -
+           MAX(0, (int64_t)(ds->ds_reserved - unique));
+       ds->ds_reserved = newval;
+       mutex_exit(&ds->ds_lock);
 
-       return (error);
+       dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+       mutex_exit(&ds->ds_dir->dd_lock);
 }
 
-int
-dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
+static void
+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
 {
+       dsl_dataset_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
        dsl_dataset_t *ds;
-       int err;
-
-       err = dsl_dataset_hold(dsname, FTAG, &ds);
-       if (err)
-               return (err);
 
-       VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
-       if (ds->ds_phys->ds_userrefs_obj != 0) {
-               zap_attribute_t *za;
-               zap_cursor_t zc;
-
-               za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-               for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
-                   ds->ds_phys->ds_userrefs_obj);
-                   zap_cursor_retrieve(&zc, za) == 0;
-                   zap_cursor_advance(&zc)) {
-                       VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
-                           za->za_first_integer));
-               }
-               zap_cursor_fini(&zc);
-               kmem_free(za, sizeof (zap_attribute_t));
-       }
+       VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+       dsl_dataset_set_refreservation_sync_impl(ds,
+           ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
        dsl_dataset_rele(ds, FTAG);
-       return (0);
 }
 
-/*
- * Note, this function is used as the callback for dmu_objset_find().  We
- * always return 0 so that we will continue to find and process
- * inconsistent datasets, even if we encounter an error trying to
- * process one of them.
- */
-/* ARGSUSED */
 int
-dsl_destroy_inconsistent(const char *dsname, void *arg)
+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+    uint64_t refreservation)
 {
-       dsl_dataset_t *ds;
+       dsl_dataset_set_qr_arg_t ddsqra;
 
-       if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
-               if (DS_IS_INCONSISTENT(ds))
-                       (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
-               else
-                       dsl_dataset_disown(ds, FTAG);
-       }
-       return (0);
-}
+       ddsqra.ddsqra_name = dsname;
+       ddsqra.ddsqra_source = source;
+       ddsqra.ddsqra_value = refreservation;
 
+       return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
+           dsl_dataset_set_refreservation_sync, &ddsqra, 0));
+}
 
 /*
  * Return (in *usedp) the amount of space written in new that is not
@@ -4179,6 +2746,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
        uint64_t snapobj;
        dsl_pool_t *dp = new->ds_dir->dd_pool;
 
+       ASSERT(dsl_pool_config_held(dp));
+
        *usedp = 0;
        *usedp += new->ds_phys->ds_referenced_bytes;
        *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
@@ -4191,7 +2760,6 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
        *uncompp += new->ds_phys->ds_uncompressed_bytes;
        *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
        snapobj = new->ds_object;
        while (snapobj != oldsnap->ds_object) {
                dsl_dataset_t *snap;
@@ -4240,7 +2808,6 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
                }
 
        }
-       rw_exit(&dp->dp_config_rwlock);
        return (err);
 }
 
@@ -4282,7 +2849,6 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
 
        *usedp = *compp = *uncompp = 0;
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
        snapobj = lastsnap->ds_phys->ds_next_snap_obj;
        while (snapobj != firstsnap->ds_object) {
                dsl_dataset_t *ds;
@@ -4303,12 +2869,47 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
                ASSERT3U(snapobj, !=, 0);
                dsl_dataset_rele(ds, FTAG);
        }
-       rw_exit(&dp->dp_config_rwlock);
        return (err);
 }
 
+/*
+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
+ * For example, they could both be snapshots of the same filesystem, and
+ * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
+ * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
+ * filesystem.  Or 'earlier' could be the origin's origin.
+ */
+boolean_t
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
+{
+       dsl_pool_t *dp = later->ds_dir->dd_pool;
+       int error;
+       boolean_t ret;
+       dsl_dataset_t *origin;
+
+       ASSERT(dsl_pool_config_held(dp));
+
+       if (earlier->ds_phys->ds_creation_txg >=
+           later->ds_phys->ds_creation_txg)
+               return (B_FALSE);
+
+       if (later->ds_dir == earlier->ds_dir)
+               return (B_TRUE);
+       if (!dsl_dir_is_clone(later->ds_dir))
+               return (B_FALSE);
+
+       if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
+               return (B_TRUE);
+       error = dsl_dataset_hold_obj(dp,
+           later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
+       if (error != 0)
+               return (B_FALSE);
+       ret = dsl_dataset_is_before(origin, earlier);
+       dsl_dataset_rele(origin, FTAG);
+       return (ret);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
-EXPORT_SYMBOL(dmu_snapshots_destroy_nvl);
 EXPORT_SYMBOL(dsl_dataset_hold);
 EXPORT_SYMBOL(dsl_dataset_hold_obj);
 EXPORT_SYMBOL(dsl_dataset_own);
@@ -4316,22 +2917,14 @@ EXPORT_SYMBOL(dsl_dataset_own_obj);
 EXPORT_SYMBOL(dsl_dataset_name);
 EXPORT_SYMBOL(dsl_dataset_rele);
 EXPORT_SYMBOL(dsl_dataset_disown);
-EXPORT_SYMBOL(dsl_dataset_drop_ref);
 EXPORT_SYMBOL(dsl_dataset_tryown);
-EXPORT_SYMBOL(dsl_dataset_make_exclusive);
 EXPORT_SYMBOL(dsl_dataset_create_sync);
 EXPORT_SYMBOL(dsl_dataset_create_sync_dd);
-EXPORT_SYMBOL(dsl_dataset_destroy);
-EXPORT_SYMBOL(dsl_dataset_destroy_check);
-EXPORT_SYMBOL(dsl_dataset_destroy_sync);
 EXPORT_SYMBOL(dsl_dataset_snapshot_check);
 EXPORT_SYMBOL(dsl_dataset_snapshot_sync);
-EXPORT_SYMBOL(dsl_dataset_rename);
 EXPORT_SYMBOL(dsl_dataset_promote);
-EXPORT_SYMBOL(dsl_dataset_clone_swap);
 EXPORT_SYMBOL(dsl_dataset_user_hold);
 EXPORT_SYMBOL(dsl_dataset_user_release);
-EXPORT_SYMBOL(dsl_dataset_user_release_tmp);
 EXPORT_SYMBOL(dsl_dataset_get_holds);
 EXPORT_SYMBOL(dsl_dataset_get_blkptr);
 EXPORT_SYMBOL(dsl_dataset_set_blkptr);
@@ -4351,8 +2944,6 @@ EXPORT_SYMBOL(dsl_dataset_space);
 EXPORT_SYMBOL(dsl_dataset_fsid_guid);
 EXPORT_SYMBOL(dsl_dsobj_to_dsname);
 EXPORT_SYMBOL(dsl_dataset_check_quota);
-EXPORT_SYMBOL(dsl_dataset_set_quota);
-EXPORT_SYMBOL(dsl_dataset_set_quota_sync);
-EXPORT_SYMBOL(dsl_dataset_set_reservation);
-EXPORT_SYMBOL(dsl_destroy_inconsistent);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl);
 #endif
index 48c261e63806d2891ab415308e16916b838cb1f8..d09e79f1c70e1946a37ef2247d50f8c24b640a72 100644 (file)
@@ -147,28 +147,37 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
        return (0);
 }
 
+typedef struct dsl_deleg_arg {
+       const char *dda_name;
+       nvlist_t *dda_nvlist;
+} dsl_deleg_arg_t;
+
 static void
-dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       nvlist_t *nvp = arg2;
-       objset_t *mos = dd->dd_pool->dp_meta_objset;
+       dsl_deleg_arg_t *dda = arg;
+       dsl_dir_t *dd;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       objset_t *mos = dp->dp_meta_objset;
        nvpair_t *whopair = NULL;
-       uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+       uint64_t zapobj;
+
+       VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
 
+       zapobj = dd->dd_phys->dd_deleg_zapobj;
        if (zapobj == 0) {
                dmu_buf_will_dirty(dd->dd_dbuf, tx);
                zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
                    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
        }
 
-       while ((whopair = nvlist_next_nvpair(nvp, whopair))) {
+       while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) {
                const char *whokey = nvpair_name(whopair);
                nvlist_t *perms;
                nvpair_t *permpair = NULL;
                uint64_t jumpobj;
 
-               VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+               perms = fnvpair_value_nvlist(whopair);
 
                if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
                        jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
@@ -185,21 +194,27 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                            "%s %s", whokey, perm);
                }
        }
+       dsl_dir_rele(dd, FTAG);
 }
 
 static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       nvlist_t *nvp = arg2;
-       objset_t *mos = dd->dd_pool->dp_meta_objset;
+       dsl_deleg_arg_t *dda = arg;
+       dsl_dir_t *dd;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       objset_t *mos = dp->dp_meta_objset;
        nvpair_t *whopair = NULL;
-       uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+       uint64_t zapobj;
 
-       if (zapobj == 0)
+       VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+       zapobj = dd->dd_phys->dd_deleg_zapobj;
+       if (zapobj == 0) {
+               dsl_dir_rele(dd, FTAG);
                return;
+       }
 
-       while ((whopair = nvlist_next_nvpair(nvp, whopair))) {
+       while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) {
                const char *whokey = nvpair_name(whopair);
                nvlist_t *perms;
                nvpair_t *permpair = NULL;
@@ -234,35 +249,40 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                            "%s %s", whokey, perm);
                }
        }
+       dsl_dir_rele(dd, FTAG);
 }
 
-int
-dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+static int
+dsl_deleg_check(void *arg, dmu_tx_t *tx)
 {
+       dsl_deleg_arg_t *dda = arg;
        dsl_dir_t *dd;
        int error;
-       nvpair_t *whopair = NULL;
-       int blocks_modified = 0;
 
-       error = dsl_dir_open(ddname, FTAG, &dd, NULL);
-       if (error)
-               return (error);
-
-       if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) <
+       if (spa_version(dmu_tx_pool(tx)->dp_spa) <
            SPA_VERSION_DELEGATED_PERMS) {
-               dsl_dir_close(dd, FTAG);
                return (ENOTSUP);
        }
 
-       while ((whopair = nvlist_next_nvpair(nvp, whopair)))
-               blocks_modified++;
+       error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
+       if (error == 0)
+               dsl_dir_rele(dd, FTAG);
+       return (error);
+}
 
-       error = dsl_sync_task_do(dd->dd_pool, NULL,
-           unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
-           dd, nvp, blocks_modified);
-       dsl_dir_close(dd, FTAG);
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+       dsl_deleg_arg_t dda;
 
-       return (error);
+       /* nvp must already have been verified to be valid */
+
+       dda.dda_name = ddname;
+       dda.dda_nvlist = nvp;
+
+       return (dsl_sync_task(ddname, dsl_deleg_check,
+           unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+           &dda, fnvlist_num_pairs(nvp)));
 }
 
 /*
@@ -293,9 +313,15 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp)
        zap_attribute_t *baseza, *za;
        char *source;
 
-       error = dsl_dir_open(ddname, FTAG, &startdd, NULL);
-       if (error)
+       error = dsl_pool_hold(ddname, FTAG, &dp);
+       if (error != 0)
+               return (error);
+
+       error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
                return (error);
+       }
 
        dp = startdd->dd_pool;
        mos = dp->dp_meta_objset;
@@ -307,20 +333,16 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp)
        source = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP);
        VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
        for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
                nvlist_t *sp_nvp;
                uint64_t n;
 
-               if (dd->dd_phys->dd_deleg_zapobj &&
-                   (zap_count(mos, dd->dd_phys->dd_deleg_zapobj,
-                   &n) == 0) && n) {
-                       VERIFY(nvlist_alloc(&sp_nvp,
-                           NV_UNIQUE_NAME, KM_SLEEP) == 0);
-               } else {
+               if (dd->dd_phys->dd_deleg_zapobj == 0 ||
+                   zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 ||
+                   n == 0)
                        continue;
-               }
 
+               sp_nvp = fnvlist_alloc();
                for (zap_cursor_init(basezc, mos,
                    dd->dd_phys->dd_deleg_zapobj);
                    zap_cursor_retrieve(basezc, baseza) == 0;
@@ -330,27 +352,23 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp)
                        ASSERT(baseza->za_integer_length == 8);
                        ASSERT(baseza->za_num_integers == 1);
 
-                       VERIFY(nvlist_alloc(&perms_nvp,
-                           NV_UNIQUE_NAME, KM_SLEEP) == 0);
+                       perms_nvp = fnvlist_alloc();
                        for (zap_cursor_init(zc, mos, baseza->za_first_integer);
                            zap_cursor_retrieve(zc, za) == 0;
                            zap_cursor_advance(zc)) {
-                               VERIFY(nvlist_add_boolean(perms_nvp,
-                                   za->za_name) == 0);
+                               fnvlist_add_boolean(perms_nvp, za->za_name);
                        }
                        zap_cursor_fini(zc);
-                       VERIFY(nvlist_add_nvlist(sp_nvp, baseza->za_name,
-                           perms_nvp) == 0);
-                       nvlist_free(perms_nvp);
+                       fnvlist_add_nvlist(sp_nvp, baseza->za_name, perms_nvp);
+                       fnvlist_free(perms_nvp);
                }
 
                zap_cursor_fini(basezc);
 
                dsl_dir_name(dd, source);
-               VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0);
+               fnvlist_add_nvlist(*nvp, source, sp_nvp);
                nvlist_free(sp_nvp);
        }
-       rw_exit(&dp->dp_config_rwlock);
 
        kmem_free(source, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
        kmem_free(baseza, sizeof(zap_attribute_t));
@@ -358,7 +376,8 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp)
        kmem_free(za, sizeof(zap_attribute_t));
        kmem_free(zc, sizeof(zap_cursor_t));
 
-       dsl_dir_close(startdd, FTAG);
+       dsl_dir_rele(startdd, FTAG);
+       dsl_pool_rele(dp, FTAG);
        return (0);
 }
 
@@ -564,7 +583,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
        avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
            offsetof(perm_set_t, p_node));
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       ASSERT(dsl_pool_config_held(dp));
        for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
            checkflag = ZFS_DELEG_DESCENDENT) {
                uint64_t zapobj;
@@ -625,7 +644,6 @@ again:
        }
        error = EPERM;
 success:
-       rw_exit(&dp->dp_config_rwlock);
 
        cookie = NULL;
        while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -637,15 +655,19 @@ success:
 int
 dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 {
+       dsl_pool_t *dp;
        dsl_dataset_t *ds;
        int error;
 
-       error = dsl_dataset_hold(dsname, FTAG, &ds);
-       if (error)
+       error = dsl_pool_hold(dsname, FTAG, &dp);
+       if (error != 0)
                return (error);
-
-       error = dsl_deleg_access_impl(ds, perm, cr);
-       dsl_dataset_rele(ds, FTAG);
+       error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+       if (error == 0) {
+               error = dsl_deleg_access_impl(ds, perm, cr);
+               dsl_dataset_rele(ds, FTAG);
+       }
+       dsl_pool_rele(dp, FTAG);
 
        return (error);
 }
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
new file mode 100644 (file)
index 0000000..1fb3859
--- /dev/null
@@ -0,0 +1,940 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_deleg.h>
+
+typedef struct dmu_snapshots_destroy_arg {
+       nvlist_t *dsda_snaps;
+       nvlist_t *dsda_successful_snaps;
+       boolean_t dsda_defer;
+       nvlist_t *dsda_errlist;
+} dmu_snapshots_destroy_arg_t;
+
+/*
+ * ds must be owned.
+ */
+static int
+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
+{
+       if (!dsl_dataset_is_snapshot(ds))
+               return (EINVAL);
+
+       if (dsl_dataset_long_held(ds))
+               return (EBUSY);
+
+       /*
+        * Only allow deferred destroy on pools that support it.
+        * NOTE: deferred destroy is only supported on snapshots.
+        */
+       if (defer) {
+               if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+                   SPA_VERSION_USERREFS)
+                       return (ENOTSUP);
+               return (0);
+       }
+
+       /*
+        * If this snapshot has an elevated user reference count,
+        * we can't destroy it yet.
+        */
+       if (ds->ds_userrefs > 0)
+               return (EBUSY);
+
+       /*
+        * Can't delete a branch point.
+        */
+       if (ds->ds_phys->ds_num_children > 1)
+               return (EEXIST);
+
+       return (0);
+}
+
+static int
+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+       dmu_snapshots_destroy_arg_t *dsda = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
+       int error = 0;
+
+       if (!dmu_tx_is_syncing(tx))
+               return (0);
+
+       for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
+           pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
+               dsl_dataset_t *ds;
+
+               error = dsl_dataset_hold(dp, nvpair_name(pair),
+                   FTAG, &ds);
+
+               /*
+                * If the snapshot does not exist, silently ignore it
+                * (it's "already destroyed").
+                */
+               if (error == ENOENT)
+                       continue;
+
+               if (error == 0) {
+                       error = dsl_destroy_snapshot_check_impl(ds,
+                           dsda->dsda_defer);
+                       dsl_dataset_rele(ds, FTAG);
+               }
+
+               if (error == 0) {
+                       fnvlist_add_boolean(dsda->dsda_successful_snaps,
+                           nvpair_name(pair));
+               } else {
+                       fnvlist_add_int32(dsda->dsda_errlist,
+                           nvpair_name(pair), error);
+               }
+       }
+
+       pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
+       if (pair != NULL)
+               return (fnvpair_value_int32(pair));
+       return (0);
+}
+
+struct process_old_arg {
+       dsl_dataset_t *ds;
+       dsl_dataset_t *ds_prev;
+       boolean_t after_branch_point;
+       zio_t *pio;
+       uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       struct process_old_arg *poa = arg;
+       dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+       if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+               dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+               if (poa->ds_prev && !poa->after_branch_point &&
+                   bp->blk_birth >
+                   poa->ds_prev->ds_phys->ds_prev_snap_txg) {
+                       poa->ds_prev->ds_phys->ds_unique_bytes +=
+                           bp_get_dsize_sync(dp->dp_spa, bp);
+               }
+       } else {
+               poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+               poa->comp += BP_GET_PSIZE(bp);
+               poa->uncomp += BP_GET_UCSIZE(bp);
+               dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+       }
+       return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+       struct process_old_arg poa = { 0 };
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       objset_t *mos = dp->dp_meta_objset;
+       uint64_t deadlist_obj;
+
+       ASSERT(ds->ds_deadlist.dl_oldfmt);
+       ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+       poa.ds = ds;
+       poa.ds_prev = ds_prev;
+       poa.after_branch_point = after_branch_point;
+       poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+       VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+           process_old_cb, &poa, tx));
+       VERIFY0(zio_wait(poa.pio));
+       ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+
+       /* change snapused */
+       dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+           -poa.used, -poa.comp, -poa.uncomp, tx);
+
+       /* swap next's deadlist to our deadlist */
+       dsl_deadlist_close(&ds->ds_deadlist);
+       dsl_deadlist_close(&ds_next->ds_deadlist);
+       deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+       ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
+       ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
+       dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+           ds_next->ds_phys->ds_deadlist_obj);
+}
+
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+
+       /*
+        * If it is the old version, dd_clones doesn't exist so we can't
+        * find the clones, but dsl_deadlist_remove_key() is a no-op so it
+        * doesn't matter.
+        */
+       if (ds->ds_dir->dd_phys->dd_clones == 0)
+               return;
+
+       for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           zap_cursor_advance(&zc)) {
+               dsl_dataset_t *clone;
+
+               VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+                   za.za_first_integer, FTAG, &clone));
+               if (clone->ds_dir->dd_origin_txg > mintxg) {
+                       dsl_deadlist_remove_key(&clone->ds_deadlist,
+                           mintxg, tx);
+                       dsl_dataset_remove_clones_key(clone, mintxg, tx);
+               }
+               dsl_dataset_rele(clone, FTAG);
+       }
+       zap_cursor_fini(&zc);
+}
+
+void
+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+{
+#ifdef ZFS_DEBUG
+       int err;
+#endif
+       int after_branch_point = FALSE;
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       objset_t *mos = dp->dp_meta_objset;
+       dsl_dataset_t *ds_prev = NULL;
+       uint64_t obj, old_unique, used = 0, comp = 0, uncomp = 0;
+       dsl_dataset_t *ds_next, *ds_head, *hds;
+
+
+       ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+       ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+       ASSERT(refcount_is_zero(&ds->ds_longholds));
+
+       if (defer &&
+           (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
+               ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+               spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
+               return;
+       }
+
+       ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+
+       /* We need to log before removing it from the namespace. */
+       spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+       dsl_scan_ds_destroyed(ds, tx);
+
+       obj = ds->ds_object;
+
+       if (ds->ds_phys->ds_prev_snap_obj != 0) {
+               ASSERT3P(ds->ds_prev, ==, NULL);
+               VERIFY0(dsl_dataset_hold_obj(dp,
+                   ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
+               after_branch_point =
+                   (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+               dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+               if (after_branch_point &&
+                   ds_prev->ds_phys->ds_next_clones_obj != 0) {
+                       dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
+                       if (ds->ds_phys->ds_next_snap_obj != 0) {
+                               VERIFY0(zap_add_int(mos,
+                                   ds_prev->ds_phys->ds_next_clones_obj,
+                                   ds->ds_phys->ds_next_snap_obj, tx));
+                       }
+               }
+               if (!after_branch_point) {
+                       ds_prev->ds_phys->ds_next_snap_obj =
+                           ds->ds_phys->ds_next_snap_obj;
+               }
+       }
+
+       VERIFY0(dsl_dataset_hold_obj(dp,
+           ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
+       ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+       old_unique = ds_next->ds_phys->ds_unique_bytes;
+
+       dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+       ds_next->ds_phys->ds_prev_snap_obj =
+           ds->ds_phys->ds_prev_snap_obj;
+       ds_next->ds_phys->ds_prev_snap_txg =
+           ds->ds_phys->ds_prev_snap_txg;
+       ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+           ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+       if (ds_next->ds_deadlist.dl_oldfmt) {
+               process_old_deadlist(ds, ds_prev, ds_next,
+                   after_branch_point, tx);
+       } else {
+               /* Adjust prev's unique space. */
+               if (ds_prev && !after_branch_point) {
+                       dsl_deadlist_space_range(&ds_next->ds_deadlist,
+                           ds_prev->ds_phys->ds_prev_snap_txg,
+                           ds->ds_phys->ds_prev_snap_txg,
+                           &used, &comp, &uncomp);
+                       ds_prev->ds_phys->ds_unique_bytes += used;
+               }
+
+               /* Adjust snapused. */
+               dsl_deadlist_space_range(&ds_next->ds_deadlist,
+                   ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+                   &used, &comp, &uncomp);
+               dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+                   -used, -comp, -uncomp, tx);
+
+               /* Move blocks to be freed to pool's free list. */
+               dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+                   &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+                   tx);
+               dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+                   DD_USED_HEAD, used, comp, uncomp, tx);
+
+               /* Merge our deadlist into next's and free it. */
+               dsl_deadlist_merge(&ds_next->ds_deadlist,
+                   ds->ds_phys->ds_deadlist_obj, tx);
+       }
+       dsl_deadlist_close(&ds->ds_deadlist);
+       dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       ds->ds_phys->ds_deadlist_obj = 0;
+
+       /* Collapse range in clone heads */
+       dsl_dataset_remove_clones_key(ds,
+           ds->ds_phys->ds_creation_txg, tx);
+
+       if (dsl_dataset_is_snapshot(ds_next)) {
+               dsl_dataset_t *ds_nextnext;
+
+               /*
+                * Update next's unique to include blocks which
+                * were previously shared by only this snapshot
+                * and it.  Those blocks will be born after the
+                * prev snap and before this snap, and will have
+                * died after the next snap and before the one
+                * after that (ie. be on the snap after next's
+                * deadlist).
+                */
+               VERIFY0(dsl_dataset_hold_obj(dp,
+                   ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
+               dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
+                   ds->ds_phys->ds_prev_snap_txg,
+                   ds->ds_phys->ds_creation_txg,
+                   &used, &comp, &uncomp);
+               ds_next->ds_phys->ds_unique_bytes += used;
+               dsl_dataset_rele(ds_nextnext, FTAG);
+               ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+               /* Collapse range in this head. */
+               VERIFY0(dsl_dataset_hold_obj(dp,
+                   ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+               dsl_deadlist_remove_key(&hds->ds_deadlist,
+                   ds->ds_phys->ds_creation_txg, tx);
+               dsl_dataset_rele(hds, FTAG);
+
+       } else {
+               ASSERT3P(ds_next->ds_prev, ==, ds);
+               dsl_dataset_rele(ds_next->ds_prev, ds_next);
+               ds_next->ds_prev = NULL;
+               if (ds_prev) {
+                       VERIFY0(dsl_dataset_hold_obj(dp,
+                           ds->ds_phys->ds_prev_snap_obj,
+                           ds_next, &ds_next->ds_prev));
+               }
+
+               dsl_dataset_recalc_head_uniq(ds_next);
+
+               /*
+                * Reduce the amount of our unconsumed refreservation
+                * being charged to our parent by the amount of
+                * new unique data we have gained.
+                */
+               if (old_unique < ds_next->ds_reserved) {
+                       int64_t mrsdelta;
+                       uint64_t new_unique =
+                           ds_next->ds_phys->ds_unique_bytes;
+
+                       ASSERT(old_unique <= new_unique);
+                       mrsdelta = MIN(new_unique - old_unique,
+                           ds_next->ds_reserved - old_unique);
+                       dsl_dir_diduse_space(ds->ds_dir,
+                           DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+               }
+       }
+       dsl_dataset_rele(ds_next, FTAG);
+
+       /*
+        * This must be done after the dsl_traverse(), because it will
+        * re-open the objset.
+        */
+       if (ds->ds_objset) {
+               dmu_objset_evict(ds->ds_objset);
+               ds->ds_objset = NULL;
+       }
+
+       /* remove from snapshot namespace */
+       ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+       VERIFY0(dsl_dataset_hold_obj(dp,
+           ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
+       VERIFY0(dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+       {
+               uint64_t val;
+
+               err = dsl_dataset_snap_lookup(ds_head,
+                   ds->ds_snapname, &val);
+               ASSERT0(err);
+               ASSERT3U(val, ==, obj);
+       }
+#endif
+       VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
+       dsl_dataset_rele(ds_head, FTAG);
+
+       if (ds_prev != NULL)
+               dsl_dataset_rele(ds_prev, FTAG);
+
+       spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+       if (ds->ds_phys->ds_next_clones_obj != 0) {
+               ASSERTV(uint64_t count);
+               ASSERT0(zap_count(mos,
+                   ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+               VERIFY0(dmu_object_free(mos,
+                   ds->ds_phys->ds_next_clones_obj, tx));
+       }
+       if (ds->ds_phys->ds_props_obj != 0)
+               VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+       if (ds->ds_phys->ds_userrefs_obj != 0)
+               VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
+       dsl_dir_rele(ds->ds_dir, ds);
+       ds->ds_dir = NULL;
+       VERIFY0(dmu_object_free(mos, obj, tx));
+}
+
+static void
+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+       dmu_snapshots_destroy_arg_t *dsda = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
+
+       for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
+           pair != NULL;
+           pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
+               dsl_dataset_t *ds;
+
+               VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+
+               dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
+               dsl_dataset_rele(ds, FTAG);
+       }
+}
+
+/*
+ * The semantics of this function are described in the comment above
+ * lzc_destroy_snaps().  To summarize:
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Snapshots that don't exist will be silently ignored (considered to be
+ * "already deleted").
+ *
+ * On success, all snaps will be destroyed and this will return 0.
+ * On failure, no snaps will be destroyed, the errlist will be filled in,
+ * and this will return an errno.
+ */
+int
+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
+    nvlist_t *errlist)
+{
+       dmu_snapshots_destroy_arg_t dsda;
+       int error;
+       nvpair_t *pair;
+
+       pair = nvlist_next_nvpair(snaps, NULL);
+       if (pair == NULL)
+               return (0);
+
+       dsda.dsda_snaps = snaps;
+        VERIFY0(nvlist_alloc(&dsda.dsda_successful_snaps, NV_UNIQUE_NAME, KM_PUSHPAGE));
+       dsda.dsda_defer = defer;
+       dsda.dsda_errlist = errlist;
+
+       error = dsl_sync_task(nvpair_name(pair),
+           dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
+           &dsda, 0);
+       fnvlist_free(dsda.dsda_successful_snaps);
+
+       return (error);
+}
+
+int
+dsl_destroy_snapshot(const char *name, boolean_t defer)
+{
+       int error;
+       nvlist_t *nvl;
+       nvlist_t *errlist;
+
+        VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE));
+        VERIFY0(nvlist_alloc(&errlist, NV_UNIQUE_NAME, KM_PUSHPAGE));
+
+       fnvlist_add_boolean(nvl, name);
+       error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
+       fnvlist_free(errlist);
+       fnvlist_free(nvl);
+       return (error);
+}
+
+struct killarg {
+       dsl_dataset_t *ds;
+       dmu_tx_t *tx;
+};
+
+/* ARGSUSED */
+static int
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+       struct killarg *ka = arg;
+       dmu_tx_t *tx = ka->tx;
+
+       if (bp == NULL)
+               return (0);
+
+       if (zb->zb_level == ZB_ZIL_LEVEL) {
+               ASSERT(zilog != NULL);
+               /*
+                * It's a block in the intent log.  It has no
+                * accounting, so just free it.
+                */
+               dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
+       } else {
+               ASSERT(zilog == NULL);
+               ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+               (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
+       }
+
+       return (0);
+}
+
+static void
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+       struct killarg ka;
+
+       /*
+        * Free everything that we point to (that's born after
+        * the previous snapshot, if we are a clone)
+        *
+        * NB: this should be very quick, because we already
+        * freed all the objects in open context.
+        */
+       ka.ds = ds;
+       ka.tx = tx;
+       VERIFY0(traverse_dataset(ds,
+           ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+           kill_blkptr, &ka));
+       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+}
+
+typedef struct dsl_destroy_head_arg {
+       const char *ddha_name;
+} dsl_destroy_head_arg_t;
+
+int
+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
+{
+       int error;
+       uint64_t count;
+       objset_t *mos;
+
+       if (dsl_dataset_is_snapshot(ds))
+               return (EINVAL);
+
+       if (refcount_count(&ds->ds_longholds) != expected_holds)
+               return (EBUSY);
+
+       mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+       /*
+        * Can't delete a head dataset if there are snapshots of it.
+        * (Except if the only snapshots are from the branch we cloned
+        * from.)
+        */
+       if (ds->ds_prev != NULL &&
+           ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+               return (EBUSY);
+
+       /*
+        * Can't delete if there are children of this fs.
+        */
+       error = zap_count(mos,
+           ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+       if (error != 0)
+               return (error);
+       if (count != 0)
+               return (EEXIST);
+
+       if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+           ds->ds_prev->ds_phys->ds_num_children == 2 &&
+           ds->ds_prev->ds_userrefs == 0) {
+               /* We need to remove the origin snapshot as well. */
+               if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
+                       return (EBUSY);
+       }
+       return (0);
+}
+
+static int
+dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
+{
+       dsl_destroy_head_arg_t *ddha = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       int error;
+
+       error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       error = dsl_destroy_head_check_impl(ds, 0);
+       dsl_dataset_rele(ds, FTAG);
+       return (error);
+}
+
+static void
+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
+{
+       dsl_dir_t *dd;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       objset_t *mos = dp->dp_meta_objset;
+       dd_used_t t;
+
+       ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
+
+       VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+       ASSERT0(dd->dd_phys->dd_head_dataset_obj);
+
+       /*
+        * Remove our reservation. The impl() routine avoids setting the
+        * actual property, which would require the (already destroyed) ds.
+        */
+       dsl_dir_set_reservation_sync_impl(dd, 0, tx);
+
+       ASSERT0(dd->dd_phys->dd_used_bytes);
+       ASSERT0(dd->dd_phys->dd_reserved);
+       for (t = 0; t < DD_USED_NUM; t++)
+               ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
+
+       VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
+       VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
+       VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
+       VERIFY0(zap_remove(mos,
+           dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+
+       dsl_dir_rele(dd, FTAG);
+       VERIFY0(dmu_object_free(mos, ddobj, tx));
+}
+
+void
+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       objset_t *mos = dp->dp_meta_objset;
+       uint64_t obj, ddobj, prevobj = 0;
+       boolean_t rmorigin;
+       zfeature_info_t *async_destroy;
+       objset_t *os;
+
+       ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+       ASSERT(ds->ds_prev == NULL ||
+           ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
+       ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+       ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+       /* We need to log before removing it from the namespace. */
+       spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+       rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
+           DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+           ds->ds_prev->ds_phys->ds_num_children == 2 &&
+           ds->ds_prev->ds_userrefs == 0);
+
+       /* Remove our reservation */
+       if (ds->ds_reserved != 0) {
+               dsl_dataset_set_refreservation_sync_impl(ds,
+                   (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+                   0, tx);
+               ASSERT0(ds->ds_reserved);
+       }
+
+       dsl_scan_ds_destroyed(ds, tx);
+
+       obj = ds->ds_object;
+
+       if (ds->ds_phys->ds_prev_snap_obj != 0) {
+               /* This is a clone */
+               ASSERT(ds->ds_prev != NULL);
+               ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
+               ASSERT0(ds->ds_phys->ds_next_snap_obj);
+
+               dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+               if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
+                       dsl_dataset_remove_from_next_clones(ds->ds_prev,
+                           obj, tx);
+               }
+
+               ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
+               ds->ds_prev->ds_phys->ds_num_children--;
+       }
+
+       async_destroy =
+           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
+
+       /*
+        * Destroy the deadlist.  Unless it's a clone, the
+        * deadlist should be empty.  (If it's a clone, it's
+        * safe to ignore the deadlist contents.)
+        */
+       dsl_deadlist_close(&ds->ds_deadlist);
+       dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       ds->ds_phys->ds_deadlist_obj = 0;
+
+       VERIFY0(dmu_objset_from_ds(ds, &os));
+
+       if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
+               old_synchronous_dataset_destroy(ds, tx);
+       } else {
+               /*
+                * Move the bptree into the pool's list of trees to
+                * clean up and update space accounting information.
+                */
+               uint64_t used, comp, uncomp;
+
+               zil_destroy_sync(dmu_objset_zil(os), tx);
+
+               if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+                       spa_feature_incr(dp->dp_spa, async_destroy, tx);
+                       dp->dp_bptree_obj = bptree_alloc(mos, tx);
+                       VERIFY0(zap_add(mos,
+                           DMU_POOL_DIRECTORY_OBJECT,
+                           DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+                           &dp->dp_bptree_obj, tx));
+               }
+
+               used = ds->ds_dir->dd_phys->dd_used_bytes;
+               comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
+               uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+
+               ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+                   ds->ds_phys->ds_unique_bytes == used);
+
+               bptree_add(mos, dp->dp_bptree_obj,
+                   &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+                   used, comp, uncomp, tx);
+               dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+                   -used, -comp, -uncomp, tx);
+               dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+                   used, comp, uncomp, tx);
+       }
+
+       if (ds->ds_prev != NULL) {
+               if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+                       VERIFY0(zap_remove_int(mos,
+                           ds->ds_prev->ds_dir->dd_phys->dd_clones,
+                           ds->ds_object, tx));
+               }
+               prevobj = ds->ds_prev->ds_object;
+               dsl_dataset_rele(ds->ds_prev, ds);
+               ds->ds_prev = NULL;
+       }
+
+       /*
+        * This must be done after the dsl_traverse(), because it will
+        * re-open the objset.
+        */
+       if (ds->ds_objset) {
+               dmu_objset_evict(ds->ds_objset);
+               ds->ds_objset = NULL;
+       }
+
+       /* Erase the link in the dir */
+       dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+       ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+       ddobj = ds->ds_dir->dd_object;
+       ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
+       VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
+
+       spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+       ASSERT0(ds->ds_phys->ds_next_clones_obj);
+       ASSERT0(ds->ds_phys->ds_props_obj);
+       ASSERT0(ds->ds_phys->ds_userrefs_obj);
+       dsl_dir_rele(ds->ds_dir, ds);
+       ds->ds_dir = NULL;
+       VERIFY0(dmu_object_free(mos, obj, tx));
+
+       dsl_dir_destroy_sync(ddobj, tx);
+
+       if (rmorigin) {
+               dsl_dataset_t *prev;
+               VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
+               dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
+               dsl_dataset_rele(prev, FTAG);
+       }
+}
+
+static void
+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_destroy_head_arg_t *ddha = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+
+       VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+       dsl_destroy_head_sync_impl(ds, tx);
+       dsl_dataset_rele(ds, FTAG);
+}
+
+static void
+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_destroy_head_arg_t *ddha = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+
+       VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+
+       /* Mark it as inconsistent on-disk, in case we crash */
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+       spa_history_log_internal_ds(ds, "destroy begin", tx, "");
+       dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_destroy_head(const char *name)
+{
+       dsl_destroy_head_arg_t ddha;
+       int error;
+       spa_t *spa;
+       boolean_t isenabled;
+
+#ifdef _KERNEL
+       zfs_destroy_unmount_origin(name);
+#endif
+
+       error = spa_open(name, &spa, FTAG);
+       if (error != 0)
+               return (error);
+       isenabled = spa_feature_is_enabled(spa,
+           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+       spa_close(spa, FTAG);
+
+       ddha.ddha_name = name;
+
+       if (!isenabled) {
+               objset_t *os;
+
+               error = dsl_sync_task(name, dsl_destroy_head_check,
+                   dsl_destroy_head_begin_sync, &ddha, 0);
+               if (error != 0)
+                       return (error);
+
+               /*
+                * Head deletion is processed in one txg on old pools;
+                * remove the objects from open context so that the txg sync
+                * is not too long.
+                */
+               error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
+               if (error == 0) {
+                       uint64_t obj;
+                       uint64_t prev_snap_txg =
+                           dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
+                       for (obj = 0; error == 0;
+                           error = dmu_object_next(os, &obj, FALSE,
+                           prev_snap_txg))
+                               (void) dmu_free_object(os, obj);
+                       /* sync out all frees */
+                       txg_wait_synced(dmu_objset_pool(os), 0);
+                       dmu_objset_disown(os, FTAG);
+               }
+       }
+
+       return (dsl_sync_task(name, dsl_destroy_head_check,
+           dsl_destroy_head_sync, &ddha, 0));
+}
+
+/*
+ * Note, this function is used as the callback for dmu_objset_find().  We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+       objset_t *os;
+
+       if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
+               boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+               dmu_objset_rele(os, FTAG);
+               if (inconsistent)
+                       (void) dsl_destroy_head(dsname);
+       }
+       return (0);
+}
+
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dsl_destroy_head);
+EXPORT_SYMBOL(dsl_destroy_head_sync_impl);
+EXPORT_SYMBOL(dsl_dataset_user_hold_check_one);
+EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl);
+EXPORT_SYMBOL(dsl_destroy_inconsistent);
+EXPORT_SYMBOL(dsl_dataset_user_release_tmp);
+EXPORT_SYMBOL(dsl_destroy_head_check_impl);
+#endif
index 45c73c363e57d93fae362386e71f9f2002f619fa..ccae3f2709f4aa1eb8edefdffc5a4bb0d5f8493b 100644 (file)
@@ -40,8 +40,6 @@
 #include "zfs_namecheck.h"
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
-    uint64_t value, dmu_tx_t *tx);
 
 /* ARGSUSED */
 static void
@@ -58,7 +56,7 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
        }
 
        if (dd->dd_parent)
-               dsl_dir_close(dd->dd_parent, dd);
+               dsl_dir_rele(dd->dd_parent, dd);
 
        spa_close(dd->dd_pool->dp_spa, dd);
 
@@ -72,18 +70,17 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
 }
 
 int
-dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **ddp)
 {
        dmu_buf_t *dbuf;
        dsl_dir_t *dd;
        int err;
 
-       ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-           dsl_pool_sync_context(dp));
+       ASSERT(dsl_pool_config_held(dp));
 
        err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
-       if (err)
+       if (err != 0)
                return (err);
        dd = dmu_buf_get_user(dbuf);
 #ifdef ZFS_DEBUG
@@ -110,9 +107,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
                dsl_dir_snap_cmtime_update(dd);
 
                if (dd->dd_phys->dd_parent_obj) {
-                       err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+                       err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
                            NULL, dd, &dd->dd_parent);
-                       if (err)
+                       if (err != 0)
                                goto errout;
                        if (tail) {
 #ifdef ZFS_DEBUG
@@ -129,7 +126,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
                                    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
                                    ddobj, 0, dd->dd_myname);
                        }
-                       if (err)
+                       if (err != 0)
                                goto errout;
                } else {
                        (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
@@ -146,7 +143,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
                         */
                        err = dmu_bonus_hold(dp->dp_meta_objset,
                            dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
-                       if (err)
+                       if (err != 0)
                                goto errout;
                        origin_phys = origin_bonus->db_data;
                        dd->dd_origin_txg =
@@ -158,7 +155,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
                    dsl_dir_evict);
                if (winner) {
                        if (dd->dd_parent)
-                               dsl_dir_close(dd->dd_parent, dd);
+                               dsl_dir_rele(dd->dd_parent, dd);
                        mutex_destroy(&dd->dd_lock);
                        kmem_free(dd, sizeof (dsl_dir_t));
                        dd = winner;
@@ -185,7 +182,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 
 errout:
        if (dd->dd_parent)
-               dsl_dir_close(dd->dd_parent, dd);
+               dsl_dir_rele(dd->dd_parent, dd);
        mutex_destroy(&dd->dd_lock);
        kmem_free(dd, sizeof (dsl_dir_t));
        dmu_buf_rele(dbuf, tag);
@@ -193,7 +190,7 @@ errout:
 }
 
 void
-dsl_dir_close(dsl_dir_t *dd, void *tag)
+dsl_dir_rele(dsl_dir_t *dd, void *tag)
 {
        dprintf_dd(dd, "%s\n", "");
        spa_close(dd->dd_pool->dp_spa, tag);
@@ -250,6 +247,7 @@ static int
 getcomponent(const char *path, char *component, const char **nextp)
 {
        char *p;
+
        if ((path == NULL) || (path[0] == '\0'))
                return (ENOENT);
        /* This would be a good place to reserve some namespace... */
@@ -272,10 +270,10 @@ getcomponent(const char *path, char *component, const char **nextp)
                (void) strcpy(component, path);
                p = NULL;
        } else if (p[0] == '/') {
-               if (p-path >= MAXNAMELEN)
+               if (p - path >= MAXNAMELEN)
                        return (ENAMETOOLONG);
                (void) strncpy(component, path, p - path);
-               component[p-path] = '\0';
+               component[p - path] = '\0';
                p++;
        } else if (p[0] == '@') {
                /*
@@ -284,66 +282,57 @@ getcomponent(const char *path, char *component, const char **nextp)
                 */
                if (strchr(path, '/'))
                        return (EINVAL);
-               if (p-path >= MAXNAMELEN)
+               if (p - path >= MAXNAMELEN)
                        return (ENAMETOOLONG);
                (void) strncpy(component, path, p - path);
-               component[p-path] = '\0';
+               component[p - path] = '\0';
        } else {
-               ASSERT(!"invalid p");
+               panic("invalid p=%p", (void *)p);
        }
        *nextp = p;
        return (0);
 }
 
 /*
- * same as dsl_dir_open, ignore the first component of name and use the
- * spa instead
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail.  The name must be in the specified dsl_pool_t.  This
+ * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
+ * path is bogus, or if tail==NULL and we couldn't parse the whole name.
+ * (*tail)[0] == '@' means that the last component is a snapshot.
  */
 int
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
        char *buf;
-       const char *next, *nextnext = NULL;
+       const char *spaname, *next, *nextnext = NULL;
        int err;
        dsl_dir_t *dd;
-       dsl_pool_t *dp;
        uint64_t ddobj;
-       int openedspa = FALSE;
-
-       dprintf("%s\n", name);
 
        buf = kmem_alloc(MAXNAMELEN, KM_PUSHPAGE);
        err = getcomponent(name, buf, &next);
-       if (err)
+       if (err != 0)
                goto error;
-       if (spa == NULL) {
-               err = spa_open(buf, &spa, FTAG);
-               if (err) {
-                       dprintf("spa_open(%s) failed\n", buf);
-                       goto error;
-               }
-               openedspa = TRUE;
 
-               /* XXX this assertion belongs in spa_open */
-               ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
+       /* Make sure the name is in the specified pool. */
+       spaname = spa_name(dp->dp_spa);
+       if (strcmp(buf, spaname) != 0) {
+               err = EINVAL;
+               goto error;
        }
 
-       dp = spa_get_dsl(spa);
+       ASSERT(dsl_pool_config_held(dp));
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
-       if (err) {
-               rw_exit(&dp->dp_config_rwlock);
-               if (openedspa)
-                       spa_close(spa, FTAG);
+       err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+       if (err != 0) {
                goto error;
        }
 
        while (next != NULL) {
                dsl_dir_t *child_ds;
                err = getcomponent(next, buf, &nextnext);
-               if (err)
+               if (err != 0)
                        break;
                ASSERT(next[0] != '\0');
                if (next[0] == '@')
@@ -354,25 +343,22 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
                err = zap_lookup(dp->dp_meta_objset,
                    dd->dd_phys->dd_child_dir_zapobj,
                    buf, sizeof (ddobj), 1, &ddobj);
-               if (err) {
+               if (err != 0) {
                        if (err == ENOENT)
                                err = 0;
                        break;
                }
 
-               err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
-               if (err)
+               err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
+               if (err != 0)
                        break;
-               dsl_dir_close(dd, tag);
+               dsl_dir_rele(dd, tag);
                dd = child_ds;
                next = nextnext;
        }
-       rw_exit(&dp->dp_config_rwlock);
 
-       if (err) {
-               dsl_dir_close(dd, tag);
-               if (openedspa)
-                       spa_close(spa, FTAG);
+       if (err != 0) {
+               dsl_dir_rele(dd, tag);
                goto error;
        }
 
@@ -383,32 +369,18 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
        if (next != NULL &&
            (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
                /* bad path name */
-               dsl_dir_close(dd, tag);
+               dsl_dir_rele(dd, tag);
                dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
                err = ENOENT;
        }
-       if (tailp)
+       if (tailp != NULL)
                *tailp = next;
-       if (openedspa)
-               spa_close(spa, FTAG);
        *ddp = dd;
 error:
        kmem_free(buf, MAXNAMELEN);
        return (err);
 }
 
-/*
- * Return the dsl_dir_t, and possibly the last component which couldn't
- * be found in *tail.  Return NULL if the path is bogus, or if
- * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
- * means that the last component is a snapshot.
- */
-int
-dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
-{
-       return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
-}
-
 uint64_t
 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
     dmu_tx_t *tx)
@@ -446,71 +418,6 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
        return (ddobj);
 }
 
-/* ARGSUSED */
-int
-dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dir_t *dd = arg1;
-       dsl_pool_t *dp = dd->dd_pool;
-       objset_t *mos = dp->dp_meta_objset;
-       int err;
-       uint64_t count;
-
-       /*
-        * There should be exactly two holds, both from
-        * dsl_dataset_destroy: one on the dd directory, and one on its
-        * head ds.  If there are more holds, then a concurrent thread is
-        * performing a lookup inside this dir while we're trying to destroy
-        * it.  To minimize this possibility, we perform this check only
-        * in syncing context and fail the operation if we encounter
-        * additional holds.  The dp_config_rwlock ensures that nobody else
-        * opens it after we check.
-        */
-       if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2)
-               return (EBUSY);
-
-       err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
-       if (err)
-               return (err);
-       if (count != 0)
-               return (EEXIST);
-
-       return (0);
-}
-
-void
-dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
-       dsl_dir_t *dd = arg1;
-       objset_t *mos = dd->dd_pool->dp_meta_objset;
-       uint64_t obj;
-       dd_used_t t;
-
-       ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
-       ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
-
-       /*
-        * Remove our reservation. The impl() routine avoids setting the
-        * actual property, which would require the (already destroyed) ds.
-        */
-       dsl_dir_set_reservation_sync_impl(dd, 0, tx);
-
-       ASSERT0(dd->dd_phys->dd_used_bytes);
-       ASSERT0(dd->dd_phys->dd_reserved);
-       for (t = 0; t < DD_USED_NUM; t++)
-               ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
-
-       VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
-       VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
-       VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
-       VERIFY(0 == zap_remove(mos,
-           dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
-
-       obj = dd->dd_object;
-       dsl_dir_close(dd, tag);
-       VERIFY(0 == dmu_object_free(mos, obj, tx));
-}
-
 boolean_t
 dsl_dir_is_clone(dsl_dir_t *dd)
 {
@@ -546,18 +453,16 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
        }
        mutex_exit(&dd->dd_lock);
 
-       rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
        if (dsl_dir_is_clone(dd)) {
                dsl_dataset_t *ds;
                char buf[MAXNAMELEN];
 
-               VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
+               VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
                    dd->dd_phys->dd_origin_obj, FTAG, &ds));
                dsl_dataset_name(ds, buf);
                dsl_dataset_rele(ds, FTAG);
                dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
        }
-       rw_exit(&dd->dd_pool->dp_config_rwlock);
 }
 
 void
@@ -567,7 +472,7 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 
        ASSERT(dd->dd_phys);
 
-       if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+       if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
                /* up the hold count until we can be written out */
                dmu_buf_add_ref(dd->dd_dbuf, dd);
        }
@@ -854,7 +759,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
                    FALSE, asize > usize, tr_list, tx, TRUE);
        }
 
-       if (err)
+       if (err != 0)
                dsl_dir_tempreserve_clear(tr_list, tx);
        else
                *tr_cookiep = tr_list;
@@ -1007,118 +912,123 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
                mutex_exit(&dd->dd_lock);
 }
 
+typedef struct dsl_dir_set_qr_arg {
+       const char *ddsqra_name;
+       zprop_source_t ddsqra_source;
+       uint64_t ddsqra_value;
+} dsl_dir_set_qr_arg_t;
+
 static int
-dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_dir_t *dd = ds->ds_dir;
-       dsl_prop_setarg_t *psa = arg2;
-       int err;
-       uint64_t towrite;
+       dsl_dir_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       int error;
+       uint64_t towrite, newval;
 
-       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-               return (err);
+       error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       error = dsl_prop_predict(ds->ds_dir, "quota",
+           ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+       if (error != 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return (error);
+       }
 
-       if (psa->psa_effective_value == 0)
+       if (newval == 0) {
+               dsl_dataset_rele(ds, FTAG);
                return (0);
+       }
 
-       mutex_enter(&dd->dd_lock);
+       mutex_enter(&ds->ds_dir->dd_lock);
        /*
         * If we are doing the preliminary check in open context, and
         * there are pending changes, then don't fail it, since the
         * pending changes could under-estimate the amount of space to be
         * freed up.
         */
-       towrite = dsl_dir_space_towrite(dd);
+       towrite = dsl_dir_space_towrite(ds->ds_dir);
        if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-           (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
-           psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
-               err = ENOSPC;
+           (newval < ds->ds_dir->dd_phys->dd_reserved ||
+           newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
+               error = ENOSPC;
        }
-       mutex_exit(&dd->dd_lock);
-       return (err);
+       mutex_exit(&ds->ds_dir->dd_lock);
+       dsl_dataset_rele(ds, FTAG);
+       return (error);
 }
 
-extern dsl_syncfunc_t dsl_prop_set_sync;
-
 static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_dir_t *dd = ds->ds_dir;
-       dsl_prop_setarg_t *psa = arg2;
-       uint64_t effective_value = psa->psa_effective_value;
+       dsl_dir_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       uint64_t newval;
 
-       dsl_prop_set_sync(ds, psa, tx);
-       DSL_PROP_CHECK_PREDICTION(dd, psa);
+       VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
-       dmu_buf_will_dirty(dd->dd_dbuf, tx);
+       dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
+           ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+           &ddsqra->ddsqra_value, tx);
 
-       mutex_enter(&dd->dd_lock);
-       dd->dd_phys->dd_quota = effective_value;
-       mutex_exit(&dd->dd_lock);
+       VERIFY0(dsl_prop_get_int_ds(ds,
+           zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
 
-       spa_history_log_internal_dd(dd, "set quota", tx,
-           "quota=%lld", (longlong_t)effective_value);
+       dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+       mutex_enter(&ds->ds_dir->dd_lock);
+       ds->ds_dir->dd_phys->dd_quota = newval;
+       mutex_exit(&ds->ds_dir->dd_lock);
+       dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
-       dsl_dir_t *dd;
-       dsl_dataset_t *ds;
-       dsl_prop_setarg_t psa;
-       int err;
-
-       dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
-
-       err = dsl_dataset_hold(ddname, FTAG, &ds);
-       if (err)
-               return (err);
-
-       err = dsl_dir_open(ddname, FTAG, &dd, NULL);
-       if (err) {
-               dsl_dataset_rele(ds, FTAG);
-               return (err);
-       }
+       dsl_dir_set_qr_arg_t ddsqra;
 
-       ASSERT(ds->ds_dir == dd);
+       ddsqra.ddsqra_name = ddname;
+       ddsqra.ddsqra_source = source;
+       ddsqra.ddsqra_value = quota;
 
-       /*
-        * If someone removes a file, then tries to set the quota, we want to
-        * make sure the file freeing takes effect.
-        */
-       txg_wait_open(dd->dd_pool, 0);
-
-       err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
-           dsl_dir_set_quota_sync, ds, &psa, 0);
-
-       dsl_dir_close(dd, FTAG);
-       dsl_dataset_rele(ds, FTAG);
-       return (err);
+       return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
+           dsl_dir_set_quota_sync, &ddsqra, 0));
 }
 
 int
-dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_dir_t *dd = ds->ds_dir;
-       dsl_prop_setarg_t *psa = arg2;
-       uint64_t effective_value;
-       uint64_t used, avail;
-       int err;
-
-       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-               return (err);
+       dsl_dir_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       dsl_dir_t *dd;
+       uint64_t newval, used, avail;
+       int error;
 
-       effective_value = psa->psa_effective_value;
+       error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+       if (error != 0)
+               return (error);
+       dd = ds->ds_dir;
 
        /*
         * If we are doing the preliminary check in open context, the
         * space estimates may be inaccurate.
         */
-       if (!dmu_tx_is_syncing(tx))
+       if (!dmu_tx_is_syncing(tx)) {
+               dsl_dataset_rele(ds, FTAG);
                return (0);
+       }
+
+       error = dsl_prop_predict(ds->ds_dir,
+           zfs_prop_to_name(ZFS_PROP_RESERVATION),
+           ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+       if (error != 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return (error);
+       }
 
        mutex_enter(&dd->dd_lock);
        used = dd->dd_phys->dd_used_bytes;
@@ -1131,21 +1041,21 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
                avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
        }
 
-       if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
-               uint64_t delta = MAX(used, effective_value) -
+       if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
+               uint64_t delta = MAX(used, newval) -
                    MAX(used, dd->dd_phys->dd_reserved);
 
-               if (delta > avail)
-                       return (ENOSPC);
-               if (dd->dd_phys->dd_quota > 0 &&
-                   effective_value > dd->dd_phys->dd_quota)
-                       return (ENOSPC);
+               if (delta > avail ||
+                   (dd->dd_phys->dd_quota > 0 &&
+                   newval > dd->dd_phys->dd_quota))
+                       error = ENOSPC;
        }
 
-       return (0);
+       dsl_dataset_rele(ds, FTAG);
+       return (error);
 }
 
-static void
+void
 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
 {
        uint64_t used;
@@ -1167,51 +1077,38 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
 }
 
 static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_dir_t *dd = ds->ds_dir;
-       dsl_prop_setarg_t *psa = arg2;
-       uint64_t value = psa->psa_effective_value;
+       dsl_dir_set_qr_arg_t *ddsqra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       uint64_t newval;
 
-       dsl_prop_set_sync(ds, psa, tx);
-       DSL_PROP_CHECK_PREDICTION(dd, psa);
-       dsl_dir_set_reservation_sync_impl(dd, value, tx);
+       VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+       dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION),
+           ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+           &ddsqra->ddsqra_value, tx);
 
-       spa_history_log_internal_dd(dd, "set reservation", tx,
-           "reservation=%lld", (longlong_t)value);
+       VERIFY0(dsl_prop_get_int_ds(ds,
+           zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
+       dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
+       dsl_dataset_rele(ds, FTAG);
  }
 
 int
 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation)
 {
-       dsl_dir_t *dd;
-       dsl_dataset_t *ds;
-       dsl_prop_setarg_t psa;
-       int err;
-
-       dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
-
-       err = dsl_dataset_hold(ddname, FTAG, &ds);
-       if (err)
-               return (err);
-
-       err = dsl_dir_open(ddname, FTAG, &dd, NULL);
-       if (err) {
-               dsl_dataset_rele(ds, FTAG);
-               return (err);
-       }
+       dsl_dir_set_qr_arg_t ddsqra;
 
-       ASSERT(ds->ds_dir == dd);
+       ddsqra.ddsqra_name = ddname;
+       ddsqra.ddsqra_source = source;
+       ddsqra.ddsqra_value = reservation;
 
-       err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
-           dsl_dir_set_reservation_sync, ds, &psa, 0);
-
-       dsl_dir_close(dd, FTAG);
-       dsl_dataset_rele(ds, FTAG);
-       return (err);
+       return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
+           dsl_dir_set_reservation_sync, &ddsqra, 0));
 }
 
 static dsl_dir_t *
@@ -1243,79 +1140,123 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
        return (would_change(dd->dd_parent, delta, ancestor));
 }
 
-struct renamearg {
-       dsl_dir_t *newparent;
-       const char *mynewname;
-};
+typedef struct dsl_dir_rename_arg {
+       const char *ddra_oldname;
+       const char *ddra_newname;
+} dsl_dir_rename_arg_t;
 
+/* ARGSUSED */
 static int
-dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
-       dsl_dir_t *dd = arg1;
-       struct renamearg *ra = arg2;
-       dsl_pool_t *dp = dd->dd_pool;
-       objset_t *mos = dp->dp_meta_objset;
-       int err;
-       uint64_t val;
+       int *deltap = arg;
+       char namebuf[MAXNAMELEN];
 
-       /*
-        * There should only be one reference, from dmu_objset_rename().
-        * Fleeting holds are also possible (eg, from "zfs list" getting
-        * stats), but any that are present in open context will likely
-        * be gone by syncing context, so only fail from syncing
-        * context.
-        */
-       if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
-               return (EBUSY);
+       dsl_dataset_name(ds, namebuf);
+
+       if (strlen(namebuf) + *deltap >= MAXNAMELEN)
+               return (ENAMETOOLONG);
+       return (0);
+}
+
+static int
+dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
+{
+       dsl_dir_rename_arg_t *ddra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dir_t *dd, *newparent;
+       const char *mynewname;
+       int error;
+       int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
 
-       /* check for existing name */
-       err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
-           ra->mynewname, 8, 1, &val);
-       if (err == 0)
+       /* target dir should exist */
+       error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
+       if (error != 0)
+               return (error);
+
+       /* new parent should exist */
+       error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
+           &newparent, &mynewname);
+       if (error != 0) {
+               dsl_dir_rele(dd, FTAG);
+               return (error);
+       }
+
+       /* can't rename to different pool */
+       if (dd->dd_pool != newparent->dd_pool) {
+               dsl_dir_rele(newparent, FTAG);
+               dsl_dir_rele(dd, FTAG);
+               return (ENXIO);
+       }
+
+       /* new name should not already exist */
+       if (mynewname == NULL) {
+               dsl_dir_rele(newparent, FTAG);
+               dsl_dir_rele(dd, FTAG);
                return (EEXIST);
-       if (err != ENOENT)
-               return (err);
+       }
+
+       /* if the name length is growing, validate child name lengths */
+       if (delta > 0) {
+               error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
+                   &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+               if (error != 0) {
+                       dsl_dir_rele(newparent, FTAG);
+                       dsl_dir_rele(dd, FTAG);
+                       return (error);
+               }
+       }
 
-       if (ra->newparent != dd->dd_parent) {
+       if (newparent != dd->dd_parent) {
                /* is there enough space? */
                uint64_t myspace =
                    MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
 
                /* no rename into our descendant */
-               if (closest_common_ancestor(dd, ra->newparent) == dd)
+               if (closest_common_ancestor(dd, newparent) == dd) {
+                       dsl_dir_rele(newparent, FTAG);
+                       dsl_dir_rele(dd, FTAG);
                        return (EINVAL);
+               }
 
-               if ((err = dsl_dir_transfer_possible(dd->dd_parent,
-                   ra->newparent, myspace)))
-                       return (err);
+               error = dsl_dir_transfer_possible(dd->dd_parent,
+                   newparent, myspace);
+               if (error != 0) {
+                       dsl_dir_rele(newparent, FTAG);
+                       dsl_dir_rele(dd, FTAG);
+                       return (error);
+               }
        }
 
+       dsl_dir_rele(newparent, FTAG);
+       dsl_dir_rele(dd, FTAG);
        return (0);
 }
 
 static void
-dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       struct renamearg *ra = arg2;
-       dsl_pool_t *dp = dd->dd_pool;
+       dsl_dir_rename_arg_t *ddra = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dir_t *dd, *newparent;
+       const char *mynewname;
+       int error;
        objset_t *mos = dp->dp_meta_objset;
-       int err;
-       char namebuf[MAXNAMELEN];
 
-       ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
+       VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
+       VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
+           &mynewname));
 
        /* Log this before we change the name. */
-       dsl_dir_name(ra->newparent, namebuf);
        spa_history_log_internal_dd(dd, "rename", tx,
-           "-> %s/%s", namebuf, ra->mynewname);
+           "-> %s", ddra->ddra_newname);
 
-       if (ra->newparent != dd->dd_parent) {
+       if (newparent != dd->dd_parent) {
                dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
                    -dd->dd_phys->dd_used_bytes,
                    -dd->dd_phys->dd_compressed_bytes,
                    -dd->dd_phys->dd_uncompressed_bytes, tx);
-               dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
+               dsl_dir_diduse_space(newparent, DD_USED_CHILD,
                    dd->dd_phys->dd_used_bytes,
                    dd->dd_phys->dd_compressed_bytes,
                    dd->dd_phys->dd_uncompressed_bytes, tx);
@@ -1326,7 +1267,7 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 
                        dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
                            -unused_rsrv, 0, 0, tx);
-                       dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
+                       dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
                            unused_rsrv, 0, 0, tx);
                }
        }
@@ -1334,52 +1275,36 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
        /* remove from old parent zapobj */
-       err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+       error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
            dd->dd_myname, tx);
-       ASSERT0(err);
+       ASSERT0(error);
 
-       (void) strcpy(dd->dd_myname, ra->mynewname);
-       dsl_dir_close(dd->dd_parent, dd);
-       dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
-       VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
-           ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
+       (void) strcpy(dd->dd_myname, mynewname);
+       dsl_dir_rele(dd->dd_parent, dd);
+       dd->dd_phys->dd_parent_obj = newparent->dd_object;
+       VERIFY0(dsl_dir_hold_obj(dp,
+           newparent->dd_object, NULL, dd, &dd->dd_parent));
 
        /* add to new parent zapobj */
-       err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
-           dd->dd_myname, 8, 1, &dd->dd_object, tx);
-       ASSERT0(err);
+       VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
+           dd->dd_myname, 8, 1, &dd->dd_object, tx));
+
+       dsl_prop_notify_all(dd);
 
+       dsl_dir_rele(newparent, FTAG);
+       dsl_dir_rele(dd, FTAG);
 }
 
 int
-dsl_dir_rename(dsl_dir_t *dd, const char *newname)
+dsl_dir_rename(const char *oldname, const char *newname)
 {
-       struct renamearg ra;
-       int err;
+       dsl_dir_rename_arg_t ddra;
 
-       /* new parent should exist */
-       err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
-       if (err)
-               return (err);
+       ddra.ddra_oldname = oldname;
+       ddra.ddra_newname = newname;
 
-       /* can't rename to different pool */
-       if (dd->dd_pool != ra.newparent->dd_pool) {
-               err = ENXIO;
-               goto out;
-       }
-
-       /* new name should not already exist */
-       if (ra.mynewname == NULL) {
-               err = EEXIST;
-               goto out;
-       }
-
-       err = dsl_sync_task_do(dd->dd_pool,
-           dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
-
-out:
-       dsl_dir_close(ra.newparent, FTAG);
-       return (err);
+       return (dsl_sync_task(oldname,
+           dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3));
 }
 
 int
@@ -1424,6 +1349,4 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dsl_dir_set_quota);
 EXPORT_SYMBOL(dsl_dir_set_reservation);
-EXPORT_SYMBOL(dsl_dir_open);
-EXPORT_SYMBOL(dsl_dir_close);
 #endif
index 7795d8045e92d71c148c7594e64c218ceb31e9e0..b59e056bfb57bbdd54526f87701d29500b417b49 100644 (file)
@@ -43,6 +43,7 @@
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
+#include <sys/dsl_userhold.h>
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;                 /* 1/8th of physical memory */
@@ -264,7 +265,7 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
        if (err)
                return (err);
 
-       return (dsl_dir_open_obj(dp, obj, name, dp, ddp));
+       return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
@@ -276,7 +277,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
        dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
        dp->dp_spa = spa;
        dp->dp_meta_rootbp = *bp;
-       rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
+       rrw_init(&dp->dp_config_rwlock, B_TRUE);
        dp->dp_write_limit = zfs_write_limit_min;
        txg_init(dp, txg);
 
@@ -287,7 +288,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
        txg_list_create(&dp->dp_dirty_dirs,
            offsetof(dsl_dir_t, dd_dirty_link));
        txg_list_create(&dp->dp_sync_tasks,
-           offsetof(dsl_sync_task_group_t, dstg_node));
+           offsetof(dsl_sync_task_t, dst_node));
 
        mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -324,14 +325,14 @@ dsl_pool_open(dsl_pool_t *dp)
        dsl_dataset_t *ds;
        uint64_t obj;
 
-       rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+       rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
        err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
            &dp->dp_root_dir_obj);
        if (err)
                goto out;
 
-       err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+       err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
            NULL, dp, &dp->dp_root_dir);
        if (err)
                goto out;
@@ -352,7 +353,7 @@ dsl_pool_open(dsl_pool_t *dp)
                            &dp->dp_origin_snap);
                        dsl_dataset_rele(ds, FTAG);
                }
-               dsl_dir_close(dd, dp);
+               dsl_dir_rele(dd, dp);
                if (err)
                        goto out;
        }
@@ -367,7 +368,7 @@ dsl_pool_open(dsl_pool_t *dp)
                    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
                if (err)
                        goto out;
-               VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+               VERIFY0(bpobj_open(&dp->dp_free_bpobj,
                    dp->dp_meta_objset, obj));
        }
 
@@ -400,7 +401,7 @@ dsl_pool_open(dsl_pool_t *dp)
        err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
 out:
-       rw_exit(&dp->dp_config_rwlock);
+       rrw_exit(&dp->dp_config_rwlock, FTAG);
        return (err);
 }
 
@@ -415,13 +416,13 @@ dsl_pool_close(dsl_pool_t *dp)
         * and not a hold, so just drop that here.
         */
        if (dp->dp_origin_snap)
-               dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
+               dsl_dataset_rele(dp->dp_origin_snap, dp);
        if (dp->dp_mos_dir)
-               dsl_dir_close(dp->dp_mos_dir, dp);
+               dsl_dir_rele(dp->dp_mos_dir, dp);
        if (dp->dp_free_dir)
-               dsl_dir_close(dp->dp_free_dir, dp);
+               dsl_dir_rele(dp->dp_free_dir, dp);
        if (dp->dp_root_dir)
-               dsl_dir_close(dp->dp_root_dir, dp);
+               dsl_dir_rele(dp->dp_root_dir, dp);
 
        bpobj_close(&dp->dp_free_bpobj);
 
@@ -439,7 +440,7 @@ dsl_pool_close(dsl_pool_t *dp)
        dsl_scan_fini(dp);
        dsl_pool_tx_assign_destroy(dp);
        dsl_pool_txg_history_destroy(dp);
-       rw_destroy(&dp->dp_config_rwlock);
+       rrw_destroy(&dp->dp_config_rwlock);
        mutex_destroy(&dp->dp_lock);
        taskq_destroy(dp->dp_iput_taskq);
        if (dp->dp_blkstats)
@@ -457,6 +458,8 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
        dsl_dataset_t *ds;
        uint64_t obj;
 
+       rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
        /* create and open the MOS (meta-objset) */
        dp->dp_meta_objset = dmu_objset_create_impl(spa,
            NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
@@ -467,30 +470,30 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
        ASSERT0(err);
 
        /* Initialize scan structures */
-       VERIFY3U(0, ==, dsl_scan_init(dp, txg));
+       VERIFY0(dsl_scan_init(dp, txg));
 
        /* create and open the root dir */
        dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
-       VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+       VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
            NULL, dp, &dp->dp_root_dir));
 
        /* create and open the meta-objset dir */
        (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
-       VERIFY(0 == dsl_pool_open_special_dir(dp,
+       VERIFY0(dsl_pool_open_special_dir(dp,
            MOS_DIR_NAME, &dp->dp_mos_dir));
 
        if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
                /* create and open the free dir */
                (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
                    FREE_DIR_NAME, tx);
-               VERIFY(0 == dsl_pool_open_special_dir(dp,
+               VERIFY0(dsl_pool_open_special_dir(dp,
                    FREE_DIR_NAME, &dp->dp_free_dir));
 
                /* create and open the free_bplist */
                obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
                VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
                    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
-               VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+               VERIFY0(bpobj_open(&dp->dp_free_bpobj,
                    dp->dp_meta_objset, obj));
        }
 
@@ -501,7 +504,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
        obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 
        /* create the root objset */
-       VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+       VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
        VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds,
            dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx)));
 #ifdef _KERNEL
@@ -511,6 +514,8 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 
        dmu_tx_commit(tx);
 
+       rrw_exit(&dp->dp_config_rwlock, FTAG);
+
        return (dp);
 }
 
@@ -533,10 +538,7 @@ static int
 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
        dsl_deadlist_t *dl = arg;
-       dsl_pool_t *dp = dmu_objset_pool(dl->dl_os);
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
        dsl_deadlist_insert(dl, bp, tx);
-       rw_exit(&dp->dp_config_rwlock);
        return (0);
 }
 
@@ -558,7 +560,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 
        /*
         * We need to copy dp_space_towrite() before doing
-        * dsl_sync_task_group_sync(), because
+        * dsl_sync_task_sync(), because
         * dsl_dataset_snapshot_reserve_space() will increase
         * dp_space_towrite but not actually write anything.
         */
@@ -673,14 +675,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
         */
        DTRACE_PROBE(pool_sync__3task);
        if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
-               dsl_sync_task_group_t *dstg;
+               dsl_sync_task_t *dst;
                /*
                 * No more sync tasks should have been added while we
                 * were syncing.
                 */
                ASSERT(spa_sync_pass(dp->dp_spa) == 1);
-               while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg)))
-                       dsl_sync_task_group_sync(dstg, tx);
+               while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)))
+                       dsl_sync_task_sync(dst, tx);
        }
 
        dmu_tx_commit(tx);
@@ -857,14 +859,13 @@ dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static int
-upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
        dmu_tx_t *tx = arg;
        dsl_dataset_t *ds, *prev = NULL;
        int err;
-       dsl_pool_t *dp = spa_get_dsl(spa);
 
-       err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
        if (err)
                return (err);
 
@@ -890,7 +891,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
                 * The $ORIGIN can't have any data, or the accounting
                 * will be wrong.
                 */
-               ASSERT(prev->ds_phys->ds_bp.blk_birth == 0);
+               ASSERT0(prev->ds_phys->ds_bp.blk_birth);
 
                /* The origin doesn't get attached to itself */
                if (ds->ds_object == prev->ds_object) {
@@ -910,13 +911,13 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 
                if (ds->ds_phys->ds_next_snap_obj == 0) {
                        ASSERT(ds->ds_prev == NULL);
-                       VERIFY(0 == dsl_dataset_hold_obj(dp,
+                       VERIFY0(dsl_dataset_hold_obj(dp,
                            ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
                }
        }
 
-       ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
-       ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
+       ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
+       ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
 
        if (prev->ds_phys->ds_next_clones_obj == 0) {
                dmu_buf_will_dirty(prev->ds_dbuf, tx);
@@ -924,7 +925,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
                    zap_create(dp->dp_meta_objset,
                    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
        }
-       VERIFY(0 == zap_add_int(dp->dp_meta_objset,
+       VERIFY0(zap_add_int(dp->dp_meta_objset,
            prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
 
        dsl_dataset_rele(ds, FTAG);
@@ -939,25 +940,21 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
        ASSERT(dmu_tx_is_syncing(tx));
        ASSERT(dp->dp_origin_snap != NULL);
 
-       VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+       VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
            tx, DS_FIND_CHILDREN));
 }
 
 /* ARGSUSED */
 static int
-upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
        dmu_tx_t *tx = arg;
-       dsl_dataset_t *ds;
-       dsl_pool_t *dp = spa_get_dsl(spa);
        objset_t *mos = dp->dp_meta_objset;
 
-       VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
-       if (ds->ds_dir->dd_phys->dd_origin_obj) {
+       if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
                dsl_dataset_t *origin;
 
-               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+               VERIFY0(dsl_dataset_hold_obj(dp,
                    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
 
                if (origin->ds_dir->dd_phys->dd_clones == 0) {
@@ -966,13 +963,11 @@ upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
                            DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
                }
 
-               VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
-                   origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+               VERIFY0(zap_add_int(dp->dp_meta_objset,
+                   origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
 
                dsl_dataset_rele(origin, FTAG);
        }
-
-       dsl_dataset_rele(ds, FTAG);
        return (0);
 }
 
@@ -984,7 +979,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
        ASSERT(dmu_tx_is_syncing(tx));
 
        (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
-       VERIFY(0 == dsl_pool_open_special_dir(dp,
+       VERIFY0(dsl_pool_open_special_dir(dp,
            FREE_DIR_NAME, &dp->dp_free_dir));
 
        /*
@@ -994,12 +989,11 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
         */
        obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
            SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
-       VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+       VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
-       VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
-           dp->dp_meta_objset, obj));
+       VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
-       VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
+       VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
            upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 }
 
@@ -1011,17 +1005,16 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 
        ASSERT(dmu_tx_is_syncing(tx));
        ASSERT(dp->dp_origin_snap == NULL);
+       ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 
        /* create the origin dir, ds, & snap-ds */
-       rw_enter(&dp->dp_config_rwlock, RW_WRITER);
        dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
            NULL, 0, kcred, tx);
-       VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-       dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
-       VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+       VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+       dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
+       VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
            dp, &dp->dp_origin_snap));
        dsl_dataset_rele(ds, FTAG);
-       rw_exit(&dp->dp_config_rwlock);
 }
 
 taskq_t *
@@ -1056,7 +1049,7 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
                *htag = '\0';
                ++htag;
                dsobj = strtonum(za.za_name, NULL);
-               (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
+               dsl_dataset_user_release_tmp(dp, dsobj, htag);
        }
        zap_cursor_fini(&zc);
 }
@@ -1078,7 +1071,7 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 
 static int
 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
-    const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
+    const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 {
        objset_t *mos = dp->dp_meta_objset;
        uint64_t zapobj = dp->dp_tmp_userrefs_obj;
@@ -1103,7 +1096,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 
        name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
        if (holding)
-               error = zap_add(mos, zapobj, name, 8, 1, now, tx);
+               error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
        else
                error = zap_remove(mos, zapobj, name, tx);
        strfree(name);
@@ -1116,7 +1109,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
  */
 int
 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
-    uint64_t *now, dmu_tx_t *tx)
+    uint64_t now, dmu_tx_t *tx)
 {
        return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 }
@@ -1128,10 +1121,113 @@ int
 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     dmu_tx_t *tx)
 {
-       return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
+       return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
            tx, B_FALSE));
 }
 
+/*
+ * DSL Pool Configuration Lock
+ *
+ * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
+ * creation / destruction / rename / property setting).  It must be held for
+ * read to hold a dataset or dsl_dir.  I.e. you must call
+ * dsl_pool_config_enter() or dsl_pool_hold() before calling
+ * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
+ * must be held continuously until all datasets and dsl_dirs are released.
+ *
+ * The only exception to this rule is that if a "long hold" is placed on
+ * a dataset, then the dp_config_rwlock may be dropped while the dataset
+ * is still held.  The long hold will prevent the dataset from being
+ * destroyed -- the destroy will fail with EBUSY.  A long hold can be
+ * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
+ * (by calling dsl_{dataset,objset}_{try}own{_obj}).
+ *
+ * Legitimate long-holders (including owners) should be long-running, cancelable
+ * tasks that should cause "zfs destroy" to fail.  This includes DMU
+ * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
+ * "zfs send", and "zfs diff".  There are several other long-holders whose
+ * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
+ *
+ * The usual formula for long-holding would be:
+ * dsl_pool_hold()
+ * dsl_dataset_hold()
+ * ... perform checks ...
+ * dsl_dataset_long_hold()
+ * dsl_pool_rele()
+ * ... perform long-running task ...
+ * dsl_dataset_long_rele()
+ * dsl_dataset_rele()
+ *
+ * Note that when the long hold is released, the dataset is still held but
+ * the pool is not held.  The dataset may change arbitrarily during this time
+ * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
+ * dataset except release it.
+ *
+ * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
+ * or modifying operations.
+ *
+ * Modifying operations should generally use dsl_sync_task().  The synctask
+ * infrastructure enforces proper locking strategy with respect to the
+ * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
+ *
+ * Read-only operations will manually hold the pool, then the dataset, obtain
+ * information from the dataset, then release the pool and dataset.
+ * dmu_objset_{hold,rele}() are convenience routines that also do the pool
+ * hold/rele.
+ */
+
+int
+dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
+{
+       spa_t *spa;
+       int error;
+
+       error = spa_open(name, &spa, tag);
+       if (error == 0) {
+               *dp = spa_get_dsl(spa);
+               dsl_pool_config_enter(*dp, tag);
+       }
+       return (error);
+}
+
+void
+dsl_pool_rele(dsl_pool_t *dp, void *tag)
+{
+       dsl_pool_config_exit(dp, tag);
+       spa_close(dp->dp_spa, tag);
+}
+
+void
+dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
+{
+       /*
+        * We use a "reentrant" reader-writer lock, but not reentrantly.
+        *
+        * The rrwlock can (with the track_all flag) track all reading threads,
+        * which is very useful for debugging which code path failed to release
+        * the lock, and for verifying that the *current* thread does hold
+        * the lock.
+        *
+        * (Unlike a rwlock, which knows that N threads hold it for
+        * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
+        * if any thread holds it for read, even if this thread doesn't).
+        */
+       ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+       rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
+}
+
+void
+dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
+{
+       rrw_exit(&dp->dp_config_rwlock, tag);
+}
+
+boolean_t
+dsl_pool_config_held(dsl_pool_t *dp)
+{
+       return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 module_param(zfs_no_write_throttle, int, 0644);
 MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling");
index 153420ccf5f1aa1c9d140227424b2d9b82657557..1d981a7eeaf8f6cf789cf59417a27b0b91161f58 100644 (file)
@@ -82,7 +82,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
        char *inheritstr;
        char *recvdstr;
 
-       ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+       ASSERT(dsl_pool_config_held(dd->dd_pool));
 
        if (setpoint)
                setpoint[0] = '\0';
@@ -97,8 +97,6 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
         * after this loop.
         */
        for (; dd != NULL; dd = dd->dd_parent) {
-               ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
-
                if (dd != target || snapshot) {
                        if (!inheritable)
                                break;
@@ -167,7 +165,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
        boolean_t snapshot;
        uint64_t zapobj;
 
-       ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+       ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
        inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
        snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
        zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
@@ -231,22 +229,16 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg)
 {
        dsl_dir_t *dd = ds->ds_dir;
-       dsl_pool_t *dp = dd->dd_pool;
        uint64_t value;
        dsl_prop_cb_record_t *cbr;
        int err;
-       int need_rwlock;
+       ASSERTV(dsl_pool_t *dp = dd->dd_pool);
 
-       need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock);
-       if (need_rwlock)
-               rw_enter(&dp->dp_config_rwlock, RW_READER);
+       ASSERT(dsl_pool_config_held(dp));
 
-       err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL);
-       if (err != 0) {
-               if (need_rwlock)
-                       rw_exit(&dp->dp_config_rwlock);
+       err = dsl_prop_get_int_ds(ds, propname, &value);
+       if (err != 0)
                return (err);
-       }
 
        cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_PUSHPAGE);
        cbr->cbr_ds = ds;
@@ -259,9 +251,6 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
        mutex_exit(&dd->dd_lock);
 
        cbr->cbr_func(cbr->cbr_arg, value);
-
-       if (need_rwlock)
-               rw_exit(&dp->dp_config_rwlock);
        return (0);
 }
 
@@ -269,19 +258,18 @@ int
 dsl_prop_get(const char *dsname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint)
 {
-       dsl_dataset_t *ds;
-       int err;
+       objset_t *os;
+       int error;
 
-       err = dsl_dataset_hold(dsname, FTAG, &ds);
-       if (err)
-               return (err);
+       error = dmu_objset_hold(dsname, FTAG, &os);
+       if (error != 0)
+               return (error);
 
-       rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-       err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint);
-       rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+       error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
+           intsz, numints, buf, setpoint);
 
-       dsl_dataset_rele(ds, FTAG);
-       return (err);
+       dmu_objset_rele(os, FTAG);
+       return (error);
 }
 
 /*
@@ -299,17 +287,11 @@ dsl_prop_get_integer(const char *ddname, const char *propname,
        return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
 }
 
-void
-dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
-    zprop_source_t source, uint64_t *value)
+int
+dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname,
+    uint64_t *valuep)
 {
-       psa->psa_name = propname;
-       psa->psa_source = source;
-       psa->psa_intsz = 8;
-       psa->psa_numints = 1;
-       psa->psa_value = value;
-
-       psa->psa_effective_value = -1ULL;
+       return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
 }
 
 /*
@@ -323,11 +305,10 @@ dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
  * a property not handled by this function.
  */
 int
-dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+    zprop_source_t source, uint64_t value, uint64_t *newvalp)
 {
-       const char *propname = psa->psa_name;
        zfs_prop_t prop = zfs_name_to_prop(propname);
-       zprop_source_t source = psa->psa_source;
        objset_t *mos;
        uint64_t zapobj;
        uint64_t version;
@@ -359,36 +340,33 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
        switch ((int)source) {
        case ZPROP_SRC_NONE:
                /* Revert to the received value, if any. */
-               err = zap_lookup(mos, zapobj, recvdstr, 8, 1,
-                   &psa->psa_effective_value);
+               err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
                if (err == ENOENT)
-                       psa->psa_effective_value = 0;
+                       *newvalp = 0;
                break;
        case ZPROP_SRC_LOCAL:
-               psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+               *newvalp = value;
                break;
        case ZPROP_SRC_RECEIVED:
                /*
                 * If there's no local setting, then the new received value will
                 * be the effective value.
                 */
-               err = zap_lookup(mos, zapobj, propname, 8, 1,
-                   &psa->psa_effective_value);
+               err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
                if (err == ENOENT)
-                       psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+                       *newvalp = value;
                break;
        case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
                /*
                 * We're clearing the received value, so the local setting (if
                 * it exists) remains the effective value.
                 */
-               err = zap_lookup(mos, zapobj, propname, 8, 1,
-                   &psa->psa_effective_value);
+               err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
                if (err == ENOENT)
-                       psa->psa_effective_value = 0;
+                       *newvalp = 0;
                break;
        default:
-               cmn_err(CE_PANIC, "unexpected property source: %d", source);
+               panic("unexpected property source: %d", source);
        }
 
        strfree(recvdstr);
@@ -399,39 +377,6 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
        return (err);
 }
 
-#ifdef ZFS_DEBUG
-void
-dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
-{
-       zfs_prop_t prop = zfs_name_to_prop(psa->psa_name);
-       uint64_t intval;
-       char setpoint[MAXNAMELEN];
-       uint64_t version = spa_version(dd->dd_pool->dp_spa);
-       int err;
-
-       if (version < SPA_VERSION_RECVD_PROPS) {
-               switch (prop) {
-               case ZFS_PROP_QUOTA:
-               case ZFS_PROP_RESERVATION:
-                       return;
-               default:
-                       break;
-               }
-       }
-
-       err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval,
-           setpoint, B_FALSE);
-       if (err == 0 && intval != psa->psa_effective_value) {
-               cmn_err(CE_PANIC, "%s property, source: %x, "
-                   "predicted effective value: %llu, "
-                   "actual effective value: %llu (setpoint: %s)",
-                   psa->psa_name, psa->psa_source,
-                   (unsigned long long)psa->psa_effective_value,
-                   (unsigned long long)intval, setpoint);
-       }
-}
-#endif
-
 /*
  * Unregister this callback.  Return 0 on success, ENOENT if ddname is
  * invalid, ENOMSG if no matching callback registered.
@@ -466,25 +411,57 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
        return (0);
 }
 
-/*
- * Return the number of callbacks that are registered for this dataset.
- */
-int
-dsl_prop_numcb(dsl_dataset_t *ds)
+boolean_t
+dsl_prop_hascb(dsl_dataset_t *ds)
 {
        dsl_dir_t *dd = ds->ds_dir;
+       boolean_t rv = B_FALSE;
        dsl_prop_cb_record_t *cbr;
-       int num = 0;
 
        mutex_enter(&dd->dd_lock);
-       for (cbr = list_head(&dd->dd_prop_cbs);
-           cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-               if (cbr->cbr_ds == ds)
-                       num++;
+       for (cbr = list_head(&dd->dd_prop_cbs); cbr;
+           cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+               if (cbr->cbr_ds == ds) {
+                       rv = B_TRUE;
+                       break;
+               }
        }
        mutex_exit(&dd->dd_lock);
+       return (rv);
+}
 
-       return (num);
+/* ARGSUSED */
+static int
+dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+       dsl_dir_t *dd = ds->ds_dir;
+       dsl_prop_cb_record_t *cbr;
+
+       mutex_enter(&dd->dd_lock);
+       for (cbr = list_head(&dd->dd_prop_cbs); cbr;
+           cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+               uint64_t value;
+
+               if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname,
+                   sizeof (value), 1, &value, NULL) == 0)
+                       cbr->cbr_func(cbr->cbr_arg, value);
+       }
+       mutex_exit(&dd->dd_lock);
+
+       return (0);
+}
+
+/*
+ * Update all property values for ddobj & its descendants.  This is used
+ * when renaming the dir.
+ */
+void
+dsl_prop_notify_all(dsl_dir_t *dd)
+{
+       dsl_pool_t *dp = dd->dd_pool;
+       ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+       (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
+           NULL, DS_FIND_CHILDREN);
 }
 
 static void
@@ -498,8 +475,8 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
        zap_attribute_t *za;
        int err;
 
-       ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-       err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+       ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+       err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
        if (err)
                return;
 
@@ -510,7 +487,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
                 */
                err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
                if (err == 0) {
-                       dsl_dir_close(dd, FTAG);
+                       dsl_dir_rele(dd, FTAG);
                        return;
                }
                ASSERT3U(err, ==, ENOENT);
@@ -545,26 +522,24 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
        }
        kmem_free(za, sizeof (zap_attribute_t));
        zap_cursor_fini(&zc);
-       dsl_dir_close(dd, FTAG);
+       dsl_dir_rele(dd, FTAG);
 }
 
 void
-dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
+    zprop_source_t source, int intsz, int numints, const void *value,
+    dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_prop_setarg_t *psa = arg2;
        objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
        uint64_t zapobj, intval, dummy;
        int isint;
        char valbuf[32];
-       char *valstr = NULL;
+       const char *valstr = NULL;
        char *inheritstr;
        char *recvdstr;
        char *tbuf = NULL;
        int err;
        uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-       const char *propname = psa->psa_name;
-       zprop_source_t source = psa->psa_source;
 
        isint = (dodefault(propname, 8, 1, &intval) == 0);
 
@@ -614,8 +589,8 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                 */
                err = zap_remove(mos, zapobj, inheritstr, tx);
                ASSERT(err == 0 || err == ENOENT);
-               VERIFY(0 == zap_update(mos, zapobj, propname,
-                   psa->psa_intsz, psa->psa_numints, psa->psa_value, tx));
+               VERIFY0(zap_update(mos, zapobj, propname,
+                   intsz, numints, value, tx));
                break;
        case ZPROP_SRC_INHERITED:
                /*
@@ -626,12 +601,10 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                err = zap_remove(mos, zapobj, propname, tx);
                ASSERT(err == 0 || err == ENOENT);
                if (version >= SPA_VERSION_RECVD_PROPS &&
-                   dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy,
-                   NULL) == 0) {
+                   dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
                        dummy = 0;
-                       err = zap_update(mos, zapobj, inheritstr,
-                           8, 1, &dummy, tx);
-                       ASSERT(err == 0);
+                       VERIFY0(zap_update(mos, zapobj, inheritstr,
+                           8, 1, &dummy, tx));
                }
                break;
        case ZPROP_SRC_RECEIVED:
@@ -639,7 +612,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                 * set propname$recvd -> value
                 */
                err = zap_update(mos, zapobj, recvdstr,
-                   psa->psa_intsz, psa->psa_numints, psa->psa_value, tx);
+                   intsz, numints, value, tx);
                ASSERT(err == 0);
                break;
        case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
@@ -669,7 +642,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
        strfree(recvdstr);
 
        if (isint) {
-               VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL));
+               VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
 
                if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
                        dsl_prop_cb_record_t *cbr;
@@ -696,7 +669,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                valstr = valbuf;
        } else {
                if (source == ZPROP_SRC_LOCAL) {
-                       valstr = (char *)psa->psa_value;
+                       valstr = value;
                } else {
                        tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_PUSHPAGE);
                        if (dsl_prop_get_ds(ds, propname, 1,
@@ -713,118 +686,73 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                kmem_free(tbuf, ZAP_MAXVALUELEN);
 }
 
-void
-dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+int
+dsl_prop_set_int(const char *dsname, const char *propname,
+    zprop_source_t source, uint64_t value)
 {
-       dsl_dataset_t *ds = arg1;
-       dsl_props_arg_t *pa = arg2;
-       nvlist_t *props = pa->pa_props;
-       dsl_prop_setarg_t psa;
-       nvpair_t *elem = NULL;
-
-       psa.psa_source = pa->pa_source;
-
-       while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
-               nvpair_t *pair = elem;
-
-               psa.psa_name = nvpair_name(pair);
+       nvlist_t *nvl = fnvlist_alloc();
+       int error;
 
-               if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-                       /*
-                        * dsl_prop_get_all_impl() returns properties in this
-                        * format.
-                        */
-                       nvlist_t *attrs;
-                       VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-                       VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-                           &pair) == 0);
-               }
-
-               if (nvpair_type(pair) == DATA_TYPE_STRING) {
-                       VERIFY(nvpair_value_string(pair,
-                           (char **)&psa.psa_value) == 0);
-                       psa.psa_intsz = 1;
-                       psa.psa_numints = strlen(psa.psa_value) + 1;
-               } else {
-                       uint64_t intval;
-                       VERIFY(nvpair_value_uint64(pair, &intval) == 0);
-                       psa.psa_intsz = sizeof (intval);
-                       psa.psa_numints = 1;
-                       psa.psa_value = &intval;
-               }
-               dsl_prop_set_sync(ds, &psa, tx);
-       }
+       fnvlist_add_uint64(nvl, propname, value);
+       error = dsl_props_set(dsname, source, nvl);
+       fnvlist_free(nvl);
+       return (error);
 }
 
 int
-dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source,
-    int intsz, int numints, const void *buf)
+dsl_prop_set_string(const char *dsname, const char *propname,
+    zprop_source_t source, const char *value)
 {
-       dsl_dataset_t *ds;
-       uint64_t version;
-       int err;
-       dsl_prop_setarg_t psa;
-
-       /*
-        * We must do these checks before we get to the syncfunc, since
-        * it can't fail.
-        */
-       if (strlen(propname) >= ZAP_MAXNAMELEN)
-               return (ENAMETOOLONG);
-
-       err = dsl_dataset_hold(dsname, FTAG, &ds);
-       if (err)
-               return (err);
-
-       version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-       if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ?
-           ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
-               dsl_dataset_rele(ds, FTAG);
-               return (E2BIG);
-       }
-       if (dsl_dataset_is_snapshot(ds) &&
-           version < SPA_VERSION_SNAP_PROPS) {
-               dsl_dataset_rele(ds, FTAG);
-               return (ENOTSUP);
-       }
+       nvlist_t *nvl = fnvlist_alloc();
+       int error;
 
-       psa.psa_name = propname;
-       psa.psa_source = source;
-       psa.psa_intsz = intsz;
-       psa.psa_numints = numints;
-       psa.psa_value = buf;
-       psa.psa_effective_value = -1ULL;
+       fnvlist_add_string(nvl, propname, value);
+       error = dsl_props_set(dsname, source, nvl);
+       fnvlist_free(nvl);
+       return (error);
+}
 
-       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           NULL, dsl_prop_set_sync, ds, &psa, 2);
+int
+dsl_prop_inherit(const char *dsname, const char *propname,
+    zprop_source_t source)
+{
+       nvlist_t *nvl = fnvlist_alloc();
+       int error;
 
-       dsl_dataset_rele(ds, FTAG);
-       return (err);
+       fnvlist_add_boolean(nvl, propname);
+       error = dsl_props_set(dsname, source, nvl);
+       fnvlist_free(nvl);
+       return (error);
 }
 
-int
-dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
+typedef struct dsl_props_set_arg {
+       const char *dpsa_dsname;
+       zprop_source_t dpsa_source;
+       nvlist_t *dpsa_props;
+} dsl_props_set_arg_t;
+
+static int
+dsl_props_set_check(void *arg, dmu_tx_t *tx)
 {
+       dsl_props_set_arg_t *dpsa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
        dsl_dataset_t *ds;
        uint64_t version;
        nvpair_t *elem = NULL;
-       dsl_props_arg_t pa;
        int err;
 
-       if ((err = dsl_dataset_hold(dsname, FTAG, &ds)))
+       err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
+       if (err != 0)
                return (err);
-       /*
-        * Do these checks before the syncfunc, since it can't fail.
-        */
+
        version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-       while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+       while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
                if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
                        dsl_dataset_rele(ds, FTAG);
                        return (ENAMETOOLONG);
                }
                if (nvpair_type(elem) == DATA_TYPE_STRING) {
-                       char *valstr;
-                       VERIFY(nvpair_value_string(elem, &valstr) == 0);
+                       char *valstr = fnvpair_value_string(elem);
                        if (strlen(valstr) >= (version <
                            SPA_VERSION_STMF_PROP ?
                            ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
@@ -834,20 +762,83 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
                }
        }
 
-       if (dsl_dataset_is_snapshot(ds) &&
-           version < SPA_VERSION_SNAP_PROPS) {
+       if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) {
                dsl_dataset_rele(ds, FTAG);
                return (ENOTSUP);
        }
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
+
+void
+dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
+    nvlist_t *props, dmu_tx_t *tx)
+{
+       nvpair_t *elem = NULL;
+
+       while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+               nvpair_t *pair = elem;
 
-       pa.pa_props = props;
-       pa.pa_source = source;
+               if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+                       /*
+                        * dsl_prop_get_all_impl() returns properties in this
+                        * format.
+                        */
+                       nvlist_t *attrs = fnvpair_value_nvlist(pair);
+                       pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
+               }
+
+               if (nvpair_type(pair) == DATA_TYPE_STRING) {
+                       const char *value = fnvpair_value_string(pair);
+                       dsl_prop_set_sync_impl(ds, nvpair_name(pair),
+                           source, 1, strlen(value) + 1, value, tx);
+               } else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
+                       uint64_t intval = fnvpair_value_uint64(pair);
+                       dsl_prop_set_sync_impl(ds, nvpair_name(pair),
+                           source, sizeof (intval), 1, &intval, tx);
+               } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
+                       dsl_prop_set_sync_impl(ds, nvpair_name(pair),
+                           source, 0, 0, NULL, tx);
+               } else {
+                       panic("invalid nvpair type");
+               }
+       }
+}
 
-       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           NULL, dsl_props_set_sync, ds, &pa, 2);
+static void
+dsl_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_props_set_arg_t *dpsa = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
 
+       VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
+       dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
        dsl_dataset_rele(ds, FTAG);
-       return (err);
+}
+
+/*
+ * All-or-nothing; if any prop can't be set, nothing will be modified.
+ */
+int
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
+{
+       dsl_props_set_arg_t dpsa;
+       int nblks = 0;
+
+       dpsa.dpsa_dsname = dsname;
+       dpsa.dpsa_source = source;
+       dpsa.dpsa_props = props;
+
+       /*
+        * If the source includes NONE, then we will only be removing entries
+        * from the ZAP object.  In that case don't check for ENOSPC.
+        */
+       if ((source & ZPROP_SRC_NONE) == 0)
+               nblks = 2 * fnvlist_num_pairs(props);
+
+       return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
+           &dpsa, nblks));
 }
 
 typedef enum dsl_prop_getflags {
@@ -997,7 +988,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
        if (dsl_dataset_is_snapshot(ds))
                flags |= DSL_PROP_GET_SNAPSHOT;
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       ASSERT(dsl_pool_config_held(dp));
 
        if (ds->ds_phys->ds_props_obj != 0) {
                ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
@@ -1022,58 +1013,51 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
                        break;
        }
 out:
-       rw_exit(&dp->dp_config_rwlock);
        return (err);
 }
 
 boolean_t
-dsl_prop_get_hasrecvd(objset_t *os)
+dsl_prop_get_hasrecvd(const char *dsname)
 {
-       dsl_dataset_t *ds = os->os_dsl_dataset;
-       int rc;
        uint64_t dummy;
 
-       rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-       rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL);
-       rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
-       ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
-       return (rc == 0);
+       return (0 ==
+           dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
 }
 
-static void
-dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source)
+static int
+dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
 {
-       dsl_dataset_t *ds = os->os_dsl_dataset;
-       uint64_t dummy = 0;
-       dsl_prop_setarg_t psa;
-
-       if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS)
-               return;
+       uint64_t version;
+       spa_t *spa;
+       int error = 0;
 
-       dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy);
+       VERIFY0(spa_open(dsname, &spa, FTAG));
+       version = spa_version(spa);
+       spa_close(spa, FTAG);
 
-       (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL,
-           dsl_prop_set_sync, ds, &psa, 2);
+       if (version >= SPA_VERSION_RECVD_PROPS)
+               error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
+       return (error);
 }
 
 /*
  * Call after successfully receiving properties to ensure that only the first
  * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
  */
-void
-dsl_prop_set_hasrecvd(objset_t *os)
+int
+dsl_prop_set_hasrecvd(const char *dsname)
 {
-       if (dsl_prop_get_hasrecvd(os)) {
-               ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
-               return;
-       }
-       dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL);
+       int error = 0;
+       if (!dsl_prop_get_hasrecvd(dsname))
+               error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
+       return (error);
 }
 
 void
-dsl_prop_unset_hasrecvd(objset_t *os)
+dsl_prop_unset_hasrecvd(const char *dsname)
 {
-       dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE);
+       VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
 }
 
 int
@@ -1083,16 +1067,25 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
 }
 
 int
-dsl_prop_get_received(objset_t *os, nvlist_t **nvp)
+dsl_prop_get_received(const char *dsname, nvlist_t **nvp)
 {
+       objset_t *os;
+       int error;
+
        /*
         * Received properties are not distinguishable from local properties
         * until the dataset has received properties on or after
         * SPA_VERSION_RECVD_PROPS.
         */
-       dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ?
+       dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
            DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
-       return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags));
+
+       error = dmu_objset_hold(dsname, FTAG, &os);
+       if (error != 0)
+               return (error);
+       error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
+       dmu_objset_rele(os, FTAG);
+       return (error);
 }
 
 void
@@ -1138,8 +1131,6 @@ dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dsl_prop_register);
 EXPORT_SYMBOL(dsl_prop_unregister);
-EXPORT_SYMBOL(dsl_prop_numcb);
-EXPORT_SYMBOL(dsl_prop_set);
 EXPORT_SYMBOL(dsl_prop_get);
 EXPORT_SYMBOL(dsl_prop_get_integer);
 EXPORT_SYMBOL(dsl_prop_get_all);
index 90ca7b2566067dc91b6aa92314d6d16134c32e2c..2e5034bdffdc01937772e4a149e198472b64f55e 100644 (file)
@@ -53,7 +53,7 @@
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
-static dsl_syncfunc_t dsl_scan_cancel_sync;
+static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
 
 int zfs_top_maxinflight = 32;          /* maximum I/Os per top-level */
@@ -150,9 +150,9 @@ dsl_scan_fini(dsl_pool_t *dp)
 
 /* ARGSUSED */
 static int
-dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_scan_t *scn = arg1;
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
        if (scn->scn_phys.scn_state == DSS_SCANNING)
                return (EBUSY);
@@ -160,12 +160,11 @@ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
        return (0);
 }
 
-/* ARGSUSED */
 static void
-dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_scan_t *scn = arg1;
-       pool_scan_func_t *funcp = arg2;
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+       pool_scan_func_t *funcp = arg;
        dmu_object_type_t ot = 0;
        dsl_pool_t *dp = scn->scn_dp;
        spa_t *spa = dp->dp_spa;
@@ -312,9 +311,9 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static int
-dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
-       dsl_scan_t *scn = arg1;
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
        if (scn->scn_phys.scn_state != DSS_SCANNING)
                return (ENOENT);
@@ -323,9 +322,9 @@ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 {
-       dsl_scan_t *scn = arg1;
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
        dsl_scan_done(scn, B_FALSE, tx);
        dsl_scan_sync_state(scn, tx);
@@ -334,12 +333,8 @@ dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
-       boolean_t complete = B_FALSE;
-       int err;
-
-       err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
-           dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
-       return (err);
+       return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
+           dsl_scan_cancel_sync, NULL, 3));
 }
 
 static void dsl_scan_visitbp(blkptr_t *bp,
@@ -375,7 +370,7 @@ dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 static void
 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 {
-       VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+       VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
            DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
            &scn->scn_phys, tx));
@@ -959,33 +954,33 @@ struct enqueue_clones_arg {
 
 /* ARGSUSED */
 static int
-enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
        struct enqueue_clones_arg *eca = arg;
        dsl_dataset_t *ds;
        int err;
-       dsl_pool_t *dp = spa->spa_dsl_pool;
        dsl_scan_t *scn = dp->dp_scan;
 
-       err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj)
+               return (0);
+
+       err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
        if (err)
                return (err);
 
-       if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
-               while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
-                       dsl_dataset_t *prev;
-                       err = dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+       while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+               dsl_dataset_t *prev;
+               err = dsl_dataset_hold_obj(dp,
+                   ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
 
-                       dsl_dataset_rele(ds, FTAG);
-                       if (err)
-                               return (err);
-                       ds = prev;
-               }
-               VERIFY(zap_add_int_key(dp->dp_meta_objset,
-                   scn->scn_phys.scn_queue_obj, ds->ds_object,
-                   ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+               dsl_dataset_rele(ds, FTAG);
+               if (err)
+                       return (err);
+               ds = prev;
        }
+       VERIFY(zap_add_int_key(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, ds->ds_object,
+           ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
        dsl_dataset_rele(ds, FTAG);
        return (0);
 }
@@ -1075,17 +1070,17 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
                }
 
                if (usenext) {
-                       VERIFY(zap_join_key(dp->dp_meta_objset,
+                       VERIFY0(zap_join_key(dp->dp_meta_objset,
                            ds->ds_phys->ds_next_clones_obj,
                            scn->scn_phys.scn_queue_obj,
-                           ds->ds_phys->ds_creation_txg, tx) == 0);
+                           ds->ds_phys->ds_creation_txg, tx));
                } else {
                        struct enqueue_clones_arg eca;
                        eca.tx = tx;
                        eca.originobj = ds->ds_object;
 
-                       (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
-                           NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+                       VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+                           enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
                }
        }
 
@@ -1095,15 +1090,14 @@ out:
 
 /* ARGSUSED */
 static int
-enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
        dmu_tx_t *tx = arg;
        dsl_dataset_t *ds;
        int err;
-       dsl_pool_t *dp = spa->spa_dsl_pool;
        dsl_scan_t *scn = dp->dp_scan;
 
-       err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
        if (err)
                return (err);
 
@@ -1261,8 +1255,8 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
                        return;
 
                if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-                       VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
-                           NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+                       VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+                           enqueue_cb, tx, DS_FIND_CHILDREN));
                } else {
                        dsl_scan_visitds(scn,
                            dp->dp_origin_snap->ds_object, tx);
@@ -1402,7 +1396,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                        func = POOL_SCAN_RESILVER;
                zfs_dbgmsg("restarting scan func=%u txg=%llu",
                    func, tx->tx_txg);
-               dsl_scan_setup_sync(scn, &func, tx);
+               dsl_scan_setup_sync(&func, tx);
        }
 
        if (!dsl_scan_active(scn) ||
@@ -1436,21 +1430,21 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                        err = bptree_iterate(dp->dp_meta_objset,
                            dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
                            scn, tx);
-                       VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
-                       if (err != 0)
-                               return;
-
-                       /* disable async destroy feature */
-                       spa_feature_decr(spa,
-                           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
-                       ASSERT(!spa_feature_is_active(spa,
-                           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
-                       VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
-                           DMU_POOL_DIRECTORY_OBJECT,
-                           DMU_POOL_BPTREE_OBJ, tx));
-                       VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
-                           dp->dp_bptree_obj, tx));
-                       dp->dp_bptree_obj = 0;
+                       VERIFY0(zio_wait(scn->scn_zio_root));
+
+                       if (err == 0) {
+                               zfeature_info_t *feat = &spa_feature_table
+                                   [SPA_FEATURE_ASYNC_DESTROY];
+                               /* finished; deactivate async destroy feature */
+                               spa_feature_decr(spa, feat, tx);
+                               ASSERT(!spa_feature_is_active(spa, feat));
+                               VERIFY0(zap_remove(dp->dp_meta_objset,
+                                   DMU_POOL_DIRECTORY_OBJECT,
+                                   DMU_POOL_BPTREE_OBJ, tx));
+                               VERIFY0(bptree_free(dp->dp_meta_objset,
+                                   dp->dp_bptree_obj, tx));
+                               dp->dp_bptree_obj = 0;
+                       }
                }
                if (scn->scn_visited_this_txg) {
                        zfs_dbgmsg("freed %llu blocks in %llums from "
@@ -1497,7 +1491,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 
        scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
            NULL, ZIO_FLAG_CANFAIL);
+       dsl_pool_config_enter(dp, FTAG);
        dsl_scan_visit(scn, tx);
+       dsl_pool_config_exit(dp, FTAG);
        (void) zio_wait(scn->scn_zio_root);
        scn->scn_zio_root = NULL;
 
@@ -1734,8 +1730,8 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
        spa->spa_scrub_reopen = B_FALSE;
        (void) spa_vdev_state_exit(spa, NULL, 0);
 
-       return (dsl_sync_task_do(dp, dsl_scan_setup_check,
-           dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+       return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+           dsl_scan_setup_sync, &func, 0));
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
index 2ed47fe0c95c58f565c6186ec23a97a8a3fdeeaf..6eb712314db0348c7ad90cdb06e5f3ce5a95fe18 100644 (file)
 
 /* ARGSUSED */
 static int
-dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
 {
        return (0);
 }
 
-dsl_sync_task_group_t *
-dsl_sync_task_group_create(dsl_pool_t *dp)
-{
-       dsl_sync_task_group_t *dstg;
-
-       dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP);
-       list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
-           offsetof(dsl_sync_task_t, dst_node));
-       dstg->dstg_pool = dp;
-
-       return (dstg);
-}
-
-void
-dsl_sync_task_create(dsl_sync_task_group_t *dstg,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified)
-{
-       dsl_sync_task_t *dst;
-
-       if (checkfunc == NULL)
-               checkfunc = dsl_null_checkfunc;
-       dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP);
-       dst->dst_checkfunc = checkfunc;
-       dst->dst_syncfunc = syncfunc;
-       dst->dst_arg1 = arg1;
-       dst->dst_arg2 = arg2;
-       list_insert_tail(&dstg->dstg_tasks, dst);
-
-       dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT;
-}
-
+/*
+ * Called from open context to perform a callback in syncing context.  Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail.  If it succeeds, it will be called again from
+ * syncing context.  The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called.  It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument.  Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
 int
-dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified)
 {
+       spa_t *spa;
        dmu_tx_t *tx;
-       uint64_t txg;
-       dsl_sync_task_t *dst;
-
-top:
-       tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
-       VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
-
-       txg = dmu_tx_get_txg(tx);
+       int err;
+       dsl_sync_task_t dst = { { { NULL } } };
+       dsl_pool_t *dp;
 
-       /* Do a preliminary error check. */
-       dstg->dstg_err = 0;
-#ifdef ZFS_DEBUG
-       /*
-        * Only check half the time, otherwise, the sync-context
-        * check will almost never fail.
-        */
-       if (spa_get_random(2) == 0)
-               goto skip;
-#endif
-       rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
-       for (dst = list_head(&dstg->dstg_tasks); dst;
-           dst = list_next(&dstg->dstg_tasks, dst)) {
-               dst->dst_err =
-                   dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
-               if (dst->dst_err)
-                       dstg->dstg_err = dst->dst_err;
-       }
-       rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+       err = spa_open(pool, &spa, FTAG);
+       if (err != 0)
+               return (err);
+       dp = spa_get_dsl(spa);
 
-       if (dstg->dstg_err) {
+top:
+       tx = dmu_tx_create_dd(dp->dp_mos_dir);
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+       dst.dst_pool = dp;
+       dst.dst_txg = dmu_tx_get_txg(tx);
+       dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+       dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
+       dst.dst_syncfunc = syncfunc;
+       dst.dst_arg = arg;
+       dst.dst_error = 0;
+       dst.dst_nowaiter = B_FALSE;
+
+       dsl_pool_config_enter(dp, FTAG);
+       err = dst.dst_checkfunc(arg, tx);
+       dsl_pool_config_exit(dp, FTAG);
+
+       if (err != 0) {
                dmu_tx_commit(tx);
-               return (dstg->dstg_err);
+               spa_close(spa, FTAG);
+               return (err);
        }
-#ifdef ZFS_DEBUG
-skip:
-#endif
 
-       /*
-        * We don't generally have many sync tasks, so pay the price of
-        * add_tail to get the tasks executed in the right order.
-        */
-       VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
-           dstg, txg));
+       VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg));
 
        dmu_tx_commit(tx);
 
-       txg_wait_synced(dstg->dstg_pool, txg);
+       txg_wait_synced(dp, dst.dst_txg);
 
-       if (dstg->dstg_err == EAGAIN) {
-               txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
+       if (dst.dst_error == EAGAIN) {
+               txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
                goto top;
        }
 
-       return (dstg->dstg_err);
+       spa_close(spa, FTAG);
+       return (dst.dst_error);
 }
 
 void
-dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, dmu_tx_t *tx)
 {
-       uint64_t txg;
+       dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
 
-       dstg->dstg_nowaiter = B_TRUE;
-       txg = dmu_tx_get_txg(tx);
-       /*
-        * We don't generally have many sync tasks, so pay the price of
-        * add_tail to get the tasks executed in the right order.
-        */
-       VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
-           dstg, txg));
-}
-
-void
-dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
-{
-       dsl_sync_task_t *dst;
+       dst->dst_pool = dp;
+       dst->dst_txg = dmu_tx_get_txg(tx);
+       dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+       dst->dst_checkfunc = dsl_null_checkfunc;
+       dst->dst_syncfunc = syncfunc;
+       dst->dst_arg = arg;
+       dst->dst_error = 0;
+       dst->dst_nowaiter = B_TRUE;
 
-       while ((dst = list_head(&dstg->dstg_tasks))) {
-               list_remove(&dstg->dstg_tasks, dst);
-               kmem_free(dst, sizeof (dsl_sync_task_t));
-       }
-       kmem_free(dstg, sizeof (dsl_sync_task_group_t));
+       VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg));
 }
 
+/*
+ * Called in syncing context to execute the synctask.
+ */
 void
-dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
 {
-       dsl_sync_task_t *dst;
-       dsl_pool_t *dp = dstg->dstg_pool;
+       dsl_pool_t *dp = dst->dst_pool;
        uint64_t quota, used;
 
-       ASSERT0(dstg->dstg_err);
+       ASSERT0(dst->dst_error);
 
        /*
         * Check for sufficient space.  We just check against what's
@@ -177,70 +154,24 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
            metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
        used = dp->dp_root_dir->dd_phys->dd_used_bytes;
        /* MOS space is triple-dittoed, so we multiply by 3. */
-       if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) {
-               dstg->dstg_err = ENOSPC;
+       if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+               dst->dst_error = ENOSPC;
+               if (dst->dst_nowaiter)
+                       kmem_free(dst, sizeof (*dst));
                return;
        }
 
        /*
-        * Check for errors by calling checkfuncs.
+        * Check for errors by calling checkfunc.
         */
-       rw_enter(&dp->dp_config_rwlock, RW_WRITER);
-       for (dst = list_head(&dstg->dstg_tasks); dst;
-           dst = list_next(&dstg->dstg_tasks, dst)) {
-               dst->dst_err =
-                   dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
-               if (dst->dst_err)
-                       dstg->dstg_err = dst->dst_err;
-       }
-
-       if (dstg->dstg_err == 0) {
-               /*
-                * Execute sync tasks.
-                */
-               for (dst = list_head(&dstg->dstg_tasks); dst;
-                   dst = list_next(&dstg->dstg_tasks, dst)) {
-                       dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
-               }
-       }
-       rw_exit(&dp->dp_config_rwlock);
-
-       if (dstg->dstg_nowaiter)
-               dsl_sync_task_group_destroy(dstg);
-}
-
-int
-dsl_sync_task_do(dsl_pool_t *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified)
-{
-       dsl_sync_task_group_t *dstg;
-       int err;
-
-       ASSERT(spa_writeable(dp->dp_spa));
-
-       dstg = dsl_sync_task_group_create(dp);
-       dsl_sync_task_create(dstg, checkfunc, syncfunc,
-           arg1, arg2, blocks_modified);
-       err = dsl_sync_task_group_wait(dstg);
-       dsl_sync_task_group_destroy(dstg);
-       return (err);
-}
-
-void
-dsl_sync_task_do_nowait(dsl_pool_t *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx)
-{
-       dsl_sync_task_group_t *dstg;
-
-       dstg = dsl_sync_task_group_create(dp);
-       dsl_sync_task_create(dstg, checkfunc, syncfunc,
-           arg1, arg2, blocks_modified);
-       dsl_sync_task_group_nowait(dstg, tx);
+       rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+       dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
+       if (dst->dst_error == 0)
+               dst->dst_syncfunc(dst->dst_arg, tx);
+       rrw_exit(&dp->dp_config_rwlock, FTAG);
+       if (dst->dst_nowaiter)
+               kmem_free(dst, sizeof (*dst));
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
-EXPORT_SYMBOL(dsl_sync_task_do);
-EXPORT_SYMBOL(dsl_sync_task_do_nowait);
 #endif
diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c
new file mode 100644 (file)
index 0000000..c8bc442
--- /dev/null
@@ -0,0 +1,537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+typedef struct dsl_dataset_user_hold_arg {
+       nvlist_t *dduha_holds;
+       nvlist_t *dduha_errlist;
+       minor_t dduha_minor;
+} dsl_dataset_user_hold_arg_t;
+
+/*
+ * If you add new checks here, you may need to add additional checks to the
+ * "temporary" case in snapshot_check() in dmu_objset.c.
+ */
+int
+dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
+    boolean_t temphold, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       objset_t *mos = dp->dp_meta_objset;
+       int error = 0;
+
+       if (strlen(htag) > MAXNAMELEN)
+               return (E2BIG);
+       /* Tempholds have a more restricted length */
+       if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+               return (E2BIG);
+
+       /* tags must be unique (if ds already exists) */
+       if (ds != NULL) {
+               mutex_enter(&ds->ds_lock);
+               if (ds->ds_phys->ds_userrefs_obj != 0) {
+                       uint64_t value;
+                       error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj,
+                           htag, 8, 1, &value);
+                       if (error == 0)
+                               error = EEXIST;
+                       else if (error == ENOENT)
+                               error = 0;
+               }
+               mutex_exit(&ds->ds_lock);
+       }
+
+       return (error);
+}
+
+static int
+dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_user_hold_arg_t *dduha = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
+       int rv = 0;
+
+       if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
+               return (ENOTSUP);
+
+       for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+               int error = 0;
+               dsl_dataset_t *ds;
+               char *htag;
+
+               /* must be a snapshot */
+               if (strchr(nvpair_name(pair), '@') == NULL)
+                       error = EINVAL;
+
+               if (error == 0)
+                       error = nvpair_value_string(pair, &htag);
+               if (error == 0) {
+                       error = dsl_dataset_hold(dp,
+                           nvpair_name(pair), FTAG, &ds);
+               }
+               if (error == 0) {
+                       error = dsl_dataset_user_hold_check_one(ds, htag,
+                           dduha->dduha_minor != 0, tx);
+                       dsl_dataset_rele(ds, FTAG);
+               }
+
+               if (error != 0) {
+                       rv = error;
+                       fnvlist_add_int32(dduha->dduha_errlist,
+                           nvpair_name(pair), error);
+               }
+       }
+       return (rv);
+}
+
+void
+dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag,
+    minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       objset_t *mos = dp->dp_meta_objset;
+       uint64_t zapobj;
+
+       mutex_enter(&ds->ds_lock);
+       if (ds->ds_phys->ds_userrefs_obj == 0) {
+               /*
+                * This is the first user hold for this dataset.  Create
+                * the userrefs zap object.
+                */
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               zapobj = ds->ds_phys->ds_userrefs_obj =
+                   zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+       } else {
+               zapobj = ds->ds_phys->ds_userrefs_obj;
+       }
+       ds->ds_userrefs++;
+       mutex_exit(&ds->ds_lock);
+
+       VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+       if (minor != 0) {
+               VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
+                   htag, now, tx));
+               dsl_register_onexit_hold_cleanup(ds, htag, minor);
+       }
+
+       spa_history_log_internal_ds(ds, "hold", tx,
+           "tag=%s temp=%d refs=%llu",
+           htag, minor != 0, ds->ds_userrefs);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_user_hold_arg_t *dduha = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
+       uint64_t now = gethrestime_sec();
+
+       for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+               dsl_dataset_t *ds;
+               VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+               dsl_dataset_user_hold_sync_one(ds, fnvpair_value_string(pair),
+                   dduha->dduha_minor, now, tx);
+               dsl_dataset_rele(ds, FTAG);
+       }
+}
+
+/*
+ * holds is nvl of snapname -> holdname
+ * errlist will be filled in with snapname -> error
+ * if cleanup_minor is not 0, the holds will be temporary, cleaned up
+ * when the process exits.
+ *
+ * if any fails, all will fail.
+ */
+int
+dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
+{
+       dsl_dataset_user_hold_arg_t dduha;
+       nvpair_t *pair;
+
+       pair = nvlist_next_nvpair(holds, NULL);
+       if (pair == NULL)
+               return (0);
+
+       dduha.dduha_holds = holds;
+       dduha.dduha_errlist = errlist;
+       dduha.dduha_minor = cleanup_minor;
+
+       return (dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
+           dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds)));
+}
+
+typedef struct dsl_dataset_user_release_arg {
+       nvlist_t *ddura_holds;
+       nvlist_t *ddura_todelete;
+       nvlist_t *ddura_errlist;
+} dsl_dataset_user_release_arg_t;
+
+static int
+dsl_dataset_user_release_check_one(dsl_dataset_t *ds,
+    nvlist_t *holds, boolean_t *todelete)
+{
+       uint64_t zapobj;
+       nvpair_t *pair;
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       int error;
+       int numholds = 0;
+
+       *todelete = B_FALSE;
+
+       if (!dsl_dataset_is_snapshot(ds))
+               return (EINVAL);
+
+       zapobj = ds->ds_phys->ds_userrefs_obj;
+       if (zapobj == 0)
+               return (ESRCH);
+
+       for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(holds, pair)) {
+               /* Make sure the hold exists */
+               uint64_t tmp;
+               error = zap_lookup(mos, zapobj, nvpair_name(pair), 8, 1, &tmp);
+               if (error == ENOENT)
+                       error = ESRCH;
+               if (error != 0)
+                       return (error);
+               numholds++;
+       }
+
+       if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 &&
+           ds->ds_userrefs == numholds) {
+               /* we need to destroy the snapshot as well */
+
+               if (dsl_dataset_long_held(ds))
+                       return (EBUSY);
+               *todelete = B_TRUE;
+       }
+       return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_user_release_arg_t *ddura = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
+       int rv = 0;
+
+       if (!dmu_tx_is_syncing(tx))
+               return (0);
+
+       for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
+               const char *name = nvpair_name(pair);
+               int error;
+               dsl_dataset_t *ds;
+               nvlist_t *holds;
+
+               error = nvpair_value_nvlist(pair, &holds);
+               if (error != 0)
+                       return (EINVAL);
+
+               error = dsl_dataset_hold(dp, name, FTAG, &ds);
+               if (error == 0) {
+                       boolean_t deleteme;
+                       error = dsl_dataset_user_release_check_one(ds,
+                           holds, &deleteme);
+                       if (error == 0 && deleteme) {
+                               fnvlist_add_boolean(ddura->ddura_todelete,
+                                   name);
+                       }
+                       dsl_dataset_rele(ds, FTAG);
+               }
+               if (error != 0) {
+                       if (ddura->ddura_errlist != NULL) {
+                               fnvlist_add_int32(ddura->ddura_errlist,
+                                   name, error);
+                       }
+                       rv = error;
+               }
+       }
+       return (rv);
+}
+
+static void
+dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
+    dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       objset_t *mos = dp->dp_meta_objset;
+       uint64_t zapobj;
+       int error;
+       nvpair_t *pair;
+
+       for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(holds, pair)) {
+               ds->ds_userrefs--;
+               error = dsl_pool_user_release(dp, ds->ds_object,
+                   nvpair_name(pair), tx);
+               VERIFY(error == 0 || error == ENOENT);
+               zapobj = ds->ds_phys->ds_userrefs_obj;
+               VERIFY0(zap_remove(mos, zapobj, nvpair_name(pair), tx));
+
+               spa_history_log_internal_ds(ds, "release", tx,
+                   "tag=%s refs=%lld", nvpair_name(pair),
+                   (longlong_t)ds->ds_userrefs);
+       }
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_user_release_arg_t *ddura = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       nvpair_t *pair;
+
+       for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
+               dsl_dataset_t *ds;
+
+               VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+               dsl_dataset_user_release_sync_one(ds,
+                   fnvpair_value_nvlist(pair), tx);
+               if (nvlist_exists(ddura->ddura_todelete,
+                   nvpair_name(pair))) {
+                       ASSERT(ds->ds_userrefs == 0 &&
+                           ds->ds_phys->ds_num_children == 1 &&
+                           DS_IS_DEFER_DESTROY(ds));
+                       dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
+               }
+               dsl_dataset_rele(ds, FTAG);
+       }
+}
+
+/*
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ *
+ * if any fails, all will fail.
+ */
+int
+dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist)
+{
+       dsl_dataset_user_release_arg_t ddura;
+       nvpair_t *pair;
+       int error;
+
+       pair = nvlist_next_nvpair(holds, NULL);
+       if (pair == NULL)
+               return (0);
+
+       ddura.ddura_holds = holds;
+       ddura.ddura_errlist = errlist;
+       ddura.ddura_todelete = fnvlist_alloc();
+
+       error = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_release_check,
+           dsl_dataset_user_release_sync, &ddura, fnvlist_num_pairs(holds));
+       fnvlist_free(ddura.ddura_todelete);
+       return (error);
+}
+
+typedef struct dsl_dataset_user_release_tmp_arg {
+       uint64_t ddurta_dsobj;
+       nvlist_t *ddurta_holds;
+       boolean_t ddurta_deleteme;
+} dsl_dataset_user_release_tmp_arg_t;
+
+static int
+dsl_dataset_user_release_tmp_check(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_user_release_tmp_arg_t *ddurta = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       int error;
+
+       if (!dmu_tx_is_syncing(tx))
+               return (0);
+
+       error = dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds);
+       if (error)
+               return (error);
+
+       error = dsl_dataset_user_release_check_one(ds,
+           ddurta->ddurta_holds, &ddurta->ddurta_deleteme);
+       dsl_dataset_rele(ds, FTAG);
+       return (error);
+}
+
+static void
+dsl_dataset_user_release_tmp_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_dataset_user_release_tmp_arg_t *ddurta = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+
+       VERIFY0(dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds));
+       dsl_dataset_user_release_sync_one(ds, ddurta->ddurta_holds, tx);
+       if (ddurta->ddurta_deleteme) {
+               ASSERT(ds->ds_userrefs == 0 &&
+                   ds->ds_phys->ds_num_children == 1 &&
+                   DS_IS_DEFER_DESTROY(ds));
+               dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
+       }
+       dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Called at spa_load time to release a stale temporary user hold.
+ * Also called by the onexit code.
+ */
+void
+dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, const char *htag)
+{
+       dsl_dataset_user_release_tmp_arg_t ddurta;
+
+#ifdef _KERNEL
+       dsl_dataset_t *ds;
+       int error;
+
+       /* Make sure it is not mounted. */
+       dsl_pool_config_enter(dp, FTAG);
+       error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       if (error == 0) {
+               char name[MAXNAMELEN];
+               dsl_dataset_name(ds, name);
+               dsl_dataset_rele(ds, FTAG);
+               dsl_pool_config_exit(dp, FTAG);
+               zfs_unmount_snap(name);
+       } else {
+               dsl_pool_config_exit(dp, FTAG);
+       }
+#endif
+
+       ddurta.ddurta_dsobj = dsobj;
+       ddurta.ddurta_holds = fnvlist_alloc();
+       fnvlist_add_boolean(ddurta.ddurta_holds, htag);
+
+       (void) dsl_sync_task(spa_name(dp->dp_spa),
+           dsl_dataset_user_release_tmp_check,
+           dsl_dataset_user_release_tmp_sync, &ddurta, 1);
+       fnvlist_free(ddurta.ddurta_holds);
+}
+
+typedef struct zfs_hold_cleanup_arg {
+       char zhca_spaname[MAXNAMELEN];
+       uint64_t zhca_spa_load_guid;
+       uint64_t zhca_dsobj;
+       char zhca_htag[MAXNAMELEN];
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+       zfs_hold_cleanup_arg_t *ca = arg;
+       spa_t *spa;
+       int error;
+
+       error = spa_open(ca->zhca_spaname, &spa, FTAG);
+       if (error != 0) {
+               zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s "
+                   "because pool is no longer loaded",
+                   ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag);
+               return;
+       }
+       if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
+               zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s "
+                   "because pool is no longer loaded (guid doesn't match)",
+                   ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag);
+               spa_close(spa, FTAG);
+               return;
+       }
+
+       dsl_dataset_user_release_tmp(spa_get_dsl(spa),
+           ca->zhca_dsobj, ca->zhca_htag);
+       kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+       spa_close(spa, FTAG);
+}
+
+void
+dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+    minor_t minor)
+{
+       zfs_hold_cleanup_arg_t *ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
+       spa_t *spa = dsl_dataset_get_spa(ds);
+       (void) strlcpy(ca->zhca_spaname, spa_name(spa),
+           sizeof (ca->zhca_spaname));
+       ca->zhca_spa_load_guid = spa_load_guid(spa);
+       ca->zhca_dsobj = ds->ds_object;
+       (void) strlcpy(ca->zhca_htag, htag, sizeof (ca->zhca_htag));
+       VERIFY0(zfs_onexit_add_cb(minor,
+           dsl_dataset_user_release_onexit, ca, NULL));
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
+{
+       dsl_pool_t *dp;
+       dsl_dataset_t *ds;
+       int err;
+
+       err = dsl_pool_hold(dsname, FTAG, &dp);
+       if (err != 0)
+               return (err);
+       err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+       if (err != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (err);
+       }
+
+       if (ds->ds_phys->ds_userrefs_obj != 0) {
+               zap_attribute_t *za;
+               zap_cursor_t zc;
+
+               za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+               for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+                   ds->ds_phys->ds_userrefs_obj);
+                   zap_cursor_retrieve(&zc, za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       fnvlist_add_uint64(nvl, za->za_name,
+                           za->za_first_integer);
+               }
+               zap_cursor_fini(&zc);
+               kmem_free(za, sizeof (zap_attribute_t));
+       }
+       dsl_dataset_rele(ds, FTAG);
+       dsl_pool_rele(dp, FTAG);
+       return (0);
+}
index cd1b6ce730f4843bbe5d41f9f4b24f39f4aa48f5..f9cb8cead815f50cbfff61bd2f25d5c71513fa66 100644 (file)
@@ -1928,6 +1928,46 @@ void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
        spa_config_exit(spa, SCL_VDEV, FTAG);
 }
 
+static void
+checkmap(space_map_t *sm, uint64_t off, uint64_t size)
+{
+       space_seg_t *ss;
+       avl_index_t where;
+
+       mutex_enter(sm->sm_lock);
+       ss = space_map_find(sm, off, size, &where);
+       if (ss != NULL)
+               panic("freeing free block; ss=%p", (void *)ss);
+       mutex_exit(sm->sm_lock);
+}
+
+void
+metaslab_check_free(spa_t *spa, const blkptr_t *bp)
+{
+       int i, j;
+
+       if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+               return;
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+       for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+               uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
+               vdev_t *vd = vdev_lookup_top(spa, vdid);
+               uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
+               uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
+               metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
+
+               if (ms->ms_map->sm_loaded)
+                       checkmap(ms->ms_map, off, size);
+
+               for (j = 0; j < TXG_SIZE; j++)
+                       checkmap(ms->ms_freemap[j], off, size);
+               for (j = 0; j < TXG_DEFER_SIZE; j++)
+                       checkmap(ms->ms_defermap[j], off, size);
+       }
+       spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 module_param(metaslab_debug, int, 0644);
 MODULE_PARM_DESC(metaslab_debug, "keep space maps in core to verify frees");
index e43807c8e3f408b8c85ab4d88a14c1cb3d89940b..49980efcc85d06e1797e200bff84510c79248946 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -32,7 +33,7 @@ int reference_tracking_enable = FALSE; /* runs out of memory too easily */
 #else
 int reference_tracking_enable = TRUE;
 #endif
-int reference_history = 4; /* tunable */
+int reference_history = 3; /* tunable */
 
 static kmem_cache_t *reference_cache;
 static kmem_cache_t *reference_history_cache;
@@ -64,6 +65,14 @@ refcount_create(refcount_t *rc)
            offsetof(reference_t, ref_link));
        rc->rc_count = 0;
        rc->rc_removed_count = 0;
+       rc->rc_tracked = reference_tracking_enable;
+}
+
+void
+refcount_create_untracked(refcount_t *rc)
+{
+       refcount_create(rc);
+       rc->rc_tracked = B_FALSE;
 }
 
 void
@@ -96,14 +105,12 @@ refcount_destroy(refcount_t *rc)
 int
 refcount_is_zero(refcount_t *rc)
 {
-       ASSERT(rc->rc_count >= 0);
        return (rc->rc_count == 0);
 }
 
 int64_t
 refcount_count(refcount_t *rc)
 {
-       ASSERT(rc->rc_count >= 0);
        return (rc->rc_count);
 }
 
@@ -113,14 +120,14 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
        reference_t *ref = NULL;
        int64_t count;
 
-       if (reference_tracking_enable) {
+       if (rc->rc_tracked) {
                ref = kmem_cache_alloc(reference_cache, KM_PUSHPAGE);
                ref->ref_holder = holder;
                ref->ref_number = number;
        }
        mutex_enter(&rc->rc_mtx);
        ASSERT(rc->rc_count >= 0);
-       if (reference_tracking_enable)
+       if (rc->rc_tracked)
                list_insert_head(&rc->rc_list, ref);
        rc->rc_count += number;
        count = rc->rc_count;
@@ -144,7 +151,7 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
        mutex_enter(&rc->rc_mtx);
        ASSERT(rc->rc_count >= number);
 
-       if (!reference_tracking_enable) {
+       if (!rc->rc_tracked) {
                rc->rc_count -= number;
                count = rc->rc_count;
                mutex_exit(&rc->rc_mtx);
@@ -161,7 +168,7 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
                                    KM_PUSHPAGE);
                                list_insert_head(&rc->rc_removed, ref);
                                rc->rc_removed_count++;
-                               if (rc->rc_removed_count >= reference_history) {
+                               if (rc->rc_removed_count > reference_history) {
                                        ref = list_tail(&rc->rc_removed);
                                        list_remove(&rc->rc_removed, ref);
                                        kmem_cache_free(reference_history_cache,
index 7f9290bd44c1e7d54483f63e03e5ee4c53ae870e..8e80166c7d14daae2f9e16528cd6f2f948022133 100644 (file)
@@ -75,8 +75,9 @@
 uint_t rrw_tsd_key;
 
 typedef struct rrw_node {
-       struct rrw_node *rn_next;
-       rrwlock_t       *rn_rrl;
+       struct rrw_node *rn_next;
+       rrwlock_t *rn_rrl;
+       void *rn_tag;
 } rrw_node_t;
 
 static rrw_node_t *
@@ -98,13 +99,14 @@ rrn_find(rrwlock_t *rrl)
  * Add a node to the head of the singly linked list.
  */
 static void
-rrn_add(rrwlock_t *rrl)
+rrn_add(rrwlock_t *rrl, void *tag)
 {
        rrw_node_t *rn;
 
        rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
        rn->rn_rrl = rrl;
        rn->rn_next = tsd_get(rrw_tsd_key);
+       rn->rn_tag = tag;
        VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
 }
 
@@ -113,7 +115,7 @@ rrn_add(rrwlock_t *rrl)
  * thread's list and return TRUE; otherwise return FALSE.
  */
 static boolean_t
-rrn_find_and_remove(rrwlock_t *rrl)
+rrn_find_and_remove(rrwlock_t *rrl, void *tag)
 {
        rrw_node_t *rn;
        rrw_node_t *prev = NULL;
@@ -122,7 +124,7 @@ rrn_find_and_remove(rrwlock_t *rrl)
                return (B_FALSE);
 
        for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
-               if (rn->rn_rrl == rrl) {
+               if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
                        if (prev)
                                prev->rn_next = rn->rn_next;
                        else
@@ -136,7 +138,7 @@ rrn_find_and_remove(rrwlock_t *rrl)
 }
 
 void
-rrw_init(rrwlock_t *rrl)
+rrw_init(rrwlock_t *rrl, boolean_t track_all)
 {
        mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
@@ -144,6 +146,7 @@ rrw_init(rrwlock_t *rrl)
        refcount_create(&rrl->rr_anon_rcount);
        refcount_create(&rrl->rr_linked_rcount);
        rrl->rr_writer_wanted = B_FALSE;
+       rrl->rr_track_all = track_all;
 }
 
 void
@@ -156,12 +159,13 @@ rrw_destroy(rrwlock_t *rrl)
        refcount_destroy(&rrl->rr_linked_rcount);
 }
 
-static void
+void
 rrw_enter_read(rrwlock_t *rrl, void *tag)
 {
        mutex_enter(&rrl->rr_lock);
 #if !defined(DEBUG) && defined(_KERNEL)
-       if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
+       if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
+           !rrl->rr_track_all) {
                rrl->rr_anon_rcount.rc_count++;
                mutex_exit(&rrl->rr_lock);
                return;
@@ -171,14 +175,14 @@ rrw_enter_read(rrwlock_t *rrl, void *tag)
        ASSERT(rrl->rr_writer != curthread);
        ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 
-       while (rrl->rr_writer || (rrl->rr_writer_wanted &&
+       while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
            refcount_is_zero(&rrl->rr_anon_rcount) &&
            rrn_find(rrl) == NULL))
                cv_wait(&rrl->rr_cv, &rrl->rr_lock);
 
-       if (rrl->rr_writer_wanted) {
+       if (rrl->rr_writer_wanted || rrl->rr_track_all) {
                /* may or may not be a re-entrant enter */
-               rrn_add(rrl);
+               rrn_add(rrl, tag);
                (void) refcount_add(&rrl->rr_linked_rcount, tag);
        } else {
                (void) refcount_add(&rrl->rr_anon_rcount, tag);
@@ -187,7 +191,7 @@ rrw_enter_read(rrwlock_t *rrl, void *tag)
        mutex_exit(&rrl->rr_lock);
 }
 
-static void
+void
 rrw_enter_write(rrwlock_t *rrl)
 {
        mutex_enter(&rrl->rr_lock);
@@ -233,10 +237,12 @@ rrw_exit(rrwlock_t *rrl, void *tag)
 
        if (rrl->rr_writer == NULL) {
                int64_t count;
-               if (rrn_find_and_remove(rrl))
+               if (rrn_find_and_remove(rrl, tag)) {
                        count = refcount_remove(&rrl->rr_linked_rcount, tag);
-               else
+               } else {
+                       ASSERT(!rrl->rr_track_all);
                        count = refcount_remove(&rrl->rr_anon_rcount, tag);
+               }
                if (count == 0)
                        cv_broadcast(&rrl->rr_cv);
        } else {
@@ -249,6 +255,11 @@ rrw_exit(rrwlock_t *rrl, void *tag)
        mutex_exit(&rrl->rr_lock);
 }
 
+/*
+ * If the lock was created with track_all, rrw_held(RW_READER) will return
+ * B_TRUE iff the current thread has the lock for reader.  Otherwise it may
+ * return B_TRUE if any thread has the lock for reader.
+ */
 boolean_t
 rrw_held(rrwlock_t *rrl, krw_t rw)
 {
@@ -259,7 +270,7 @@ rrw_held(rrwlock_t *rrl, krw_t rw)
                held = (rrl->rr_writer == curthread);
        } else {
                held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
-                   !refcount_is_zero(&rrl->rr_linked_rcount));
+                   rrn_find(rrl) != NULL);
        }
        mutex_exit(&rrl->rr_lock);
 
index 581cf4b0daf308e0811f13328062a7925bc539fb..bad6123aa0288a051105f84155e3ed8ef4ef1350 100644 (file)
@@ -1019,10 +1019,10 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
        sa_attr_type_t *tb;
        int error;
 
-       mutex_enter(&os->os_lock);
+       mutex_enter(&os->os_user_ptr_lock);
        if (os->os_sa) {
                mutex_enter(&os->os_sa->sa_lock);
-               mutex_exit(&os->os_lock);
+               mutex_exit(&os->os_user_ptr_lock);
                tb = os->os_sa->sa_user_table;
                mutex_exit(&os->os_sa->sa_lock);
                *user_table = tb;
@@ -1035,7 +1035,7 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
 
        os->os_sa = sa;
        mutex_enter(&sa->sa_lock);
-       mutex_exit(&os->os_lock);
+       mutex_exit(&os->os_user_ptr_lock);
        avl_create(&sa->sa_layout_num_tree, layout_num_compare,
            sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
        avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
index 7c37ca426d4a67fbaf468a60393c2b7734cf0f63..fcb1711a211ba8d234d2e7b37e07cdb6eb3907f7 100644 (file)
@@ -64,6 +64,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
+#include <sys/dsl_destroy.h>
 #include <sys/zvol.h>
 
 #ifdef _KERNEL
@@ -131,10 +132,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
 };
 
-static dsl_syncfunc_t spa_sync_version;
-static dsl_syncfunc_t spa_sync_props;
-static dsl_checkfunc_t spa_change_guid_check;
-static dsl_syncfunc_t spa_change_guid_sync;
+static void spa_sync_version(void *arg, dmu_tx_t *tx);
+static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -329,10 +328,10 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
                                dsl_dataset_t *ds = NULL;
 
                                dp = spa_get_dsl(spa);
-                               rw_enter(&dp->dp_config_rwlock, RW_READER);
+                               dsl_pool_config_enter(dp, FTAG);
                                if ((err = dsl_dataset_hold_obj(dp,
                                    za.za_first_integer, FTAG, &ds))) {
-                                       rw_exit(&dp->dp_config_rwlock);
+                                       dsl_pool_config_exit(dp, FTAG);
                                        break;
                                }
 
@@ -341,7 +340,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
                                    KM_PUSHPAGE);
                                dsl_dataset_name(ds, strval);
                                dsl_dataset_rele(ds, FTAG);
-                               rw_exit(&dp->dp_config_rwlock);
+                               dsl_pool_config_exit(dp, FTAG);
                        } else {
                                strval = NULL;
                                intval = za.za_first_integer;
@@ -495,9 +494,10 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 
                                if (dmu_objset_type(os) != DMU_OST_ZFS) {
                                        error = ENOTSUP;
-                               } else if ((error = dsl_prop_get_integer(strval,
+                               } else if ((error =
+                                   dsl_prop_get_int_ds(dmu_objset_ds(os),
                                    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-                                   &compress, NULL)) == 0 &&
+                                   &compress)) == 0 &&
                                    !BOOTFS_COMPRESS_VALID(compress)) {
                                        error = ENOTSUP;
                                } else {
@@ -661,8 +661,8 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
                         * read object, the features for write object, or the
                         * feature descriptions object.
                         */
-                       error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
-                           spa_sync_version, spa, &ver, 6);
+                       error = dsl_sync_task(spa->spa_name, NULL,
+                           spa_sync_version, &ver, 6);
                        if (error)
                                return (error);
                        continue;
@@ -673,8 +673,8 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
        }
 
        if (need_sync) {
-               return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-                   spa, nvp, 6));
+               return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
+                   nvp, 6));
        }
 
        return (0);
@@ -696,12 +696,12 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 
 /*ARGSUSED*/
 static int
-spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
+spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
        vdev_t *rvd = spa->spa_root_vdev;
        uint64_t vdev_state;
-       ASSERTV(uint64_t *newguid = arg2);
+       ASSERTV(uint64_t *newguid = arg);
 
        spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
        vdev_state = rvd->vdev_state;
@@ -716,10 +716,10 @@ spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
-       uint64_t *newguid = arg2;
+       uint64_t *newguid = arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
        uint64_t oldguid;
        vdev_t *rvd = spa->spa_root_vdev;
 
@@ -753,8 +753,8 @@ spa_change_guid(spa_t *spa)
        mutex_enter(&spa_namespace_lock);
        guid = spa_generate_guid(NULL);
 
-       error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
-           spa_change_guid_sync, spa, &guid, 5);
+       error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
+           spa_change_guid_sync, &guid, 5);
 
        if (error == 0) {
                spa_config_sync(spa, B_FALSE, B_TRUE);
@@ -1729,23 +1729,24 @@ spa_config_valid(spa_t *spa, nvlist_t *config)
 /*
  * Check for missing log devices
  */
-static int
+static boolean_t
 spa_check_logs(spa_t *spa)
 {
+       boolean_t rv = B_FALSE;
+
        switch (spa->spa_log_state) {
        default:
                break;
        case SPA_LOG_MISSING:
                /* need to recheck in case slog has been restored */
        case SPA_LOG_UNKNOWN:
-               if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
-                   DS_FIND_CHILDREN)) {
+               rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
+                   NULL, DS_FIND_CHILDREN) != 0);
+               if (rv)
                        spa_set_log_state(spa, SPA_LOG_MISSING);
-                       return (1);
-               }
                break;
        }
-       return (0);
+       return (rv);
 }
 
 static boolean_t
@@ -1793,11 +1794,11 @@ spa_activate_log(spa_t *spa)
 int
 spa_offline_log(spa_t *spa)
 {
-       int error = 0;
-
-       if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
-           NULL, DS_FIND_CHILDREN)) == 0) {
+       int error;
 
+       error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+           NULL, DS_FIND_CHILDREN);
+       if (error == 0) {
                /*
                 * We successfully offlined the log device, sync out the
                 * current txg so that the "stubby" block can be removed
@@ -3610,7 +3611,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
        if (props != NULL) {
                spa_configfile_set(spa, props, B_FALSE);
-               spa_sync_props(spa, props, tx);
+               spa_sync_props(props, tx);
        }
 
        dmu_tx_commit(tx);
@@ -3844,7 +3845,7 @@ out:
  * Import a non-root pool into the system.
  */
 int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
+spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
        spa_t *spa;
        char *altroot = NULL;
@@ -5878,10 +5879,11 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 }
 
 static void
-spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
+spa_sync_version(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
-       uint64_t version = *(uint64_t *)arg2;
+       uint64_t *versionp = arg;
+       uint64_t version = *versionp;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
        /*
         * Setting the version is special cased when first creating the pool.
@@ -5900,11 +5902,11 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
  * Set zpool properties.
  */
 static void
-spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
+spa_sync_props(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
+       nvlist_t *nvp = arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
        objset_t *mos = spa->spa_meta_objset;
-       nvlist_t *nvp = arg2;
        nvpair_t *elem = NULL;
 
        mutex_enter(&spa->spa_props_lock);
@@ -6056,6 +6058,8 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 
        ASSERT(spa->spa_sync_pass == 1);
 
+       rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
        if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
            spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
                dsl_pool_create_origin(dp, tx);
@@ -6081,6 +6085,7 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
            spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
                spa_feature_create_zap_objects(spa, tx);
        }
+       rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 /*
index 79d48620c9e1815f880c109d9711f664a5b6f1d1..bbcd697e00a8444e85ae76f8fa27b81eb208a761 100644 (file)
@@ -197,10 +197,10 @@ spa_history_zone(void)
  */
 /*ARGSUSED*/
 static void
-spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+spa_history_log_sync(void *arg, dmu_tx_t *tx)
 {
-       spa_t           *spa = arg1;
-       nvlist_t        *nvl = arg2;
+       nvlist_t        *nvl = arg;
+       spa_t           *spa = dmu_tx_pool(tx)->dp_spa;
        objset_t        *mos = spa->spa_meta_objset;
        dmu_buf_t       *dbp;
        spa_history_phys_t *shpp;
@@ -222,7 +222,7 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
         * Get the offset of where we need to write via the bonus buffer.
         * Update the offset when the write completes.
         */
-       VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+       VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
        shpp = dbp->db_data;
 
        dmu_buf_will_dirty(dbp, tx);
@@ -326,8 +326,8 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
        fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
 
        /* Kick this off asynchronously; errors are ignored. */
-       dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
-           spa_history_log_sync, spa, nvarg, 0, tx);
+       dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
+           nvarg, 0, tx);
        dmu_tx_commit(tx);
 
        /* spa_history_log_sync will free nvl */
@@ -465,10 +465,10 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
        fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
 
        if (dmu_tx_is_syncing(tx)) {
-               spa_history_log_sync(spa, nvl, tx);
+               spa_history_log_sync(nvl, tx);
        } else {
-               dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
-                   spa_history_log_sync, spa, nvl, 0, tx);
+               dsl_sync_task_nowait(spa_get_dsl(spa),
+                   spa_history_log_sync, nvl, 0, tx);
        }
        /* spa_history_log_sync() will free nvl */
 }
@@ -544,17 +544,11 @@ spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
 void
 spa_history_log_version(spa_t *spa, const char *operation)
 {
-#ifdef _KERNEL
-       uint64_t current_vers = spa_version(spa);
-
        spa_history_log_internal(spa, operation, NULL,
            "pool version %llu; software version %llu/%d; uts %s %s %s %s",
-           (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
+           (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
            utsname.nodename, utsname.release, utsname.version,
            utsname.machine);
-       cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", operation,
-           (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
-#endif
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
index 0ca9f3a7a622a89bf7fd49fc2ef876e806a089f3..a5e13b5fb3cfa47c1bba0806e23ba8457a55104d 100644 (file)
@@ -268,7 +268,7 @@ spa_config_lock_init(spa_t *spa)
                spa_config_lock_t *scl = &spa->spa_config_lock[i];
                mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
                cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
-               refcount_create(&scl->scl_count);
+               refcount_create_untracked(&scl->scl_count);
                scl->scl_writer = NULL;
                scl->scl_write_wanted = 0;
        }
@@ -326,6 +326,8 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
        int wlocks_held = 0;
        int i;
 
+       ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
+
        for (i = 0; i < SCL_LOCKS; i++) {
                spa_config_lock_t *scl = &spa->spa_config_lock[i];
                if (scl->scl_writer == curthread)
@@ -406,27 +408,22 @@ spa_lookup(const char *name)
        static spa_t search;    /* spa_t is large; don't allocate on stack */
        spa_t *spa;
        avl_index_t where;
-       char c = 0;
        char *cp;
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+       (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
+
        /*
         * If it's a full dataset name, figure out the pool name and
         * just use that.
         */
-       cp = strpbrk(name, "/@");
-       if (cp) {
-               c = *cp;
+       cp = strpbrk(search.spa_name, "/@");
+       if (cp != NULL)
                *cp = '\0';
-       }
 
-       (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
        spa = avl_find(&spa_namespace_avl, &search, &where);
 
-       if (cp)
-               *cp = c;
-
        return (spa);
 }
 
@@ -539,6 +536,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
                    KM_SLEEP) == 0);
        }
 
+       spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
+
        return (spa);
 }
 
index a031f3a20e6a5b74b7f5648d24d0b07781084f39..2cf1d2a18407fb5ba7d9da0240c4c48a91925dcc 100644 (file)
@@ -102,7 +102,7 @@ void
 space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
 {
        avl_index_t where;
-       space_seg_t ssearch, *ss_before, *ss_after, *ss;
+       space_seg_t *ss_before, *ss_after, *ss;
        uint64_t end = start + size;
        int merge_before, merge_after;
 
@@ -115,11 +115,8 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
        VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
        VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
 
-       ssearch.ss_start = start;
-       ssearch.ss_end = end;
-       ss = avl_find(&sm->sm_root, &ssearch, &where);
-
-       if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
+       ss = space_map_find(sm, start, size, &where);
+       if (ss != NULL) {
                zfs_panic_recover("zfs: allocating allocated segment"
                    "(offset=%llu size=%llu)\n",
                    (longlong_t)start, (longlong_t)size);
@@ -171,19 +168,12 @@ void
 space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 {
        avl_index_t where;
-       space_seg_t ssearch, *ss, *newseg;
+       space_seg_t *ss, *newseg;
        uint64_t end = start + size;
        int left_over, right_over;
 
-       ASSERT(MUTEX_HELD(sm->sm_lock));
        VERIFY(!sm->sm_condensing);
-       VERIFY(size != 0);
-       VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-       VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-       ssearch.ss_start = start;
-       ssearch.ss_end = end;
-       ss = avl_find(&sm->sm_root, &ssearch, &where);
+       ss = space_map_find(sm, start, size, &where);
 
        /* Make sure we completely overlap with someone */
        if (ss == NULL) {
@@ -226,12 +216,11 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
        sm->sm_space -= size;
 }
 
-boolean_t
-space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+space_seg_t *
+space_map_find(space_map_t *sm, uint64_t start, uint64_t size,
+    avl_index_t *wherep)
 {
-       avl_index_t where;
        space_seg_t ssearch, *ss;
-       uint64_t end = start + size;
 
        ASSERT(MUTEX_HELD(sm->sm_lock));
        VERIFY(size != 0);
@@ -239,10 +228,20 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
        VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
 
        ssearch.ss_start = start;
-       ssearch.ss_end = end;
-       ss = avl_find(&sm->sm_root, &ssearch, &where);
+       ssearch.ss_end = start + size;
+       ss = avl_find(&sm->sm_root, &ssearch, wherep);
+
+       if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size)
+               return (ss);
+       return (NULL);
+}
+
+boolean_t
+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+{
+       avl_index_t where;
 
-       return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
+       return (space_map_find(sm, start, size, &where) != 0);
 }
 
 void
index 7c820af4f8b3e8f6f6b070c2d2bba6745d015e4d..b3e537f459c4b1822cda3afc4996f4ed33ad03b3 100644 (file)
@@ -659,6 +659,8 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 {
        tx_state_t *tx = &dp->dp_tx;
 
+       ASSERT(!dsl_pool_config_held(dp));
+
        mutex_enter(&tx->tx_sync_lock);
        ASSERT(tx->tx_threads == 2);
        if (txg == 0)
@@ -682,6 +684,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 {
        tx_state_t *tx = &dp->dp_tx;
 
+       ASSERT(!dsl_pool_config_held(dp));
+
        mutex_enter(&tx->tx_sync_lock);
        ASSERT(tx->tx_threads == 2);
        if (txg == 0)
@@ -747,42 +751,43 @@ txg_list_empty(txg_list_t *tl, uint64_t txg)
 }
 
 /*
- * Add an entry to the list.
- * Returns 0 if it's a new entry, 1 if it's already there.
+ * Add an entry to the list (unless it's already on the list).
+ * Returns B_TRUE if it was actually added.
  */
-int
+boolean_t
 txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 {
        int t = txg & TXG_MASK;
        txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-       int already_on_list;
+       boolean_t add;
 
        mutex_enter(&tl->tl_lock);
-       already_on_list = tn->tn_member[t];
-       if (!already_on_list) {
+       add = (tn->tn_member[t] == 0);
+       if (add) {
                tn->tn_member[t] = 1;
                tn->tn_next[t] = tl->tl_head[t];
                tl->tl_head[t] = tn;
        }
        mutex_exit(&tl->tl_lock);
 
-       return (already_on_list);
+       return (add);
 }
 
 /*
- * Add an entry to the end of the list (walks list to find end).
- * Returns 0 if it's a new entry, 1 if it's already there.
+ * Add an entry to the end of the list, unless it's already on the list.
+ * (walks list to find end)
+ * Returns B_TRUE if it was actually added.
  */
-int
+boolean_t
 txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
 {
        int t = txg & TXG_MASK;
        txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-       int already_on_list;
+       boolean_t add;
 
        mutex_enter(&tl->tl_lock);
-       already_on_list = tn->tn_member[t];
-       if (!already_on_list) {
+       add = (tn->tn_member[t] == 0);
+       if (add) {
                txg_node_t **tp;
 
                for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
@@ -794,7 +799,7 @@ txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
        }
        mutex_exit(&tl->tl_lock);
 
-       return (already_on_list);
+       return (add);
 }
 
 /*
@@ -845,13 +850,13 @@ txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
        return (NULL);
 }
 
-int
+boolean_t
 txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
 {
        int t = txg & TXG_MASK;
        txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 
-       return (tn->tn_member[t]);
+       return (tn->tn_member[t] != 0);
 }
 
 /*
index a03e1c6948ea1493b540af04ab52a7ba50af12c1..3cf0089ecef0aa2c069a96b35c495c5280a65e9b 100644 (file)
@@ -80,6 +80,7 @@
 #include <sys/zfs_vnops.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
+#include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/mount.h>
 #include <sys/zpl.h>
@@ -488,13 +489,13 @@ zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name)
  */
 /*ARGSUSED*/
 int
-zfsctl_snapdir_rename(struct inode *sdip, char *sname,
-    struct inode *tdip, char *tname, cred_t *cr, int flags)
+zfsctl_snapdir_rename(struct inode *sdip, char *snm,
+    struct inode *tdip, char *tnm, cred_t *cr, int flags)
 {
        zfs_sb_t *zsb = ITOZSB(sdip);
        zfs_snapentry_t search, *sep;
        avl_index_t where;
-       char *to, *from, *real;
+       char *to, *from, *real, *fsname;
        int error;
 
        ZFS_ENTER(zsb);
@@ -502,23 +503,26 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname,
        to = kmem_alloc(MAXNAMELEN, KM_SLEEP);
        from = kmem_alloc(MAXNAMELEN, KM_SLEEP);
        real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+       fsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 
        if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
-               error = dmu_snapshot_realname(zsb->z_os, sname, real,
+               error = dmu_snapshot_realname(zsb->z_os, snm, real,
                    MAXNAMELEN, NULL);
                if (error == 0) {
-                       sname = real;
+                       snm = real;
                } else if (error != ENOTSUP) {
                        goto out;
                }
        }
 
-       error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from);
-       if (!error)
-               error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to);
-       if (!error)
+       dmu_objset_name(zsb->z_os, fsname);
+
+       error = zfsctl_snapshot_zname(sdip, snm, MAXNAMELEN, from);
+       if (error == 0)
+               error = zfsctl_snapshot_zname(tdip, tnm, MAXNAMELEN, to);
+       if (error == 0)
                error = zfs_secpolicy_rename_perms(from, to, cr);
-       if (error)
+       if (error != 0)
                goto out;
 
        /*
@@ -532,21 +536,21 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname,
        /*
         * No-op when names are identical.
         */
-       if (strcmp(sname, tname) == 0) {
+       if (strcmp(snm, tnm) == 0) {
                error = 0;
                goto out;
        }
 
        mutex_enter(&zsb->z_ctldir_lock);
 
-       error = dmu_objset_rename(from, to, B_FALSE);
+       error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
        if (error)
                goto out_unlock;
 
-       search.se_name = (char *)sname;
+       search.se_name = (char *)snm;
        sep = avl_find(&zsb->z_ctldir_snaps, &search, &where);
        if (sep)
-               zfsctl_rename_snap(zsb, sep, tname);
+               zfsctl_rename_snap(zsb, sep, tnm);
 
 out_unlock:
        mutex_exit(&zsb->z_ctldir_lock);
@@ -554,6 +558,7 @@ out:
        kmem_free(from, MAXNAMELEN);
        kmem_free(to, MAXNAMELEN);
        kmem_free(real, MAXNAMELEN);
+       kmem_free(fsname, MAXNAMELEN);
 
        ZFS_EXIT(zsb);
 
@@ -588,14 +593,14 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
        }
 
        error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname);
-       if (!error)
+       if (error == 0)
                error = zfs_secpolicy_destroy_perms(snapname, cr);
-       if (error)
+       if (error != 0)
                goto out;
 
        error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE);
        if ((error == 0) || (error == ENOENT))
-               error = dmu_objset_destroy(snapname, B_FALSE);
+               error = dsl_destroy_snapshot(snapname, B_FALSE);
 out:
        kmem_free(snapname, MAXNAMELEN);
        kmem_free(real, MAXNAMELEN);
@@ -628,12 +633,12 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
        dmu_objset_name(zsb->z_os, dsname);
 
        error = zfs_secpolicy_snapshot_perms(dsname, cr);
-       if (error)
+       if (error != 0)
                goto out;
 
        if (error == 0) {
                error = dmu_objset_snapshot_one(dsname, dirname);
-               if (error)
+               if (error != 0)
                        goto out;
 
                error = zfsctl_snapdir_lookup(dip, dirname, ipp,
index e64d6a1f04ebec7493e595f5a16877fb1a63f19d..acc54e5a7830d488148373540370edb3833d1def 100644 (file)
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/sunldi.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sharefs/share.h>
-#include <sys/dmu_objset.h>
 #include <sys/fm/util.h>
 
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 
 #include <linux/miscdevice.h>
@@ -242,11 +245,7 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
 int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
 static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
 
-static int zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature);
-static int zfs_prop_activate_feature_check(void *arg1, void *arg2,
-    dmu_tx_t *tx);
-static void zfs_prop_activate_feature_sync(void *arg1, void *arg2,
-    dmu_tx_t *tx);
+static int zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature);
 
 static void
 history_str_free(char *buf)
@@ -430,49 +429,48 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
 {
        uint64_t zoned;
 
-       rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-       if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) {
-               rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+       if (dsl_prop_get_int_ds(ds, "zoned", &zoned))
                return (ENOENT);
-       }
-       rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 
        return (zfs_dozonecheck_impl(dataset, zoned, cr));
 }
 
 static int
-zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+    const char *perm, cred_t *cr)
 {
        int error;
-       dsl_dataset_t *ds;
-
-       error = dsl_dataset_hold(name, FTAG, &ds);
-       if (error != 0)
-               return (error);
 
        error = zfs_dozonecheck_ds(name, ds, cr);
        if (error == 0) {
                error = secpolicy_zfs(cr);
-               if (error)
+               if (error != 0)
                        error = dsl_deleg_access_impl(ds, perm, cr);
        }
-
-       dsl_dataset_rele(ds, FTAG);
        return (error);
 }
 
 static int
-zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
-    const char *perm, cred_t *cr)
+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
 {
        int error;
+       dsl_dataset_t *ds;
+       dsl_pool_t *dp;
 
-       error = zfs_dozonecheck_ds(name, ds, cr);
-       if (error == 0) {
-               error = secpolicy_zfs(cr);
-               if (error)
-                       error = dsl_deleg_access_impl(ds, perm, cr);
+       error = dsl_pool_hold(name, FTAG, &dp);
+       if (error != 0)
+               return (error);
+
+       error = dsl_dataset_hold(dp, name, FTAG, &ds);
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (error);
        }
+
+       error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
+
+       dsl_dataset_rele(ds, FTAG);
+       dsl_pool_rele(dp, FTAG);
        return (error);
 }
 
@@ -495,7 +493,7 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
        /* First get the existing dataset label. */
        error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
            1, sizeof (ds_hexsl), &ds_hexsl, NULL);
-       if (error)
+       if (error != 0)
                return (EPERM);
 
        if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
@@ -545,7 +543,7 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
                 */
                error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
                    setsl_tag, &os);
-               if (error)
+               if (error != 0)
                        return (EPERM);
 
                dmu_objset_disown(os, setsl_tag);
@@ -638,7 +636,7 @@ zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
        int error;
 
        error = zfs_dozonecheck(zc->zc_name, cr);
-       if (error)
+       if (error != 0)
                return (error);
 
        /*
@@ -660,7 +658,6 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 static int
 zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-       spa_t *spa;
        dsl_pool_t *dp;
        dsl_dataset_t *ds;
        char *cp;
@@ -673,23 +670,22 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
        cp = strchr(zc->zc_name, '@');
        if (cp == NULL)
                return (EINVAL);
-       error = spa_open(zc->zc_name, &spa, FTAG);
-       if (error)
+       error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+       if (error != 0)
                return (error);
 
-       dp = spa_get_dsl(spa);
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
        error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
-       rw_exit(&dp->dp_config_rwlock);
-       spa_close(spa, FTAG);
-       if (error)
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
                return (error);
+       }
 
        dsl_dataset_name(ds, zc->zc_name);
 
        error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
            ZFS_DELEG_PERM_SEND, cr);
        dsl_dataset_rele(ds, FTAG);
+       dsl_pool_rele(dp, FTAG);
 
        return (error);
 }
@@ -820,12 +816,21 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
                return (EINVAL);
        for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
            pair = nextpair) {
+               dsl_pool_t *dp;
                dsl_dataset_t *ds;
 
+               error = dsl_pool_hold(nvpair_name(pair), FTAG, &dp);
+               if (error != 0)
+                       break;
                nextpair = nvlist_next_nvpair(snaps, pair);
-               error = dsl_dataset_hold(nvpair_name(pair), FTAG, &ds);
-               if (error == 0) {
+               error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds);
+               if (error == 0)
                        dsl_dataset_rele(ds, FTAG);
+               dsl_pool_rele(dp, FTAG);
+
+               if (error == 0) {
+                       error = zfs_secpolicy_destroy_perms(nvpair_name(pair),
+                           cr);
                } else if (error == ENOENT) {
                        /*
                         * Ignore any snapshots that don't exist (we consider
@@ -837,11 +842,7 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
                         */
                        fnvlist_remove_nvpair(snaps, pair);
                        error = 0;
-                       continue;
-               } else {
-                       break;
                }
-               error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
                if (error != 0)
                        break;
        }
@@ -889,41 +890,47 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 static int
 zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-       char    parentname[MAXNAMELEN];
-       objset_t *clone;
+       dsl_pool_t *dp;
+       dsl_dataset_t *clone;
        int error;
 
        error = zfs_secpolicy_write_perms(zc->zc_name,
            ZFS_DELEG_PERM_PROMOTE, cr);
-       if (error)
+       if (error != 0)
+               return (error);
+
+       error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+       if (error != 0)
                return (error);
 
-       error = dmu_objset_hold(zc->zc_name, FTAG, &clone);
+       error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
 
        if (error == 0) {
-               dsl_dataset_t *pclone = NULL;
+               char parentname[MAXNAMELEN];
+               dsl_dataset_t *origin = NULL;
                dsl_dir_t *dd;
-               dd = clone->os_dsl_dataset->ds_dir;
+               dd = clone->ds_dir;
 
-               rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
                error = dsl_dataset_hold_obj(dd->dd_pool,
-                   dd->dd_phys->dd_origin_obj, FTAG, &pclone);
-               rw_exit(&dd->dd_pool->dp_config_rwlock);
-               if (error) {
-                       dmu_objset_rele(clone, FTAG);
+                   dd->dd_phys->dd_origin_obj, FTAG, &origin);
+               if (error != 0) {
+                       dsl_dataset_rele(clone, FTAG);
+                       dsl_pool_rele(dp, FTAG);
                        return (error);
                }
 
-               error = zfs_secpolicy_write_perms(zc->zc_name,
+               error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
                    ZFS_DELEG_PERM_MOUNT, cr);
 
-               dsl_dataset_name(pclone, parentname);
-               dmu_objset_rele(clone, FTAG);
-               dsl_dataset_rele(pclone, FTAG);
-               if (error == 0)
-                       error = zfs_secpolicy_write_perms(parentname,
+               dsl_dataset_name(origin, parentname);
+               if (error == 0) {
+                       error = zfs_secpolicy_write_perms_ds(parentname, origin,
                            ZFS_DELEG_PERM_PROMOTE, cr);
+               }
+               dsl_dataset_rele(clone, FTAG);
+               dsl_dataset_rele(origin, FTAG);
        }
+       dsl_pool_rele(dp, FTAG);
        return (error);
 }
 
@@ -1132,16 +1139,47 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 static int
 zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-       return (zfs_secpolicy_write_perms(zc->zc_name,
-           ZFS_DELEG_PERM_HOLD, cr));
+       nvpair_t *pair;
+       nvlist_t *holds;
+       int error;
+
+       error = nvlist_lookup_nvlist(innvl, "holds", &holds);
+       if (error != 0)
+               return (EINVAL);
+
+       for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(holds, pair)) {
+               char fsname[MAXNAMELEN];
+               error = dmu_fsname(nvpair_name(pair), fsname);
+               if (error != 0)
+                       return (error);
+               error = zfs_secpolicy_write_perms(fsname,
+                   ZFS_DELEG_PERM_HOLD, cr);
+               if (error != 0)
+                       return (error);
+       }
+       return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-       return (zfs_secpolicy_write_perms(zc->zc_name,
-           ZFS_DELEG_PERM_RELEASE, cr));
+       nvpair_t *pair;
+       int error;
+
+       for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(innvl, pair)) {
+               char fsname[MAXNAMELEN];
+               error = dmu_fsname(nvpair_name(pair), fsname);
+               if (error != 0)
+                       return (error);
+               error = zfs_secpolicy_write_perms(fsname,
+                   ZFS_DELEG_PERM_RELEASE, cr);
+               if (error != 0)
+                       return (error);
+       }
+       return (0);
 }
 
 /*
@@ -1162,11 +1200,11 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
                return (0);
 
        error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
-       if (!error)
+       if (error == 0)
                error = zfs_secpolicy_hold(zc, innvl, cr);
-       if (!error)
+       if (error == 0)
                error = zfs_secpolicy_release(zc, innvl, cr);
-       if (!error)
+       if (error == 0)
                error = zfs_secpolicy_destroy(zc, innvl, cr);
        return (error);
 }
@@ -1276,7 +1314,7 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp)
        int error;
 
        error = dmu_objset_hold(dsname, FTAG, &os);
-       if (error)
+       if (error != 0)
                return (error);
        if (dmu_objset_type(os) != DMU_OST_ZFS) {
                dmu_objset_rele(os, FTAG);
@@ -1379,7 +1417,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
                VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
                error = zfs_fill_zplprops_root(version, rootprops,
                    zplprops, NULL);
-               if (error)
+               if (error != 0)
                        goto pool_props_bad;
        }
 
@@ -1652,12 +1690,7 @@ zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 static int
 zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 {
-       int error;
-
-       if ((error = dsl_dsobj_to_dsname(zc->zc_name,zc->zc_obj,zc->zc_value)))
-               return (error);
-
-       return (0);
+       return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
 }
 
 /*
@@ -1974,15 +2007,14 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
-       objset_t *os = NULL;
+       objset_t *os;
        int error;
 
-       if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)))
-               return (error);
-
-       error = zfs_ioc_objset_stats_impl(zc, os);
-
-       dmu_objset_rele(os, FTAG);
+       error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+       if (error == 0) {
+               error = zfs_ioc_objset_stats_impl(zc, os);
+               dmu_objset_rele(os, FTAG);
+       }
 
        return (error);
 }
@@ -2003,30 +2035,23 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
 static int
 zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
 {
-       objset_t *os = NULL;
-       int error;
+       int error = 0;
        nvlist_t *nv;
 
-       if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)))
-               return (error);
-
        /*
         * Without this check, we would return local property values if the
         * caller has not already received properties on or after
         * SPA_VERSION_RECVD_PROPS.
         */
-       if (!dsl_prop_get_hasrecvd(os)) {
-               dmu_objset_rele(os, FTAG);
+       if (!dsl_prop_get_hasrecvd(zc->zc_name))
                return (ENOTSUP);
-       }
 
        if (zc->zc_nvlist_dst != 0 &&
-           (error = dsl_prop_get_received(os, &nv)) == 0) {
+           (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
                error = put_nvlist(zc, nv);
                nvlist_free(nv);
        }
 
-       dmu_objset_rele(os, FTAG);
        return (error);
 }
 
@@ -2141,20 +2166,6 @@ top:
                (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
        p = zc->zc_name + strlen(zc->zc_name);
 
-       /*
-        * Pre-fetch the datasets.  dmu_objset_prefetch() always returns 0
-        * but is not declared void because its called by dmu_objset_find().
-        */
-       if (zc->zc_cookie == 0) {
-               uint64_t cookie = 0;
-               int len = sizeof (zc->zc_name) - (p - zc->zc_name);
-
-               while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
-                       if (!dataset_name_hidden(zc->zc_name))
-                               (void) dmu_objset_prefetch(zc->zc_name, NULL);
-               }
-       }
-
        do {
                error = dmu_dir_list_next(os,
                    sizeof (zc->zc_name) - (p - zc->zc_name), p,
@@ -2197,14 +2208,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
        objset_t *os;
        int error;
 
-top:
-       if (zc->zc_cookie == 0 && !zc->zc_simple)
-               (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
-                   NULL, DS_FIND_SNAPSHOTS);
-
        error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-       if (error)
+       if (error != 0) {
                return (error == ENOENT ? ESRCH : error);
+       }
 
        /*
         * A dataset name of maximum length cannot have any snapshots,
@@ -2224,24 +2231,8 @@ top:
                dsl_dataset_t *ds;
                dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
 
-               /*
-                * Since we probably don't have a hold on this snapshot,
-                * it's possible that the objsetid could have been destroyed
-                * and reused for a new objset. It's OK if this happens during
-                * a zfs send operation, since the new createtxg will be
-                * beyond the range we're interested in.
-                */
-               rw_enter(&dp->dp_config_rwlock, RW_READER);
                error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
-               rw_exit(&dp->dp_config_rwlock);
-               if (error) {
-                       if (error == ENOENT) {
-                               /* Racing with destroy, get the next one. */
-                               *strchr(zc->zc_name, '@') = '\0';
-                               dmu_objset_rele(os, FTAG);
-                               goto top;
-                       }
-               } else {
+               if (error == 0) {
                        objset_t *ossnap;
 
                        error = dmu_objset_from_ds(ds, &ossnap);
@@ -2255,7 +2246,7 @@ top:
 
        dmu_objset_rele(os, FTAG);
        /* if we failed, undo the @ that we tacked on to zc_name */
-       if (error)
+       if (error != 0)
                *strchr(zc->zc_name, '@') = '\0';
        return (error);
 }
@@ -2345,13 +2336,13 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
                err = dsl_dir_set_quota(dsname, source, intval);
                break;
        case ZFS_PROP_REFQUOTA:
-               err = dsl_dataset_set_quota(dsname, source, intval);
+               err = dsl_dataset_set_refquota(dsname, source, intval);
                break;
        case ZFS_PROP_RESERVATION:
                err = dsl_dir_set_reservation(dsname, source, intval);
                break;
        case ZFS_PROP_REFRESERVATION:
-               err = dsl_dataset_set_reservation(dsname, source, intval);
+               err = dsl_dataset_set_refreservation(dsname, source, intval);
                break;
        case ZFS_PROP_VOLSIZE:
                err = zvol_set_volsize(dsname, intval);
@@ -2386,19 +2377,16 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
                        zfeature_info_t *feature =
                            &spa_feature_table[SPA_FEATURE_LZ4_COMPRESS];
                        spa_t *spa;
-                       dsl_pool_t *dp;
 
                        if ((err = spa_open(dsname, &spa, FTAG)) != 0)
                                return (err);
 
-                       dp = spa->spa_dsl_pool;
-
                        /*
                         * Setting the LZ4 compression algorithm activates
                         * the feature.
                         */
                        if (!spa_feature_is_active(spa, feature)) {
-                               if ((err = zfs_prop_activate_feature(dp,
+                               if ((err = zfs_prop_activate_feature(spa,
                                    feature)) != 0) {
                                        spa_close(spa, FTAG);
                                        return (err);
@@ -2557,12 +2545,12 @@ retry:
 
                        if (nvpair_type(propval) == DATA_TYPE_STRING) {
                                strval = fnvpair_value_string(propval);
-                               err = dsl_prop_set(dsname, propname, source, 1,
-                                   strlen(strval) + 1, strval);
+                               err = dsl_prop_set_string(dsname, propname,
+                                   source, strval);
                        } else {
                                intval = fnvpair_value_uint64(propval);
-                               err = dsl_prop_set(dsname, propname, source, 8,
-                                   1, &intval);
+                               err = dsl_prop_set_int(dsname, propname, source,
+                                   intval);
                        }
 
                        if (err != 0) {
@@ -2628,7 +2616,7 @@ props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
 }
 
 static int
-clear_received_props(objset_t *os, const char *fs, nvlist_t *props,
+clear_received_props(const char *dsname, nvlist_t *props,
     nvlist_t *skipped)
 {
        int err = 0;
@@ -2640,8 +2628,8 @@ clear_received_props(objset_t *os, const char *fs, nvlist_t *props,
                 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
                 */
                zprop_source_t flags = (ZPROP_SRC_NONE |
-                   (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0));
-               err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL);
+                   (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
+               err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
        }
        nvlist_free(cleared_props);
        return (err);
@@ -2673,22 +2661,19 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
 
        if (received) {
                nvlist_t *origprops;
-               objset_t *os;
-
-               if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) {
-                       if (dsl_prop_get_received(os, &origprops) == 0) {
-                               (void) clear_received_props(os,
-                                   zc->zc_name, origprops, nvl);
-                               nvlist_free(origprops);
-                       }
 
-                       dsl_prop_set_hasrecvd(os);
-                       dmu_objset_rele(os, FTAG);
+               if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
+                       (void) clear_received_props(zc->zc_name,
+                           origprops, nvl);
+                       nvlist_free(origprops);
                }
+
+               error = dsl_prop_set_hasrecvd(zc->zc_name);
        }
 
        errors = fnvlist_alloc();
-       error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
+       if (error == 0)
+               error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
 
        if (zc->zc_nvlist_dst != 0 && errors != NULL) {
                (void) put_nvlist(zc, errors);
@@ -2771,7 +2756,7 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc)
        }
 
        /* property name has been validated by zfs_secpolicy_inherit_prop() */
-       return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL));
+       return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
 }
 
 static int
@@ -2907,7 +2892,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
         */
 
        error = secpolicy_zfs(CRED());
-       if (error) {
+       if (error != 0) {
                if (zc->zc_perm_action == B_FALSE) {
                        error = dsl_deleg_can_allow(zc->zc_name,
                            fsaclnv, CRED());
@@ -3214,7 +3199,7 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
                error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
                    nvprops, outnvl);
                if (error != 0)
-                       (void) dmu_objset_destroy(fsname, B_FALSE);
+                       (void) dsl_destroy_head(fsname);
        }
        return (error);
 }
@@ -3234,7 +3219,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
        int error = 0;
        nvlist_t *nvprops = NULL;
        char *origin_name;
-       dsl_dataset_t *origin;
 
        if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0)
                return (EINVAL);
@@ -3246,14 +3230,8 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 
        if (dataset_namecheck(origin_name, NULL, NULL) != 0)
                return (EINVAL);
-
-       error = dsl_dataset_hold(origin_name, FTAG, &origin);
-       if (error)
-               return (error);
-
-       error = dmu_objset_clone(fsname, origin, 0);
-       dsl_dataset_rele(origin, FTAG);
-       if (error)
+       error = dmu_objset_clone(fsname, origin_name);
+       if (error != 0)
                return (error);
 
        /*
@@ -3263,7 +3241,7 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
                error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
                    nvprops, outnvl);
                if (error != 0)
-                       (void) dmu_objset_destroy(fsname, B_FALSE);
+                       (void) dsl_destroy_head(fsname);
        }
        return (error);
 }
@@ -3275,7 +3253,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
  * }
  *
  * outnvl: snapshot -> error code (int32)
- *
  */
 static int
 zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -3325,7 +3302,7 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
                }
        }
 
-       error = dmu_objset_snapshot(snaps, props, outnvl);
+       error = dsl_dataset_snapshot(snaps, props, outnvl);
        return (error);
 }
 
@@ -3371,43 +3348,71 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 }
 
 /*
- * inputs:
- * name                dataset name, or when 'arg == NULL' the full snapshot name
- * arg         short snapshot name (i.e. part after the '@')
+ * The dp_config_rwlock must not be held when calling this, because the
+ * unmount may need to write out data.
+ *
+ * This function is best-effort.  Callers must deal gracefully if it
+ * remains mounted (or is remounted after this call).
  */
-/* ARGSUSED */
-int
-zfs_unmount_snap(const char *name, void *arg)
+void
+zfs_unmount_snap(const char *snapname)
 {
        zfs_sb_t *zsb = NULL;
        char *dsname;
-       char *snapname;
        char *fullname;
        char *ptr;
-       int error;
 
-       if ((ptr = strchr(name, '@')) == NULL)
-               return (0);
+       if ((ptr = strchr(snapname, '@')) == NULL)
+               return;
 
-       dsname = strdup(name);
-       dsname[ptr - name] = '\0';
+       dsname = strdup(snapname);
+       dsname[ptr - snapname] = '\0';
        snapname = strdup(ptr + 1);
        fullname = kmem_asprintf("%s@%s", dsname, snapname);
-       error = zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE);
-       if (error == 0) {
-               error = zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE);
+       if (zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE) == 0) {
+               ASSERT(!dsl_pool_config_held(dmu_objset_pool(zsb->z_os)));
+               (void) zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE);
                zfs_sb_rele(zsb, FTAG);
-
-               /* Allow ENOENT for consistency with upstream */
-               if (error == ENOENT)
-                       error = 0;
        }
 
        strfree(dsname);
-       strfree(snapname);
        strfree(fullname);
 
-       return (error);
+       return;
+}
+
+/* ARGSUSED */
+static int
+zfs_unmount_snap_cb(const char *snapname, void *arg)
+{
+       zfs_unmount_snap(snapname);
+       return (0);
+}
+
+/*
+ * When a clone is destroyed, its origin may also need to be destroyed,
+ * in which case it must be unmounted.  This routine will do that unmount
+ * if necessary.
+ */
+void
+zfs_destroy_unmount_origin(const char *fsname)
+{
+       int error;
+       objset_t *os;
+       dsl_dataset_t *ds;
+
+       error = dmu_objset_hold(fsname, FTAG, &os);
+       if (error != 0)
+               return;
+       ds = dmu_objset_ds(os);
+       if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
+               char originname[MAXNAMELEN];
+               dsl_dataset_name(ds->ds_prev, originname);
+               dmu_objset_rele(os, FTAG);
+               zfs_unmount_snap(originname);
+       } else {
+               dmu_objset_rele(os, FTAG);
+       }
 }
 
 /*
@@ -3442,15 +3447,11 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
                    (name[poollen] != '/' && name[poollen] != '@'))
                        return (EXDEV);
 
-               /*
-                * Ignore failures to unmount; dmu_snapshots_destroy_nvl()
-                * will deal with this gracefully (by filling in outnvl).
-                */
-               (void) zfs_unmount_snap(name, NULL);
+               zfs_unmount_snap(name);
                (void) zvol_remove_minor(name);
        }
 
-       return (dmu_snapshots_destroy_nvl(snaps, defer, outnvl));
+       return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
 }
 
 /*
@@ -3465,13 +3466,13 @@ static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
        int err;
-       if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
-               err = zfs_unmount_snap(zc->zc_name, NULL);
-               if (err)
-                       return (err);
-       }
+       if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS)
+               zfs_unmount_snap(zc->zc_name);
 
-       err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy);
+       if (strchr(zc->zc_name, '@'))
+               err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
+       else
+               err = dsl_destroy_head(zc->zc_name);
        if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
                (void) zvol_remove_minor(zc->zc_name);
        return (err);
@@ -3486,79 +3487,35 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 static int
 zfs_ioc_rollback(zfs_cmd_t *zc)
 {
-       dsl_dataset_t *ds, *clone;
-       int error;
        zfs_sb_t *zsb;
-       char *clone_name;
-
-       error = dsl_dataset_hold(zc->zc_name, FTAG, &ds);
-       if (error)
-               return (error);
-
-       /* must not be a snapshot */
-       if (dsl_dataset_is_snapshot(ds)) {
-               dsl_dataset_rele(ds, FTAG);
-               return (EINVAL);
-       }
-
-       /* must have a most recent snapshot */
-       if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
-               dsl_dataset_rele(ds, FTAG);
-               return (EINVAL);
-       }
-
-       /*
-        * Create clone of most recent snapshot.
-        */
-       clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name);
-       error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT);
-       if (error)
-               goto out;
-
-       error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone);
-       if (error)
-               goto out;
+       int error;
 
-       /*
-        * Do clone swap.
-        */
        if (get_zfs_sb(zc->zc_name, &zsb) == 0) {
                error = zfs_suspend_fs(zsb);
                if (error == 0) {
                        int resume_err;
 
-                       if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
-                               error = dsl_dataset_clone_swap(clone, ds,
-                                   B_TRUE);
-                               dsl_dataset_disown(ds, FTAG);
-                               ds = NULL;
-                       } else {
-                               error = EBUSY;
-                       }
+                       error = dsl_dataset_rollback(zc->zc_name);
                        resume_err = zfs_resume_fs(zsb, zc->zc_name);
                        error = error ? error : resume_err;
                }
                deactivate_super(zsb->z_sb);
        } else {
-               if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
-                       error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
-                       dsl_dataset_disown(ds, FTAG);
-                       ds = NULL;
-               } else {
-                       error = EBUSY;
-               }
+               error = dsl_dataset_rollback(zc->zc_name);
        }
+       return (error);
+}
 
-       /*
-        * Destroy clone (which also closes it).
-        */
-       (void) dsl_dataset_destroy(clone, FTAG, B_FALSE);
+static int
+recursive_unmount(const char *fsname, void *arg)
+{
+       const char *snapname = arg;
+       char *fullname;
 
-out:
-       strfree(clone_name);
-       if (ds)
-               dsl_dataset_rele(ds, FTAG);
-       return (error);
+       fullname = kmem_asprintf("%s@%s", fsname, snapname);
+       zfs_unmount_snap(fullname);
+       strfree(fullname);
+       return (0);
 }
 
 /*
@@ -3573,6 +3530,7 @@ static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
        boolean_t recursive = zc->zc_cookie & 1;
+       char *at;
        int err;
 
        zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
@@ -3580,25 +3538,29 @@ zfs_ioc_rename(zfs_cmd_t *zc)
            strchr(zc->zc_value, '%'))
                return (EINVAL);
 
-       /*
-        * Unmount snapshot unless we're doing a recursive rename,
-        * in which case the dataset code figures out which snapshots
-        * to unmount.
-        */
-       if (!recursive && strchr(zc->zc_name, '@') != NULL &&
-           zc->zc_objset_type == DMU_OST_ZFS) {
-               err = zfs_unmount_snap(zc->zc_name, NULL);
-               if (err)
-                       return (err);
-       }
-
-       err = dmu_objset_rename(zc->zc_name, zc->zc_value, recursive);
-       if ((err == 0) && (zc->zc_objset_type == DMU_OST_ZVOL)) {
-               (void) zvol_remove_minor(zc->zc_name);
-               (void) zvol_create_minor(zc->zc_value);
+       at = strchr(zc->zc_name, '@');
+       if (at != NULL) {
+               /* snaps must be in same fs */
+               if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
+                       return (EXDEV);
+               *at = '\0';
+               if (zc->zc_objset_type == DMU_OST_ZFS) {
+                       int error = dmu_objset_find(zc->zc_name,
+                           recursive_unmount, at + 1,
+                           recursive ? DS_FIND_CHILDREN : 0);
+                       if (error != 0)
+                               return (error);
+               }
+               return (dsl_dataset_rename_snapshot(zc->zc_name,
+                   at + 1, strchr(zc->zc_value, '@') + 1, recursive));
+       } else {
+               err = dsl_dir_rename(zc->zc_name, zc->zc_value);
+               if (!err && zc->zc_objset_type == DMU_OST_ZVOL) {
+                       (void) zvol_remove_minor(zc->zc_name);
+                       (void) zvol_create_minor(zc->zc_value);
+               }
+               return (err);
        }
-
-       return (err);
 }
 
 static int
@@ -3743,36 +3705,15 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
        return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
 }
 
-/*
- * Activates a feature on a pool in response to a property setting. This
- * creates a new sync task which modifies the pool to reflect the feature
- * as being active.
- */
-static int
-zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature)
-{
-       int err;
-
-       /* EBUSY here indicates that the feature is already active */
-       err = dsl_sync_task_do(dp, zfs_prop_activate_feature_check,
-           zfs_prop_activate_feature_sync, dp->dp_spa, feature, 2);
-
-       if (err != 0 && err != EBUSY)
-               return (err);
-       else
-               return (0);
-}
-
 /*
  * Checks for a race condition to make sure we don't increment a feature flag
  * multiple times.
  */
-/*ARGSUSED*/
 static int
-zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx)
+zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
-       zfeature_info_t *feature = arg2;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       zfeature_info_t *feature = arg;
 
        if (!spa_feature_is_active(spa, feature))
                return (0);
@@ -3785,14 +3726,35 @@ zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx)
  * zfs_prop_activate_feature.
  */
 static void
-zfs_prop_activate_feature_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
 {
-       spa_t *spa = arg1;
-       zfeature_info_t *feature = arg2;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       zfeature_info_t *feature = arg;
 
        spa_feature_incr(spa, feature, tx);
 }
 
+/*
+ * Activates a feature on a pool in response to a property setting. This
+ * creates a new sync task which modifies the pool to reflect the feature
+ * as being active.
+ */
+static int
+zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature)
+{
+       int err;
+
+       /* EBUSY here indicates that the feature is already active */
+       err = dsl_sync_task(spa_name(spa),
+           zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
+           feature, 2);
+
+       if (err != 0 && err != EBUSY)
+               return (err);
+       else
+               return (0);
+}
+
 /*
  * Removes properties from the given props list that fail permission checks
  * needed to clear them and to restore them in case of a receive error. For each
@@ -3947,7 +3909,6 @@ static int
 zfs_ioc_recv(zfs_cmd_t *zc)
 {
        file_t *fp;
-       objset_t *os;
        dmu_recv_cookie_t drc;
        boolean_t force = (boolean_t)zc->zc_guid;
        int fd;
@@ -3957,7 +3918,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        offset_t off;
        nvlist_t *props = NULL; /* sent properties */
        nvlist_t *origprops = NULL; /* existing properties */
-       objset_t *origin = NULL;
+       char *origin = NULL;
        char *tosnap;
        char tofs[ZFS_MAXNAMELEN];
        boolean_t first_recvd_props = B_FALSE;
@@ -3985,18 +3946,31 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 
        VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-       if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) {
-               if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) &&
-                   !dsl_prop_get_hasrecvd(os)) {
+       if (zc->zc_string[0])
+               origin = zc->zc_string;
+
+       error = dmu_recv_begin(tofs, tosnap,
+           &zc->zc_begin_record, force, origin, &drc);
+       if (error != 0)
+               goto out;
+
+       /*
+        * Set properties before we receive the stream so that they are applied
+        * to the new data. Note that we must call dmu_recv_stream() if
+        * dmu_recv_begin() succeeds.
+        */
+       if (props != NULL && !drc.drc_newfs) {
+               if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
+                   SPA_VERSION_RECVD_PROPS &&
+                   !dsl_prop_get_hasrecvd(tofs))
                        first_recvd_props = B_TRUE;
-               }
 
                /*
                 * If new received properties are supplied, they are to
                 * completely replace the existing received properties, so stash
                 * away the existing ones.
                 */
-               if (dsl_prop_get_received(os, &origprops) == 0) {
+               if (dsl_prop_get_received(tofs, &origprops) == 0) {
                        nvlist_t *errlist = NULL;
                        /*
                         * Don't bother writing a property if its value won't
@@ -4008,53 +3982,25 @@ zfs_ioc_recv(zfs_cmd_t *zc)
                         */
                        if (!first_recvd_props)
                                props_reduce(props, origprops);
-                       if (zfs_check_clearable(tofs, origprops,
-                           &errlist) != 0)
+                       if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
                                (void) nvlist_merge(errors, errlist, 0);
                        nvlist_free(errlist);
-               }
 
-               dmu_objset_rele(os, FTAG);
-       }
-
-       if (zc->zc_string[0]) {
-               error = dmu_objset_hold(zc->zc_string, FTAG, &origin);
-               if (error)
-                       goto out;
-       }
-
-       error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds,
-           &zc->zc_begin_record, force, origin, &drc);
-       if (origin)
-               dmu_objset_rele(origin, FTAG);
-       if (error)
-               goto out;
-
-       /*
-        * Set properties before we receive the stream so that they are applied
-        * to the new data. Note that we must call dmu_recv_stream() if
-        * dmu_recv_begin() succeeds.
-        */
-       if (props) {
-               if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) {
-                       if (drc.drc_newfs) {
-                               if (spa_version(os->os_spa) >=
-                                   SPA_VERSION_RECVD_PROPS)
-                                       first_recvd_props = B_TRUE;
-                       } else if (origprops != NULL) {
-                               if (clear_received_props(os, tofs, origprops,
-                                   first_recvd_props ? NULL : props) != 0)
-                                       zc->zc_obj |= ZPROP_ERR_NOCLEAR;
-                       } else {
+                       if (clear_received_props(tofs, origprops,
+                           first_recvd_props ? NULL : props) != 0)
                                zc->zc_obj |= ZPROP_ERR_NOCLEAR;
-                       }
-                       dsl_prop_set_hasrecvd(os);
-               } else if (!drc.drc_newfs) {
+               } else {
                        zc->zc_obj |= ZPROP_ERR_NOCLEAR;
                }
+       }
+
+       if (props != NULL) {
+               props_error = dsl_prop_set_hasrecvd(tofs);
 
-               (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
-                   props, errors);
+               if (props_error == 0) {
+                       (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+                           props, errors);
+               }
        }
 
        if (zc->zc_nvlist_dst_size != 0 &&
@@ -4106,22 +4052,16 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        /*
         * On error, restore the original props.
         */
-       if (error && props) {
-               if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
-                       if (clear_received_props(os, tofs, props, NULL) != 0) {
-                               /*
-                                * We failed to clear the received properties.
-                                * Since we may have left a $recvd value on the
-                                * system, we can't clear the $hasrecvd flag.
-                                */
-                               zc->zc_obj |= ZPROP_ERR_NORESTORE;
-                       } else if (first_recvd_props) {
-                               dsl_prop_unset_hasrecvd(os);
-                       }
-                       dmu_objset_rele(os, FTAG);
-               } else if (!drc.drc_newfs) {
-                       /* We failed to clear the received properties. */
+       if (error != 0 && props != NULL && !drc.drc_newfs) {
+               if (clear_received_props(tofs, props, NULL) != 0) {
+                       /*
+                        * We failed to clear the received properties.
+                        * Since we may have left a $recvd value on the
+                        * system, we can't clear the $hasrecvd flag.
+                        */
                        zc->zc_obj |= ZPROP_ERR_NORESTORE;
+               } else if (first_recvd_props) {
+                       dsl_prop_unset_hasrecvd(tofs);
                }
 
                if (origprops == NULL && !drc.drc_newfs) {
@@ -4173,100 +4113,75 @@ out:
 static int
 zfs_ioc_send(zfs_cmd_t *zc)
 {
-       objset_t *fromsnap = NULL;
-       objset_t *tosnap;
        int error;
        offset_t off;
-       dsl_dataset_t *ds;
-       dsl_dataset_t *dsfrom = NULL;
-       spa_t *spa;
-       dsl_pool_t *dp;
        boolean_t estimate = (zc->zc_guid != 0);
 
-       error = spa_open(zc->zc_name, &spa, FTAG);
-       if (error)
-               return (error);
+       if (zc->zc_obj != 0) {
+               dsl_pool_t *dp;
+               dsl_dataset_t *tosnap;
 
-       dp = spa_get_dsl(spa);
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
-       rw_exit(&dp->dp_config_rwlock);
-       spa_close(spa, FTAG);
-       if (error)
-               return (error);
-
-       error = dmu_objset_from_ds(ds, &tosnap);
-       if (error) {
-               dsl_dataset_rele(ds, FTAG);
-               return (error);
-       }
-
-       if (zc->zc_fromobj != 0) {
-               rw_enter(&dp->dp_config_rwlock, RW_READER);
-               error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom);
-               rw_exit(&dp->dp_config_rwlock);
-               if (error) {
-                       dsl_dataset_rele(ds, FTAG);
+               error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+               if (error != 0)
                        return (error);
-               }
-               error = dmu_objset_from_ds(dsfrom, &fromsnap);
-               if (error) {
-                       dsl_dataset_rele(dsfrom, FTAG);
-                       dsl_dataset_rele(ds, FTAG);
+
+               error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+               if (error != 0) {
+                       dsl_pool_rele(dp, FTAG);
                        return (error);
                }
+
+               if (dsl_dir_is_clone(tosnap->ds_dir))
+                       zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj;
+               dsl_dataset_rele(tosnap, FTAG);
+               dsl_pool_rele(dp, FTAG);
        }
 
-       if (zc->zc_obj) {
-               dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       if (estimate) {
+               dsl_pool_t *dp;
+               dsl_dataset_t *tosnap;
+               dsl_dataset_t *fromsnap = NULL;
+
+               error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+               if (error != 0)
+                       return (error);
 
-               if (fromsnap != NULL) {
-                       dsl_dataset_rele(dsfrom, FTAG);
-                       dsl_dataset_rele(ds, FTAG);
-                       return (EINVAL);
+               error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+               if (error != 0) {
+                       dsl_pool_rele(dp, FTAG);
+                       return (error);
                }
 
-               if (dsl_dir_is_clone(ds->ds_dir)) {
-                       rw_enter(&dp->dp_config_rwlock, RW_READER);
-                       error = dsl_dataset_hold_obj(dp,
-                           ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &dsfrom);
-                       rw_exit(&dp->dp_config_rwlock);
-                       if (error) {
-                               dsl_dataset_rele(ds, FTAG);
-                               return (error);
-                       }
-                       error = dmu_objset_from_ds(dsfrom, &fromsnap);
-                       if (error) {
-                               dsl_dataset_rele(dsfrom, FTAG);
-                               dsl_dataset_rele(ds, FTAG);
+               if (zc->zc_fromobj != 0) {
+                       error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
+                           FTAG, &fromsnap);
+                       if (error != 0) {
+                               dsl_dataset_rele(tosnap, FTAG);
+                               dsl_pool_rele(dp, FTAG);
                                return (error);
                        }
                }
-       }
 
-       if (estimate) {
                error = dmu_send_estimate(tosnap, fromsnap,
                    &zc->zc_objset_type);
+
+               if (fromsnap != NULL)
+                       dsl_dataset_rele(fromsnap, FTAG);
+               dsl_dataset_rele(tosnap, FTAG);
+               dsl_pool_rele(dp, FTAG);
        } else {
                file_t *fp = getf(zc->zc_cookie);
-               if (fp == NULL) {
-                       dsl_dataset_rele(ds, FTAG);
-                       if (dsfrom)
-                               dsl_dataset_rele(dsfrom, FTAG);
+               if (fp == NULL)
                        return (EBADF);
-               }
 
                off = fp->f_offset;
-               error = dmu_send(tosnap, fromsnap,
-                   zc->zc_cookie, fp->f_vnode, &off);
+               error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+                   zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off);
 
                if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                        fp->f_offset = off;
                releasef(zc->zc_cookie);
        }
-       if (dsfrom)
-               dsl_dataset_rele(dsfrom, FTAG);
-       dsl_dataset_rele(ds, FTAG);
        return (error);
 }
 
@@ -4281,13 +4196,21 @@ zfs_ioc_send(zfs_cmd_t *zc)
 static int
 zfs_ioc_send_progress(zfs_cmd_t *zc)
 {
+       dsl_pool_t *dp;
        dsl_dataset_t *ds;
        dmu_sendarg_t *dsp = NULL;
        int error;
 
-       if ((error = dsl_dataset_hold(zc->zc_name, FTAG, &ds)) != 0)
+       error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+       if (error != 0)
                return (error);
 
+       error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (error);
+       }
+
        mutex_enter(&ds->ds_sendstream_lock);
 
        /*
@@ -4311,6 +4234,7 @@ zfs_ioc_send_progress(zfs_cmd_t *zc)
 
        mutex_exit(&ds->ds_sendstream_lock);
        dsl_dataset_rele(ds, FTAG);
+       dsl_pool_rele(dp, FTAG);
        return (error);
 }
 
@@ -4417,7 +4341,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
                }
        }
 
-       if (error)
+       if (error != 0)
                return (error);
 
        spa_vdev_state_enter(spa, SCL_NONE);
@@ -4455,7 +4379,7 @@ zfs_ioc_pool_reopen(zfs_cmd_t *zc)
        int error;
 
        error = spa_open(zc->zc_name, &spa, FTAG);
-       if (error)
+       if (error != 0)
                return (error);
 
        spa_vdev_state_enter(spa, SCL_NONE);
@@ -4495,7 +4419,7 @@ zfs_ioc_promote(zfs_cmd_t *zc)
        if (cp)
                *cp = '\0';
        (void) dmu_objset_find(zc->zc_value,
-           zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
+           zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
        return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
 }
 
@@ -4521,7 +4445,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
                return (EINVAL);
 
        error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE);
-       if (error)
+       if (error != 0)
                return (error);
 
        error = zfs_userspace_one(zsb,
@@ -4554,7 +4478,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc)
                return (ENOMEM);
 
        error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE);
-       if (error)
+       if (error != 0)
                return (error);
 
        buf = vmem_alloc(bufsize, KM_SLEEP);
@@ -4604,7 +4528,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
        } else {
                /* XXX kind of reading contents without owning */
                error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-               if (error)
+               if (error != 0)
                        return (error);
 
                error = dmu_objset_userspace_upgrade(os);
@@ -4639,7 +4563,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc)
        int error;
 
        error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-       if (error)
+       if (error != 0)
                return (error);
 
        error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
@@ -4662,25 +4586,26 @@ static int
 zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
 {
        char *snap_name;
+       char *hold_name;
        int error;
+       minor_t minor;
 
-       snap_name = kmem_asprintf("%s@%s-%016llx", zc->zc_name, zc->zc_value,
-           (u_longlong_t)ddi_get_lbolt64());
-
-       if (strlen(snap_name) >= MAXPATHLEN) {
-               strfree(snap_name);
-               return (E2BIG);
-       }
-
-       error = dmu_objset_snapshot_tmp(snap_name, "%temp", zc->zc_cleanup_fd);
-       if (error != 0) {
-               strfree(snap_name);
+       error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+       if (error != 0)
                return (error);
-       }
 
-       (void) strcpy(zc->zc_value, strchr(snap_name, '@') + 1);
+       snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+           (u_longlong_t)ddi_get_lbolt64());
+       hold_name = kmem_asprintf("%%%s", zc->zc_value);
+
+       error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
+           hold_name);
+       if (error == 0)
+               (void) strcpy(zc->zc_value, snap_name);
        strfree(snap_name);
-       return (0);
+       strfree(hold_name);
+       zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+       return (error);
 }
 
 /*
@@ -4695,39 +4620,22 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
 static int
 zfs_ioc_diff(zfs_cmd_t *zc)
 {
-       objset_t *fromsnap;
-       objset_t *tosnap;
        file_t *fp;
        offset_t off;
        int error;
 
-       error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
-       if (error)
-               return (error);
-
-       error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap);
-       if (error) {
-               dmu_objset_rele(tosnap, FTAG);
-               return (error);
-       }
-
        fp = getf(zc->zc_cookie);
-       if (fp == NULL) {
-               dmu_objset_rele(fromsnap, FTAG);
-               dmu_objset_rele(tosnap, FTAG);
+       if (fp == NULL)
                return (EBADF);
-       }
 
        off = fp->f_offset;
 
-       error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off);
+       error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
 
        if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                fp->f_offset = off;
        releasef(zc->zc_cookie);
 
-       dmu_objset_rele(fromsnap, FTAG);
-       dmu_objset_rele(tosnap, FTAG);
        return (error);
 }
 
@@ -4799,13 +4707,13 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
                    ZFS_SHARES_DIR);
                dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
                error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
+               if (error != 0) {
                        dmu_tx_abort(tx);
                } else {
                        error = zfs_create_share_dir(zsb, tx);
                        dmu_tx_commit(tx);
                }
-               if (error) {
+               if (error != 0) {
                        mutex_exit(&zsb->z_lock);
                        VN_RELE(vp);
                        ZFS_EXIT(zsb);
@@ -4886,124 +4794,82 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 }
 
 /*
- * inputs:
- * zc_name             name of filesystem
- * zc_value            short name of snap
- * zc_string           user-supplied tag for this hold
- * zc_cookie           recursive flag
- * zc_temphold         set if hold is temporary
- * zc_cleanup_fd       cleanup-on-exit file descriptor for calling process
- * zc_sendobj          if non-zero, the objid for zc_name@zc_value
- * zc_createtxg                if zc_sendobj is non-zero, snap must have zc_createtxg
+ * innvl: {
+ *     "holds" -> { snapname -> holdname (string), ... }
+ *     (optional) "cleanup_fd" -> fd (int32)
+ * }
  *
- * outputs:            none
+ * outnvl: {
+ *     snapname -> error value (int32)
+ *     ...
+ * }
  */
+/* ARGSUSED */
 static int
-zfs_ioc_hold(zfs_cmd_t *zc)
+zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 {
-       boolean_t recursive = zc->zc_cookie;
-       spa_t *spa;
-       dsl_pool_t *dp;
-       dsl_dataset_t *ds;
+       nvlist_t *holds;
+       int cleanup_fd = -1;
        int error;
        minor_t minor = 0;
 
-       if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-               return (EINVAL);
-
-       if (zc->zc_sendobj == 0) {
-               return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
-                   zc->zc_string, recursive, zc->zc_temphold,
-                   zc->zc_cleanup_fd));
-       }
-
-       if (recursive)
+       error = nvlist_lookup_nvlist(args, "holds", &holds);
+       if (error != 0)
                return (EINVAL);
 
-       error = spa_open(zc->zc_name, &spa, FTAG);
-       if (error)
-               return (error);
-
-       dp = spa_get_dsl(spa);
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
-       rw_exit(&dp->dp_config_rwlock);
-       spa_close(spa, FTAG);
-       if (error)
-               return (error);
-
-       /*
-        * Until we have a hold on this snapshot, it's possible that
-        * zc_sendobj could've been destroyed and reused as part
-        * of a later txg.  Make sure we're looking at the right object.
-        */
-       if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) {
-               dsl_dataset_rele(ds, FTAG);
-               return (ENOENT);
-       }
-
-       if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) {
-               error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
-               if (error) {
-                       dsl_dataset_rele(ds, FTAG);
+       if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
+               error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+               if (error != 0)
                        return (error);
-               }
-       }
-
-       error = dsl_dataset_user_hold_for_send(ds, zc->zc_string,
-           zc->zc_temphold);
-       if (minor != 0) {
-               if (error == 0) {
-                       dsl_register_onexit_hold_cleanup(ds, zc->zc_string,
-                           minor);
-               }
-               zfs_onexit_fd_rele(zc->zc_cleanup_fd);
        }
-       dsl_dataset_rele(ds, FTAG);
 
+       error = dsl_dataset_user_hold(holds, minor, errlist);
+       if (minor != 0)
+               zfs_onexit_fd_rele(cleanup_fd);
        return (error);
 }
 
 /*
- * inputs:
- * zc_name     name of dataset from which we're releasing a user hold
- * zc_value    short name of snap
- * zc_string   user-supplied tag for this hold
- * zc_cookie   recursive flag
+ * innvl is not used.
  *
- * outputs:    none
+ * outnvl: {
+ *    holdname -> time added (uint64 seconds since epoch)
+ *    ...
+ * }
  */
+/* ARGSUSED */
 static int
-zfs_ioc_release(zfs_cmd_t *zc)
+zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
 {
-       boolean_t recursive = zc->zc_cookie;
-
-       if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-               return (EINVAL);
-
-       return (dsl_dataset_user_release(zc->zc_name, zc->zc_value,
-           zc->zc_string, recursive));
+       return (dsl_dataset_get_holds(snapname, outnvl));
 }
 
 /*
- * inputs:
- * zc_name             name of filesystem
+ * innvl: {
+ *     snapname -> { holdname, ... }
+ *     ...
+ * }
  *
- * outputs:
- * zc_nvlist_src{_size}        nvlist of snapshot holds
+ * outnvl: {
+ *     snapname -> error value (int32)
+ *     ...
+ * }
  */
+/* ARGSUSED */
 static int
-zfs_ioc_get_holds(zfs_cmd_t *zc)
+zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
 {
-       nvlist_t *nvp;
-       int error;
+       nvpair_t *pair;
 
-       if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) {
-               error = put_nvlist(zc, nvp);
-               nvlist_free(nvp);
-       }
+       /*
+        * The release may cause the snapshot to be destroyed; make sure it
+        * is not mounted.
+        */
+       for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+           pair = nvlist_next_nvpair(holds, pair))
+               zfs_unmount_snap(nvpair_name(pair));
 
-       return (error);
+       return (dsl_dataset_user_release(holds, errlist));
 }
 
 /*
@@ -5044,7 +4910,7 @@ zfs_ioc_events_next(zfs_cmd_t *zc)
                        break;
 
                error = zfs_zevent_wait(ze);
-               if (error)
+               if (error != 0)
                        break;
        } while (1);
 
@@ -5082,14 +4948,21 @@ static int
 zfs_ioc_space_written(zfs_cmd_t *zc)
 {
        int error;
+       dsl_pool_t *dp;
        dsl_dataset_t *new, *old;
 
-       error = dsl_dataset_hold(zc->zc_name, FTAG, &new);
+       error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
        if (error != 0)
                return (error);
-       error = dsl_dataset_hold(zc->zc_value, FTAG, &old);
+       error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (error);
+       }
+       error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
        if (error != 0) {
                dsl_dataset_rele(new, FTAG);
+               dsl_pool_rele(dp, FTAG);
                return (error);
        }
 
@@ -5097,6 +4970,7 @@ zfs_ioc_space_written(zfs_cmd_t *zc)
            &zc->zc_objset_type, &zc->zc_perm_action);
        dsl_dataset_rele(old, FTAG);
        dsl_dataset_rele(new, FTAG);
+       dsl_pool_rele(dp, FTAG);
        return (error);
 }
 
@@ -5115,6 +4989,7 @@ static int
 zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 {
        int error;
+       dsl_pool_t *dp;
        dsl_dataset_t *new, *old;
        char *firstsnap;
        uint64_t used, comp, uncomp;
@@ -5122,18 +4997,26 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
        if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
                return (EINVAL);
 
-       error = dsl_dataset_hold(lastsnap, FTAG, &new);
+       error = dsl_pool_hold(lastsnap, FTAG, &dp);
        if (error != 0)
                return (error);
-       error = dsl_dataset_hold(firstsnap, FTAG, &old);
+
+       error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (error);
+       }
+       error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
        if (error != 0) {
                dsl_dataset_rele(new, FTAG);
+               dsl_pool_rele(dp, FTAG);
                return (error);
        }
 
        error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
        dsl_dataset_rele(old, FTAG);
        dsl_dataset_rele(new, FTAG);
+       dsl_pool_rele(dp, FTAG);
        fnvlist_add_uint64(outnvl, "used", used);
        fnvlist_add_uint64(outnvl, "compressed", comp);
        fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
@@ -5152,49 +5035,28 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 static int
 zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
-       objset_t *fromsnap = NULL;
-       objset_t *tosnap;
        int error;
        offset_t off;
-       char *fromname;
+       char *fromname = NULL;
        int fd;
+       file_t *fp;
 
        error = nvlist_lookup_int32(innvl, "fd", &fd);
        if (error != 0)
                return (EINVAL);
 
-       error = dmu_objset_hold(snapname, FTAG, &tosnap);
-       if (error)
-               return (error);
+       (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
-       error = nvlist_lookup_string(innvl, "fromsnap", &fromname);
-       if (error == 0) {
-               error = dmu_objset_hold(fromname, FTAG, &fromsnap);
-               if (error) {
-                       dmu_objset_rele(tosnap, FTAG);
-                       return (error);
-               }
-       }
-
-       {
-       file_t *fp = getf(fd);
-       if (fp == NULL) {
-               dmu_objset_rele(tosnap, FTAG);
-               if (fromsnap != NULL)
-                       dmu_objset_rele(fromsnap, FTAG);
+       if ((fp = getf(fd)) == NULL)
                return (EBADF);
-       }
 
        off = fp->f_offset;
-       error = dmu_send(tosnap, fromsnap, fd, fp->f_vnode, &off);
+       error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off);
 
        if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                fp->f_offset = off;
-       }
+
        releasef(fd);
-       if (fromsnap != NULL)
-               dmu_objset_rele(fromsnap, FTAG);
-       dmu_objset_rele(tosnap, FTAG);
        return (error);
 }
 
@@ -5213,21 +5075,29 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 static int
 zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
-       objset_t *fromsnap = NULL;
-       objset_t *tosnap;
+       dsl_pool_t *dp;
+       dsl_dataset_t *fromsnap = NULL;
+       dsl_dataset_t *tosnap;
        int error;
        char *fromname;
        uint64_t space;
 
-       error = dmu_objset_hold(snapname, FTAG, &tosnap);
-       if (error)
+       error = dsl_pool_hold(snapname, FTAG, &dp);
+       if (error != 0)
                return (error);
 
+       error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
+       if (error != 0) {
+               dsl_pool_rele(dp, FTAG);
+               return (error);
+       }
+
        error = nvlist_lookup_string(innvl, "fromsnap", &fromname);
        if (error == 0) {
-               error = dmu_objset_hold(fromname, FTAG, &fromsnap);
-               if (error) {
-                       dmu_objset_rele(tosnap, FTAG);
+               error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+               if (error != 0) {
+                       dsl_dataset_rele(tosnap, FTAG);
+                       dsl_pool_rele(dp, FTAG);
                        return (error);
                }
        }
@@ -5236,8 +5106,9 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
        fnvlist_add_uint64(outnvl, "space", space);
 
        if (fromsnap != NULL)
-               dmu_objset_rele(fromsnap, FTAG);
-       dmu_objset_rele(tosnap, FTAG);
+               dsl_dataset_rele(fromsnap, FTAG);
+       dsl_dataset_rele(tosnap, FTAG);
+       dsl_pool_rele(dp, FTAG);
        return (error);
 }
 
@@ -5382,6 +5253,17 @@ zfs_ioctl_init(void)
            zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
            POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
+       zfs_ioctl_register("hold", ZFS_IOC_HOLD,
+           zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+       zfs_ioctl_register("release", ZFS_IOC_RELEASE,
+           zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+       zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
+           zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
+           POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
        /* IOCTLS that use the legacy function signature */
 
        zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
@@ -5459,8 +5341,6 @@ zfs_ioctl_init(void)
 
        zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
            zfs_ioc_space_written);
-       zfs_ioctl_register_dataset_read(ZFS_IOC_GET_HOLDS,
-           zfs_ioc_get_holds);
        zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
            zfs_ioc_objset_recvd_props);
        zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
@@ -5503,10 +5383,6 @@ zfs_ioctl_init(void)
            zfs_secpolicy_recv);
        zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
            zfs_secpolicy_promote);
-       zfs_ioctl_register_dataset_modify(ZFS_IOC_HOLD, zfs_ioc_hold,
-           zfs_secpolicy_hold);
-       zfs_ioctl_register_dataset_modify(ZFS_IOC_RELEASE, zfs_ioc_release,
-           zfs_secpolicy_release);
        zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
            zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
        zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
@@ -5866,7 +5742,7 @@ zfs_attach(void)
            offsetof(zfsdev_state_t, zs_next));
 
        error = misc_register(&zfs_misc);
-        if (error) {
+        if (error != 0) {
                printk(KERN_INFO "ZFS: misc_register() failed %d\n", error);
                return (error);
        }
@@ -5880,7 +5756,7 @@ zfs_detach(void)
        int error;
 
        error = misc_deregister(&zfs_misc);
-       if (error)
+       if (error != 0)
                printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error);
 
        mutex_destroy(&zfsdev_state_lock);
index 8fee441b14e57e835b8450f0a1bfb5030311cdd9..eeac0391cb05cbfb08d919a370fb98de90ea4f00 100644 (file)
@@ -248,28 +248,31 @@ zfs_register_callbacks(zfs_sb_t *zsb)
         * overboard...
         */
        ds = dmu_objset_ds(os);
+       dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
        error = dsl_prop_register(ds,
-           "atime", atime_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "xattr", xattr_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "recordsize", blksz_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "readonly", readonly_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "devices", devices_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "setuid", setuid_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "exec", exec_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "snapdir", snapdir_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "aclinherit", acl_inherit_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+           zsb);
        error = error ? error : dsl_prop_register(ds,
-           "vscan", vscan_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb);
        error = error ? error : dsl_prop_register(ds,
-           "nbmand", nbmand_changed_cb, zsb);
+           zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb);
+       dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
        if (error)
                goto unregister;
 
@@ -284,18 +287,28 @@ unregister:
         * registered, but this is OK; it will simply return ENOMSG,
         * which we will ignore.
         */
-       (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
-           zsb);
-       (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zsb);
-       (void) dsl_prop_unregister(ds, "nbmand", nbmand_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME),
+           atime_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR),
+           xattr_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+           blksz_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY),
+           readonly_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES),
+           devices_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID),
+           setuid_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC),
+           exec_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR),
+           snapdir_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT),
+           acl_inherit_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN),
+           vscan_changed_cb, zsb);
+       (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_NBMAND),
+           nbmand_changed_cb, zsb);
 
        return (error);
 }
@@ -305,8 +318,6 @@ static int
 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
     uint64_t *userp, uint64_t *groupp)
 {
-       int error = 0;
-
        /*
         * Is it a valid type of object to track?
         */
@@ -363,7 +374,7 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
                        *groupp = BSWAP_64(*groupp);
                }
        }
-       return (error);
+       return (0);
 }
 
 static void
@@ -726,7 +737,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
        mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&zsb->z_all_znodes, sizeof (znode_t),
            offsetof(znode_t, z_link_node));
-       rrw_init(&zsb->z_teardown_lock);
+       rrw_init(&zsb->z_teardown_lock, B_FALSE);
        rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
        rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL);
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1138,7 +1149,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
        if (dsl_dataset_is_dirty(dmu_objset_ds(zsb->z_os)) &&
            !zfs_is_readonly(zsb))
                txg_wait_synced(dmu_objset_pool(zsb->z_os), 0);
-       (void) dmu_objset_evict_dbufs(zsb->z_os);
+       dmu_objset_evict_dbufs(zsb->z_os);
 
        return (0);
 }
index c1796937b568389806c04a969fdaf796e698cb83..d59c92c093f4c57e7d57322867513cc2f80bd22a 100644 (file)
@@ -257,7 +257,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
                        }
                }
 
-               VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+               VERIFY(arc_buf_remove_ref(abuf, &abuf));
        }
 
        return (error);
@@ -356,7 +356,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
                        break;
 
                error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
-               if (error)
+               if (error != 0)
                        break;
 
                for (lrp = lrbuf; lrp < end; lrp += reclen) {
@@ -492,7 +492,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg)
        if (dsl_dataset_is_snapshot(ds))
                panic("dirtying snapshot!");
 
-       if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) {
+       if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
                /* up the hold count until we can be written out */
                dmu_buf_add_ref(ds->ds_dbuf, zilog);
        }
@@ -658,8 +658,8 @@ zil_claim(const char *osname, void *txarg)
        objset_t *os;
        int error;
 
-       error = dmu_objset_hold(osname, FTAG, &os);
-       if (error) {
+       error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
+       if (error != 0) {
                cmn_err(CE_WARN, "can't open objset for %s", osname);
                return (0);
        }
@@ -672,7 +672,7 @@ zil_claim(const char *osname, void *txarg)
                        zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
                BP_ZERO(&zh->zh_log);
                dsl_dataset_dirty(dmu_objset_ds(os), tx);
-               dmu_objset_rele(os, FTAG);
+               dmu_objset_disown(os, FTAG);
                return (0);
        }
 
@@ -697,7 +697,7 @@ zil_claim(const char *osname, void *txarg)
        }
 
        ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
-       dmu_objset_rele(os, FTAG);
+       dmu_objset_disown(os, FTAG);
        return (0);
 }
 
@@ -717,7 +717,7 @@ zil_check_log_chain(const char *osname, void *tx)
        ASSERT(tx == NULL);
 
        error = dmu_objset_hold(osname, FTAG, &os);
-       if (error) {
+       if (error != 0) {
                cmn_err(CE_WARN, "can't open objset for %s", osname);
                return (0);
        }
@@ -1014,7 +1014,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 
        BP_ZERO(bp);
        use_slog = USE_SLOG(zilog);
-       error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog));
+       error = zio_alloc_zil(spa, txg, bp, zil_blksz,
+           USE_SLOG(zilog));
        if (use_slog)
        {
                ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
@@ -1025,7 +1026,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
                ZIL_STAT_BUMP(zil_itx_metaslab_normal_count);
                ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused);
        }
-       if (!error) {
+       if (error == 0) {
                ASSERT3U(bp->blk_birth, ==, txg);
                bp->blk_cksum = lwb->lwb_blk.blk_cksum;
                bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
@@ -1145,7 +1146,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
                                txg_wait_synced(zilog->zl_dmu_pool, txg);
                                return (lwb);
                        }
-                       if (error) {
+                       if (error != 0) {
                                ASSERT(error == ENOENT || error == EEXIST ||
                                    error == EALREADY);
                                return (lwb);
@@ -1807,6 +1808,9 @@ zil_free(zilog_t *zilog)
 
        zilog->zl_stop_sync = 1;
 
+       ASSERT0(zilog->zl_suspend);
+       ASSERT0(zilog->zl_suspending);
+
        ASSERT(list_is_empty(&zilog->zl_lwb_list));
        list_destroy(&zilog->zl_lwb_list);
 
@@ -1905,32 +1909,100 @@ zil_close(zilog_t *zilog)
        mutex_exit(&zilog->zl_lock);
 }
 
+static char *suspend_tag = "zil suspending";
+
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
- * We suspend the log briefly when taking a snapshot so that the snapshot
- * contains all the data it's supposed to, and has an empty intent log.
+ * On old version pools, we suspend the log briefly when taking a
+ * snapshot so that it will have an empty intent log.
+ *
+ * Long holds are not really intended to be used the way we do here --
+ * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
+ * could fail.  Therefore we take pains to only put a long hold if it is
+ * actually necessary.  Fortunately, it will only be necessary if the
+ * objset is currently mounted (or the ZVOL equivalent).  In that case it
+ * will already have a long hold, so we are not really making things any worse.
+ *
+ * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
+ * zvol_state_t), and use their mechanism to prevent their hold from being
+ * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
+ * very little gain.
+ *
+ * if cookiep == NULL, this does both the suspend & resume.
+ * Otherwise, it returns with the dataset "long held", and the cookie
+ * should be passed into zil_resume().
  */
 int
-zil_suspend(zilog_t *zilog)
+zil_suspend(const char *osname, void **cookiep)
 {
-       const zil_header_t *zh = zilog->zl_header;
+       objset_t *os;
+       zilog_t *zilog;
+       const zil_header_t *zh;
+       int error;
+
+       error = dmu_objset_hold(osname, suspend_tag, &os);
+       if (error != 0)
+               return (error);
+       zilog = dmu_objset_zil(os);
 
        mutex_enter(&zilog->zl_lock);
+       zh = zilog->zl_header;
+
        if (zh->zh_flags & ZIL_REPLAY_NEEDED) {         /* unplayed log */
                mutex_exit(&zilog->zl_lock);
+               dmu_objset_rele(os, suspend_tag);
                return (EBUSY);
        }
-       if (zilog->zl_suspend++ != 0) {
+
+       /*
+        * Don't put a long hold in the cases where we can avoid it.  This
+        * is when there is no cookie so we are doing a suspend & resume
+        * (i.e. called from zil_vdev_offline()), and there's nothing to do
+        * for the suspend because it's already suspended, or there's no ZIL.
+        */
+       if (cookiep == NULL && !zilog->zl_suspending &&
+           (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
+               mutex_exit(&zilog->zl_lock);
+               dmu_objset_rele(os, suspend_tag);
+               return (0);
+       }
+
+       dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
+       dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
+
+       zilog->zl_suspend++;
+
+       if (zilog->zl_suspend > 1) {
                /*
-                * Someone else already began a suspend.
+                * Someone else is already suspending it.
                 * Just wait for them to finish.
                 */
+
                while (zilog->zl_suspending)
                        cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
                mutex_exit(&zilog->zl_lock);
+
+               if (cookiep == NULL)
+                       zil_resume(os);
+               else
+                       *cookiep = os;
+               return (0);
+       }
+
+       /*
+        * If there is no pointer to an on-disk block, this ZIL must not
+        * be active (e.g. filesystem not mounted), so there's nothing
+        * to clean up.
+        */
+       if (BP_IS_HOLE(&zh->zh_log)) {
+               ASSERT(cookiep != NULL); /* fast path already handled */
+
+               *cookiep = os;
+               mutex_exit(&zilog->zl_lock);
                return (0);
        }
+
        zilog->zl_suspending = B_TRUE;
        mutex_exit(&zilog->zl_lock);
 
@@ -1943,16 +2015,25 @@ zil_suspend(zilog_t *zilog)
        cv_broadcast(&zilog->zl_cv_suspend);
        mutex_exit(&zilog->zl_lock);
 
+       if (cookiep == NULL)
+               zil_resume(os);
+       else
+               *cookiep = os;
        return (0);
 }
 
 void
-zil_resume(zilog_t *zilog)
+zil_resume(void *cookie)
 {
+       objset_t *os = cookie;
+       zilog_t *zilog = dmu_objset_zil(os);
+
        mutex_enter(&zilog->zl_lock);
        ASSERT(zilog->zl_suspend != 0);
        zilog->zl_suspend--;
        mutex_exit(&zilog->zl_lock);
+       dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
+       dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
@@ -2025,7 +2106,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
        if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
                error = zil_read_log_data(zilog, (lr_write_t *)lr,
                    zr->zr_lr + reclen);
-               if (error)
+               if (error != 0)
                        return (zil_replay_error(zilog, lr, error));
        }
 
@@ -2046,7 +2127,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
         * is updated if we are in replay mode.
         */
        error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
-       if (error) {
+       if (error != 0) {
                /*
                 * The DMU's dnode layer doesn't see removes until the txg
                 * commits, so a subsequent claim can spuriously fail with
@@ -2056,7 +2137,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
                 */
                txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
                error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
-               if (error)
+               if (error != 0)
                        return (zil_replay_error(zilog, lr, error));
        }
        return (0);
@@ -2128,21 +2209,12 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 int
 zil_vdev_offline(const char *osname, void *arg)
 {
-       objset_t *os;
-       zilog_t *zilog;
        int error;
 
-       error = dmu_objset_hold(osname, FTAG, &os);
-       if (error)
-               return (error);
-
-       zilog = dmu_objset_zil(os);
-       if (zil_suspend(zilog) != 0)
-               error = EEXIST;
-       else
-               zil_resume(zilog);
-       dmu_objset_rele(os, FTAG);
-       return (error);
+       error = zil_suspend(osname, NULL);
+       if (error != 0)
+               return (EEXIST);
+       return (0);
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
index ccefaf8ac1cb82eb541bac20c60c1a07dc5d22e3..0e2b463ac9e2309f1c8a4d71d23aef5e8c6f35ea 100644 (file)
@@ -767,6 +767,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
+       metaslab_check_free(spa, bp);
        bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 }
 
@@ -785,6 +786,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 
        arc_freed(spa, bp);
 
+       metaslab_check_free(spa, bp);
+
        zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
            NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
            NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
@@ -2060,7 +2063,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
                                    bcmp(abuf->b_data, zio->io_orig_data,
                                    zio->io_orig_size) != 0)
                                        error = EEXIST;
-                               VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+                               VERIFY(arc_buf_remove_ref(abuf, &abuf));
                        }
 
                        ddt_enter(ddt);
@@ -2656,8 +2659,9 @@ zio_vdev_io_assess(zio_t *zio)
         * set vdev_cant_write so that we stop trying to allocate from it.
         */
        if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
-           vd != NULL && !vd->vdev_ops->vdev_op_leaf)
+           vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
                vd->vdev_cant_write = B_TRUE;
+       }
 
        if (zio->io_error)
                zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
index b516156372448a286f8e68c761314048d8334ac4..f52d8bbc142a001fb7ee34f82dfaa1904978d142 100644 (file)
@@ -315,6 +315,13 @@ zvol_set_volsize(const char *name, uint64_t volsize)
        uint64_t readonly;
        int error;
 
+       error = dsl_prop_get_integer(name,
+           zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+       if (error != 0)
+               return (error);
+       if (readonly)
+               return (EROFS);
+
        mutex_enter(&zvol_state_lock);
 
        zv = zvol_find_by_name(name);
@@ -1459,8 +1466,7 @@ zvol_remove_minor(const char *name)
 }
 
 static int
-zvol_create_minors_cb(spa_t *spa, uint64_t dsobj,
-                     const char *dsname, void *arg)
+zvol_create_minors_cb(const char *dsname, void *arg)
 {
        if (strchr(dsname, '/') == NULL)
                return 0;
@@ -1474,7 +1480,7 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj,
  * for all available pools.
  */
 int
-zvol_create_minors(const char *pool)
+zvol_create_minors(char *pool)
 {
        spa_t *spa = NULL;
        int error = 0;
@@ -1484,13 +1490,12 @@ zvol_create_minors(const char *pool)
 
        mutex_enter(&zvol_state_lock);
        if (pool) {
-               error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb,
+               error = dmu_objset_find(pool, zvol_create_minors_cb,
                    NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
        } else {
                mutex_enter(&spa_namespace_lock);
                while ((spa = spa_next(spa)) != NULL) {
-                       error = dmu_objset_find_spa(NULL,
-                           spa_name(spa), zvol_create_minors_cb, NULL,
+                       error = dmu_objset_find(spa_name(spa), zvol_create_minors_cb, NULL,
                            DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
                        if (error)
                                break;
index 53cc77bd9762445f3029bc7ba7c5628e43c8503d..be56b771d34c3c8045ccce645fcf1ecb3ac2d30d 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
+#include <sys/dsl_destroy.h>
 #include <linux/cdev.h>
 #include "zpios-internal.h"
 
@@ -224,9 +225,9 @@ zpios_dmu_setup(run_args_t *run_args)
        run_args->os = os;
 out_destroy:
        if (rc) {
-               rc2 = dmu_objset_destroy(name, B_FALSE);
+               rc2 = dsl_destroy_head(name);
                if (rc2)
-                       zpios_print(run_args->file, "Error dmu_objset_destroy"
+                       zpios_print(run_args->file, "Error dsl_destroy_head"
                                    "(%s, ...) failed: %d\n", name, rc2);
        }
 out:
@@ -395,9 +396,9 @@ zpios_remove_objset(run_args_t *run_args)
        dmu_objset_disown(run_args->os, zpios_tag);
 
        if (run_args->flags & DMU_REMOVE) {
-               rc = dmu_objset_destroy(name, B_FALSE);
+               rc = dsl_destroy_head(name);
                if (rc)
-                       zpios_print(run_args->file, "Error dmu_objset_destroy"
+                       zpios_print(run_args->file, "Error dsl_destroy_head"
                                    "(%s, ...) failed: %d\n", name, rc);
        }