module/zfs/dsl_dataset.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 #include <sys/dmu_objset.h>
  26 #include <sys/dsl_dataset.h>
  27 #include <sys/dsl_dir.h>
  28 #include <sys/dsl_prop.h>
  29 #include <sys/dsl_synctask.h>
  30 #include <sys/dmu_traverse.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/arc.h>
  33 #include <sys/zio.h>
  34 #include <sys/zap.h>
  35 #include <sys/unique.h>
  36 #include <sys/zfs_context.h>
  37 #include <sys/zfs_ioctl.h>
  38 #include <sys/spa.h>
  39 #include <sys/zfs_znode.h>
  40 #include <sys/zfs_onexit.h>
  41 #include <sys/zvol.h>
  42 #include <sys/dsl_scan.h>
  43 #include <sys/dsl_deadlist.h>
  44
  45 static char *dsl_reaper = "the grim reaper";
  46
  47 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  48 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  49 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  50
  51 #define SWITCH64(x, y) \
  52         { \
  53                 uint64_t __tmp = (x); \
  54                 (x) = (y); \
  55                 (y) = __tmp; \
  56         }
  57
  58 #define DS_REF_MAX      (1ULL << 62)
  59
  60 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
  61
  62 #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
  63
  64
  65 /*
  66  * Figure out how much of this delta should be propogated to the dsl_dir
  67  * layer.  If there's a refreservation, that space has already been
  68  * partially accounted for in our ancestors.
  69  */
  70 static int64_t
  71 parent_delta(dsl_dataset_t *ds, int64_t delta)
  72 {
  73         uint64_t old_bytes, new_bytes;
  74
  75         if (ds->ds_reserved == 0)
  76                 return (delta);
  77
  78         old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
  79         new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
  80
  81         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  82         return (new_bytes - old_bytes);
  83 }
  84
  85 void
  86 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  87 {
  88         int used, compressed, uncompressed;
  89         int64_t delta;
  90
  91         used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  92         compressed = BP_GET_PSIZE(bp);
  93         uncompressed = BP_GET_UCSIZE(bp);
  94
  95         dprintf_bp(bp, "ds=%p", ds);
  96
  97         ASSERT(dmu_tx_is_syncing(tx));
  98         /* It could have been compressed away to nothing */
  99         if (BP_IS_HOLE(bp))
 100                 return;
 101         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 102         ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
 103         if (ds == NULL) {
 104                 /*
 105                  * Account for the meta-objset space in its placeholder
 106                  * dsl_dir.
 107                  */
 108                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 109                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 110                     used, compressed, uncompressed, tx);
 111                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 112                 return;
 113         }
 114         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 115
 116         mutex_enter(&ds->ds_dir->dd_lock);
 117         mutex_enter(&ds->ds_lock);
 118         delta = parent_delta(ds, used);
 119         ds->ds_phys->ds_used_bytes += used;
 120         ds->ds_phys->ds_compressed_bytes += compressed;
 121         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 122         ds->ds_phys->ds_unique_bytes += used;
 123         mutex_exit(&ds->ds_lock);
 124         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 125             compressed, uncompressed, tx);
 126         dsl_dir_transfer_space(ds->ds_dir, used - delta,
 127             DD_USED_REFRSRV, DD_USED_HEAD, tx);
 128         mutex_exit(&ds->ds_dir->dd_lock);
 129 }
 130
 131 int
 132 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 133     boolean_t async)
 134 {
 135         int used, compressed, uncompressed;
 136
 137         if (BP_IS_HOLE(bp))
 138                 return (0);
 139
 140         ASSERT(dmu_tx_is_syncing(tx));
 141         ASSERT(bp->blk_birth <= tx->tx_txg);
 142
 143         used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 144         compressed = BP_GET_PSIZE(bp);
 145         uncompressed = BP_GET_UCSIZE(bp);
 146
 147         ASSERT(used > 0);
 148         if (ds == NULL) {
 149                 /*
 150                  * Account for the meta-objset space in its placeholder
 151                  * dataset.
 152                  */
 153                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 154
 155                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 156                     -used, -compressed, -uncompressed, tx);
 157                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 158                 return (used);
 159         }
 160         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 161
 162         ASSERT(!dsl_dataset_is_snapshot(ds));
 163         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 164
 165         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 166                 int64_t delta;
 167
 168                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 169                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 170
 171                 mutex_enter(&ds->ds_dir->dd_lock);
 172                 mutex_enter(&ds->ds_lock);
 173                 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
 174                     !DS_UNIQUE_IS_ACCURATE(ds));
 175                 delta = parent_delta(ds, -used);
 176                 ds->ds_phys->ds_unique_bytes -= used;
 177                 mutex_exit(&ds->ds_lock);
 178                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 179                     delta, -compressed, -uncompressed, tx);
 180                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 181                     DD_USED_REFRSRV, DD_USED_HEAD, tx);
 182                 mutex_exit(&ds->ds_dir->dd_lock);
 183         } else {
 184                 dprintf_bp(bp, "putting on dead list: %s", "");
 185                 if (async) {
 186                         /*
 187                          * We are here as part of zio's write done callback,
 188                          * which means we're a zio interrupt thread.  We can't
 189                          * call dsl_deadlist_insert() now because it may block
 190                          * waiting for I/O.  Instead, put bp on the deferred
 191                          * queue and let dsl_pool_sync() finish the job.
 192                          */
 193                         bplist_append(&ds->ds_pending_deadlist, bp);
 194                 } else {
 195                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 196                 }
 197                 ASSERT3U(ds->ds_prev->ds_object, ==,
 198                     ds->ds_phys->ds_prev_snap_obj);
 199                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 200                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 201                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 202                     ds->ds_object && bp->blk_birth >
 203                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 204                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 205                         mutex_enter(&ds->ds_prev->ds_lock);
 206                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
 207                         mutex_exit(&ds->ds_prev->ds_lock);
 208                 }
 209                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 210                         dsl_dir_transfer_space(ds->ds_dir, used,
 211                             DD_USED_HEAD, DD_USED_SNAP, tx);
 212                 }
 213         }
 214         mutex_enter(&ds->ds_lock);
 215         ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
 216         ds->ds_phys->ds_used_bytes -= used;
 217         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 218         ds->ds_phys->ds_compressed_bytes -= compressed;
 219         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 220         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 221         mutex_exit(&ds->ds_lock);
 222
 223         return (used);
 224 }
 225
 226 uint64_t
 227 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 228 {
 229         uint64_t trysnap = 0;
 230
 231         if (ds == NULL)
 232                 return (0);
 233         /*
 234          * The snapshot creation could fail, but that would cause an
 235          * incorrect FALSE return, which would only result in an
 236          * overestimation of the amount of space that an operation would
 237          * consume, which is OK.
 238          *
 239          * There's also a small window where we could miss a pending
 240          * snapshot, because we could set the sync task in the quiescing
 241          * phase.  So this should only be used as a guess.
 242          */
 243         if (ds->ds_trysnap_txg >
 244             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 245                 trysnap = ds->ds_trysnap_txg;
 246         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 247 }
 248
 249 boolean_t
 250 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
 251     uint64_t blk_birth)
 252 {
 253         if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
 254                 return (B_FALSE);
 255
 256         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 257
 258         return (B_TRUE);
 259 }
 260
 261 /* ARGSUSED */
 262 static void
 263 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 264 {
 265         dsl_dataset_t *ds = dsv;
 266
 267         ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 268
 269         unique_remove(ds->ds_fsid_guid);
 270
 271         if (ds->ds_objset != NULL)
 272                 dmu_objset_evict(ds->ds_objset);
 273
 274         if (ds->ds_prev) {
 275                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 276                 ds->ds_prev = NULL;
 277         }
 278
 279         bplist_destroy(&ds->ds_pending_deadlist);
 280         if (db != NULL) {
 281                 dsl_deadlist_close(&ds->ds_deadlist);
 282         } else {
 283                 ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
 284                 ASSERT(!ds->ds_deadlist.dl_oldfmt);
 285         }
 286         if (ds->ds_dir)
 287                 dsl_dir_close(ds->ds_dir, ds);
 288
 289         ASSERT(!list_link_active(&ds->ds_synced_link));
 290
 291         mutex_destroy(&ds->ds_lock);
 292         mutex_destroy(&ds->ds_recvlock);
 293         mutex_destroy(&ds->ds_opening_lock);
 294         rw_destroy(&ds->ds_rwlock);
 295         cv_destroy(&ds->ds_exclusive_cv);
 296
 297         kmem_free(ds, sizeof (dsl_dataset_t));
 298 }
 299
 300 static int
 301 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 302 {
 303         dsl_dataset_phys_t *headphys;
 304         int err;
 305         dmu_buf_t *headdbuf;
 306         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 307         objset_t *mos = dp->dp_meta_objset;
 308
 309         if (ds->ds_snapname[0])
 310                 return (0);
 311         if (ds->ds_phys->ds_next_snap_obj == 0)
 312                 return (0);
 313
 314         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 315             FTAG, &headdbuf);
 316         if (err)
 317                 return (err);
 318         headphys = headdbuf->db_data;
 319         err = zap_value_search(dp->dp_meta_objset,
 320             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 321         dmu_buf_rele(headdbuf, FTAG);
 322         return (err);
 323 }
 324
 325 static int
 326 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 327 {
 328         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 329         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 330         matchtype_t mt;
 331         int err;
 332
 333         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 334                 mt = MT_FIRST;
 335         else
 336                 mt = MT_EXACT;
 337
 338         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 339             value, mt, NULL, 0, NULL);
 340         if (err == ENOTSUP && mt == MT_FIRST)
 341                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
 342         return (err);
 343 }
 344
 345 static int
 346 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 347 {
 348         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 349         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 350         matchtype_t mt;
 351         int err;
 352
 353         dsl_dir_snap_cmtime_update(ds->ds_dir);
 354
 355         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 356                 mt = MT_FIRST;
 357         else
 358                 mt = MT_EXACT;
 359
 360         err = zap_remove_norm(mos, snapobj, name, mt, tx);
 361         if (err == ENOTSUP && mt == MT_FIRST)
 362                 err = zap_remove(mos, snapobj, name, tx);
 363         return (err);
 364 }
 365
 366 static int
 367 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 368     dsl_dataset_t **dsp)
 369 {
 370         objset_t *mos = dp->dp_meta_objset;
 371         dmu_buf_t *dbuf;
 372         dsl_dataset_t *ds;
 373         int err;
 374         dmu_object_info_t doi;
 375
 376         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 377             dsl_pool_sync_context(dp));
 378
 379         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 380         if (err)
 381                 return (err);
 382
 383         /* Make sure dsobj has the correct object type. */
 384         dmu_object_info_from_db(dbuf, &doi);
 385         if (doi.doi_type != DMU_OT_DSL_DATASET)
 386                 return (EINVAL);
 387
 388         ds = dmu_buf_get_user(dbuf);
 389         if (ds == NULL) {
 390                 dsl_dataset_t *winner;
 391
 392                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 393                 ds->ds_dbuf = dbuf;
 394                 ds->ds_object = dsobj;
 395                 ds->ds_phys = dbuf->db_data;
 396
 397                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 398                 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 399                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 400                 rw_init(&ds->ds_rwlock, 0, 0, 0);
 401                 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 402
 403                 bplist_create(&ds->ds_pending_deadlist);
 404                 dsl_deadlist_open(&ds->ds_deadlist,
 405                     mos, ds->ds_phys->ds_deadlist_obj);
 406
 407                 if (err == 0) {
 408                         err = dsl_dir_open_obj(dp,
 409                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 410                 }
 411                 if (err) {
 412                         mutex_destroy(&ds->ds_lock);
 413                         mutex_destroy(&ds->ds_recvlock);
 414                         mutex_destroy(&ds->ds_opening_lock);
 415                         rw_destroy(&ds->ds_rwlock);
 416                         cv_destroy(&ds->ds_exclusive_cv);
 417                         bplist_destroy(&ds->ds_pending_deadlist);
 418                         dsl_deadlist_close(&ds->ds_deadlist);
 419                         kmem_free(ds, sizeof (dsl_dataset_t));
 420                         dmu_buf_rele(dbuf, tag);
 421                         return (err);
 422                 }
 423
 424                 if (!dsl_dataset_is_snapshot(ds)) {
 425                         ds->ds_snapname[0] = '\0';
 426                         if (ds->ds_phys->ds_prev_snap_obj) {
 427                                 err = dsl_dataset_get_ref(dp,
 428                                     ds->ds_phys->ds_prev_snap_obj,
 429                                     ds, &ds->ds_prev);
 430                         }
 431                 } else {
 432                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 433                                 err = dsl_dataset_get_snapname(ds);
 434                         if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
 435                                 err = zap_count(
 436                                     ds->ds_dir->dd_pool->dp_meta_objset,
 437                                     ds->ds_phys->ds_userrefs_obj,
 438                                     &ds->ds_userrefs);
 439                         }
 440                 }
 441
 442                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 443                         /*
 444                          * In sync context, we're called with either no lock
 445                          * or with the write lock.  If we're not syncing,
 446                          * we're always called with the read lock held.
 447                          */
 448                         boolean_t need_lock =
 449                             !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
 450                             dsl_pool_sync_context(dp);
 451
 452                         if (need_lock)
 453                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 454
 455                         err = dsl_prop_get_ds(ds,
 456                             "refreservation", sizeof (uint64_t), 1,
 457                             &ds->ds_reserved, NULL);
 458                         if (err == 0) {
 459                                 err = dsl_prop_get_ds(ds,
 460                                     "refquota", sizeof (uint64_t), 1,
 461                                     &ds->ds_quota, NULL);
 462                         }
 463
 464                         if (need_lock)
 465                                 rw_exit(&dp->dp_config_rwlock);
 466                 } else {
 467                         ds->ds_reserved = ds->ds_quota = 0;
 468                 }
 469
 470                 if (err == 0) {
 471                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 472                             dsl_dataset_evict);
 473                 }
 474                 if (err || winner) {
 475                         bplist_destroy(&ds->ds_pending_deadlist);
 476                         dsl_deadlist_close(&ds->ds_deadlist);
 477                         if (ds->ds_prev)
 478                                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 479                         dsl_dir_close(ds->ds_dir, ds);
 480                         mutex_destroy(&ds->ds_lock);
 481                         mutex_destroy(&ds->ds_recvlock);
 482                         mutex_destroy(&ds->ds_opening_lock);
 483                         rw_destroy(&ds->ds_rwlock);
 484                         cv_destroy(&ds->ds_exclusive_cv);
 485                         kmem_free(ds, sizeof (dsl_dataset_t));
 486                         if (err) {
 487                                 dmu_buf_rele(dbuf, tag);
 488                                 return (err);
 489                         }
 490                         ds = winner;
 491                 } else {
 492                         ds->ds_fsid_guid =
 493                             unique_insert(ds->ds_phys->ds_fsid_guid);
 494                 }
 495         }
 496         ASSERT3P(ds->ds_dbuf, ==, dbuf);
 497         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 498         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 499             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 500             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 501         mutex_enter(&ds->ds_lock);
 502         if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 503                 mutex_exit(&ds->ds_lock);
 504                 dmu_buf_rele(ds->ds_dbuf, tag);
 505                 return (ENOENT);
 506         }
 507         mutex_exit(&ds->ds_lock);
 508         *dsp = ds;
 509         return (0);
 510 }
 511
 512 static int
 513 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 514 {
 515         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 516
 517         /*
 518          * In syncing context we don't want the rwlock lock: there
 519          * may be an existing writer waiting for sync phase to
 520          * finish.  We don't need to worry about such writers, since
 521          * sync phase is single-threaded, so the writer can't be
 522          * doing anything while we are active.
 523          */
 524         if (dsl_pool_sync_context(dp)) {
 525                 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 526                 return (0);
 527         }
 528
 529         /*
 530          * Normal users will hold the ds_rwlock as a READER until they
 531          * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
 532          * drop their READER lock after they set the ds_owner field.
 533          *
 534          * If the dataset is being destroyed, the destroy thread will
 535          * obtain a WRITER lock for exclusive access after it's done its
 536          * open-context work and then change the ds_owner to
 537          * dsl_reaper once destruction is assured.  So threads
 538          * may block here temporarily, until the "destructability" of
 539          * the dataset is determined.
 540          */
 541         ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
 542         mutex_enter(&ds->ds_lock);
 543         while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
 544                 rw_exit(&dp->dp_config_rwlock);
 545                 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
 546                 if (DSL_DATASET_IS_DESTROYED(ds)) {
 547                         mutex_exit(&ds->ds_lock);
 548                         dsl_dataset_drop_ref(ds, tag);
 549                         rw_enter(&dp->dp_config_rwlock, RW_READER);
 550                         return (ENOENT);
 551                 }
 552                 /*
 553                  * The dp_config_rwlock lives above the ds_lock. And
 554                  * we need to check DSL_DATASET_IS_DESTROYED() while
 555                  * holding the ds_lock, so we have to drop and reacquire
 556                  * the ds_lock here.
 557                  */
 558                 mutex_exit(&ds->ds_lock);
 559                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 560                 mutex_enter(&ds->ds_lock);
 561         }
 562         mutex_exit(&ds->ds_lock);
 563         return (0);
 564 }
 565
 566 int
 567 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 568     dsl_dataset_t **dsp)
 569 {
 570         int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
 571
 572         if (err)
 573                 return (err);
 574         return (dsl_dataset_hold_ref(*dsp, tag));
 575 }
 576
 577 int
 578 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
 579     void *tag, dsl_dataset_t **dsp)
 580 {
 581         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 582         if (err)
 583                 return (err);
 584         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 585                 dsl_dataset_rele(*dsp, tag);
 586                 *dsp = NULL;
 587                 return (EBUSY);
 588         }
 589         return (0);
 590 }
 591
 592 int
 593 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 594 {
 595         dsl_dir_t *dd;
 596         dsl_pool_t *dp;
 597         const char *snapname;
 598         uint64_t obj;
 599         int err = 0;
 600
 601         err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 602         if (err)
 603                 return (err);
 604
 605         dp = dd->dd_pool;
 606         obj = dd->dd_phys->dd_head_dataset_obj;
 607         rw_enter(&dp->dp_config_rwlock, RW_READER);
 608         if (obj)
 609                 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
 610         else
 611                 err = ENOENT;
 612         if (err)
 613                 goto out;
 614
 615         err = dsl_dataset_hold_ref(*dsp, tag);
 616
 617         /* we may be looking for a snapshot */
 618         if (err == 0 && snapname != NULL) {
 619                 dsl_dataset_t *ds = NULL;
 620
 621                 if (*snapname++ != '@') {
 622                         dsl_dataset_rele(*dsp, tag);
 623                         err = ENOENT;
 624                         goto out;
 625                 }
 626
 627                 dprintf("looking for snapshot '%s'\n", snapname);
 628                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 629                 if (err == 0)
 630                         err = dsl_dataset_get_ref(dp, obj, tag, &ds);
 631                 dsl_dataset_rele(*dsp, tag);
 632
 633                 ASSERT3U((err == 0), ==, (ds != NULL));
 634
 635                 if (ds) {
 636                         mutex_enter(&ds->ds_lock);
 637                         if (ds->ds_snapname[0] == 0)
 638                                 (void) strlcpy(ds->ds_snapname, snapname,
 639                                     sizeof (ds->ds_snapname));
 640                         mutex_exit(&ds->ds_lock);
 641                         err = dsl_dataset_hold_ref(ds, tag);
 642                         *dsp = err ? NULL : ds;
 643                 }
 644         }
 645 out:
 646         rw_exit(&dp->dp_config_rwlock);
 647         dsl_dir_close(dd, FTAG);
 648         return (err);
 649 }
 650
 651 int
 652 dsl_dataset_own(const char *name, boolean_t inconsistentok,
 653     void *tag, dsl_dataset_t **dsp)
 654 {
 655         int err = dsl_dataset_hold(name, tag, dsp);
 656         if (err)
 657                 return (err);
 658         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 659                 dsl_dataset_rele(*dsp, tag);
 660                 return (EBUSY);
 661         }
 662         return (0);
 663 }
 664
 665 void
 666 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 667 {
 668         if (ds == NULL) {
 669                 (void) strcpy(name, "mos");
 670         } else {
 671                 dsl_dir_name(ds->ds_dir, name);
 672                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 673                 if (ds->ds_snapname[0]) {
 674                         (void) strcat(name, "@");
 675                         /*
 676                          * We use a "recursive" mutex so that we
 677                          * can call dprintf_ds() with ds_lock held.
 678                          */
 679                         if (!MUTEX_HELD(&ds->ds_lock)) {
 680                                 mutex_enter(&ds->ds_lock);
 681                                 (void) strcat(name, ds->ds_snapname);
 682                                 mutex_exit(&ds->ds_lock);
 683                         } else {
 684                                 (void) strcat(name, ds->ds_snapname);
 685                         }
 686                 }
 687         }
 688 }
 689
 690 static int
 691 dsl_dataset_namelen(dsl_dataset_t *ds)
 692 {
 693         int result;
 694
 695         if (ds == NULL) {
 696                 result = 3;     /* "mos" */
 697         } else {
 698                 result = dsl_dir_namelen(ds->ds_dir);
 699                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 700                 if (ds->ds_snapname[0]) {
 701                         ++result;       /* adding one for the @-sign */
 702                         if (!MUTEX_HELD(&ds->ds_lock)) {
 703                                 mutex_enter(&ds->ds_lock);
 704                                 result += strlen(ds->ds_snapname);
 705                                 mutex_exit(&ds->ds_lock);
 706                         } else {
 707                                 result += strlen(ds->ds_snapname);
 708                         }
 709                 }
 710         }
 711
 712         return (result);
 713 }
 714
 715 void
 716 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 717 {
 718         dmu_buf_rele(ds->ds_dbuf, tag);
 719 }
 720
 721 void
 722 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 723 {
 724         if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 725                 rw_exit(&ds->ds_rwlock);
 726         }
 727         dsl_dataset_drop_ref(ds, tag);
 728 }
 729
 730 void
 731 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 732 {
 733         ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 734             (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 735
 736         mutex_enter(&ds->ds_lock);
 737         ds->ds_owner = NULL;
 738         if (RW_WRITE_HELD(&ds->ds_rwlock)) {
 739                 rw_exit(&ds->ds_rwlock);
 740                 cv_broadcast(&ds->ds_exclusive_cv);
 741         }
 742         mutex_exit(&ds->ds_lock);
 743         if (ds->ds_dbuf)
 744                 dsl_dataset_drop_ref(ds, tag);
 745         else
 746                 dsl_dataset_evict(NULL, ds);
 747 }
 748
 749 boolean_t
 750 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 751 {
 752         boolean_t gotit = FALSE;
 753
 754         mutex_enter(&ds->ds_lock);
 755         if (ds->ds_owner == NULL &&
 756             (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
 757                 ds->ds_owner = tag;
 758                 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 759                         rw_exit(&ds->ds_rwlock);
 760                 gotit = TRUE;
 761         }
 762         mutex_exit(&ds->ds_lock);
 763         return (gotit);
 764 }
 765
 766 void
 767 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 768 {
 769         ASSERT3P(owner, ==, ds->ds_owner);
 770         if (!RW_WRITE_HELD(&ds->ds_rwlock))
 771                 rw_enter(&ds->ds_rwlock, RW_WRITER);
 772 }
 773
 774 uint64_t
 775 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 776     uint64_t flags, dmu_tx_t *tx)
 777 {
 778         dsl_pool_t *dp = dd->dd_pool;
 779         dmu_buf_t *dbuf;
 780         dsl_dataset_phys_t *dsphys;
 781         uint64_t dsobj;
 782         objset_t *mos = dp->dp_meta_objset;
 783
 784         if (origin == NULL)
 785                 origin = dp->dp_origin_snap;
 786
 787         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 788         ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 789         ASSERT(dmu_tx_is_syncing(tx));
 790         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 791
 792         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 793             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 794         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 795         dmu_buf_will_dirty(dbuf, tx);
 796         dsphys = dbuf->db_data;
 797         bzero(dsphys, sizeof (dsl_dataset_phys_t));
 798         dsphys->ds_dir_obj = dd->dd_object;
 799         dsphys->ds_flags = flags;
 800         dsphys->ds_fsid_guid = unique_create();
 801         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 802             sizeof (dsphys->ds_guid));
 803         dsphys->ds_snapnames_zapobj =
 804             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 805             DMU_OT_NONE, 0, tx);
 806         dsphys->ds_creation_time = gethrestime_sec();
 807         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 808
 809         if (origin == NULL) {
 810                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 811         } else {
 812                 dsl_dataset_t *ohds;
 813
 814                 dsphys->ds_prev_snap_obj = origin->ds_object;
 815                 dsphys->ds_prev_snap_txg =
 816                     origin->ds_phys->ds_creation_txg;
 817                 dsphys->ds_used_bytes =
 818                     origin->ds_phys->ds_used_bytes;
 819                 dsphys->ds_compressed_bytes =
 820                     origin->ds_phys->ds_compressed_bytes;
 821                 dsphys->ds_uncompressed_bytes =
 822                     origin->ds_phys->ds_uncompressed_bytes;
 823                 dsphys->ds_bp = origin->ds_phys->ds_bp;
 824                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
 825
 826                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
 827                 origin->ds_phys->ds_num_children++;
 828
 829                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 830                     origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 831                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 832                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 833                 dsl_dataset_rele(ohds, FTAG);
 834
 835                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 836                         if (origin->ds_phys->ds_next_clones_obj == 0) {
 837                                 origin->ds_phys->ds_next_clones_obj =
 838                                     zap_create(mos,
 839                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 840                         }
 841                         VERIFY(0 == zap_add_int(mos,
 842                             origin->ds_phys->ds_next_clones_obj,
 843                             dsobj, tx));
 844                 }
 845
 846                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
 847                 dd->dd_phys->dd_origin_obj = origin->ds_object;
 848                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 849                         if (origin->ds_dir->dd_phys->dd_clones == 0) {
 850                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 851                                 origin->ds_dir->dd_phys->dd_clones =
 852                                     zap_create(mos,
 853                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 854                         }
 855                         VERIFY3U(0, ==, zap_add_int(mos,
 856                             origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 857                 }
 858         }
 859
 860         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 861                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 862
 863         dmu_buf_rele(dbuf, FTAG);
 864
 865         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 866         dd->dd_phys->dd_head_dataset_obj = dsobj;
 867
 868         return (dsobj);
 869 }
 870
 871 uint64_t
 872 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 873     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 874 {
 875         dsl_pool_t *dp = pdd->dd_pool;
 876         uint64_t dsobj, ddobj;
 877         dsl_dir_t *dd;
 878
 879         ASSERT(lastname[0] != '@');
 880
 881         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 882         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 883
 884         dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 885
 886         dsl_deleg_set_create_perms(dd, tx, cr);
 887
 888         dsl_dir_close(dd, FTAG);
 889
 890         /*
 891          * If we are creating a clone, make sure we zero out any stale
 892          * data from the origin snapshots zil header.
 893          */
 894         if (origin != NULL) {
 895                 dsl_dataset_t *ds;
 896                 objset_t *os;
 897
 898                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 899                 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
 900                 bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 901                 dsl_dataset_dirty(ds, tx);
 902                 dsl_dataset_rele(ds, FTAG);
 903         }
 904
 905         return (dsobj);
 906 }
 907
 908 struct destroyarg {
 909         dsl_sync_task_group_t *dstg;
 910         char *snapname;
 911         char *failed;
 912         boolean_t defer;
 913 };
 914
 915 static int
 916 dsl_snapshot_destroy_one(const char *name, void *arg)
 917 {
 918         struct destroyarg *da = arg;
 919         dsl_dataset_t *ds;
 920         int err;
 921         char *dsname;
 922
 923         dsname = kmem_asprintf("%s@%s", name, da->snapname);
 924         err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds);
 925         strfree(dsname);
 926         if (err == 0) {
 927                 struct dsl_ds_destroyarg *dsda;
 928
 929                 dsl_dataset_make_exclusive(ds, da->dstg);
 930                 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
 931                 dsda->ds = ds;
 932                 dsda->defer = da->defer;
 933                 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
 934                     dsl_dataset_destroy_sync, dsda, da->dstg, 0);
 935         } else if (err == ENOENT) {
 936                 err = 0;
 937         } else {
 938                 (void) strcpy(da->failed, name);
 939         }
 940         return (err);
 941 }
 942
 943 /*
 944  * Destroy 'snapname' in all descendants of 'fsname'.
 945  */
 946 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
 947 int
 948 dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer)
 949 {
 950         int err;
 951         struct destroyarg da;
 952         dsl_sync_task_t *dst;
 953         spa_t *spa;
 954
 955         err = spa_open(fsname, &spa, FTAG);
 956         if (err)
 957                 return (err);
 958         da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 959         da.snapname = snapname;
 960         da.failed = fsname;
 961         da.defer = defer;
 962
 963         err = dmu_objset_find(fsname,
 964             dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
 965
 966         if (err == 0)
 967                 err = dsl_sync_task_group_wait(da.dstg);
 968
 969         for (dst = list_head(&da.dstg->dstg_tasks); dst;
 970             dst = list_next(&da.dstg->dstg_tasks, dst)) {
 971                 struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
 972                 dsl_dataset_t *ds = dsda->ds;
 973
 974                 /*
 975                  * Return the file system name that triggered the error
 976                  */
 977                 if (dst->dst_err) {
 978                         dsl_dataset_name(ds, fsname);
 979                         *strchr(fsname, '@') = '\0';
 980                 }
 981                 ASSERT3P(dsda->rm_origin, ==, NULL);
 982                 dsl_dataset_disown(ds, da.dstg);
 983                 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
 984         }
 985
 986         dsl_sync_task_group_destroy(da.dstg);
 987         spa_close(spa, FTAG);
 988         return (err);
 989 }
 990
 991 static boolean_t
 992 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
 993 {
 994         boolean_t might_destroy = B_FALSE;
 995
 996         mutex_enter(&ds->ds_lock);
 997         if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
 998             DS_IS_DEFER_DESTROY(ds))
 999                 might_destroy = B_TRUE;
1000         mutex_exit(&ds->ds_lock);
1001
1002         return (might_destroy);
1003 }
1004
1005 /*
1006  * If we're removing a clone, and these three conditions are true:
1007  *      1) the clone's origin has no other children
1008  *      2) the clone's origin has no user references
1009  *      3) the clone's origin has been marked for deferred destruction
1010  * Then, prepare to remove the origin as part of this sync task group.
1011  */
1012 static int
1013 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
1014 {
1015         dsl_dataset_t *ds = dsda->ds;
1016         dsl_dataset_t *origin = ds->ds_prev;
1017
1018         if (dsl_dataset_might_destroy_origin(origin)) {
1019                 char *name;
1020                 int namelen;
1021                 int error;
1022
1023                 namelen = dsl_dataset_namelen(origin) + 1;
1024                 name = kmem_alloc(namelen, KM_SLEEP);
1025                 dsl_dataset_name(origin, name);
1026 #ifdef _KERNEL
1027                 error = zfs_unmount_snap(name, NULL);
1028                 if (error) {
1029                         kmem_free(name, namelen);
1030                         return (error);
1031                 }
1032 #endif
1033                 error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1034                 kmem_free(name, namelen);
1035                 if (error)
1036                         return (error);
1037                 dsda->rm_origin = origin;
1038                 dsl_dataset_make_exclusive(origin, tag);
1039         }
1040
1041         return (0);
1042 }
1043
1044 /*
1045  * ds must be opened as OWNER.  On return (whether successful or not),
1046  * ds will be closed and caller can no longer dereference it.
1047  */
1048 int
1049 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1050 {
1051         int err;
1052         dsl_sync_task_group_t *dstg;
1053         objset_t *os;
1054         dsl_dir_t *dd;
1055         uint64_t obj;
1056         struct dsl_ds_destroyarg dsda = { 0 };
1057         dsl_dataset_t dummy_ds = { 0 };
1058
1059         dsda.ds = ds;
1060
1061         if (dsl_dataset_is_snapshot(ds)) {
1062                 /* Destroying a snapshot is simpler */
1063                 dsl_dataset_make_exclusive(ds, tag);
1064
1065                 dsda.defer = defer;
1066                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1067                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1068                     &dsda, tag, 0);
1069                 ASSERT3P(dsda.rm_origin, ==, NULL);
1070                 goto out;
1071         } else if (defer) {
1072                 err = EINVAL;
1073                 goto out;
1074         }
1075
1076         dd = ds->ds_dir;
1077         dummy_ds.ds_dir = dd;
1078         dummy_ds.ds_object = ds->ds_object;
1079
1080         /*
1081          * Check for errors and mark this ds as inconsistent, in
1082          * case we crash while freeing the objects.
1083          */
1084         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
1085             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1086         if (err)
1087                 goto out;
1088
1089         err = dmu_objset_from_ds(ds, &os);
1090         if (err)
1091                 goto out;
1092
1093         /*
1094          * remove the objects in open context, so that we won't
1095          * have too much to do in syncing context.
1096          */
1097         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1098             ds->ds_phys->ds_prev_snap_txg)) {
1099                 /*
1100                  * Ignore errors, if there is not enough disk space
1101                  * we will deal with it in dsl_dataset_destroy_sync().
1102                  */
1103                 (void) dmu_free_object(os, obj);
1104         }
1105         if (err != ESRCH)
1106                 goto out;
1107
1108         /*
1109          * Only the ZIL knows how to free log blocks.
1110          */
1111         zil_destroy(dmu_objset_zil(os), B_FALSE);
1112
1113         /*
1114          * Sync out all in-flight IO.
1115          */
1116         txg_wait_synced(dd->dd_pool, 0);
1117
1118         /*
1119          * If we managed to free all the objects in open
1120          * context, the user space accounting should be zero.
1121          */
1122         if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1123             dmu_objset_userused_enabled(os)) {
1124                 uint64_t count;
1125
1126                 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
1127                     count == 0);
1128                 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
1129                     count == 0);
1130         }
1131
1132         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1133         err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1134         rw_exit(&dd->dd_pool->dp_config_rwlock);
1135
1136         if (err)
1137                 goto out;
1138
1139         /*
1140          * Blow away the dsl_dir + head dataset.
1141          */
1142         dsl_dataset_make_exclusive(ds, tag);
1143         /*
1144          * If we're removing a clone, we might also need to remove its
1145          * origin.
1146          */
1147         do {
1148                 dsda.need_prep = B_FALSE;
1149                 if (dsl_dir_is_clone(dd)) {
1150                         err = dsl_dataset_origin_rm_prep(&dsda, tag);
1151                         if (err) {
1152                                 dsl_dir_close(dd, FTAG);
1153                                 goto out;
1154                         }
1155                 }
1156
1157                 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1158                 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1159                     dsl_dataset_destroy_sync, &dsda, tag, 0);
1160                 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1161                     dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
1162                 err = dsl_sync_task_group_wait(dstg);
1163                 dsl_sync_task_group_destroy(dstg);
1164
1165                 /*
1166                  * We could be racing against 'zfs release' or 'zfs destroy -d'
1167                  * on the origin snap, in which case we can get EBUSY if we
1168                  * needed to destroy the origin snap but were not ready to
1169                  * do so.
1170                  */
1171                 if (dsda.need_prep) {
1172                         ASSERT(err == EBUSY);
1173                         ASSERT(dsl_dir_is_clone(dd));
1174                         ASSERT(dsda.rm_origin == NULL);
1175                 }
1176         } while (dsda.need_prep);
1177
1178         if (dsda.rm_origin != NULL)
1179                 dsl_dataset_disown(dsda.rm_origin, tag);
1180
1181         /* if it is successful, dsl_dir_destroy_sync will close the dd */
1182         if (err)
1183                 dsl_dir_close(dd, FTAG);
1184 out:
1185         dsl_dataset_disown(ds, tag);
1186         return (err);
1187 }
1188
1189 blkptr_t *
1190 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1191 {
1192         return (&ds->ds_phys->ds_bp);
1193 }
1194
1195 void
1196 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1197 {
1198         ASSERT(dmu_tx_is_syncing(tx));
1199         /* If it's the meta-objset, set dp_meta_rootbp */
1200         if (ds == NULL) {
1201                 tx->tx_pool->dp_meta_rootbp = *bp;
1202         } else {
1203                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1204                 ds->ds_phys->ds_bp = *bp;
1205         }
1206 }
1207
1208 spa_t *
1209 dsl_dataset_get_spa(dsl_dataset_t *ds)
1210 {
1211         return (ds->ds_dir->dd_pool->dp_spa);
1212 }
1213
1214 void
1215 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1216 {
1217         dsl_pool_t *dp;
1218
1219         if (ds == NULL) /* this is the meta-objset */
1220                 return;
1221
1222         ASSERT(ds->ds_objset != NULL);
1223
1224         if (ds->ds_phys->ds_next_snap_obj != 0)
1225                 panic("dirtying snapshot!");
1226
1227         dp = ds->ds_dir->dd_pool;
1228
1229         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1230                 /* up the hold count until we can be written out */
1231                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1232         }
1233 }
1234
1235 /*
1236  * The unique space in the head dataset can be calculated by subtracting
1237  * the space used in the most recent snapshot, that is still being used
1238  * in this file system, from the space currently in use.  To figure out
1239  * the space in the most recent snapshot still in use, we need to take
1240  * the total space used in the snapshot and subtract out the space that
1241  * has been freed up since the snapshot was taken.
1242  */
1243 static void
1244 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1245 {
1246         uint64_t mrs_used;
1247         uint64_t dlused, dlcomp, dluncomp;
1248
1249         ASSERT(!dsl_dataset_is_snapshot(ds));
1250
1251         if (ds->ds_phys->ds_prev_snap_obj != 0)
1252                 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
1253         else
1254                 mrs_used = 0;
1255
1256         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1257
1258         ASSERT3U(dlused, <=, mrs_used);
1259         ds->ds_phys->ds_unique_bytes =
1260             ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
1261
1262         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1263             SPA_VERSION_UNIQUE_ACCURATE)
1264                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1265 }
1266
1267 struct killarg {
1268         dsl_dataset_t *ds;
1269         dmu_tx_t *tx;
1270 };
1271
1272 /* ARGSUSED */
1273 static int
1274 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1275     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1276 {
1277         struct killarg *ka = arg;
1278         dmu_tx_t *tx = ka->tx;
1279
1280         if (bp == NULL)
1281                 return (0);
1282
1283         if (zb->zb_level == ZB_ZIL_LEVEL) {
1284                 ASSERT(zilog != NULL);
1285                 /*
1286                  * It's a block in the intent log.  It has no
1287                  * accounting, so just free it.
1288                  */
1289                 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1290         } else {
1291                 ASSERT(zilog == NULL);
1292                 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1293                 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1294         }
1295
1296         return (0);
1297 }
1298
1299 /* ARGSUSED */
1300 static int
1301 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1302 {
1303         dsl_dataset_t *ds = arg1;
1304         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1305         uint64_t count;
1306         int err;
1307
1308         /*
1309          * Can't delete a head dataset if there are snapshots of it.
1310          * (Except if the only snapshots are from the branch we cloned
1311          * from.)
1312          */
1313         if (ds->ds_prev != NULL &&
1314             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1315                 return (EBUSY);
1316
1317         /*
1318          * This is really a dsl_dir thing, but check it here so that
1319          * we'll be less likely to leave this dataset inconsistent &
1320          * nearly destroyed.
1321          */
1322         err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1323         if (err)
1324                 return (err);
1325         if (count != 0)
1326                 return (EEXIST);
1327
1328         return (0);
1329 }
1330
1331 /* ARGSUSED */
1332 static void
1333 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1334 {
1335         dsl_dataset_t *ds = arg1;
1336         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1337
1338         /* Mark it as inconsistent on-disk, in case we crash */
1339         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1340         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1341
1342         spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
1343             "dataset = %llu", ds->ds_object);
1344 }
1345
1346 static int
1347 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1348     dmu_tx_t *tx)
1349 {
1350         dsl_dataset_t *ds = dsda->ds;
1351         dsl_dataset_t *ds_prev = ds->ds_prev;
1352
1353         if (dsl_dataset_might_destroy_origin(ds_prev)) {
1354                 struct dsl_ds_destroyarg ndsda = {0};
1355
1356                 /*
1357                  * If we're not prepared to remove the origin, don't remove
1358                  * the clone either.
1359                  */
1360                 if (dsda->rm_origin == NULL) {
1361                         dsda->need_prep = B_TRUE;
1362                         return (EBUSY);
1363                 }
1364
1365                 ndsda.ds = ds_prev;
1366                 ndsda.is_origin_rm = B_TRUE;
1367                 return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1368         }
1369
1370         /*
1371          * If we're not going to remove the origin after all,
1372          * undo the open context setup.
1373          */
1374         if (dsda->rm_origin != NULL) {
1375                 dsl_dataset_disown(dsda->rm_origin, tag);
1376                 dsda->rm_origin = NULL;
1377         }
1378
1379         return (0);
1380 }
1381
1382 /*
1383  * If you add new checks here, you may need to add
1384  * additional checks to the "temporary" case in
1385  * snapshot_check() in dmu_objset.c.
1386  */
1387 /* ARGSUSED */
1388 int
1389 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1390 {
1391         struct dsl_ds_destroyarg *dsda = arg1;
1392         dsl_dataset_t *ds = dsda->ds;
1393
1394         /* we have an owner hold, so noone else can destroy us */
1395         ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1396
1397         /*
1398          * Only allow deferred destroy on pools that support it.
1399          * NOTE: deferred destroy is only supported on snapshots.
1400          */
1401         if (dsda->defer) {
1402                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1403                     SPA_VERSION_USERREFS)
1404                         return (ENOTSUP);
1405                 ASSERT(dsl_dataset_is_snapshot(ds));
1406                 return (0);
1407         }
1408
1409         /*
1410          * Can't delete a head dataset if there are snapshots of it.
1411          * (Except if the only snapshots are from the branch we cloned
1412          * from.)
1413          */
1414         if (ds->ds_prev != NULL &&
1415             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1416                 return (EBUSY);
1417
1418         /*
1419          * If we made changes this txg, traverse_dsl_dataset won't find
1420          * them.  Try again.
1421          */
1422         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1423                 return (EAGAIN);
1424
1425         if (dsl_dataset_is_snapshot(ds)) {
1426                 /*
1427                  * If this snapshot has an elevated user reference count,
1428                  * we can't destroy it yet.
1429                  */
1430                 if (ds->ds_userrefs > 0 && !dsda->releasing)
1431                         return (EBUSY);
1432
1433                 mutex_enter(&ds->ds_lock);
1434                 /*
1435                  * Can't delete a branch point. However, if we're destroying
1436                  * a clone and removing its origin due to it having a user
1437                  * hold count of 0 and having been marked for deferred destroy,
1438                  * it's OK for the origin to have a single clone.
1439                  */
1440                 if (ds->ds_phys->ds_num_children >
1441                     (dsda->is_origin_rm ? 2 : 1)) {
1442                         mutex_exit(&ds->ds_lock);
1443                         return (EEXIST);
1444                 }
1445                 mutex_exit(&ds->ds_lock);
1446         } else if (dsl_dir_is_clone(ds->ds_dir)) {
1447                 return (dsl_dataset_origin_check(dsda, arg2, tx));
1448         }
1449
1450         /* XXX we should do some i/o error checking... */
1451         return (0);
1452 }
1453
1454 struct refsarg {
1455         kmutex_t lock;
1456         boolean_t gone;
1457         kcondvar_t cv;
1458 };
1459
1460 /* ARGSUSED */
1461 static void
1462 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1463 {
1464         struct refsarg *arg = argv;
1465
1466         mutex_enter(&arg->lock);
1467         arg->gone = TRUE;
1468         cv_signal(&arg->cv);
1469         mutex_exit(&arg->lock);
1470 }
1471
1472 static void
1473 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1474 {
1475         struct refsarg arg;
1476
1477         mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1478         cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1479         arg.gone = FALSE;
1480         (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1481             dsl_dataset_refs_gone);
1482         dmu_buf_rele(ds->ds_dbuf, tag);
1483         mutex_enter(&arg.lock);
1484         while (!arg.gone)
1485                 cv_wait(&arg.cv, &arg.lock);
1486         ASSERT(arg.gone);
1487         mutex_exit(&arg.lock);
1488         ds->ds_dbuf = NULL;
1489         ds->ds_phys = NULL;
1490         mutex_destroy(&arg.lock);
1491         cv_destroy(&arg.cv);
1492 }
1493
1494 static void
1495 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1496 {
1497         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1498         uint64_t count;
1499         int err;
1500
1501         ASSERT(ds->ds_phys->ds_num_children >= 2);
1502         err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1503         /*
1504          * The err should not be ENOENT, but a bug in a previous version
1505          * of the code could cause upgrade_clones_cb() to not set
1506          * ds_next_snap_obj when it should, leading to a missing entry.
1507          * If we knew that the pool was created after
1508          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1509          * ENOENT.  However, at least we can check that we don't have
1510          * too many entries in the next_clones_obj even after failing to
1511          * remove this one.
1512          */
1513         if (err != ENOENT) {
1514                 VERIFY3U(err, ==, 0);
1515         }
1516         ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1517             &count));
1518         ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1519 }
1520
1521 static void
1522 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1523 {
1524         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1525         zap_cursor_t zc;
1526         zap_attribute_t za;
1527
1528         /*
1529          * If it is the old version, dd_clones doesn't exist so we can't
1530          * find the clones, but deadlist_remove_key() is a no-op so it
1531          * doesn't matter.
1532          */
1533         if (ds->ds_dir->dd_phys->dd_clones == 0)
1534                 return;
1535
1536         for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1537             zap_cursor_retrieve(&zc, &za) == 0;
1538             zap_cursor_advance(&zc)) {
1539                 dsl_dataset_t *clone;
1540
1541                 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1542                     za.za_first_integer, FTAG, &clone));
1543                 if (clone->ds_dir->dd_origin_txg > mintxg) {
1544                         dsl_deadlist_remove_key(&clone->ds_deadlist,
1545                             mintxg, tx);
1546                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
1547                 }
1548                 dsl_dataset_rele(clone, FTAG);
1549         }
1550         zap_cursor_fini(&zc);
1551 }
1552
1553 struct process_old_arg {
1554         dsl_dataset_t *ds;
1555         dsl_dataset_t *ds_prev;
1556         boolean_t after_branch_point;
1557         zio_t *pio;
1558         uint64_t used, comp, uncomp;
1559 };
1560
1561 static int
1562 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1563 {
1564         struct process_old_arg *poa = arg;
1565         dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1566
1567         if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1568                 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1569                 if (poa->ds_prev && !poa->after_branch_point &&
1570                     bp->blk_birth >
1571                     poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1572                         poa->ds_prev->ds_phys->ds_unique_bytes +=
1573                             bp_get_dsize_sync(dp->dp_spa, bp);
1574                 }
1575         } else {
1576                 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1577                 poa->comp += BP_GET_PSIZE(bp);
1578                 poa->uncomp += BP_GET_UCSIZE(bp);
1579                 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1580         }
1581         return (0);
1582 }
1583
1584 static void
1585 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1586     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1587 {
1588         struct process_old_arg poa = { 0 };
1589         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1590         objset_t *mos = dp->dp_meta_objset;
1591
1592         ASSERT(ds->ds_deadlist.dl_oldfmt);
1593         ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1594
1595         poa.ds = ds;
1596         poa.ds_prev = ds_prev;
1597         poa.after_branch_point = after_branch_point;
1598         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1599         VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1600             process_old_cb, &poa, tx));
1601         VERIFY3U(zio_wait(poa.pio), ==, 0);
1602         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1603
1604         /* change snapused */
1605         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1606             -poa.used, -poa.comp, -poa.uncomp, tx);
1607
1608         /* swap next's deadlist to our deadlist */
1609         dsl_deadlist_close(&ds->ds_deadlist);
1610         dsl_deadlist_close(&ds_next->ds_deadlist);
1611         SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1612             ds->ds_phys->ds_deadlist_obj);
1613         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1614         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1615             ds_next->ds_phys->ds_deadlist_obj);
1616 }
1617
1618 void
1619 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1620 {
1621         struct dsl_ds_destroyarg *dsda = arg1;
1622         dsl_dataset_t *ds = dsda->ds;
1623         int err;
1624         int after_branch_point = FALSE;
1625         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1626         objset_t *mos = dp->dp_meta_objset;
1627         dsl_dataset_t *ds_prev = NULL;
1628         boolean_t wont_destroy;
1629         uint64_t obj;
1630
1631         wont_destroy = (dsda->defer &&
1632             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1633
1634         ASSERT(ds->ds_owner || wont_destroy);
1635         ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1636         ASSERT(ds->ds_prev == NULL ||
1637             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1638         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1639
1640         if (wont_destroy) {
1641                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1642                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1643                 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1644                 return;
1645         }
1646
1647         /* signal any waiters that this dataset is going away */
1648         mutex_enter(&ds->ds_lock);
1649         ds->ds_owner = dsl_reaper;
1650         cv_broadcast(&ds->ds_exclusive_cv);
1651         mutex_exit(&ds->ds_lock);
1652
1653         /* Remove our reservation */
1654         if (ds->ds_reserved != 0) {
1655                 dsl_prop_setarg_t psa;
1656                 uint64_t value = 0;
1657
1658                 dsl_prop_setarg_init_uint64(&psa, "refreservation",
1659                     (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1660                     &value);
1661                 psa.psa_effective_value = 0;    /* predict default value */
1662
1663                 dsl_dataset_set_reservation_sync(ds, &psa, tx);
1664                 ASSERT3U(ds->ds_reserved, ==, 0);
1665         }
1666
1667         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1668
1669         dsl_scan_ds_destroyed(ds, tx);
1670
1671         obj = ds->ds_object;
1672
1673         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1674                 if (ds->ds_prev) {
1675                         ds_prev = ds->ds_prev;
1676                 } else {
1677                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1678                             ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1679                 }
1680                 after_branch_point =
1681                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1682
1683                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1684                 if (after_branch_point &&
1685                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
1686                         remove_from_next_clones(ds_prev, obj, tx);
1687                         if (ds->ds_phys->ds_next_snap_obj != 0) {
1688                                 VERIFY(0 == zap_add_int(mos,
1689                                     ds_prev->ds_phys->ds_next_clones_obj,
1690                                     ds->ds_phys->ds_next_snap_obj, tx));
1691                         }
1692                 }
1693                 if (after_branch_point &&
1694                     ds->ds_phys->ds_next_snap_obj == 0) {
1695                         /* This clone is toast. */
1696                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1697                         ds_prev->ds_phys->ds_num_children--;
1698
1699                         /*
1700                          * If the clone's origin has no other clones, no
1701                          * user holds, and has been marked for deferred
1702                          * deletion, then we should have done the necessary
1703                          * destroy setup for it.
1704                          */
1705                         if (ds_prev->ds_phys->ds_num_children == 1 &&
1706                             ds_prev->ds_userrefs == 0 &&
1707                             DS_IS_DEFER_DESTROY(ds_prev)) {
1708                                 ASSERT3P(dsda->rm_origin, !=, NULL);
1709                         } else {
1710                                 ASSERT3P(dsda->rm_origin, ==, NULL);
1711                         }
1712                 } else if (!after_branch_point) {
1713                         ds_prev->ds_phys->ds_next_snap_obj =
1714                             ds->ds_phys->ds_next_snap_obj;
1715                 }
1716         }
1717
1718         if (dsl_dataset_is_snapshot(ds)) {
1719                 dsl_dataset_t *ds_next;
1720                 uint64_t old_unique;
1721                 uint64_t used = 0, comp = 0, uncomp = 0;
1722
1723                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1724                     ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1725                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1726
1727                 old_unique = ds_next->ds_phys->ds_unique_bytes;
1728
1729                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1730                 ds_next->ds_phys->ds_prev_snap_obj =
1731                     ds->ds_phys->ds_prev_snap_obj;
1732                 ds_next->ds_phys->ds_prev_snap_txg =
1733                     ds->ds_phys->ds_prev_snap_txg;
1734                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1735                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1736
1737
1738                 if (ds_next->ds_deadlist.dl_oldfmt) {
1739                         process_old_deadlist(ds, ds_prev, ds_next,
1740                             after_branch_point, tx);
1741                 } else {
1742                         /* Adjust prev's unique space. */
1743                         if (ds_prev && !after_branch_point) {
1744                                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1745                                     ds_prev->ds_phys->ds_prev_snap_txg,
1746                                     ds->ds_phys->ds_prev_snap_txg,
1747                                     &used, &comp, &uncomp);
1748                                 ds_prev->ds_phys->ds_unique_bytes += used;
1749                         }
1750
1751                         /* Adjust snapused. */
1752                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
1753                             ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1754                             &used, &comp, &uncomp);
1755                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1756                             -used, -comp, -uncomp, tx);
1757
1758                         /* Move blocks to be freed to pool's free list. */
1759                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1760                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1761                             tx);
1762                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1763                             DD_USED_HEAD, used, comp, uncomp, tx);
1764                         dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
1765
1766                         /* Merge our deadlist into next's and free it. */
1767                         dsl_deadlist_merge(&ds_next->ds_deadlist,
1768                             ds->ds_phys->ds_deadlist_obj, tx);
1769                 }
1770                 dsl_deadlist_close(&ds->ds_deadlist);
1771                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1772
1773                 /* Collapse range in clone heads */
1774                 dsl_dataset_remove_clones_key(ds,
1775                     ds->ds_phys->ds_creation_txg, tx);
1776
1777                 if (dsl_dataset_is_snapshot(ds_next)) {
1778                         dsl_dataset_t *ds_nextnext;
1779                         dsl_dataset_t *hds;
1780
1781                         /*
1782                          * Update next's unique to include blocks which
1783                          * were previously shared by only this snapshot
1784                          * and it.  Those blocks will be born after the
1785                          * prev snap and before this snap, and will have
1786                          * died after the next snap and before the one
1787                          * after that (ie. be on the snap after next's
1788                          * deadlist).
1789                          */
1790                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1791                             ds_next->ds_phys->ds_next_snap_obj,
1792                             FTAG, &ds_nextnext));
1793                         dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1794                             ds->ds_phys->ds_prev_snap_txg,
1795                             ds->ds_phys->ds_creation_txg,
1796                             &used, &comp, &uncomp);
1797                         ds_next->ds_phys->ds_unique_bytes += used;
1798                         dsl_dataset_rele(ds_nextnext, FTAG);
1799                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1800
1801                         /* Collapse range in this head. */
1802                         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
1803                             ds->ds_dir->dd_phys->dd_head_dataset_obj,
1804                             FTAG, &hds));
1805                         dsl_deadlist_remove_key(&hds->ds_deadlist,
1806                             ds->ds_phys->ds_creation_txg, tx);
1807                         dsl_dataset_rele(hds, FTAG);
1808
1809                 } else {
1810                         ASSERT3P(ds_next->ds_prev, ==, ds);
1811                         dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1812                         ds_next->ds_prev = NULL;
1813                         if (ds_prev) {
1814                                 VERIFY(0 == dsl_dataset_get_ref(dp,
1815                                     ds->ds_phys->ds_prev_snap_obj,
1816                                     ds_next, &ds_next->ds_prev));
1817                         }
1818
1819                         dsl_dataset_recalc_head_uniq(ds_next);
1820
1821                         /*
1822                          * Reduce the amount of our unconsmed refreservation
1823                          * being charged to our parent by the amount of
1824                          * new unique data we have gained.
1825                          */
1826                         if (old_unique < ds_next->ds_reserved) {
1827                                 int64_t mrsdelta;
1828                                 uint64_t new_unique =
1829                                     ds_next->ds_phys->ds_unique_bytes;
1830
1831                                 ASSERT(old_unique <= new_unique);
1832                                 mrsdelta = MIN(new_unique - old_unique,
1833                                     ds_next->ds_reserved - old_unique);
1834                                 dsl_dir_diduse_space(ds->ds_dir,
1835                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1836                         }
1837                 }
1838                 dsl_dataset_rele(ds_next, FTAG);
1839         } else {
1840                 /*
1841                  * There's no next snapshot, so this is a head dataset.
1842                  * Destroy the deadlist.  Unless it's a clone, the
1843                  * deadlist should be empty.  (If it's a clone, it's
1844                  * safe to ignore the deadlist contents.)
1845                  */
1846                 struct killarg ka;
1847
1848                 dsl_deadlist_close(&ds->ds_deadlist);
1849                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1850                 ds->ds_phys->ds_deadlist_obj = 0;
1851
1852                 /*
1853                  * Free everything that we point to (that's born after
1854                  * the previous snapshot, if we are a clone)
1855                  *
1856                  * NB: this should be very quick, because we already
1857                  * freed all the objects in open context.
1858                  */
1859                 ka.ds = ds;
1860                 ka.tx = tx;
1861                 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1862                     TRAVERSE_POST, kill_blkptr, &ka);
1863                 ASSERT3U(err, ==, 0);
1864                 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1865                     ds->ds_phys->ds_unique_bytes == 0);
1866
1867                 if (ds->ds_prev != NULL) {
1868                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1869                                 VERIFY3U(0, ==, zap_remove_int(mos,
1870                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
1871                                     ds->ds_object, tx));
1872                         }
1873                         dsl_dataset_rele(ds->ds_prev, ds);
1874                         ds->ds_prev = ds_prev = NULL;
1875                 }
1876         }
1877
1878         /*
1879          * This must be done after the dsl_traverse(), because it will
1880          * re-open the objset.
1881          */
1882         if (ds->ds_objset) {
1883                 dmu_objset_evict(ds->ds_objset);
1884                 ds->ds_objset = NULL;
1885         }
1886
1887         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1888                 /* Erase the link in the dir */
1889                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1890                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1891                 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1892                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1893                 ASSERT(err == 0);
1894         } else {
1895                 /* remove from snapshot namespace */
1896                 dsl_dataset_t *ds_head;
1897                 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1898                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1899                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1900                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1901 #ifdef ZFS_DEBUG
1902                 {
1903                         uint64_t val;
1904
1905                         err = dsl_dataset_snap_lookup(ds_head,
1906                             ds->ds_snapname, &val);
1907                         ASSERT3U(err, ==, 0);
1908                         ASSERT3U(val, ==, obj);
1909                 }
1910 #endif
1911                 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1912                 ASSERT(err == 0);
1913                 dsl_dataset_rele(ds_head, FTAG);
1914         }
1915
1916         if (ds_prev && ds->ds_prev != ds_prev)
1917                 dsl_dataset_rele(ds_prev, FTAG);
1918
1919         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1920         spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
1921             "dataset = %llu", ds->ds_object);
1922
1923         if (ds->ds_phys->ds_next_clones_obj != 0) {
1924                 uint64_t count;
1925                 ASSERT(0 == zap_count(mos,
1926                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1927                 VERIFY(0 == dmu_object_free(mos,
1928                     ds->ds_phys->ds_next_clones_obj, tx));
1929         }
1930         if (ds->ds_phys->ds_props_obj != 0)
1931                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1932         if (ds->ds_phys->ds_userrefs_obj != 0)
1933                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1934         dsl_dir_close(ds->ds_dir, ds);
1935         ds->ds_dir = NULL;
1936         dsl_dataset_drain_refs(ds, tag);
1937         VERIFY(0 == dmu_object_free(mos, obj, tx));
1938
1939         if (dsda->rm_origin) {
1940                 /*
1941                  * Remove the origin of the clone we just destroyed.
1942                  */
1943                 struct dsl_ds_destroyarg ndsda = {0};
1944
1945                 ndsda.ds = dsda->rm_origin;
1946                 dsl_dataset_destroy_sync(&ndsda, tag, tx);
1947         }
1948 }
1949
1950 static int
1951 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1952 {
1953         uint64_t asize;
1954
1955         if (!dmu_tx_is_syncing(tx))
1956                 return (0);
1957
1958         /*
1959          * If there's an fs-only reservation, any blocks that might become
1960          * owned by the snapshot dataset must be accommodated by space
1961          * outside of the reservation.
1962          */
1963         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
1964         asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
1965         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
1966                 return (ENOSPC);
1967
1968         /*
1969          * Propogate any reserved space for this snapshot to other
1970          * snapshot checks in this sync group.
1971          */
1972         if (asize > 0)
1973                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1974
1975         return (0);
1976 }
1977
1978 int
1979 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
1980 {
1981         dsl_dataset_t *ds = arg1;
1982         const char *snapname = arg2;
1983         int err;
1984         uint64_t value;
1985
1986         /*
1987          * We don't allow multiple snapshots of the same txg.  If there
1988          * is already one, try again.
1989          */
1990         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
1991                 return (EAGAIN);
1992
1993         /*
1994          * Check for conflicting name snapshot name.
1995          */
1996         err = dsl_dataset_snap_lookup(ds, snapname, &value);
1997         if (err == 0)
1998                 return (EEXIST);
1999         if (err != ENOENT)
2000                 return (err);
2001
2002         /*
2003          * Check that the dataset's name is not too long.  Name consists
2004          * of the dataset's length + 1 for the @-sign + snapshot name's length
2005          */
2006         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2007                 return (ENAMETOOLONG);
2008
2009         err = dsl_dataset_snapshot_reserve_space(ds, tx);
2010         if (err)
2011                 return (err);
2012
2013         ds->ds_trysnap_txg = tx->tx_txg;
2014         return (0);
2015 }
2016
2017 void
2018 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2019 {
2020         dsl_dataset_t *ds = arg1;
2021         const char *snapname = arg2;
2022         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2023         dmu_buf_t *dbuf;
2024         dsl_dataset_phys_t *dsphys;
2025         uint64_t dsobj, crtxg;
2026         objset_t *mos = dp->dp_meta_objset;
2027         int err;
2028
2029         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2030
2031         /*
2032          * The origin's ds_creation_txg has to be < TXG_INITIAL
2033          */
2034         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2035                 crtxg = 1;
2036         else
2037                 crtxg = tx->tx_txg;
2038
2039         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2040             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2041         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2042         dmu_buf_will_dirty(dbuf, tx);
2043         dsphys = dbuf->db_data;
2044         bzero(dsphys, sizeof (dsl_dataset_phys_t));
2045         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2046         dsphys->ds_fsid_guid = unique_create();
2047         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2048             sizeof (dsphys->ds_guid));
2049         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2050         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2051         dsphys->ds_next_snap_obj = ds->ds_object;
2052         dsphys->ds_num_children = 1;
2053         dsphys->ds_creation_time = gethrestime_sec();
2054         dsphys->ds_creation_txg = crtxg;
2055         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2056         dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
2057         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2058         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2059         dsphys->ds_flags = ds->ds_phys->ds_flags;
2060         dsphys->ds_bp = ds->ds_phys->ds_bp;
2061         dmu_buf_rele(dbuf, FTAG);
2062
2063         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2064         if (ds->ds_prev) {
2065                 uint64_t next_clones_obj =
2066                     ds->ds_prev->ds_phys->ds_next_clones_obj;
2067                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2068                     ds->ds_object ||
2069                     ds->ds_prev->ds_phys->ds_num_children > 1);
2070                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2071                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2072                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2073                             ds->ds_prev->ds_phys->ds_creation_txg);
2074                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2075                 } else if (next_clones_obj != 0) {
2076                         remove_from_next_clones(ds->ds_prev,
2077                             dsphys->ds_next_snap_obj, tx);
2078                         VERIFY3U(0, ==, zap_add_int(mos,
2079                             next_clones_obj, dsobj, tx));
2080                 }
2081         }
2082
2083         /*
2084          * If we have a reference-reservation on this dataset, we will
2085          * need to increase the amount of refreservation being charged
2086          * since our unique space is going to zero.
2087          */
2088         if (ds->ds_reserved) {
2089                 int64_t delta;
2090                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2091                 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2092                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2093                     delta, 0, 0, tx);
2094         }
2095
2096         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2097         zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2098             ds->ds_dir->dd_myname, snapname, dsobj,
2099             ds->ds_phys->ds_prev_snap_txg);
2100         ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2101             UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2102         dsl_deadlist_close(&ds->ds_deadlist);
2103         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2104         dsl_deadlist_add_key(&ds->ds_deadlist,
2105             ds->ds_phys->ds_prev_snap_txg, tx);
2106
2107         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2108         ds->ds_phys->ds_prev_snap_obj = dsobj;
2109         ds->ds_phys->ds_prev_snap_txg = crtxg;
2110         ds->ds_phys->ds_unique_bytes = 0;
2111         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2112                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2113
2114         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2115             snapname, 8, 1, &dsobj, tx);
2116         ASSERT(err == 0);
2117
2118         if (ds->ds_prev)
2119                 dsl_dataset_drop_ref(ds->ds_prev, ds);
2120         VERIFY(0 == dsl_dataset_get_ref(dp,
2121             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2122
2123         dsl_scan_ds_snapshotted(ds, tx);
2124
2125         dsl_dir_snap_cmtime_update(ds->ds_dir);
2126
2127         spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
2128             "dataset = %llu", dsobj);
2129 }
2130
2131 void
2132 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2133 {
2134         ASSERT(dmu_tx_is_syncing(tx));
2135         ASSERT(ds->ds_objset != NULL);
2136         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2137
2138         /*
2139          * in case we had to change ds_fsid_guid when we opened it,
2140          * sync it out now.
2141          */
2142         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2143         ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2144
2145         dsl_dir_dirty(ds->ds_dir, tx);
2146         dmu_objset_sync(ds->ds_objset, zio, tx);
2147 }
2148
2149 void
2150 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2151 {
2152         uint64_t refd, avail, uobjs, aobjs;
2153
2154         dsl_dir_stats(ds->ds_dir, nv);
2155
2156         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2157         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2158         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2159
2160         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2161             ds->ds_phys->ds_creation_time);
2162         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2163             ds->ds_phys->ds_creation_txg);
2164         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2165             ds->ds_quota);
2166         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2167             ds->ds_reserved);
2168         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2169             ds->ds_phys->ds_guid);
2170         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2171             ds->ds_phys->ds_unique_bytes);
2172         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2173             ds->ds_object);
2174         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2175             ds->ds_userrefs);
2176         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2177             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2178
2179         if (ds->ds_phys->ds_next_snap_obj) {
2180                 /*
2181                  * This is a snapshot; override the dd's space used with
2182                  * our unique space and compression ratio.
2183                  */
2184                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2185                     ds->ds_phys->ds_unique_bytes);
2186                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
2187                     ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2188                     (ds->ds_phys->ds_uncompressed_bytes * 100 /
2189                     ds->ds_phys->ds_compressed_bytes));
2190         }
2191 }
2192
2193 void
2194 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2195 {
2196         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2197         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2198         stat->dds_guid = ds->ds_phys->ds_guid;
2199         if (ds->ds_phys->ds_next_snap_obj) {
2200                 stat->dds_is_snapshot = B_TRUE;
2201                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2202         } else {
2203                 stat->dds_is_snapshot = B_FALSE;
2204                 stat->dds_num_clones = 0;
2205         }
2206
2207         /* clone origin is really a dsl_dir thing... */
2208         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2209         if (dsl_dir_is_clone(ds->ds_dir)) {
2210                 dsl_dataset_t *ods;
2211
2212                 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2213                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2214                 dsl_dataset_name(ods, stat->dds_origin);
2215                 dsl_dataset_drop_ref(ods, FTAG);
2216         } else {
2217                 stat->dds_origin[0] = '\0';
2218         }
2219         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2220 }
2221
2222 uint64_t
2223 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2224 {
2225         return (ds->ds_fsid_guid);
2226 }
2227
2228 void
2229 dsl_dataset_space(dsl_dataset_t *ds,
2230     uint64_t *refdbytesp, uint64_t *availbytesp,
2231     uint64_t *usedobjsp, uint64_t *availobjsp)
2232 {
2233         *refdbytesp = ds->ds_phys->ds_used_bytes;
2234         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2235         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2236                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2237         if (ds->ds_quota != 0) {
2238                 /*
2239                  * Adjust available bytes according to refquota
2240                  */
2241                 if (*refdbytesp < ds->ds_quota)
2242                         *availbytesp = MIN(*availbytesp,
2243                             ds->ds_quota - *refdbytesp);
2244                 else
2245                         *availbytesp = 0;
2246         }
2247         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2248         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2249 }
2250
2251 boolean_t
2252 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2253 {
2254         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2255
2256         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2257             dsl_pool_sync_context(dp));
2258         if (ds->ds_prev == NULL)
2259                 return (B_FALSE);
2260         if (ds->ds_phys->ds_bp.blk_birth >
2261             ds->ds_prev->ds_phys->ds_creation_txg) {
2262                 objset_t *os, *os_prev;
2263                 /*
2264                  * It may be that only the ZIL differs, because it was
2265                  * reset in the head.  Don't count that as being
2266                  * modified.
2267                  */
2268                 if (dmu_objset_from_ds(ds, &os) != 0)
2269                         return (B_TRUE);
2270                 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2271                         return (B_TRUE);
2272                 return (bcmp(&os->os_phys->os_meta_dnode,
2273                     &os_prev->os_phys->os_meta_dnode,
2274                     sizeof (os->os_phys->os_meta_dnode)) != 0);
2275         }
2276         return (B_FALSE);
2277 }
2278
2279 /* ARGSUSED */
2280 static int
2281 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2282 {
2283         dsl_dataset_t *ds = arg1;
2284         char *newsnapname = arg2;
2285         dsl_dir_t *dd = ds->ds_dir;
2286         dsl_dataset_t *hds;
2287         uint64_t val;
2288         int err;
2289
2290         err = dsl_dataset_hold_obj(dd->dd_pool,
2291             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2292         if (err)
2293                 return (err);
2294
2295         /* new name better not be in use */
2296         err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2297         dsl_dataset_rele(hds, FTAG);
2298
2299         if (err == 0)
2300                 err = EEXIST;
2301         else if (err == ENOENT)
2302                 err = 0;
2303
2304         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2305         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2306                 err = ENAMETOOLONG;
2307
2308         return (err);
2309 }
2310
2311 static void
2312 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2313 {
2314         dsl_dataset_t *ds = arg1;
2315         const char *newsnapname = arg2;
2316         dsl_dir_t *dd = ds->ds_dir;
2317         objset_t *mos = dd->dd_pool->dp_meta_objset;
2318         dsl_dataset_t *hds;
2319         int err;
2320
2321         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2322
2323         VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2324             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2325
2326         VERIFY(0 == dsl_dataset_get_snapname(ds));
2327         err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2328         ASSERT3U(err, ==, 0);
2329         mutex_enter(&ds->ds_lock);
2330         (void) strcpy(ds->ds_snapname, newsnapname);
2331         mutex_exit(&ds->ds_lock);
2332         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2333             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2334         ASSERT3U(err, ==, 0);
2335
2336         spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
2337             "dataset = %llu", ds->ds_object);
2338         dsl_dataset_rele(hds, FTAG);
2339 }
2340
2341 struct renamesnaparg {
2342         dsl_sync_task_group_t *dstg;
2343         char failed[MAXPATHLEN];
2344         char *oldsnap;
2345         char *newsnap;
2346 };
2347
2348 static int
2349 dsl_snapshot_rename_one(const char *name, void *arg)
2350 {
2351         struct renamesnaparg *ra = arg;
2352         dsl_dataset_t *ds = NULL;
2353         char *snapname;
2354         int err;
2355
2356         snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2357         (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2358
2359         /*
2360          * For recursive snapshot renames the parent won't be changing
2361          * so we just pass name for both the to/from argument.
2362          */
2363         err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2364         if (err != 0) {
2365                 strfree(snapname);
2366                 return (err == ENOENT ? 0 : err);
2367         }
2368
2369 #ifdef _KERNEL
2370         /*
2371          * For all filesystems undergoing rename, we'll need to unmount it.
2372          */
2373         (void) zfs_unmount_snap(snapname, NULL);
2374 #endif
2375         err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2376         strfree(snapname);
2377         if (err != 0)
2378                 return (err == ENOENT ? 0 : err);
2379
2380         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2381             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2382
2383         return (0);
2384 }
2385
2386 static int
2387 dsl_recursive_rename(char *oldname, const char *newname)
2388 {
2389         int err;
2390         struct renamesnaparg *ra;
2391         dsl_sync_task_t *dst;
2392         spa_t *spa;
2393         char *cp, *fsname = spa_strdup(oldname);
2394         int len = strlen(oldname) + 1;
2395
2396         /* truncate the snapshot name to get the fsname */
2397         cp = strchr(fsname, '@');
2398         *cp = '\0';
2399
2400         err = spa_open(fsname, &spa, FTAG);
2401         if (err) {
2402                 kmem_free(fsname, len);
2403                 return (err);
2404         }
2405         ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2406         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2407
2408         ra->oldsnap = strchr(oldname, '@') + 1;
2409         ra->newsnap = strchr(newname, '@') + 1;
2410         *ra->failed = '\0';
2411
2412         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2413             DS_FIND_CHILDREN);
2414         kmem_free(fsname, len);
2415
2416         if (err == 0) {
2417                 err = dsl_sync_task_group_wait(ra->dstg);
2418         }
2419
2420         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2421             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2422                 dsl_dataset_t *ds = dst->dst_arg1;
2423                 if (dst->dst_err) {
2424                         dsl_dir_name(ds->ds_dir, ra->failed);
2425                         (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2426                         (void) strlcat(ra->failed, ra->newsnap,
2427                             sizeof (ra->failed));
2428                 }
2429                 dsl_dataset_rele(ds, ra->dstg);
2430         }
2431
2432         if (err)
2433                 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2434
2435         dsl_sync_task_group_destroy(ra->dstg);
2436         kmem_free(ra, sizeof (struct renamesnaparg));
2437         spa_close(spa, FTAG);
2438         return (err);
2439 }
2440
2441 static int
2442 dsl_valid_rename(const char *oldname, void *arg)
2443 {
2444         int delta = *(int *)arg;
2445
2446         if (strlen(oldname) + delta >= MAXNAMELEN)
2447                 return (ENAMETOOLONG);
2448
2449         return (0);
2450 }
2451
2452 #pragma weak dmu_objset_rename = dsl_dataset_rename
2453 int
2454 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2455 {
2456         dsl_dir_t *dd;
2457         dsl_dataset_t *ds;
2458         const char *tail;
2459         int err;
2460
2461         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2462         if (err)
2463                 return (err);
2464
2465         if (tail == NULL) {
2466                 int delta = strlen(newname) - strlen(oldname);
2467
2468                 /* if we're growing, validate child name lengths */
2469                 if (delta > 0)
2470                         err = dmu_objset_find(oldname, dsl_valid_rename,
2471                             &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2472
2473                 if (err == 0)
2474                         err = dsl_dir_rename(dd, newname);
2475                 dsl_dir_close(dd, FTAG);
2476                 return (err);
2477         }
2478
2479         if (tail[0] != '@') {
2480                 /* the name ended in a nonexistent component */
2481                 dsl_dir_close(dd, FTAG);
2482                 return (ENOENT);
2483         }
2484
2485         dsl_dir_close(dd, FTAG);
2486
2487         /* new name must be snapshot in same filesystem */
2488         tail = strchr(newname, '@');
2489         if (tail == NULL)
2490                 return (EINVAL);
2491         tail++;
2492         if (strncmp(oldname, newname, tail - newname) != 0)
2493                 return (EXDEV);
2494
2495         if (recursive) {
2496                 err = dsl_recursive_rename(oldname, newname);
2497         } else {
2498                 err = dsl_dataset_hold(oldname, FTAG, &ds);
2499                 if (err)
2500                         return (err);
2501
2502                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2503                     dsl_dataset_snapshot_rename_check,
2504                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2505
2506                 dsl_dataset_rele(ds, FTAG);
2507         }
2508
2509         return (err);
2510 }
2511
2512 struct promotenode {
2513         list_node_t link;
2514         dsl_dataset_t *ds;
2515 };
2516
2517 struct promotearg {
2518         list_t shared_snaps, origin_snaps, clone_snaps;
2519         dsl_dataset_t *origin_origin;
2520         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2521         char *err_ds;
2522 };
2523
2524 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2525 static boolean_t snaplist_unstable(list_t *l);
2526
2527 static int
2528 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2529 {
2530         dsl_dataset_t *hds = arg1;
2531         struct promotearg *pa = arg2;
2532         struct promotenode *snap = list_head(&pa->shared_snaps);
2533         dsl_dataset_t *origin_ds = snap->ds;
2534         int err;
2535         uint64_t unused;
2536
2537         /* Check that it is a real clone */
2538         if (!dsl_dir_is_clone(hds->ds_dir))
2539                 return (EINVAL);
2540
2541         /* Since this is so expensive, don't do the preliminary check */
2542         if (!dmu_tx_is_syncing(tx))
2543                 return (0);
2544
2545         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2546                 return (EXDEV);
2547
2548         /* compute origin's new unique space */
2549         snap = list_tail(&pa->clone_snaps);
2550         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2551         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2552             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2553             &pa->unique, &unused, &unused);
2554
2555         /*
2556          * Walk the snapshots that we are moving
2557          *
2558          * Compute space to transfer.  Consider the incremental changes
2559          * to used for each snapshot:
2560          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2561          * So each snapshot gave birth to:
2562          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2563          * So a sequence would look like:
2564          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2565          * Which simplifies to:
2566          * uN + kN + kN-1 + ... + k1 + k0
2567          * Note however, if we stop before we reach the ORIGIN we get:
2568          * uN + kN + kN-1 + ... + kM - uM-1
2569          */
2570         pa->used = origin_ds->ds_phys->ds_used_bytes;
2571         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2572         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2573         for (snap = list_head(&pa->shared_snaps); snap;
2574             snap = list_next(&pa->shared_snaps, snap)) {
2575                 uint64_t val, dlused, dlcomp, dluncomp;
2576                 dsl_dataset_t *ds = snap->ds;
2577
2578                 /* Check that the snapshot name does not conflict */
2579                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2580                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2581                 if (err == 0) {
2582                         err = EEXIST;
2583                         goto out;
2584                 }
2585                 if (err != ENOENT)
2586                         goto out;
2587
2588                 /* The very first snapshot does not have a deadlist */
2589                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2590                         continue;
2591
2592                 dsl_deadlist_space(&ds->ds_deadlist,
2593                     &dlused, &dlcomp, &dluncomp);
2594                 pa->used += dlused;
2595                 pa->comp += dlcomp;
2596                 pa->uncomp += dluncomp;
2597         }
2598
2599         /*
2600          * If we are a clone of a clone then we never reached ORIGIN,
2601          * so we need to subtract out the clone origin's used space.
2602          */
2603         if (pa->origin_origin) {
2604                 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
2605                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2606                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2607         }
2608
2609         /* Check that there is enough space here */
2610         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2611             pa->used);
2612         if (err)
2613                 return (err);
2614
2615         /*
2616          * Compute the amounts of space that will be used by snapshots
2617          * after the promotion (for both origin and clone).  For each,
2618          * it is the amount of space that will be on all of their
2619          * deadlists (that was not born before their new origin).
2620          */
2621         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2622                 uint64_t space;
2623
2624                 /*
2625                  * Note, typically this will not be a clone of a clone,
2626                  * so dd_origin_txg will be < TXG_INITIAL, so
2627                  * these snaplist_space() -> dsl_deadlist_space_range()
2628                  * calls will be fast because they do not have to
2629                  * iterate over all bps.
2630                  */
2631                 snap = list_head(&pa->origin_snaps);
2632                 err = snaplist_space(&pa->shared_snaps,
2633                     snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2634                 if (err)
2635                         return (err);
2636
2637                 err = snaplist_space(&pa->clone_snaps,
2638                     snap->ds->ds_dir->dd_origin_txg, &space);
2639                 if (err)
2640                         return (err);
2641                 pa->cloneusedsnap += space;
2642         }
2643         if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2644                 err = snaplist_space(&pa->origin_snaps,
2645                     origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2646                 if (err)
2647                         return (err);
2648         }
2649
2650         return (0);
2651 out:
2652         pa->err_ds =  snap->ds->ds_snapname;
2653         return (err);
2654 }
2655
2656 static void
2657 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2658 {
2659         dsl_dataset_t *hds = arg1;
2660         struct promotearg *pa = arg2;
2661         struct promotenode *snap = list_head(&pa->shared_snaps);
2662         dsl_dataset_t *origin_ds = snap->ds;
2663         dsl_dataset_t *origin_head;
2664         dsl_dir_t *dd = hds->ds_dir;
2665         dsl_pool_t *dp = hds->ds_dir->dd_pool;
2666         dsl_dir_t *odd = NULL;
2667         uint64_t oldnext_obj;
2668         int64_t delta;
2669
2670         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2671
2672         snap = list_head(&pa->origin_snaps);
2673         origin_head = snap->ds;
2674
2675         /*
2676          * We need to explicitly open odd, since origin_ds's dd will be
2677          * changing.
2678          */
2679         VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2680             NULL, FTAG, &odd));
2681
2682         /* change origin's next snap */
2683         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2684         oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2685         snap = list_tail(&pa->clone_snaps);
2686         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2687         origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2688
2689         /* change the origin's next clone */
2690         if (origin_ds->ds_phys->ds_next_clones_obj) {
2691                 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2692                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2693                     origin_ds->ds_phys->ds_next_clones_obj,
2694                     oldnext_obj, tx));
2695         }
2696
2697         /* change origin */
2698         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2699         ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2700         dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2701         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2702         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2703         odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2704         origin_head->ds_dir->dd_origin_txg =
2705             origin_ds->ds_phys->ds_creation_txg;
2706
2707         /* change dd_clone entries */
2708         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2709                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2710                     odd->dd_phys->dd_clones, hds->ds_object, tx));
2711                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2712                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2713                     hds->ds_object, tx));
2714
2715                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2716                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2717                     origin_head->ds_object, tx));
2718                 if (dd->dd_phys->dd_clones == 0) {
2719                         dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2720                             DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2721                 }
2722                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2723                     dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2724
2725         }
2726
2727         /* move snapshots to this dir */
2728         for (snap = list_head(&pa->shared_snaps); snap;
2729             snap = list_next(&pa->shared_snaps, snap)) {
2730                 dsl_dataset_t *ds = snap->ds;
2731
2732                 /* unregister props as dsl_dir is changing */
2733                 if (ds->ds_objset) {
2734                         dmu_objset_evict(ds->ds_objset);
2735                         ds->ds_objset = NULL;
2736                 }
2737                 /* move snap name entry */
2738                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2739                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2740                     ds->ds_snapname, tx));
2741                 VERIFY(0 == zap_add(dp->dp_meta_objset,
2742                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2743                     8, 1, &ds->ds_object, tx));
2744
2745                 /* change containing dsl_dir */
2746                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2747                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2748                 ds->ds_phys->ds_dir_obj = dd->dd_object;
2749                 ASSERT3P(ds->ds_dir, ==, odd);
2750                 dsl_dir_close(ds->ds_dir, ds);
2751                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2752                     NULL, ds, &ds->ds_dir));
2753
2754                 /* move any clone references */
2755                 if (ds->ds_phys->ds_next_clones_obj &&
2756                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2757                         zap_cursor_t zc;
2758                         zap_attribute_t za;
2759
2760                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
2761                             ds->ds_phys->ds_next_clones_obj);
2762                             zap_cursor_retrieve(&zc, &za) == 0;
2763                             zap_cursor_advance(&zc)) {
2764                                 dsl_dataset_t *cnds;
2765                                 uint64_t o;
2766
2767                                 if (za.za_first_integer == oldnext_obj) {
2768                                         /*
2769                                          * We've already moved the
2770                                          * origin's reference.
2771                                          */
2772                                         continue;
2773                                 }
2774
2775                                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
2776                                     za.za_first_integer, FTAG, &cnds));
2777                                 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2778
2779                                 VERIFY3U(zap_remove_int(dp->dp_meta_objset,
2780                                     odd->dd_phys->dd_clones, o, tx), ==, 0);
2781                                 VERIFY3U(zap_add_int(dp->dp_meta_objset,
2782                                     dd->dd_phys->dd_clones, o, tx), ==, 0);
2783                                 dsl_dataset_rele(cnds, FTAG);
2784                         }
2785                         zap_cursor_fini(&zc);
2786                 }
2787
2788                 ASSERT3U(dsl_prop_numcb(ds), ==, 0);
2789         }
2790
2791         /*
2792          * Change space accounting.
2793          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2794          * both be valid, or both be 0 (resulting in delta == 0).  This
2795          * is true for each of {clone,origin} independently.
2796          */
2797
2798         delta = pa->cloneusedsnap -
2799             dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2800         ASSERT3S(delta, >=, 0);
2801         ASSERT3U(pa->used, >=, delta);
2802         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2803         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2804             pa->used - delta, pa->comp, pa->uncomp, tx);
2805
2806         delta = pa->originusedsnap -
2807             odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2808         ASSERT3S(delta, <=, 0);
2809         ASSERT3U(pa->used, >=, -delta);
2810         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2811         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2812             -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2813
2814         origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2815
2816         /* log history record */
2817         spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
2818             "dataset = %llu", hds->ds_object);
2819
2820         dsl_dir_close(odd, FTAG);
2821 }
2822
2823 static char *snaplist_tag = "snaplist";
2824 /*
2825  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2826  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2827  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2828  * snapshots back to this dataset's origin.
2829  */
2830 static int
2831 snaplist_make(dsl_pool_t *dp, boolean_t own,
2832     uint64_t first_obj, uint64_t last_obj, list_t *l)
2833 {
2834         uint64_t obj = last_obj;
2835
2836         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2837
2838         list_create(l, sizeof (struct promotenode),
2839             offsetof(struct promotenode, link));
2840
2841         while (obj != first_obj) {
2842                 dsl_dataset_t *ds;
2843                 struct promotenode *snap;
2844                 int err;
2845
2846                 if (own) {
2847                         err = dsl_dataset_own_obj(dp, obj,
2848                             0, snaplist_tag, &ds);
2849                         if (err == 0)
2850                                 dsl_dataset_make_exclusive(ds, snaplist_tag);
2851                 } else {
2852                         err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2853                 }
2854                 if (err == ENOENT) {
2855                         /* lost race with snapshot destroy */
2856                         struct promotenode *last = list_tail(l);
2857                         ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2858                         obj = last->ds->ds_phys->ds_prev_snap_obj;
2859                         continue;
2860                 } else if (err) {
2861                         return (err);
2862                 }
2863
2864                 if (first_obj == 0)
2865                         first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2866
2867                 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2868                 snap->ds = ds;
2869                 list_insert_tail(l, snap);
2870                 obj = ds->ds_phys->ds_prev_snap_obj;
2871         }
2872
2873         return (0);
2874 }
2875
2876 static int
2877 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2878 {
2879         struct promotenode *snap;
2880
2881         *spacep = 0;
2882         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2883                 uint64_t used, comp, uncomp;
2884                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2885                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
2886                 *spacep += used;
2887         }
2888         return (0);
2889 }
2890
2891 static void
2892 snaplist_destroy(list_t *l, boolean_t own)
2893 {
2894         struct promotenode *snap;
2895
2896         if (!l || !list_link_active(&l->list_head))
2897                 return;
2898
2899         while ((snap = list_tail(l)) != NULL) {
2900                 list_remove(l, snap);
2901                 if (own)
2902                         dsl_dataset_disown(snap->ds, snaplist_tag);
2903                 else
2904                         dsl_dataset_rele(snap->ds, snaplist_tag);
2905                 kmem_free(snap, sizeof (struct promotenode));
2906         }
2907         list_destroy(l);
2908 }
2909
2910 /*
2911  * Promote a clone.  Nomenclature note:
2912  * "clone" or "cds": the original clone which is being promoted
2913  * "origin" or "ods": the snapshot which is originally clone's origin
2914  * "origin head" or "ohds": the dataset which is the head
2915  * (filesystem/volume) for the origin
2916  * "origin origin": the origin of the origin's filesystem (typically
2917  * NULL, indicating that the clone is not a clone of a clone).
2918  */
2919 int
2920 dsl_dataset_promote(const char *name, char *conflsnap)
2921 {
2922         dsl_dataset_t *ds;
2923         dsl_dir_t *dd;
2924         dsl_pool_t *dp;
2925         dmu_object_info_t doi;
2926         struct promotearg pa = { 0 };
2927         struct promotenode *snap;
2928         int err;
2929
2930         err = dsl_dataset_hold(name, FTAG, &ds);
2931         if (err)
2932                 return (err);
2933         dd = ds->ds_dir;
2934         dp = dd->dd_pool;
2935
2936         err = dmu_object_info(dp->dp_meta_objset,
2937             ds->ds_phys->ds_snapnames_zapobj, &doi);
2938         if (err) {
2939                 dsl_dataset_rele(ds, FTAG);
2940                 return (err);
2941         }
2942
2943         if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
2944                 dsl_dataset_rele(ds, FTAG);
2945                 return (EINVAL);
2946         }
2947
2948         /*
2949          * We are going to inherit all the snapshots taken before our
2950          * origin (i.e., our new origin will be our parent's origin).
2951          * Take ownership of them so that we can rename them into our
2952          * namespace.
2953          */
2954         rw_enter(&dp->dp_config_rwlock, RW_READER);
2955
2956         err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
2957             &pa.shared_snaps);
2958         if (err != 0)
2959                 goto out;
2960
2961         err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
2962         if (err != 0)
2963                 goto out;
2964
2965         snap = list_head(&pa.shared_snaps);
2966         ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
2967         err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
2968             snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
2969         if (err != 0)
2970                 goto out;
2971
2972         if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
2973                 err = dsl_dataset_hold_obj(dp,
2974                     snap->ds->ds_dir->dd_phys->dd_origin_obj,
2975                     FTAG, &pa.origin_origin);
2976                 if (err != 0)
2977                         goto out;
2978         }
2979
2980 out:
2981         rw_exit(&dp->dp_config_rwlock);
2982
2983         /*
2984          * Add in 128x the snapnames zapobj size, since we will be moving
2985          * a bunch of snapnames to the promoted ds, and dirtying their
2986          * bonus buffers.
2987          */
2988         if (err == 0) {
2989                 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
2990                     dsl_dataset_promote_sync, ds, &pa,
2991                     2 + 2 * doi.doi_physical_blocks_512);
2992                 if (err && pa.err_ds && conflsnap)
2993                         (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
2994         }
2995
2996         snaplist_destroy(&pa.shared_snaps, B_TRUE);
2997         snaplist_destroy(&pa.clone_snaps, B_FALSE);
2998         snaplist_destroy(&pa.origin_snaps, B_FALSE);
2999         if (pa.origin_origin)
3000                 dsl_dataset_rele(pa.origin_origin, FTAG);
3001         dsl_dataset_rele(ds, FTAG);
3002         return (err);
3003 }
3004
3005 struct cloneswaparg {
3006         dsl_dataset_t *cds; /* clone dataset */
3007         dsl_dataset_t *ohds; /* origin's head dataset */
3008         boolean_t force;
3009         int64_t unused_refres_delta; /* change in unconsumed refreservation */
3010 };
3011
3012 /* ARGSUSED */
3013 static int
3014 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3015 {
3016         struct cloneswaparg *csa = arg1;
3017
3018         /* they should both be heads */
3019         if (dsl_dataset_is_snapshot(csa->cds) ||
3020             dsl_dataset_is_snapshot(csa->ohds))
3021                 return (EINVAL);
3022
3023         /* the branch point should be just before them */
3024         if (csa->cds->ds_prev != csa->ohds->ds_prev)
3025                 return (EINVAL);
3026
3027         /* cds should be the clone (unless they are unrelated) */
3028         if (csa->cds->ds_prev != NULL &&
3029             csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3030             csa->ohds->ds_object !=
3031             csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3032                 return (EINVAL);
3033
3034         /* the clone should be a child of the origin */
3035         if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3036                 return (EINVAL);
3037
3038         /* ohds shouldn't be modified unless 'force' */
3039         if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3040                 return (ETXTBSY);
3041
3042         /* adjust amount of any unconsumed refreservation */
3043         csa->unused_refres_delta =
3044             (int64_t)MIN(csa->ohds->ds_reserved,
3045             csa->ohds->ds_phys->ds_unique_bytes) -
3046             (int64_t)MIN(csa->ohds->ds_reserved,
3047             csa->cds->ds_phys->ds_unique_bytes);
3048
3049         if (csa->unused_refres_delta > 0 &&
3050             csa->unused_refres_delta >
3051             dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3052                 return (ENOSPC);
3053
3054         if (csa->ohds->ds_quota != 0 &&
3055             csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3056                 return (EDQUOT);
3057
3058         return (0);
3059 }
3060
3061 /* ARGSUSED */
3062 static void
3063 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3064 {
3065         struct cloneswaparg *csa = arg1;
3066         dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3067
3068         ASSERT(csa->cds->ds_reserved == 0);
3069         ASSERT(csa->ohds->ds_quota == 0 ||
3070             csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3071
3072         dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3073         dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3074
3075         if (csa->cds->ds_objset != NULL) {
3076                 dmu_objset_evict(csa->cds->ds_objset);
3077                 csa->cds->ds_objset = NULL;
3078         }
3079
3080         if (csa->ohds->ds_objset != NULL) {
3081                 dmu_objset_evict(csa->ohds->ds_objset);
3082                 csa->ohds->ds_objset = NULL;
3083         }
3084
3085         /*
3086          * Reset origin's unique bytes, if it exists.
3087          */
3088         if (csa->cds->ds_prev) {
3089                 dsl_dataset_t *origin = csa->cds->ds_prev;
3090                 uint64_t comp, uncomp;
3091
3092                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3093                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3094                     origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3095                     &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3096         }
3097
3098         /* swap blkptrs */
3099         {
3100                 blkptr_t tmp;
3101                 tmp = csa->ohds->ds_phys->ds_bp;
3102                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3103                 csa->cds->ds_phys->ds_bp = tmp;
3104         }
3105
3106         /* set dd_*_bytes */
3107         {
3108                 int64_t dused, dcomp, duncomp;
3109                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3110                 uint64_t odl_used, odl_comp, odl_uncomp;
3111
3112                 ASSERT3U(csa->cds->ds_dir->dd_phys->
3113                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3114
3115                 dsl_deadlist_space(&csa->cds->ds_deadlist,
3116                     &cdl_used, &cdl_comp, &cdl_uncomp);
3117                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3118                     &odl_used, &odl_comp, &odl_uncomp);
3119
3120                 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
3121                     (csa->ohds->ds_phys->ds_used_bytes + odl_used);
3122                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3123                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3124                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3125                     cdl_uncomp -
3126                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3127
3128                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3129                     dused, dcomp, duncomp, tx);
3130                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3131                     -dused, -dcomp, -duncomp, tx);
3132
3133                 /*
3134                  * The difference in the space used by snapshots is the
3135                  * difference in snapshot space due to the head's
3136                  * deadlist (since that's the only thing that's
3137                  * changing that affects the snapused).
3138                  */
3139                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3140                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3141                     &cdl_used, &cdl_comp, &cdl_uncomp);
3142                 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3143                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3144                     &odl_used, &odl_comp, &odl_uncomp);
3145                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3146                     DD_USED_HEAD, DD_USED_SNAP, tx);
3147         }
3148
3149         /* swap ds_*_bytes */
3150         SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
3151             csa->cds->ds_phys->ds_used_bytes);
3152         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3153             csa->cds->ds_phys->ds_compressed_bytes);
3154         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3155             csa->cds->ds_phys->ds_uncompressed_bytes);
3156         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3157             csa->cds->ds_phys->ds_unique_bytes);
3158
3159         /* apply any parent delta for change in unconsumed refreservation */
3160         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3161             csa->unused_refres_delta, 0, 0, tx);
3162
3163         /*
3164          * Swap deadlists.
3165          */
3166         dsl_deadlist_close(&csa->cds->ds_deadlist);
3167         dsl_deadlist_close(&csa->ohds->ds_deadlist);
3168         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3169             csa->cds->ds_phys->ds_deadlist_obj);
3170         dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3171             csa->cds->ds_phys->ds_deadlist_obj);
3172         dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3173             csa->ohds->ds_phys->ds_deadlist_obj);
3174
3175         dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3176 }
3177
3178 /*
3179  * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
3180  * recv" into an existing fs to swizzle the file system to the new
3181  * version, and by "zfs rollback".  Can also be used to swap two
3182  * independent head datasets if neither has any snapshots.
3183  */
3184 int
3185 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3186     boolean_t force)
3187 {
3188         struct cloneswaparg csa;
3189         int error;
3190
3191         ASSERT(clone->ds_owner);
3192         ASSERT(origin_head->ds_owner);
3193 retry:
3194         /*
3195          * Need exclusive access for the swap. If we're swapping these
3196          * datasets back after an error, we already hold the locks.
3197          */
3198         if (!RW_WRITE_HELD(&clone->ds_rwlock))
3199                 rw_enter(&clone->ds_rwlock, RW_WRITER);
3200         if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3201             !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3202                 rw_exit(&clone->ds_rwlock);
3203                 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3204                 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3205                         rw_exit(&origin_head->ds_rwlock);
3206                         goto retry;
3207                 }
3208         }
3209         csa.cds = clone;
3210         csa.ohds = origin_head;
3211         csa.force = force;
3212         error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3213             dsl_dataset_clone_swap_check,
3214             dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3215         return (error);
3216 }
3217
3218 /*
3219  * Given a pool name and a dataset object number in that pool,
3220  * return the name of that dataset.
3221  */
3222 int
3223 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3224 {
3225         spa_t *spa;
3226         dsl_pool_t *dp;
3227         dsl_dataset_t *ds;
3228         int error;
3229
3230         if ((error = spa_open(pname, &spa, FTAG)) != 0)
3231                 return (error);
3232         dp = spa_get_dsl(spa);
3233         rw_enter(&dp->dp_config_rwlock, RW_READER);
3234         if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3235                 dsl_dataset_name(ds, buf);
3236                 dsl_dataset_rele(ds, FTAG);
3237         }
3238         rw_exit(&dp->dp_config_rwlock);
3239         spa_close(spa, FTAG);
3240
3241         return (error);
3242 }
3243
3244 int
3245 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3246     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3247 {
3248         int error = 0;
3249
3250         ASSERT3S(asize, >, 0);
3251
3252         /*
3253          * *ref_rsrv is the portion of asize that will come from any
3254          * unconsumed refreservation space.
3255          */
3256         *ref_rsrv = 0;
3257
3258         mutex_enter(&ds->ds_lock);
3259         /*
3260          * Make a space adjustment for reserved bytes.
3261          */
3262         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3263                 ASSERT3U(*used, >=,
3264                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3265                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3266                 *ref_rsrv =
3267                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3268         }
3269
3270         if (!check_quota || ds->ds_quota == 0) {
3271                 mutex_exit(&ds->ds_lock);
3272                 return (0);
3273         }
3274         /*
3275          * If they are requesting more space, and our current estimate
3276          * is over quota, they get to try again unless the actual
3277          * on-disk is over quota and there are no pending changes (which
3278          * may free up space for us).
3279          */
3280         if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
3281                 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
3282                         error = ERESTART;
3283                 else
3284                         error = EDQUOT;
3285         }
3286         mutex_exit(&ds->ds_lock);
3287
3288         return (error);
3289 }
3290
3291 /* ARGSUSED */
3292 static int
3293 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3294 {
3295         dsl_dataset_t *ds = arg1;
3296         dsl_prop_setarg_t *psa = arg2;
3297         int err;
3298
3299         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3300                 return (ENOTSUP);
3301
3302         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3303                 return (err);
3304
3305         if (psa->psa_effective_value == 0)
3306                 return (0);
3307
3308         if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
3309             psa->psa_effective_value < ds->ds_reserved)
3310                 return (ENOSPC);
3311
3312         return (0);
3313 }
3314
3315 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3316
3317 void
3318 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3319 {
3320         dsl_dataset_t *ds = arg1;
3321         dsl_prop_setarg_t *psa = arg2;
3322         uint64_t effective_value = psa->psa_effective_value;
3323
3324         dsl_prop_set_sync(ds, psa, tx);
3325         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3326
3327         if (ds->ds_quota != effective_value) {
3328                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3329                 ds->ds_quota = effective_value;
3330
3331                 spa_history_log_internal(LOG_DS_REFQUOTA,
3332                     ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ",
3333                     (longlong_t)ds->ds_quota, ds->ds_object);
3334         }
3335 }
3336
3337 int
3338 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3339 {
3340         dsl_dataset_t *ds;
3341         dsl_prop_setarg_t psa;
3342         int err;
3343
3344         dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
3345
3346         err = dsl_dataset_hold(dsname, FTAG, &ds);
3347         if (err)
3348                 return (err);
3349
3350         /*
3351          * If someone removes a file, then tries to set the quota, we
3352          * want to make sure the file freeing takes effect.
3353          */
3354         txg_wait_open(ds->ds_dir->dd_pool, 0);
3355
3356         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3357             dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3358             ds, &psa, 0);
3359
3360         dsl_dataset_rele(ds, FTAG);
3361         return (err);
3362 }
3363
3364 static int
3365 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3366 {
3367         dsl_dataset_t *ds = arg1;
3368         dsl_prop_setarg_t *psa = arg2;
3369         uint64_t effective_value;
3370         uint64_t unique;
3371         int err;
3372
3373         if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3374             SPA_VERSION_REFRESERVATION)
3375                 return (ENOTSUP);
3376
3377         if (dsl_dataset_is_snapshot(ds))
3378                 return (EINVAL);
3379
3380         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3381                 return (err);
3382
3383         effective_value = psa->psa_effective_value;
3384
3385         /*
3386          * If we are doing the preliminary check in open context, the
3387          * space estimates may be inaccurate.
3388          */
3389         if (!dmu_tx_is_syncing(tx))
3390                 return (0);
3391
3392         mutex_enter(&ds->ds_lock);
3393         if (!DS_UNIQUE_IS_ACCURATE(ds))
3394                 dsl_dataset_recalc_head_uniq(ds);
3395         unique = ds->ds_phys->ds_unique_bytes;
3396         mutex_exit(&ds->ds_lock);
3397
3398         if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3399                 uint64_t delta = MAX(unique, effective_value) -
3400                     MAX(unique, ds->ds_reserved);
3401
3402                 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3403                         return (ENOSPC);
3404                 if (ds->ds_quota > 0 &&
3405                     effective_value > ds->ds_quota)
3406                         return (ENOSPC);
3407         }
3408
3409         return (0);
3410 }
3411
3412 static void
3413 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3414 {
3415         dsl_dataset_t *ds = arg1;
3416         dsl_prop_setarg_t *psa = arg2;
3417         uint64_t effective_value = psa->psa_effective_value;
3418         uint64_t unique;
3419         int64_t delta;
3420
3421         dsl_prop_set_sync(ds, psa, tx);
3422         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3423
3424         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3425
3426         mutex_enter(&ds->ds_dir->dd_lock);
3427         mutex_enter(&ds->ds_lock);
3428         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3429         unique = ds->ds_phys->ds_unique_bytes;
3430         delta = MAX(0, (int64_t)(effective_value - unique)) -
3431             MAX(0, (int64_t)(ds->ds_reserved - unique));
3432         ds->ds_reserved = effective_value;
3433         mutex_exit(&ds->ds_lock);
3434
3435         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3436         mutex_exit(&ds->ds_dir->dd_lock);
3437
3438         spa_history_log_internal(LOG_DS_REFRESERV,
3439             ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu",
3440             (longlong_t)effective_value, ds->ds_object);
3441 }
3442
3443 int
3444 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3445     uint64_t reservation)
3446 {
3447         dsl_dataset_t *ds;
3448         dsl_prop_setarg_t psa;
3449         int err;
3450
3451         dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3452             &reservation);
3453
3454         err = dsl_dataset_hold(dsname, FTAG, &ds);
3455         if (err)
3456                 return (err);
3457
3458         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3459             dsl_dataset_set_reservation_check,
3460             dsl_dataset_set_reservation_sync, ds, &psa, 0);
3461
3462         dsl_dataset_rele(ds, FTAG);
3463         return (err);
3464 }
3465
3466 typedef struct zfs_hold_cleanup_arg {
3467         dsl_pool_t *dp;
3468         uint64_t dsobj;
3469         char htag[MAXNAMELEN];
3470 } zfs_hold_cleanup_arg_t;
3471
3472 static void
3473 dsl_dataset_user_release_onexit(void *arg)
3474 {
3475         zfs_hold_cleanup_arg_t *ca = arg;
3476
3477         (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3478             B_TRUE);
3479         kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3480 }
3481
3482 void
3483 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3484     minor_t minor)
3485 {
3486         zfs_hold_cleanup_arg_t *ca;
3487
3488         ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3489         ca->dp = ds->ds_dir->dd_pool;
3490         ca->dsobj = ds->ds_object;
3491         (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3492         VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
3493             dsl_dataset_user_release_onexit, ca, NULL));
3494 }
3495
3496 /*
3497  * If you add new checks here, you may need to add
3498  * additional checks to the "temporary" case in
3499  * snapshot_check() in dmu_objset.c.
3500  */
3501 static int
3502 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3503 {
3504         dsl_dataset_t *ds = arg1;
3505         struct dsl_ds_holdarg *ha = arg2;
3506         char *htag = ha->htag;
3507         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3508         int error = 0;
3509
3510         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3511                 return (ENOTSUP);
3512
3513         if (!dsl_dataset_is_snapshot(ds))
3514                 return (EINVAL);
3515
3516         /* tags must be unique */
3517         mutex_enter(&ds->ds_lock);
3518         if (ds->ds_phys->ds_userrefs_obj) {
3519                 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3520                     8, 1, tx);
3521                 if (error == 0)
3522                         error = EEXIST;
3523                 else if (error == ENOENT)
3524                         error = 0;
3525         }
3526         mutex_exit(&ds->ds_lock);
3527
3528         if (error == 0 && ha->temphold &&
3529             strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3530                 error = E2BIG;
3531
3532         return (error);
3533 }
3534
3535 void
3536 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3537 {
3538         dsl_dataset_t *ds = arg1;
3539         struct dsl_ds_holdarg *ha = arg2;
3540         char *htag = ha->htag;
3541         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3542         objset_t *mos = dp->dp_meta_objset;
3543         uint64_t now = gethrestime_sec();
3544         uint64_t zapobj;
3545
3546         mutex_enter(&ds->ds_lock);
3547         if (ds->ds_phys->ds_userrefs_obj == 0) {
3548                 /*
3549                  * This is the first user hold for this dataset.  Create
3550                  * the userrefs zap object.
3551                  */
3552                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3553                 zapobj = ds->ds_phys->ds_userrefs_obj =
3554                     zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3555         } else {
3556                 zapobj = ds->ds_phys->ds_userrefs_obj;
3557         }
3558         ds->ds_userrefs++;
3559         mutex_exit(&ds->ds_lock);
3560
3561         VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3562
3563         if (ha->temphold) {
3564                 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3565                     htag, &now, tx));
3566         }
3567
3568         spa_history_log_internal(LOG_DS_USER_HOLD,
3569             dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
3570             (int)ha->temphold, ds->ds_object);
3571 }
3572
3573 static int
3574 dsl_dataset_user_hold_one(const char *dsname, void *arg)
3575 {
3576         struct dsl_ds_holdarg *ha = arg;
3577         dsl_dataset_t *ds;
3578         int error;
3579         char *name;
3580
3581         /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3582         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3583         error = dsl_dataset_hold(name, ha->dstg, &ds);
3584         strfree(name);
3585         if (error == 0) {
3586                 ha->gotone = B_TRUE;
3587                 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3588                     dsl_dataset_user_hold_sync, ds, ha, 0);
3589         } else if (error == ENOENT && ha->recursive) {
3590                 error = 0;
3591         } else {
3592                 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3593         }
3594         return (error);
3595 }
3596
3597 int
3598 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3599     boolean_t temphold)
3600 {
3601         struct dsl_ds_holdarg *ha;
3602         int error;
3603
3604         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3605         ha->htag = htag;
3606         ha->temphold = temphold;
3607         error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3608             dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3609             ds, ha, 0);
3610         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3611
3612         return (error);
3613 }
3614
3615 int
3616 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3617     boolean_t recursive, boolean_t temphold, int cleanup_fd)
3618 {
3619         struct dsl_ds_holdarg *ha;
3620         dsl_sync_task_t *dst;
3621         spa_t *spa;
3622         int error;
3623         minor_t minor = 0;
3624
3625         if (cleanup_fd != -1) {
3626                 /* Currently we only support cleanup-on-exit of tempholds. */
3627                 if (!temphold)
3628                         return (EINVAL);
3629                 error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3630                 if (error)
3631                         return (error);
3632         }
3633
3634         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3635
3636         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3637
3638         error = spa_open(dsname, &spa, FTAG);
3639         if (error) {
3640                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3641                 if (cleanup_fd != -1)
3642                         zfs_onexit_fd_rele(cleanup_fd);
3643                 return (error);
3644         }
3645
3646         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3647         ha->htag = htag;
3648         ha->snapname = snapname;
3649         ha->recursive = recursive;
3650         ha->temphold = temphold;
3651
3652         if (recursive) {
3653                 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3654                     ha, DS_FIND_CHILDREN);
3655         } else {
3656                 error = dsl_dataset_user_hold_one(dsname, ha);
3657         }
3658         if (error == 0)
3659                 error = dsl_sync_task_group_wait(ha->dstg);
3660
3661         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3662             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3663                 dsl_dataset_t *ds = dst->dst_arg1;
3664
3665                 if (dst->dst_err) {
3666                         dsl_dataset_name(ds, ha->failed);
3667                         *strchr(ha->failed, '@') = '\0';
3668                 } else if (error == 0 && minor != 0 && temphold) {
3669                         /*
3670                          * If this hold is to be released upon process exit,
3671                          * register that action now.
3672                          */
3673                         dsl_register_onexit_hold_cleanup(ds, htag, minor);
3674                 }
3675                 dsl_dataset_rele(ds, ha->dstg);
3676         }
3677
3678         if (error == 0 && recursive && !ha->gotone)
3679                 error = ENOENT;
3680
3681         if (error)
3682                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3683
3684         dsl_sync_task_group_destroy(ha->dstg);
3685
3686         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3687         spa_close(spa, FTAG);
3688         if (cleanup_fd != -1)
3689                 zfs_onexit_fd_rele(cleanup_fd);
3690         return (error);
3691 }
3692
3693 struct dsl_ds_releasearg {
3694         dsl_dataset_t *ds;
3695         const char *htag;
3696         boolean_t own;          /* do we own or just hold ds? */
3697 };
3698
3699 static int
3700 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3701     boolean_t *might_destroy)
3702 {
3703         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3704         uint64_t zapobj;
3705         uint64_t tmp;
3706         int error;
3707
3708         *might_destroy = B_FALSE;
3709
3710         mutex_enter(&ds->ds_lock);
3711         zapobj = ds->ds_phys->ds_userrefs_obj;
3712         if (zapobj == 0) {
3713                 /* The tag can't possibly exist */
3714                 mutex_exit(&ds->ds_lock);
3715                 return (ESRCH);
3716         }
3717
3718         /* Make sure the tag exists */
3719         error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3720         if (error) {
3721                 mutex_exit(&ds->ds_lock);
3722                 if (error == ENOENT)
3723                         error = ESRCH;
3724                 return (error);
3725         }
3726
3727         if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3728             DS_IS_DEFER_DESTROY(ds))
3729                 *might_destroy = B_TRUE;
3730
3731         mutex_exit(&ds->ds_lock);
3732         return (0);
3733 }
3734
3735 static int
3736 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
3737 {
3738         struct dsl_ds_releasearg *ra = arg1;
3739         dsl_dataset_t *ds = ra->ds;
3740         boolean_t might_destroy;
3741         int error;
3742
3743         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3744                 return (ENOTSUP);
3745
3746         error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
3747         if (error)
3748                 return (error);
3749
3750         if (might_destroy) {
3751                 struct dsl_ds_destroyarg dsda = {0};
3752
3753                 if (dmu_tx_is_syncing(tx)) {
3754                         /*
3755                          * If we're not prepared to remove the snapshot,
3756                          * we can't allow the release to happen right now.
3757                          */
3758                         if (!ra->own)
3759                                 return (EBUSY);
3760                 }
3761                 dsda.ds = ds;
3762                 dsda.releasing = B_TRUE;
3763                 return (dsl_dataset_destroy_check(&dsda, tag, tx));
3764         }
3765
3766         return (0);
3767 }
3768
3769 static void
3770 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
3771 {
3772         struct dsl_ds_releasearg *ra = arg1;
3773         dsl_dataset_t *ds = ra->ds;
3774         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3775         objset_t *mos = dp->dp_meta_objset;
3776         uint64_t zapobj;
3777         uint64_t dsobj = ds->ds_object;
3778         uint64_t refs;
3779         int error;
3780
3781         mutex_enter(&ds->ds_lock);
3782         ds->ds_userrefs--;
3783         refs = ds->ds_userrefs;
3784         mutex_exit(&ds->ds_lock);
3785         error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
3786         VERIFY(error == 0 || error == ENOENT);
3787         zapobj = ds->ds_phys->ds_userrefs_obj;
3788         VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
3789         if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
3790             DS_IS_DEFER_DESTROY(ds)) {
3791                 struct dsl_ds_destroyarg dsda = {0};
3792
3793                 ASSERT(ra->own);
3794                 dsda.ds = ds;
3795                 dsda.releasing = B_TRUE;
3796                 /* We already did the destroy_check */
3797                 dsl_dataset_destroy_sync(&dsda, tag, tx);
3798         }
3799
3800         spa_history_log_internal(LOG_DS_USER_RELEASE,
3801             dp->dp_spa, tx, "<%s> %lld dataset = %llu",
3802             ra->htag, (longlong_t)refs, dsobj);
3803 }
3804
3805 static int
3806 dsl_dataset_user_release_one(const char *dsname, void *arg)
3807 {
3808         struct dsl_ds_holdarg *ha = arg;
3809         struct dsl_ds_releasearg *ra;
3810         dsl_dataset_t *ds;
3811         int error;
3812         void *dtag = ha->dstg;
3813         char *name;
3814         boolean_t own = B_FALSE;
3815         boolean_t might_destroy;
3816
3817         /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3818         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3819         error = dsl_dataset_hold(name, dtag, &ds);
3820         strfree(name);
3821         if (error == ENOENT && ha->recursive)
3822                 return (0);
3823         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3824         if (error)
3825                 return (error);
3826
3827         ha->gotone = B_TRUE;
3828
3829         ASSERT(dsl_dataset_is_snapshot(ds));
3830
3831         error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
3832         if (error) {
3833                 dsl_dataset_rele(ds, dtag);
3834                 return (error);
3835         }
3836
3837         if (might_destroy) {
3838 #ifdef _KERNEL
3839                 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3840                 error = zfs_unmount_snap(name, NULL);
3841                 strfree(name);
3842                 if (error) {
3843                         dsl_dataset_rele(ds, dtag);
3844                         return (error);
3845                 }
3846 #endif
3847                 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
3848                         dsl_dataset_rele(ds, dtag);
3849                         return (EBUSY);
3850                 } else {
3851                         own = B_TRUE;
3852                         dsl_dataset_make_exclusive(ds, dtag);
3853                 }
3854         }
3855
3856         ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
3857         ra->ds = ds;
3858         ra->htag = ha->htag;
3859         ra->own = own;
3860         dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
3861             dsl_dataset_user_release_sync, ra, dtag, 0);
3862
3863         return (0);
3864 }
3865
3866 int
3867 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
3868     boolean_t recursive)
3869 {
3870         struct dsl_ds_holdarg *ha;
3871         dsl_sync_task_t *dst;
3872         spa_t *spa;
3873         int error;
3874
3875 top:
3876         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3877
3878         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3879
3880         error = spa_open(dsname, &spa, FTAG);
3881         if (error) {
3882                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3883                 return (error);
3884         }
3885
3886         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3887         ha->htag = htag;
3888         ha->snapname = snapname;
3889         ha->recursive = recursive;
3890         if (recursive) {
3891                 error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
3892                     ha, DS_FIND_CHILDREN);
3893         } else {
3894                 error = dsl_dataset_user_release_one(dsname, ha);
3895         }
3896         if (error == 0)
3897                 error = dsl_sync_task_group_wait(ha->dstg);
3898
3899         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3900             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3901                 struct dsl_ds_releasearg *ra = dst->dst_arg1;
3902                 dsl_dataset_t *ds = ra->ds;
3903
3904                 if (dst->dst_err)
3905                         dsl_dataset_name(ds, ha->failed);
3906
3907                 if (ra->own)
3908                         dsl_dataset_disown(ds, ha->dstg);
3909                 else
3910                         dsl_dataset_rele(ds, ha->dstg);
3911
3912                 kmem_free(ra, sizeof (struct dsl_ds_releasearg));
3913         }
3914
3915         if (error == 0 && recursive && !ha->gotone)
3916                 error = ENOENT;
3917
3918         if (error && error != EBUSY)
3919                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3920
3921         dsl_sync_task_group_destroy(ha->dstg);
3922         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3923         spa_close(spa, FTAG);
3924
3925         /*
3926          * We can get EBUSY if we were racing with deferred destroy and
3927          * dsl_dataset_user_release_check() hadn't done the necessary
3928          * open context setup.  We can also get EBUSY if we're racing
3929          * with destroy and that thread is the ds_owner.  Either way
3930          * the busy condition should be transient, and we should retry
3931          * the release operation.
3932          */
3933         if (error == EBUSY)
3934                 goto top;
3935
3936         return (error);
3937 }
3938
3939 /*
3940  * Called at spa_load time (with retry == B_FALSE) to release a stale
3941  * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
3942  */
3943 int
3944 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
3945     boolean_t retry)
3946 {
3947         dsl_dataset_t *ds;
3948         char *snap;
3949         char *name;
3950         int namelen;
3951         int error;
3952
3953         do {
3954                 rw_enter(&dp->dp_config_rwlock, RW_READER);
3955                 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
3956                 rw_exit(&dp->dp_config_rwlock);
3957                 if (error)
3958                         return (error);
3959                 namelen = dsl_dataset_namelen(ds)+1;
3960                 name = kmem_alloc(namelen, KM_SLEEP);
3961                 dsl_dataset_name(ds, name);
3962                 dsl_dataset_rele(ds, FTAG);
3963
3964                 snap = strchr(name, '@');
3965                 *snap = '\0';
3966                 ++snap;
3967                 error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
3968                 kmem_free(name, namelen);
3969
3970                 /*
3971                  * The object can't have been destroyed because we have a hold,
3972                  * but it might have been renamed, resulting in ENOENT.  Retry
3973                  * if we've been requested to do so.
3974                  *
3975                  * It would be nice if we could use the dsobj all the way
3976                  * through and avoid ENOENT entirely.  But we might need to
3977                  * unmount the snapshot, and there's currently no way to lookup
3978                  * a vfsp using a ZFS object id.
3979                  */
3980         } while ((error == ENOENT) && retry);
3981
3982         return (error);
3983 }
3984
3985 int
3986 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
3987 {
3988         dsl_dataset_t *ds;
3989         int err;
3990
3991         err = dsl_dataset_hold(dsname, FTAG, &ds);
3992         if (err)
3993                 return (err);
3994
3995         VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
3996         if (ds->ds_phys->ds_userrefs_obj != 0) {
3997                 zap_attribute_t *za;
3998                 zap_cursor_t zc;
3999
4000                 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4001                 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4002                     ds->ds_phys->ds_userrefs_obj);
4003                     zap_cursor_retrieve(&zc, za) == 0;
4004                     zap_cursor_advance(&zc)) {
4005                         VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4006                             za->za_first_integer));
4007                 }
4008                 zap_cursor_fini(&zc);
4009                 kmem_free(za, sizeof (zap_attribute_t));
4010         }
4011         dsl_dataset_rele(ds, FTAG);
4012         return (0);
4013 }
4014
4015 /*
4016  * Note, this fuction is used as the callback for dmu_objset_find().  We
4017  * always return 0 so that we will continue to find and process
4018  * inconsistent datasets, even if we encounter an error trying to
4019  * process one of them.
4020  */
4021 /* ARGSUSED */
4022 int
4023 dsl_destroy_inconsistent(const char *dsname, void *arg)
4024 {
4025         dsl_dataset_t *ds;
4026
4027         if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4028                 if (DS_IS_INCONSISTENT(ds))
4029                         (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4030                 else
4031                         dsl_dataset_disown(ds, FTAG);
4032         }
4033         return (0);
4034 }