4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/dmu_objset.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_dir.h>
29 #include <sys/dsl_prop.h>
30 #include <sys/dsl_synctask.h>
31 #include <sys/dmu_traverse.h>
32 #include <sys/dmu_tx.h>
36 #include <sys/unique.h>
37 #include <sys/zfs_context.h>
38 #include <sys/zfs_ioctl.h>
40 #include <sys/zfs_znode.h>
41 #include <sys/sunddi.h>
44 static char *dsl_reaper
= "the grim reaper";
46 static dsl_checkfunc_t dsl_dataset_destroy_begin_check
;
47 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync
;
48 static dsl_checkfunc_t dsl_dataset_rollback_check
;
49 static dsl_syncfunc_t dsl_dataset_rollback_sync
;
50 static dsl_syncfunc_t dsl_dataset_set_reservation_sync
;
52 #define DS_REF_MAX (1ULL << 62)
54 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
56 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper)
60 * Figure out how much of this delta should be propogated to the dsl_dir
61 * layer. If there's a refreservation, that space has already been
62 * partially accounted for in our ancestors.
65 parent_delta(dsl_dataset_t
*ds
, int64_t delta
)
67 uint64_t old_bytes
, new_bytes
;
69 if (ds
->ds_reserved
== 0)
72 old_bytes
= MAX(ds
->ds_phys
->ds_unique_bytes
, ds
->ds_reserved
);
73 new_bytes
= MAX(ds
->ds_phys
->ds_unique_bytes
+ delta
, ds
->ds_reserved
);
75 ASSERT3U(ABS((int64_t)(new_bytes
- old_bytes
)), <=, ABS(delta
));
76 return (new_bytes
- old_bytes
);
80 dsl_dataset_block_born(dsl_dataset_t
*ds
, blkptr_t
*bp
, dmu_tx_t
*tx
)
82 int used
= bp_get_dasize(tx
->tx_pool
->dp_spa
, bp
);
83 int compressed
= BP_GET_PSIZE(bp
);
84 int uncompressed
= BP_GET_UCSIZE(bp
);
87 dprintf_bp(bp
, "born, ds=%p\n", ds
);
89 ASSERT(dmu_tx_is_syncing(tx
));
90 /* It could have been compressed away to nothing */
93 ASSERT(BP_GET_TYPE(bp
) != DMU_OT_NONE
);
94 ASSERT3U(BP_GET_TYPE(bp
), <, DMU_OT_NUMTYPES
);
97 * Account for the meta-objset space in its placeholder
100 ASSERT3U(compressed
, ==, uncompressed
); /* it's all metadata */
101 dsl_dir_diduse_space(tx
->tx_pool
->dp_mos_dir
, DD_USED_HEAD
,
102 used
, compressed
, uncompressed
, tx
);
103 dsl_dir_dirty(tx
->tx_pool
->dp_mos_dir
, tx
);
106 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
107 mutex_enter(&ds
->ds_dir
->dd_lock
);
108 mutex_enter(&ds
->ds_lock
);
109 delta
= parent_delta(ds
, used
);
110 ds
->ds_phys
->ds_used_bytes
+= used
;
111 ds
->ds_phys
->ds_compressed_bytes
+= compressed
;
112 ds
->ds_phys
->ds_uncompressed_bytes
+= uncompressed
;
113 ds
->ds_phys
->ds_unique_bytes
+= used
;
114 mutex_exit(&ds
->ds_lock
);
115 dsl_dir_diduse_space(ds
->ds_dir
, DD_USED_HEAD
, delta
,
116 compressed
, uncompressed
, tx
);
117 dsl_dir_transfer_space(ds
->ds_dir
, used
- delta
,
118 DD_USED_REFRSRV
, DD_USED_HEAD
, tx
);
119 mutex_exit(&ds
->ds_dir
->dd_lock
);
123 dsl_dataset_block_kill(dsl_dataset_t
*ds
, blkptr_t
*bp
, zio_t
*pio
,
126 int used
= bp_get_dasize(tx
->tx_pool
->dp_spa
, bp
);
127 int compressed
= BP_GET_PSIZE(bp
);
128 int uncompressed
= BP_GET_UCSIZE(bp
);
131 ASSERT(dmu_tx_is_syncing(tx
));
132 /* No block pointer => nothing to free */
140 * Account for the meta-objset space in its placeholder
143 err
= dsl_free(pio
, tx
->tx_pool
,
144 tx
->tx_txg
, bp
, NULL
, NULL
, ARC_NOWAIT
);
147 dsl_dir_diduse_space(tx
->tx_pool
->dp_mos_dir
, DD_USED_HEAD
,
148 -used
, -compressed
, -uncompressed
, tx
);
149 dsl_dir_dirty(tx
->tx_pool
->dp_mos_dir
, tx
);
152 ASSERT3P(tx
->tx_pool
, ==, ds
->ds_dir
->dd_pool
);
154 ASSERT(!dsl_dataset_is_snapshot(ds
));
155 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
157 if (bp
->blk_birth
> ds
->ds_phys
->ds_prev_snap_txg
) {
161 dprintf_bp(bp
, "freeing: %s", "");
162 err
= dsl_free(pio
, tx
->tx_pool
,
163 tx
->tx_txg
, bp
, NULL
, NULL
, ARC_NOWAIT
);
166 mutex_enter(&ds
->ds_dir
->dd_lock
);
167 mutex_enter(&ds
->ds_lock
);
168 ASSERT(ds
->ds_phys
->ds_unique_bytes
>= used
||
169 !DS_UNIQUE_IS_ACCURATE(ds
));
170 delta
= parent_delta(ds
, -used
);
171 ds
->ds_phys
->ds_unique_bytes
-= used
;
172 mutex_exit(&ds
->ds_lock
);
173 dsl_dir_diduse_space(ds
->ds_dir
, DD_USED_HEAD
,
174 delta
, -compressed
, -uncompressed
, tx
);
175 dsl_dir_transfer_space(ds
->ds_dir
, -used
- delta
,
176 DD_USED_REFRSRV
, DD_USED_HEAD
, tx
);
177 mutex_exit(&ds
->ds_dir
->dd_lock
);
179 dprintf_bp(bp
, "putting on dead list: %s", "");
180 VERIFY(0 == bplist_enqueue(&ds
->ds_deadlist
, bp
, tx
));
181 ASSERT3U(ds
->ds_prev
->ds_object
, ==,
182 ds
->ds_phys
->ds_prev_snap_obj
);
183 ASSERT(ds
->ds_prev
->ds_phys
->ds_num_children
> 0);
184 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
185 if (ds
->ds_prev
->ds_phys
->ds_next_snap_obj
==
186 ds
->ds_object
&& bp
->blk_birth
>
187 ds
->ds_prev
->ds_phys
->ds_prev_snap_txg
) {
188 dmu_buf_will_dirty(ds
->ds_prev
->ds_dbuf
, tx
);
189 mutex_enter(&ds
->ds_prev
->ds_lock
);
190 ds
->ds_prev
->ds_phys
->ds_unique_bytes
+= used
;
191 mutex_exit(&ds
->ds_prev
->ds_lock
);
193 if (bp
->blk_birth
> ds
->ds_origin_txg
) {
194 dsl_dir_transfer_space(ds
->ds_dir
, used
,
195 DD_USED_HEAD
, DD_USED_SNAP
, tx
);
198 mutex_enter(&ds
->ds_lock
);
199 ASSERT3U(ds
->ds_phys
->ds_used_bytes
, >=, used
);
200 ds
->ds_phys
->ds_used_bytes
-= used
;
201 ASSERT3U(ds
->ds_phys
->ds_compressed_bytes
, >=, compressed
);
202 ds
->ds_phys
->ds_compressed_bytes
-= compressed
;
203 ASSERT3U(ds
->ds_phys
->ds_uncompressed_bytes
, >=, uncompressed
);
204 ds
->ds_phys
->ds_uncompressed_bytes
-= uncompressed
;
205 mutex_exit(&ds
->ds_lock
);
211 dsl_dataset_prev_snap_txg(dsl_dataset_t
*ds
)
213 uint64_t trysnap
= 0;
218 * The snapshot creation could fail, but that would cause an
219 * incorrect FALSE return, which would only result in an
220 * overestimation of the amount of space that an operation would
221 * consume, which is OK.
223 * There's also a small window where we could miss a pending
224 * snapshot, because we could set the sync task in the quiescing
225 * phase. So this should only be used as a guess.
227 if (ds
->ds_trysnap_txg
>
228 spa_last_synced_txg(ds
->ds_dir
->dd_pool
->dp_spa
))
229 trysnap
= ds
->ds_trysnap_txg
;
230 return (MAX(ds
->ds_phys
->ds_prev_snap_txg
, trysnap
));
234 dsl_dataset_block_freeable(dsl_dataset_t
*ds
, uint64_t blk_birth
)
236 return (blk_birth
> dsl_dataset_prev_snap_txg(ds
));
241 dsl_dataset_evict(dmu_buf_t
*db
, void *dsv
)
243 dsl_dataset_t
*ds
= dsv
;
245 ASSERT(ds
->ds_owner
== NULL
|| DSL_DATASET_IS_DESTROYED(ds
));
247 dprintf_ds(ds
, "evicting %s\n", "");
249 unique_remove(ds
->ds_fsid_guid
);
251 if (ds
->ds_user_ptr
!= NULL
)
252 ds
->ds_user_evict_func(ds
, ds
->ds_user_ptr
);
255 dsl_dataset_drop_ref(ds
->ds_prev
, ds
);
259 bplist_close(&ds
->ds_deadlist
);
261 dsl_dir_close(ds
->ds_dir
, ds
);
263 ASSERT(!list_link_active(&ds
->ds_synced_link
));
265 mutex_destroy(&ds
->ds_lock
);
266 mutex_destroy(&ds
->ds_recvlock
);
267 mutex_destroy(&ds
->ds_opening_lock
);
268 mutex_destroy(&ds
->ds_deadlist
.bpl_lock
);
269 rw_destroy(&ds
->ds_rwlock
);
270 cv_destroy(&ds
->ds_exclusive_cv
);
272 kmem_free(ds
, sizeof (dsl_dataset_t
));
276 dsl_dataset_get_snapname(dsl_dataset_t
*ds
)
278 dsl_dataset_phys_t
*headphys
;
281 dsl_pool_t
*dp
= ds
->ds_dir
->dd_pool
;
282 objset_t
*mos
= dp
->dp_meta_objset
;
284 if (ds
->ds_snapname
[0])
286 if (ds
->ds_phys
->ds_next_snap_obj
== 0)
289 err
= dmu_bonus_hold(mos
, ds
->ds_dir
->dd_phys
->dd_head_dataset_obj
,
293 headphys
= headdbuf
->db_data
;
294 err
= zap_value_search(dp
->dp_meta_objset
,
295 headphys
->ds_snapnames_zapobj
, ds
->ds_object
, 0, ds
->ds_snapname
);
296 dmu_buf_rele(headdbuf
, FTAG
);
301 dsl_dataset_snap_lookup(dsl_dataset_t
*ds
, const char *name
, uint64_t *value
)
303 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
304 uint64_t snapobj
= ds
->ds_phys
->ds_snapnames_zapobj
;
308 if (ds
->ds_phys
->ds_flags
& DS_FLAG_CI_DATASET
)
313 err
= zap_lookup_norm(mos
, snapobj
, name
, 8, 1,
314 value
, mt
, NULL
, 0, NULL
);
315 if (err
== ENOTSUP
&& mt
== MT_FIRST
)
316 err
= zap_lookup(mos
, snapobj
, name
, 8, 1, value
);
321 dsl_dataset_snap_remove(dsl_dataset_t
*ds
, char *name
, dmu_tx_t
*tx
)
323 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
324 uint64_t snapobj
= ds
->ds_phys
->ds_snapnames_zapobj
;
328 if (ds
->ds_phys
->ds_flags
& DS_FLAG_CI_DATASET
)
333 err
= zap_remove_norm(mos
, snapobj
, name
, mt
, tx
);
334 if (err
== ENOTSUP
&& mt
== MT_FIRST
)
335 err
= zap_remove(mos
, snapobj
, name
, tx
);
340 dsl_dataset_get_ref(dsl_pool_t
*dp
, uint64_t dsobj
, void *tag
,
343 objset_t
*mos
= dp
->dp_meta_objset
;
348 ASSERT(RW_LOCK_HELD(&dp
->dp_config_rwlock
) ||
349 dsl_pool_sync_context(dp
));
351 err
= dmu_bonus_hold(mos
, dsobj
, tag
, &dbuf
);
354 ds
= dmu_buf_get_user(dbuf
);
356 dsl_dataset_t
*winner
;
358 ds
= kmem_zalloc(sizeof (dsl_dataset_t
), KM_SLEEP
);
360 ds
->ds_object
= dsobj
;
361 ds
->ds_phys
= dbuf
->db_data
;
363 mutex_init(&ds
->ds_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
364 mutex_init(&ds
->ds_recvlock
, NULL
, MUTEX_DEFAULT
, NULL
);
365 mutex_init(&ds
->ds_opening_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
366 mutex_init(&ds
->ds_deadlist
.bpl_lock
, NULL
, MUTEX_DEFAULT
,
368 rw_init(&ds
->ds_rwlock
, 0, 0, 0);
369 cv_init(&ds
->ds_exclusive_cv
, NULL
, CV_DEFAULT
, NULL
);
371 err
= bplist_open(&ds
->ds_deadlist
,
372 mos
, ds
->ds_phys
->ds_deadlist_obj
);
374 err
= dsl_dir_open_obj(dp
,
375 ds
->ds_phys
->ds_dir_obj
, NULL
, ds
, &ds
->ds_dir
);
379 * we don't really need to close the blist if we
382 mutex_destroy(&ds
->ds_lock
);
383 mutex_destroy(&ds
->ds_recvlock
);
384 mutex_destroy(&ds
->ds_opening_lock
);
385 mutex_destroy(&ds
->ds_deadlist
.bpl_lock
);
386 rw_destroy(&ds
->ds_rwlock
);
387 cv_destroy(&ds
->ds_exclusive_cv
);
388 kmem_free(ds
, sizeof (dsl_dataset_t
));
389 dmu_buf_rele(dbuf
, tag
);
393 if (!dsl_dataset_is_snapshot(ds
)) {
394 ds
->ds_snapname
[0] = '\0';
395 if (ds
->ds_phys
->ds_prev_snap_obj
) {
396 err
= dsl_dataset_get_ref(dp
,
397 ds
->ds_phys
->ds_prev_snap_obj
,
401 if (err
== 0 && dsl_dir_is_clone(ds
->ds_dir
)) {
402 dsl_dataset_t
*origin
;
404 err
= dsl_dataset_hold_obj(dp
,
405 ds
->ds_dir
->dd_phys
->dd_origin_obj
,
409 origin
->ds_phys
->ds_creation_txg
;
410 dsl_dataset_rele(origin
, FTAG
);
414 if (zfs_flags
& ZFS_DEBUG_SNAPNAMES
)
415 err
= dsl_dataset_get_snapname(ds
);
416 if (err
== 0 && ds
->ds_phys
->ds_userrefs_obj
!= 0) {
418 ds
->ds_dir
->dd_pool
->dp_meta_objset
,
419 ds
->ds_phys
->ds_userrefs_obj
,
424 if (err
== 0 && !dsl_dataset_is_snapshot(ds
)) {
426 * In sync context, we're called with either no lock
427 * or with the write lock. If we're not syncing,
428 * we're always called with the read lock held.
430 boolean_t need_lock
=
431 !RW_WRITE_HELD(&dp
->dp_config_rwlock
) &&
432 dsl_pool_sync_context(dp
);
435 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
437 err
= dsl_prop_get_ds(ds
,
438 "refreservation", sizeof (uint64_t), 1,
439 &ds
->ds_reserved
, NULL
);
441 err
= dsl_prop_get_ds(ds
,
442 "refquota", sizeof (uint64_t), 1,
443 &ds
->ds_quota
, NULL
);
447 rw_exit(&dp
->dp_config_rwlock
);
449 ds
->ds_reserved
= ds
->ds_quota
= 0;
453 winner
= dmu_buf_set_user_ie(dbuf
, ds
, &ds
->ds_phys
,
457 bplist_close(&ds
->ds_deadlist
);
459 dsl_dataset_drop_ref(ds
->ds_prev
, ds
);
460 dsl_dir_close(ds
->ds_dir
, ds
);
461 mutex_destroy(&ds
->ds_lock
);
462 mutex_destroy(&ds
->ds_recvlock
);
463 mutex_destroy(&ds
->ds_opening_lock
);
464 mutex_destroy(&ds
->ds_deadlist
.bpl_lock
);
465 rw_destroy(&ds
->ds_rwlock
);
466 cv_destroy(&ds
->ds_exclusive_cv
);
467 kmem_free(ds
, sizeof (dsl_dataset_t
));
469 dmu_buf_rele(dbuf
, tag
);
475 unique_insert(ds
->ds_phys
->ds_fsid_guid
);
478 ASSERT3P(ds
->ds_dbuf
, ==, dbuf
);
479 ASSERT3P(ds
->ds_phys
, ==, dbuf
->db_data
);
480 ASSERT(ds
->ds_phys
->ds_prev_snap_obj
!= 0 ||
481 spa_version(dp
->dp_spa
) < SPA_VERSION_ORIGIN
||
482 dp
->dp_origin_snap
== NULL
|| ds
== dp
->dp_origin_snap
);
483 mutex_enter(&ds
->ds_lock
);
484 if (!dsl_pool_sync_context(dp
) && DSL_DATASET_IS_DESTROYED(ds
)) {
485 mutex_exit(&ds
->ds_lock
);
486 dmu_buf_rele(ds
->ds_dbuf
, tag
);
489 mutex_exit(&ds
->ds_lock
);
495 dsl_dataset_hold_ref(dsl_dataset_t
*ds
, void *tag
)
497 dsl_pool_t
*dp
= ds
->ds_dir
->dd_pool
;
500 * In syncing context we don't want the rwlock lock: there
501 * may be an existing writer waiting for sync phase to
502 * finish. We don't need to worry about such writers, since
503 * sync phase is single-threaded, so the writer can't be
504 * doing anything while we are active.
506 if (dsl_pool_sync_context(dp
)) {
507 ASSERT(!DSL_DATASET_IS_DESTROYED(ds
));
512 * Normal users will hold the ds_rwlock as a READER until they
513 * are finished (i.e., call dsl_dataset_rele()). "Owners" will
514 * drop their READER lock after they set the ds_owner field.
516 * If the dataset is being destroyed, the destroy thread will
517 * obtain a WRITER lock for exclusive access after it's done its
518 * open-context work and then change the ds_owner to
519 * dsl_reaper once destruction is assured. So threads
520 * may block here temporarily, until the "destructability" of
521 * the dataset is determined.
523 ASSERT(!RW_WRITE_HELD(&dp
->dp_config_rwlock
));
524 mutex_enter(&ds
->ds_lock
);
525 while (!rw_tryenter(&ds
->ds_rwlock
, RW_READER
)) {
526 rw_exit(&dp
->dp_config_rwlock
);
527 cv_wait(&ds
->ds_exclusive_cv
, &ds
->ds_lock
);
528 if (DSL_DATASET_IS_DESTROYED(ds
)) {
529 mutex_exit(&ds
->ds_lock
);
530 dsl_dataset_drop_ref(ds
, tag
);
531 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
534 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
536 mutex_exit(&ds
->ds_lock
);
541 dsl_dataset_hold_obj(dsl_pool_t
*dp
, uint64_t dsobj
, void *tag
,
544 int err
= dsl_dataset_get_ref(dp
, dsobj
, tag
, dsp
);
548 return (dsl_dataset_hold_ref(*dsp
, tag
));
552 dsl_dataset_own_obj(dsl_pool_t
*dp
, uint64_t dsobj
, int flags
, void *owner
,
555 int err
= dsl_dataset_hold_obj(dp
, dsobj
, owner
, dsp
);
557 ASSERT(DS_MODE_TYPE(flags
) != DS_MODE_USER
);
561 if (!dsl_dataset_tryown(*dsp
, DS_MODE_IS_INCONSISTENT(flags
), owner
)) {
562 dsl_dataset_rele(*dsp
, owner
);
570 dsl_dataset_hold(const char *name
, void *tag
, dsl_dataset_t
**dsp
)
574 const char *snapname
;
578 err
= dsl_dir_open_spa(NULL
, name
, FTAG
, &dd
, &snapname
);
583 obj
= dd
->dd_phys
->dd_head_dataset_obj
;
584 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
586 err
= dsl_dataset_get_ref(dp
, obj
, tag
, dsp
);
592 err
= dsl_dataset_hold_ref(*dsp
, tag
);
594 /* we may be looking for a snapshot */
595 if (err
== 0 && snapname
!= NULL
) {
596 dsl_dataset_t
*ds
= NULL
;
598 if (*snapname
++ != '@') {
599 dsl_dataset_rele(*dsp
, tag
);
604 dprintf("looking for snapshot '%s'\n", snapname
);
605 err
= dsl_dataset_snap_lookup(*dsp
, snapname
, &obj
);
607 err
= dsl_dataset_get_ref(dp
, obj
, tag
, &ds
);
608 dsl_dataset_rele(*dsp
, tag
);
610 ASSERT3U((err
== 0), ==, (ds
!= NULL
));
613 mutex_enter(&ds
->ds_lock
);
614 if (ds
->ds_snapname
[0] == 0)
615 (void) strlcpy(ds
->ds_snapname
, snapname
,
616 sizeof (ds
->ds_snapname
));
617 mutex_exit(&ds
->ds_lock
);
618 err
= dsl_dataset_hold_ref(ds
, tag
);
619 *dsp
= err
? NULL
: ds
;
623 rw_exit(&dp
->dp_config_rwlock
);
624 dsl_dir_close(dd
, FTAG
);
629 dsl_dataset_own(const char *name
, int flags
, void *owner
, dsl_dataset_t
**dsp
)
631 int err
= dsl_dataset_hold(name
, owner
, dsp
);
634 if ((*dsp
)->ds_phys
->ds_num_children
> 0 &&
635 !DS_MODE_IS_READONLY(flags
)) {
636 dsl_dataset_rele(*dsp
, owner
);
639 if (!dsl_dataset_tryown(*dsp
, DS_MODE_IS_INCONSISTENT(flags
), owner
)) {
640 dsl_dataset_rele(*dsp
, owner
);
647 dsl_dataset_name(dsl_dataset_t
*ds
, char *name
)
650 (void) strcpy(name
, "mos");
652 dsl_dir_name(ds
->ds_dir
, name
);
653 VERIFY(0 == dsl_dataset_get_snapname(ds
));
654 if (ds
->ds_snapname
[0]) {
655 (void) strcat(name
, "@");
657 * We use a "recursive" mutex so that we
658 * can call dprintf_ds() with ds_lock held.
660 if (!MUTEX_HELD(&ds
->ds_lock
)) {
661 mutex_enter(&ds
->ds_lock
);
662 (void) strcat(name
, ds
->ds_snapname
);
663 mutex_exit(&ds
->ds_lock
);
665 (void) strcat(name
, ds
->ds_snapname
);
672 dsl_dataset_namelen(dsl_dataset_t
*ds
)
677 result
= 3; /* "mos" */
679 result
= dsl_dir_namelen(ds
->ds_dir
);
680 VERIFY(0 == dsl_dataset_get_snapname(ds
));
681 if (ds
->ds_snapname
[0]) {
682 ++result
; /* adding one for the @-sign */
683 if (!MUTEX_HELD(&ds
->ds_lock
)) {
684 mutex_enter(&ds
->ds_lock
);
685 result
+= strlen(ds
->ds_snapname
);
686 mutex_exit(&ds
->ds_lock
);
688 result
+= strlen(ds
->ds_snapname
);
697 dsl_dataset_drop_ref(dsl_dataset_t
*ds
, void *tag
)
699 dmu_buf_rele(ds
->ds_dbuf
, tag
);
703 dsl_dataset_rele(dsl_dataset_t
*ds
, void *tag
)
705 if (!dsl_pool_sync_context(ds
->ds_dir
->dd_pool
)) {
706 rw_exit(&ds
->ds_rwlock
);
708 dsl_dataset_drop_ref(ds
, tag
);
712 dsl_dataset_disown(dsl_dataset_t
*ds
, void *owner
)
714 ASSERT((ds
->ds_owner
== owner
&& ds
->ds_dbuf
) ||
715 (DSL_DATASET_IS_DESTROYED(ds
) && ds
->ds_dbuf
== NULL
));
717 mutex_enter(&ds
->ds_lock
);
719 if (RW_WRITE_HELD(&ds
->ds_rwlock
)) {
720 rw_exit(&ds
->ds_rwlock
);
721 cv_broadcast(&ds
->ds_exclusive_cv
);
723 mutex_exit(&ds
->ds_lock
);
725 dsl_dataset_drop_ref(ds
, owner
);
727 dsl_dataset_evict(ds
->ds_dbuf
, ds
);
731 dsl_dataset_tryown(dsl_dataset_t
*ds
, boolean_t inconsistentok
, void *owner
)
733 boolean_t gotit
= FALSE
;
735 mutex_enter(&ds
->ds_lock
);
736 if (ds
->ds_owner
== NULL
&&
737 (!DS_IS_INCONSISTENT(ds
) || inconsistentok
)) {
738 ds
->ds_owner
= owner
;
739 if (!dsl_pool_sync_context(ds
->ds_dir
->dd_pool
))
740 rw_exit(&ds
->ds_rwlock
);
743 mutex_exit(&ds
->ds_lock
);
748 dsl_dataset_make_exclusive(dsl_dataset_t
*ds
, void *owner
)
750 ASSERT3P(owner
, ==, ds
->ds_owner
);
751 if (!RW_WRITE_HELD(&ds
->ds_rwlock
))
752 rw_enter(&ds
->ds_rwlock
, RW_WRITER
);
756 dsl_dataset_create_sync_dd(dsl_dir_t
*dd
, dsl_dataset_t
*origin
,
757 uint64_t flags
, dmu_tx_t
*tx
)
759 dsl_pool_t
*dp
= dd
->dd_pool
;
761 dsl_dataset_phys_t
*dsphys
;
763 objset_t
*mos
= dp
->dp_meta_objset
;
766 origin
= dp
->dp_origin_snap
;
768 ASSERT(origin
== NULL
|| origin
->ds_dir
->dd_pool
== dp
);
769 ASSERT(origin
== NULL
|| origin
->ds_phys
->ds_num_children
> 0);
770 ASSERT(dmu_tx_is_syncing(tx
));
771 ASSERT(dd
->dd_phys
->dd_head_dataset_obj
== 0);
773 dsobj
= dmu_object_alloc(mos
, DMU_OT_DSL_DATASET
, 0,
774 DMU_OT_DSL_DATASET
, sizeof (dsl_dataset_phys_t
), tx
);
775 VERIFY(0 == dmu_bonus_hold(mos
, dsobj
, FTAG
, &dbuf
));
776 dmu_buf_will_dirty(dbuf
, tx
);
777 dsphys
= dbuf
->db_data
;
778 bzero(dsphys
, sizeof (dsl_dataset_phys_t
));
779 dsphys
->ds_dir_obj
= dd
->dd_object
;
780 dsphys
->ds_flags
= flags
;
781 dsphys
->ds_fsid_guid
= unique_create();
782 (void) random_get_pseudo_bytes((void*)&dsphys
->ds_guid
,
783 sizeof (dsphys
->ds_guid
));
784 dsphys
->ds_snapnames_zapobj
=
785 zap_create_norm(mos
, U8_TEXTPREP_TOUPPER
, DMU_OT_DSL_DS_SNAP_MAP
,
787 dsphys
->ds_creation_time
= gethrestime_sec();
788 dsphys
->ds_creation_txg
= tx
->tx_txg
== TXG_INITIAL
? 1 : tx
->tx_txg
;
789 dsphys
->ds_deadlist_obj
=
790 bplist_create(mos
, DSL_DEADLIST_BLOCKSIZE
, tx
);
793 dsphys
->ds_prev_snap_obj
= origin
->ds_object
;
794 dsphys
->ds_prev_snap_txg
=
795 origin
->ds_phys
->ds_creation_txg
;
796 dsphys
->ds_used_bytes
=
797 origin
->ds_phys
->ds_used_bytes
;
798 dsphys
->ds_compressed_bytes
=
799 origin
->ds_phys
->ds_compressed_bytes
;
800 dsphys
->ds_uncompressed_bytes
=
801 origin
->ds_phys
->ds_uncompressed_bytes
;
802 dsphys
->ds_bp
= origin
->ds_phys
->ds_bp
;
803 dsphys
->ds_flags
|= origin
->ds_phys
->ds_flags
;
805 dmu_buf_will_dirty(origin
->ds_dbuf
, tx
);
806 origin
->ds_phys
->ds_num_children
++;
808 if (spa_version(dp
->dp_spa
) >= SPA_VERSION_NEXT_CLONES
) {
809 if (origin
->ds_phys
->ds_next_clones_obj
== 0) {
810 origin
->ds_phys
->ds_next_clones_obj
=
812 DMU_OT_NEXT_CLONES
, DMU_OT_NONE
, 0, tx
);
814 VERIFY(0 == zap_add_int(mos
,
815 origin
->ds_phys
->ds_next_clones_obj
,
819 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
820 dd
->dd_phys
->dd_origin_obj
= origin
->ds_object
;
823 if (spa_version(dp
->dp_spa
) >= SPA_VERSION_UNIQUE_ACCURATE
)
824 dsphys
->ds_flags
|= DS_FLAG_UNIQUE_ACCURATE
;
826 dmu_buf_rele(dbuf
, FTAG
);
828 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
829 dd
->dd_phys
->dd_head_dataset_obj
= dsobj
;
835 dsl_dataset_create_sync(dsl_dir_t
*pdd
, const char *lastname
,
836 dsl_dataset_t
*origin
, uint64_t flags
, cred_t
*cr
, dmu_tx_t
*tx
)
838 dsl_pool_t
*dp
= pdd
->dd_pool
;
839 uint64_t dsobj
, ddobj
;
842 ASSERT(lastname
[0] != '@');
844 ddobj
= dsl_dir_create_sync(dp
, pdd
, lastname
, tx
);
845 VERIFY(0 == dsl_dir_open_obj(dp
, ddobj
, lastname
, FTAG
, &dd
));
847 dsobj
= dsl_dataset_create_sync_dd(dd
, origin
, flags
, tx
);
849 dsl_deleg_set_create_perms(dd
, tx
, cr
);
851 dsl_dir_close(dd
, FTAG
);
857 dsl_sync_task_group_t
*dstg
;
864 dsl_snapshot_destroy_one(char *name
, void *arg
)
866 struct destroyarg
*da
= arg
;
872 /* alloc a buffer to hold name@snapname, plus the terminating NULL */
873 buflen
= strlen(name
) + strlen(da
->snapname
) + 2;
874 dsname
= kmem_alloc(buflen
, KM_SLEEP
);
875 (void) snprintf(dsname
, buflen
, "%s@%s", name
, da
->snapname
);
876 err
= dsl_dataset_own(dsname
, DS_MODE_READONLY
| DS_MODE_INCONSISTENT
,
878 kmem_free(dsname
, buflen
);
880 struct dsl_ds_destroyarg
*dsda
;
882 dsl_dataset_make_exclusive(ds
, da
->dstg
);
883 if (ds
->ds_user_ptr
) {
884 ds
->ds_user_evict_func(ds
, ds
->ds_user_ptr
);
885 ds
->ds_user_ptr
= NULL
;
887 dsda
= kmem_zalloc(sizeof (struct dsl_ds_destroyarg
), KM_SLEEP
);
889 dsda
->defer
= da
->defer
;
890 dsl_sync_task_create(da
->dstg
, dsl_dataset_destroy_check
,
891 dsl_dataset_destroy_sync
, dsda
, da
->dstg
, 0);
892 } else if (err
== ENOENT
) {
895 (void) strcpy(da
->failed
, name
);
901 * Destroy 'snapname' in all descendants of 'fsname'.
903 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
905 dsl_snapshots_destroy(char *fsname
, char *snapname
, boolean_t defer
)
908 struct destroyarg da
;
909 dsl_sync_task_t
*dst
;
912 err
= spa_open(fsname
, &spa
, FTAG
);
915 da
.dstg
= dsl_sync_task_group_create(spa_get_dsl(spa
));
916 da
.snapname
= snapname
;
920 err
= dmu_objset_find(fsname
,
921 dsl_snapshot_destroy_one
, &da
, DS_FIND_CHILDREN
);
924 err
= dsl_sync_task_group_wait(da
.dstg
);
926 for (dst
= list_head(&da
.dstg
->dstg_tasks
); dst
;
927 dst
= list_next(&da
.dstg
->dstg_tasks
, dst
)) {
928 struct dsl_ds_destroyarg
*dsda
= dst
->dst_arg1
;
929 dsl_dataset_t
*ds
= dsda
->ds
;
932 * Return the file system name that triggered the error
935 dsl_dataset_name(ds
, fsname
);
936 *strchr(fsname
, '@') = '\0';
938 ASSERT3P(dsda
->rm_origin
, ==, NULL
);
939 dsl_dataset_disown(ds
, da
.dstg
);
940 kmem_free(dsda
, sizeof (struct dsl_ds_destroyarg
));
943 dsl_sync_task_group_destroy(da
.dstg
);
944 spa_close(spa
, FTAG
);
949 dsl_dataset_might_destroy_origin(dsl_dataset_t
*ds
)
951 boolean_t might_destroy
= B_FALSE
;
953 mutex_enter(&ds
->ds_lock
);
954 if (ds
->ds_phys
->ds_num_children
== 2 && ds
->ds_userrefs
== 0 &&
955 DS_IS_DEFER_DESTROY(ds
))
956 might_destroy
= B_TRUE
;
957 mutex_exit(&ds
->ds_lock
);
959 return (might_destroy
);
964 dsl_dataset_zvol_cleanup(dsl_dataset_t
*ds
, const char *name
)
969 error
= dmu_objset_open_ds(ds
, DMU_OST_ANY
, &os
);
973 if (dmu_objset_type(os
) == DMU_OST_ZVOL
)
974 error
= zvol_remove_minor(name
);
975 dmu_objset_close(os
);
982 * If we're removing a clone, and these three conditions are true:
983 * 1) the clone's origin has no other children
984 * 2) the clone's origin has no user references
985 * 3) the clone's origin has been marked for deferred destruction
986 * Then, prepare to remove the origin as part of this sync task group.
989 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg
*dsda
, void *tag
)
991 dsl_dataset_t
*ds
= dsda
->ds
;
992 dsl_dataset_t
*origin
= ds
->ds_prev
;
994 if (dsl_dataset_might_destroy_origin(origin
)) {
999 namelen
= dsl_dataset_namelen(origin
) + 1;
1000 name
= kmem_alloc(namelen
, KM_SLEEP
);
1001 dsl_dataset_name(origin
, name
);
1003 error
= zfs_unmount_snap(name
, NULL
);
1005 kmem_free(name
, namelen
);
1008 error
= dsl_dataset_zvol_cleanup(origin
, name
);
1010 kmem_free(name
, namelen
);
1014 error
= dsl_dataset_own(name
,
1015 DS_MODE_READONLY
| DS_MODE_INCONSISTENT
,
1017 kmem_free(name
, namelen
);
1020 dsda
->rm_origin
= origin
;
1021 dsl_dataset_make_exclusive(origin
, tag
);
1028 * ds must be opened as OWNER. On return (whether successful or not),
1029 * ds will be closed and caller can no longer dereference it.
1032 dsl_dataset_destroy(dsl_dataset_t
*ds
, void *tag
, boolean_t defer
)
1035 dsl_sync_task_group_t
*dstg
;
1039 struct dsl_ds_destroyarg dsda
= {0};
1043 if (dsl_dataset_is_snapshot(ds
)) {
1044 /* Destroying a snapshot is simpler */
1045 dsl_dataset_make_exclusive(ds
, tag
);
1047 if (ds
->ds_user_ptr
) {
1048 ds
->ds_user_evict_func(ds
, ds
->ds_user_ptr
);
1049 ds
->ds_user_ptr
= NULL
;
1051 /* NOTE: defer is always B_FALSE for non-snapshots */
1053 err
= dsl_sync_task_do(ds
->ds_dir
->dd_pool
,
1054 dsl_dataset_destroy_check
, dsl_dataset_destroy_sync
,
1056 ASSERT3P(dsda
.rm_origin
, ==, NULL
);
1063 * Check for errors and mark this ds as inconsistent, in
1064 * case we crash while freeing the objects.
1066 err
= dsl_sync_task_do(dd
->dd_pool
, dsl_dataset_destroy_begin_check
,
1067 dsl_dataset_destroy_begin_sync
, ds
, NULL
, 0);
1071 err
= dmu_objset_open_ds(ds
, DMU_OST_ANY
, &os
);
1076 * remove the objects in open context, so that we won't
1077 * have too much to do in syncing context.
1079 for (obj
= 0; err
== 0; err
= dmu_object_next(os
, &obj
, FALSE
,
1080 ds
->ds_phys
->ds_prev_snap_txg
)) {
1082 * Ignore errors, if there is not enough disk space
1083 * we will deal with it in dsl_dataset_destroy_sync().
1085 (void) dmu_free_object(os
, obj
);
1089 * We need to sync out all in-flight IO before we try to evict
1090 * (the dataset evict func is trying to clear the cached entries
1091 * for this dataset in the ARC).
1093 txg_wait_synced(dd
->dd_pool
, 0);
1096 * If we managed to free all the objects in open
1097 * context, the user space accounting should be zero.
1099 if (ds
->ds_phys
->ds_bp
.blk_fill
== 0 &&
1100 dmu_objset_userused_enabled(os
->os
)) {
1103 ASSERT(zap_count(os
, DMU_USERUSED_OBJECT
, &count
) != 0 ||
1105 ASSERT(zap_count(os
, DMU_GROUPUSED_OBJECT
, &count
) != 0 ||
1109 dmu_objset_close(os
);
1113 rw_enter(&dd
->dd_pool
->dp_config_rwlock
, RW_READER
);
1114 err
= dsl_dir_open_obj(dd
->dd_pool
, dd
->dd_object
, NULL
, FTAG
, &dd
);
1115 rw_exit(&dd
->dd_pool
->dp_config_rwlock
);
1120 if (ds
->ds_user_ptr
) {
1122 * We need to sync out all in-flight IO before we try
1123 * to evict (the dataset evict func is trying to clear
1124 * the cached entries for this dataset in the ARC).
1126 txg_wait_synced(dd
->dd_pool
, 0);
1130 * Blow away the dsl_dir + head dataset.
1132 dsl_dataset_make_exclusive(ds
, tag
);
1133 if (ds
->ds_user_ptr
) {
1134 ds
->ds_user_evict_func(ds
, ds
->ds_user_ptr
);
1135 ds
->ds_user_ptr
= NULL
;
1139 * If we're removing a clone, we might also need to remove its
1143 dsda
.need_prep
= B_FALSE
;
1144 if (dsl_dir_is_clone(dd
)) {
1145 err
= dsl_dataset_origin_rm_prep(&dsda
, tag
);
1147 dsl_dir_close(dd
, FTAG
);
1152 dstg
= dsl_sync_task_group_create(ds
->ds_dir
->dd_pool
);
1153 dsl_sync_task_create(dstg
, dsl_dataset_destroy_check
,
1154 dsl_dataset_destroy_sync
, &dsda
, tag
, 0);
1155 dsl_sync_task_create(dstg
, dsl_dir_destroy_check
,
1156 dsl_dir_destroy_sync
, dd
, FTAG
, 0);
1157 err
= dsl_sync_task_group_wait(dstg
);
1158 dsl_sync_task_group_destroy(dstg
);
1161 * We could be racing against 'zfs release' or 'zfs destroy -d'
1162 * on the origin snap, in which case we can get EBUSY if we
1163 * needed to destroy the origin snap but were not ready to
1166 if (dsda
.need_prep
) {
1167 ASSERT(err
== EBUSY
);
1168 ASSERT(dsl_dir_is_clone(dd
));
1169 ASSERT(dsda
.rm_origin
== NULL
);
1171 } while (dsda
.need_prep
);
1173 if (dsda
.rm_origin
!= NULL
)
1174 dsl_dataset_disown(dsda
.rm_origin
, tag
);
1176 /* if it is successful, dsl_dir_destroy_sync will close the dd */
1178 dsl_dir_close(dd
, FTAG
);
1180 dsl_dataset_disown(ds
, tag
);
1185 dsl_dataset_rollback(dsl_dataset_t
*ds
, dmu_objset_type_t ost
)
1189 ASSERT(ds
->ds_owner
);
1191 dsl_dataset_make_exclusive(ds
, ds
->ds_owner
);
1192 err
= dsl_sync_task_do(ds
->ds_dir
->dd_pool
,
1193 dsl_dataset_rollback_check
, dsl_dataset_rollback_sync
,
1195 /* drop exclusive access */
1196 mutex_enter(&ds
->ds_lock
);
1197 rw_exit(&ds
->ds_rwlock
);
1198 cv_broadcast(&ds
->ds_exclusive_cv
);
1199 mutex_exit(&ds
->ds_lock
);
1204 dsl_dataset_set_user_ptr(dsl_dataset_t
*ds
,
1205 void *p
, dsl_dataset_evict_func_t func
)
1209 mutex_enter(&ds
->ds_lock
);
1210 old
= ds
->ds_user_ptr
;
1212 ds
->ds_user_ptr
= p
;
1213 ds
->ds_user_evict_func
= func
;
1215 mutex_exit(&ds
->ds_lock
);
1220 dsl_dataset_get_user_ptr(dsl_dataset_t
*ds
)
1222 return (ds
->ds_user_ptr
);
1226 dsl_dataset_get_blkptr(dsl_dataset_t
*ds
)
1228 return (&ds
->ds_phys
->ds_bp
);
1232 dsl_dataset_set_blkptr(dsl_dataset_t
*ds
, blkptr_t
*bp
, dmu_tx_t
*tx
)
1234 ASSERT(dmu_tx_is_syncing(tx
));
1235 /* If it's the meta-objset, set dp_meta_rootbp */
1237 tx
->tx_pool
->dp_meta_rootbp
= *bp
;
1239 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
1240 ds
->ds_phys
->ds_bp
= *bp
;
1245 dsl_dataset_get_spa(dsl_dataset_t
*ds
)
1247 return (ds
->ds_dir
->dd_pool
->dp_spa
);
1251 dsl_dataset_dirty(dsl_dataset_t
*ds
, dmu_tx_t
*tx
)
1255 if (ds
== NULL
) /* this is the meta-objset */
1258 ASSERT(ds
->ds_user_ptr
!= NULL
);
1260 if (ds
->ds_phys
->ds_next_snap_obj
!= 0)
1261 panic("dirtying snapshot!");
1263 dp
= ds
->ds_dir
->dd_pool
;
1265 if (txg_list_add(&dp
->dp_dirty_datasets
, ds
, tx
->tx_txg
) == 0) {
1266 /* up the hold count until we can be written out */
1267 dmu_buf_add_ref(ds
->ds_dbuf
, ds
);
1272 * The unique space in the head dataset can be calculated by subtracting
1273 * the space used in the most recent snapshot, that is still being used
1274 * in this file system, from the space currently in use. To figure out
1275 * the space in the most recent snapshot still in use, we need to take
1276 * the total space used in the snapshot and subtract out the space that
1277 * has been freed up since the snapshot was taken.
1280 dsl_dataset_recalc_head_uniq(dsl_dataset_t
*ds
)
1283 uint64_t dlused
, dlcomp
, dluncomp
;
1285 ASSERT(ds
->ds_object
== ds
->ds_dir
->dd_phys
->dd_head_dataset_obj
);
1287 if (ds
->ds_phys
->ds_prev_snap_obj
!= 0)
1288 mrs_used
= ds
->ds_prev
->ds_phys
->ds_used_bytes
;
1292 VERIFY(0 == bplist_space(&ds
->ds_deadlist
, &dlused
, &dlcomp
,
1295 ASSERT3U(dlused
, <=, mrs_used
);
1296 ds
->ds_phys
->ds_unique_bytes
=
1297 ds
->ds_phys
->ds_used_bytes
- (mrs_used
- dlused
);
1299 if (!DS_UNIQUE_IS_ACCURATE(ds
) &&
1300 spa_version(ds
->ds_dir
->dd_pool
->dp_spa
) >=
1301 SPA_VERSION_UNIQUE_ACCURATE
)
1302 ds
->ds_phys
->ds_flags
|= DS_FLAG_UNIQUE_ACCURATE
;
1306 dsl_dataset_unique(dsl_dataset_t
*ds
)
1308 if (!DS_UNIQUE_IS_ACCURATE(ds
) && !dsl_dataset_is_snapshot(ds
))
1309 dsl_dataset_recalc_head_uniq(ds
);
1311 return (ds
->ds_phys
->ds_unique_bytes
);
1322 kill_blkptr(spa_t
*spa
, blkptr_t
*bp
, const zbookmark_t
*zb
,
1323 const dnode_phys_t
*dnp
, void *arg
)
1325 struct killarg
*ka
= arg
;
1330 if ((zb
->zb_level
== -1ULL && zb
->zb_blkid
!= 0) ||
1331 (zb
->zb_object
!= 0 && dnp
== NULL
)) {
1333 * It's a block in the intent log. It has no
1334 * accounting, so just free it.
1336 VERIFY3U(0, ==, dsl_free(ka
->zio
, ka
->tx
->tx_pool
,
1337 ka
->tx
->tx_txg
, bp
, NULL
, NULL
, ARC_NOWAIT
));
1339 ASSERT3U(bp
->blk_birth
, >, ka
->ds
->ds_phys
->ds_prev_snap_txg
);
1340 (void) dsl_dataset_block_kill(ka
->ds
, bp
, ka
->zio
, ka
->tx
);
1348 dsl_dataset_rollback_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
1350 dsl_dataset_t
*ds
= arg1
;
1351 dmu_objset_type_t
*ost
= arg2
;
1354 * We can only roll back to emptyness if it is a ZPL objset.
1356 if (*ost
!= DMU_OST_ZFS
&&
1357 ds
->ds_phys
->ds_prev_snap_txg
< TXG_INITIAL
)
1361 * This must not be a snapshot.
1363 if (ds
->ds_phys
->ds_next_snap_obj
!= 0)
1367 * If we made changes this txg, traverse_dataset won't find
1370 if (ds
->ds_phys
->ds_bp
.blk_birth
>= tx
->tx_txg
)
1378 dsl_dataset_rollback_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
1380 dsl_dataset_t
*ds
= arg1
;
1381 dmu_objset_type_t
*ost
= arg2
;
1382 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
1384 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
1386 if (ds
->ds_user_ptr
!= NULL
) {
1388 * We need to make sure that the objset_impl_t is reopened after
1389 * we do the rollback, otherwise it will have the wrong
1390 * objset_phys_t. Normally this would happen when this
1391 * dataset-open is closed, thus causing the
1392 * dataset to be immediately evicted. But when doing "zfs recv
1393 * -F", we reopen the objset before that, so that there is no
1394 * window where the dataset is closed and inconsistent.
1396 ds
->ds_user_evict_func(ds
, ds
->ds_user_ptr
);
1397 ds
->ds_user_ptr
= NULL
;
1400 /* Transfer space that was freed since last snap back to the head. */
1404 VERIFY(0 == bplist_space_birthrange(&ds
->ds_deadlist
,
1405 ds
->ds_origin_txg
, UINT64_MAX
, &used
));
1406 dsl_dir_transfer_space(ds
->ds_dir
, used
,
1407 DD_USED_SNAP
, DD_USED_HEAD
, tx
);
1410 /* Zero out the deadlist. */
1411 bplist_close(&ds
->ds_deadlist
);
1412 bplist_destroy(mos
, ds
->ds_phys
->ds_deadlist_obj
, tx
);
1413 ds
->ds_phys
->ds_deadlist_obj
=
1414 bplist_create(mos
, DSL_DEADLIST_BLOCKSIZE
, tx
);
1415 VERIFY(0 == bplist_open(&ds
->ds_deadlist
, mos
,
1416 ds
->ds_phys
->ds_deadlist_obj
));
1420 * Free blkptrs that we gave birth to - this covers
1421 * claimed but not played log blocks too.
1426 zio
= zio_root(tx
->tx_pool
->dp_spa
, NULL
, NULL
,
1427 ZIO_FLAG_MUSTSUCCEED
);
1431 (void) traverse_dataset(ds
, ds
->ds_phys
->ds_prev_snap_txg
,
1432 TRAVERSE_POST
, kill_blkptr
, &ka
);
1433 (void) zio_wait(zio
);
1436 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds
) || ds
->ds_phys
->ds_unique_bytes
== 0);
1438 if (ds
->ds_prev
&& ds
->ds_prev
!= ds
->ds_dir
->dd_pool
->dp_origin_snap
) {
1439 /* Change our contents to that of the prev snapshot */
1441 ASSERT3U(ds
->ds_prev
->ds_object
, ==,
1442 ds
->ds_phys
->ds_prev_snap_obj
);
1443 ASSERT3U(ds
->ds_phys
->ds_used_bytes
, <=,
1444 ds
->ds_prev
->ds_phys
->ds_used_bytes
);
1446 ds
->ds_phys
->ds_bp
= ds
->ds_prev
->ds_phys
->ds_bp
;
1447 ds
->ds_phys
->ds_used_bytes
=
1448 ds
->ds_prev
->ds_phys
->ds_used_bytes
;
1449 ds
->ds_phys
->ds_compressed_bytes
=
1450 ds
->ds_prev
->ds_phys
->ds_compressed_bytes
;
1451 ds
->ds_phys
->ds_uncompressed_bytes
=
1452 ds
->ds_prev
->ds_phys
->ds_uncompressed_bytes
;
1453 ds
->ds_phys
->ds_flags
= ds
->ds_prev
->ds_phys
->ds_flags
;
1455 if (ds
->ds_prev
->ds_phys
->ds_next_snap_obj
== ds
->ds_object
) {
1456 dmu_buf_will_dirty(ds
->ds_prev
->ds_dbuf
, tx
);
1457 ds
->ds_prev
->ds_phys
->ds_unique_bytes
= 0;
1462 ASSERT(*ost
!= DMU_OST_ZVOL
);
1463 ASSERT3U(ds
->ds_phys
->ds_used_bytes
, ==, 0);
1464 ASSERT3U(ds
->ds_phys
->ds_compressed_bytes
, ==, 0);
1465 ASSERT3U(ds
->ds_phys
->ds_uncompressed_bytes
, ==, 0);
1467 bzero(&ds
->ds_phys
->ds_bp
, sizeof (blkptr_t
));
1468 ds
->ds_phys
->ds_flags
= 0;
1469 ds
->ds_phys
->ds_unique_bytes
= 0;
1470 if (spa_version(ds
->ds_dir
->dd_pool
->dp_spa
) >=
1471 SPA_VERSION_UNIQUE_ACCURATE
)
1472 ds
->ds_phys
->ds_flags
|= DS_FLAG_UNIQUE_ACCURATE
;
1474 osi
= dmu_objset_create_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
,
1475 &ds
->ds_phys
->ds_bp
, *ost
, tx
);
1477 zfs_create_fs(&osi
->os
, kcred
, NULL
, tx
);
1481 spa_history_internal_log(LOG_DS_ROLLBACK
, ds
->ds_dir
->dd_pool
->dp_spa
,
1482 tx
, cr
, "dataset = %llu", ds
->ds_object
);
1487 dsl_dataset_destroy_begin_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
1489 dsl_dataset_t
*ds
= arg1
;
1490 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
1495 * Can't delete a head dataset if there are snapshots of it.
1496 * (Except if the only snapshots are from the branch we cloned
1499 if (ds
->ds_prev
!= NULL
&&
1500 ds
->ds_prev
->ds_phys
->ds_next_snap_obj
== ds
->ds_object
)
1504 * This is really a dsl_dir thing, but check it here so that
1505 * we'll be less likely to leave this dataset inconsistent &
1508 err
= zap_count(mos
, ds
->ds_dir
->dd_phys
->dd_child_dir_zapobj
, &count
);
1519 dsl_dataset_destroy_begin_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
1521 dsl_dataset_t
*ds
= arg1
;
1522 dsl_pool_t
*dp
= ds
->ds_dir
->dd_pool
;
1524 /* Mark it as inconsistent on-disk, in case we crash */
1525 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
1526 ds
->ds_phys
->ds_flags
|= DS_FLAG_INCONSISTENT
;
1528 spa_history_internal_log(LOG_DS_DESTROY_BEGIN
, dp
->dp_spa
, tx
,
1529 cr
, "dataset = %llu", ds
->ds_object
);
1533 dsl_dataset_origin_check(struct dsl_ds_destroyarg
*dsda
, void *tag
,
1536 dsl_dataset_t
*ds
= dsda
->ds
;
1537 dsl_dataset_t
*ds_prev
= ds
->ds_prev
;
1539 if (dsl_dataset_might_destroy_origin(ds_prev
)) {
1540 struct dsl_ds_destroyarg ndsda
= {0};
1543 * If we're not prepared to remove the origin, don't remove
1546 if (dsda
->rm_origin
== NULL
) {
1547 dsda
->need_prep
= B_TRUE
;
1552 ndsda
.is_origin_rm
= B_TRUE
;
1553 return (dsl_dataset_destroy_check(&ndsda
, tag
, tx
));
1557 * If we're not going to remove the origin after all,
1558 * undo the open context setup.
1560 if (dsda
->rm_origin
!= NULL
) {
1561 dsl_dataset_disown(dsda
->rm_origin
, tag
);
1562 dsda
->rm_origin
= NULL
;
1570 dsl_dataset_destroy_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
1572 struct dsl_ds_destroyarg
*dsda
= arg1
;
1573 dsl_dataset_t
*ds
= dsda
->ds
;
1575 /* we have an owner hold, so noone else can destroy us */
1576 ASSERT(!DSL_DATASET_IS_DESTROYED(ds
));
1579 * Only allow deferred destroy on pools that support it.
1580 * NOTE: deferred destroy is only supported on snapshots.
1583 if (spa_version(ds
->ds_dir
->dd_pool
->dp_spa
) <
1584 SPA_VERSION_USERREFS
)
1586 ASSERT(dsl_dataset_is_snapshot(ds
));
1591 * Can't delete a head dataset if there are snapshots of it.
1592 * (Except if the only snapshots are from the branch we cloned
1595 if (ds
->ds_prev
!= NULL
&&
1596 ds
->ds_prev
->ds_phys
->ds_next_snap_obj
== ds
->ds_object
)
1600 * If we made changes this txg, traverse_dsl_dataset won't find
1603 if (ds
->ds_phys
->ds_bp
.blk_birth
>= tx
->tx_txg
)
1606 if (dsl_dataset_is_snapshot(ds
)) {
1608 * If this snapshot has an elevated user reference count,
1609 * we can't destroy it yet.
1611 if (ds
->ds_userrefs
> 0 && !dsda
->releasing
)
1614 mutex_enter(&ds
->ds_lock
);
1616 * Can't delete a branch point. However, if we're destroying
1617 * a clone and removing its origin due to it having a user
1618 * hold count of 0 and having been marked for deferred destroy,
1619 * it's OK for the origin to have a single clone.
1621 if (ds
->ds_phys
->ds_num_children
>
1622 (dsda
->is_origin_rm
? 2 : 1)) {
1623 mutex_exit(&ds
->ds_lock
);
1626 mutex_exit(&ds
->ds_lock
);
1627 } else if (dsl_dir_is_clone(ds
->ds_dir
)) {
1628 return (dsl_dataset_origin_check(dsda
, arg2
, tx
));
1631 /* XXX we should do some i/o error checking... */
1643 dsl_dataset_refs_gone(dmu_buf_t
*db
, void *argv
)
1645 struct refsarg
*arg
= argv
;
1647 mutex_enter(&arg
->lock
);
1649 cv_signal(&arg
->cv
);
1650 mutex_exit(&arg
->lock
);
1654 dsl_dataset_drain_refs(dsl_dataset_t
*ds
, void *tag
)
1658 mutex_init(&arg
.lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1659 cv_init(&arg
.cv
, NULL
, CV_DEFAULT
, NULL
);
1661 (void) dmu_buf_update_user(ds
->ds_dbuf
, ds
, &arg
, &ds
->ds_phys
,
1662 dsl_dataset_refs_gone
);
1663 dmu_buf_rele(ds
->ds_dbuf
, tag
);
1664 mutex_enter(&arg
.lock
);
1666 cv_wait(&arg
.cv
, &arg
.lock
);
1668 mutex_exit(&arg
.lock
);
1671 mutex_destroy(&arg
.lock
);
1672 cv_destroy(&arg
.cv
);
1676 dsl_dataset_destroy_sync(void *arg1
, void *tag
, cred_t
*cr
, dmu_tx_t
*tx
)
1678 struct dsl_ds_destroyarg
*dsda
= arg1
;
1679 dsl_dataset_t
*ds
= dsda
->ds
;
1682 int after_branch_point
= FALSE
;
1683 dsl_pool_t
*dp
= ds
->ds_dir
->dd_pool
;
1684 objset_t
*mos
= dp
->dp_meta_objset
;
1685 dsl_dataset_t
*ds_prev
= NULL
;
1688 ASSERT(ds
->ds_owner
);
1689 ASSERT(dsda
->defer
|| ds
->ds_phys
->ds_num_children
<= 1);
1690 ASSERT(ds
->ds_prev
== NULL
||
1691 ds
->ds_prev
->ds_phys
->ds_next_snap_obj
!= ds
->ds_object
);
1692 ASSERT3U(ds
->ds_phys
->ds_bp
.blk_birth
, <=, tx
->tx_txg
);
1695 ASSERT(spa_version(dp
->dp_spa
) >= SPA_VERSION_USERREFS
);
1696 if (ds
->ds_userrefs
> 0 || ds
->ds_phys
->ds_num_children
> 1) {
1697 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
1698 ds
->ds_phys
->ds_flags
|= DS_FLAG_DEFER_DESTROY
;
1703 /* signal any waiters that this dataset is going away */
1704 mutex_enter(&ds
->ds_lock
);
1705 ds
->ds_owner
= dsl_reaper
;
1706 cv_broadcast(&ds
->ds_exclusive_cv
);
1707 mutex_exit(&ds
->ds_lock
);
1709 /* Remove our reservation */
1710 if (ds
->ds_reserved
!= 0) {
1712 dsl_dataset_set_reservation_sync(ds
, &val
, cr
, tx
);
1713 ASSERT3U(ds
->ds_reserved
, ==, 0);
1716 ASSERT(RW_WRITE_HELD(&dp
->dp_config_rwlock
));
1718 dsl_pool_ds_destroyed(ds
, tx
);
1720 obj
= ds
->ds_object
;
1722 if (ds
->ds_phys
->ds_prev_snap_obj
!= 0) {
1724 ds_prev
= ds
->ds_prev
;
1726 VERIFY(0 == dsl_dataset_hold_obj(dp
,
1727 ds
->ds_phys
->ds_prev_snap_obj
, FTAG
, &ds_prev
));
1729 after_branch_point
=
1730 (ds_prev
->ds_phys
->ds_next_snap_obj
!= obj
);
1732 dmu_buf_will_dirty(ds_prev
->ds_dbuf
, tx
);
1733 if (after_branch_point
&&
1734 ds_prev
->ds_phys
->ds_next_clones_obj
!= 0) {
1735 VERIFY3U(0, ==, zap_remove_int(mos
,
1736 ds_prev
->ds_phys
->ds_next_clones_obj
, obj
, tx
));
1737 if (ds
->ds_phys
->ds_next_snap_obj
!= 0) {
1738 VERIFY(0 == zap_add_int(mos
,
1739 ds_prev
->ds_phys
->ds_next_clones_obj
,
1740 ds
->ds_phys
->ds_next_snap_obj
, tx
));
1743 if (after_branch_point
&&
1744 ds
->ds_phys
->ds_next_snap_obj
== 0) {
1745 /* This clone is toast. */
1746 ASSERT(ds_prev
->ds_phys
->ds_num_children
> 1);
1747 ds_prev
->ds_phys
->ds_num_children
--;
1750 * If the clone's origin has no other clones, no
1751 * user holds, and has been marked for deferred
1752 * deletion, then we should have done the necessary
1753 * destroy setup for it.
1755 if (ds_prev
->ds_phys
->ds_num_children
== 1 &&
1756 ds_prev
->ds_userrefs
== 0 &&
1757 DS_IS_DEFER_DESTROY(ds_prev
)) {
1758 ASSERT3P(dsda
->rm_origin
, !=, NULL
);
1760 ASSERT3P(dsda
->rm_origin
, ==, NULL
);
1762 } else if (!after_branch_point
) {
1763 ds_prev
->ds_phys
->ds_next_snap_obj
=
1764 ds
->ds_phys
->ds_next_snap_obj
;
1768 zio
= zio_root(dp
->dp_spa
, NULL
, NULL
, ZIO_FLAG_MUSTSUCCEED
);
1770 if (ds
->ds_phys
->ds_next_snap_obj
!= 0) {
1772 dsl_dataset_t
*ds_next
;
1774 uint64_t old_unique
;
1775 int64_t used
= 0, compressed
= 0, uncompressed
= 0;
1777 VERIFY(0 == dsl_dataset_hold_obj(dp
,
1778 ds
->ds_phys
->ds_next_snap_obj
, FTAG
, &ds_next
));
1779 ASSERT3U(ds_next
->ds_phys
->ds_prev_snap_obj
, ==, obj
);
1781 old_unique
= dsl_dataset_unique(ds_next
);
1783 dmu_buf_will_dirty(ds_next
->ds_dbuf
, tx
);
1784 ds_next
->ds_phys
->ds_prev_snap_obj
=
1785 ds
->ds_phys
->ds_prev_snap_obj
;
1786 ds_next
->ds_phys
->ds_prev_snap_txg
=
1787 ds
->ds_phys
->ds_prev_snap_txg
;
1788 ASSERT3U(ds
->ds_phys
->ds_prev_snap_txg
, ==,
1789 ds_prev
? ds_prev
->ds_phys
->ds_creation_txg
: 0);
1792 * Transfer to our deadlist (which will become next's
1793 * new deadlist) any entries from next's current
1794 * deadlist which were born before prev, and free the
1797 * XXX we're doing this long task with the config lock held
1799 while (bplist_iterate(&ds_next
->ds_deadlist
, &itor
, &bp
) == 0) {
1800 if (bp
.blk_birth
<= ds
->ds_phys
->ds_prev_snap_txg
) {
1801 VERIFY(0 == bplist_enqueue(&ds
->ds_deadlist
,
1803 if (ds_prev
&& !after_branch_point
&&
1805 ds_prev
->ds_phys
->ds_prev_snap_txg
) {
1806 ds_prev
->ds_phys
->ds_unique_bytes
+=
1807 bp_get_dasize(dp
->dp_spa
, &bp
);
1810 used
+= bp_get_dasize(dp
->dp_spa
, &bp
);
1811 compressed
+= BP_GET_PSIZE(&bp
);
1812 uncompressed
+= BP_GET_UCSIZE(&bp
);
1813 /* XXX check return value? */
1814 (void) dsl_free(zio
, dp
, tx
->tx_txg
,
1815 &bp
, NULL
, NULL
, ARC_NOWAIT
);
1819 ASSERT3U(used
, ==, ds
->ds_phys
->ds_unique_bytes
);
1821 /* change snapused */
1822 dsl_dir_diduse_space(ds
->ds_dir
, DD_USED_SNAP
,
1823 -used
, -compressed
, -uncompressed
, tx
);
1825 /* free next's deadlist */
1826 bplist_close(&ds_next
->ds_deadlist
);
1827 bplist_destroy(mos
, ds_next
->ds_phys
->ds_deadlist_obj
, tx
);
1829 /* set next's deadlist to our deadlist */
1830 bplist_close(&ds
->ds_deadlist
);
1831 ds_next
->ds_phys
->ds_deadlist_obj
=
1832 ds
->ds_phys
->ds_deadlist_obj
;
1833 VERIFY(0 == bplist_open(&ds_next
->ds_deadlist
, mos
,
1834 ds_next
->ds_phys
->ds_deadlist_obj
));
1835 ds
->ds_phys
->ds_deadlist_obj
= 0;
1837 if (ds_next
->ds_phys
->ds_next_snap_obj
!= 0) {
1839 * Update next's unique to include blocks which
1840 * were previously shared by only this snapshot
1841 * and it. Those blocks will be born after the
1842 * prev snap and before this snap, and will have
1843 * died after the next snap and before the one
1844 * after that (ie. be on the snap after next's
1847 * XXX we're doing this long task with the
1850 dsl_dataset_t
*ds_after_next
;
1853 VERIFY(0 == dsl_dataset_hold_obj(dp
,
1854 ds_next
->ds_phys
->ds_next_snap_obj
,
1855 FTAG
, &ds_after_next
));
1858 bplist_space_birthrange(&ds_after_next
->ds_deadlist
,
1859 ds
->ds_phys
->ds_prev_snap_txg
,
1860 ds
->ds_phys
->ds_creation_txg
, &space
));
1861 ds_next
->ds_phys
->ds_unique_bytes
+= space
;
1863 dsl_dataset_rele(ds_after_next
, FTAG
);
1864 ASSERT3P(ds_next
->ds_prev
, ==, NULL
);
1866 ASSERT3P(ds_next
->ds_prev
, ==, ds
);
1867 dsl_dataset_drop_ref(ds_next
->ds_prev
, ds_next
);
1868 ds_next
->ds_prev
= NULL
;
1870 VERIFY(0 == dsl_dataset_get_ref(dp
,
1871 ds
->ds_phys
->ds_prev_snap_obj
,
1872 ds_next
, &ds_next
->ds_prev
));
1875 dsl_dataset_recalc_head_uniq(ds_next
);
1878 * Reduce the amount of our unconsmed refreservation
1879 * being charged to our parent by the amount of
1880 * new unique data we have gained.
1882 if (old_unique
< ds_next
->ds_reserved
) {
1884 uint64_t new_unique
=
1885 ds_next
->ds_phys
->ds_unique_bytes
;
1887 ASSERT(old_unique
<= new_unique
);
1888 mrsdelta
= MIN(new_unique
- old_unique
,
1889 ds_next
->ds_reserved
- old_unique
);
1890 dsl_dir_diduse_space(ds
->ds_dir
,
1891 DD_USED_REFRSRV
, -mrsdelta
, 0, 0, tx
);
1894 dsl_dataset_rele(ds_next
, FTAG
);
1897 * There's no next snapshot, so this is a head dataset.
1898 * Destroy the deadlist. Unless it's a clone, the
1899 * deadlist should be empty. (If it's a clone, it's
1900 * safe to ignore the deadlist contents.)
1904 ASSERT(after_branch_point
|| bplist_empty(&ds
->ds_deadlist
));
1905 bplist_close(&ds
->ds_deadlist
);
1906 bplist_destroy(mos
, ds
->ds_phys
->ds_deadlist_obj
, tx
);
1907 ds
->ds_phys
->ds_deadlist_obj
= 0;
1910 * Free everything that we point to (that's born after
1911 * the previous snapshot, if we are a clone)
1913 * NB: this should be very quick, because we already
1914 * freed all the objects in open context.
1919 err
= traverse_dataset(ds
, ds
->ds_phys
->ds_prev_snap_txg
,
1920 TRAVERSE_POST
, kill_blkptr
, &ka
);
1921 ASSERT3U(err
, ==, 0);
1922 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds
) ||
1923 ds
->ds_phys
->ds_unique_bytes
== 0);
1926 err
= zio_wait(zio
);
1927 ASSERT3U(err
, ==, 0);
1929 if (ds
->ds_dir
->dd_phys
->dd_head_dataset_obj
== ds
->ds_object
) {
1930 /* Erase the link in the dir */
1931 dmu_buf_will_dirty(ds
->ds_dir
->dd_dbuf
, tx
);
1932 ds
->ds_dir
->dd_phys
->dd_head_dataset_obj
= 0;
1933 ASSERT(ds
->ds_phys
->ds_snapnames_zapobj
!= 0);
1934 err
= zap_destroy(mos
, ds
->ds_phys
->ds_snapnames_zapobj
, tx
);
1937 /* remove from snapshot namespace */
1938 dsl_dataset_t
*ds_head
;
1939 ASSERT(ds
->ds_phys
->ds_snapnames_zapobj
== 0);
1940 VERIFY(0 == dsl_dataset_hold_obj(dp
,
1941 ds
->ds_dir
->dd_phys
->dd_head_dataset_obj
, FTAG
, &ds_head
));
1942 VERIFY(0 == dsl_dataset_get_snapname(ds
));
1947 err
= dsl_dataset_snap_lookup(ds_head
,
1948 ds
->ds_snapname
, &val
);
1949 ASSERT3U(err
, ==, 0);
1950 ASSERT3U(val
, ==, obj
);
1953 err
= dsl_dataset_snap_remove(ds_head
, ds
->ds_snapname
, tx
);
1955 dsl_dataset_rele(ds_head
, FTAG
);
1958 if (ds_prev
&& ds
->ds_prev
!= ds_prev
)
1959 dsl_dataset_rele(ds_prev
, FTAG
);
1961 spa_prop_clear_bootfs(dp
->dp_spa
, ds
->ds_object
, tx
);
1962 spa_history_internal_log(LOG_DS_DESTROY
, dp
->dp_spa
, tx
,
1963 cr
, "dataset = %llu", ds
->ds_object
);
1965 if (ds
->ds_phys
->ds_next_clones_obj
!= 0) {
1967 ASSERT(0 == zap_count(mos
,
1968 ds
->ds_phys
->ds_next_clones_obj
, &count
) && count
== 0);
1969 VERIFY(0 == dmu_object_free(mos
,
1970 ds
->ds_phys
->ds_next_clones_obj
, tx
));
1972 if (ds
->ds_phys
->ds_props_obj
!= 0)
1973 VERIFY(0 == zap_destroy(mos
, ds
->ds_phys
->ds_props_obj
, tx
));
1974 if (ds
->ds_phys
->ds_userrefs_obj
!= 0)
1975 VERIFY(0 == zap_destroy(mos
, ds
->ds_phys
->ds_userrefs_obj
, tx
));
1976 dsl_dir_close(ds
->ds_dir
, ds
);
1978 dsl_dataset_drain_refs(ds
, tag
);
1979 VERIFY(0 == dmu_object_free(mos
, obj
, tx
));
1981 if (dsda
->rm_origin
) {
1983 * Remove the origin of the clone we just destroyed.
1985 dsl_dataset_t
*origin
= ds
->ds_prev
;
1986 struct dsl_ds_destroyarg ndsda
= {0};
1988 ASSERT3P(origin
, ==, dsda
->rm_origin
);
1989 if (origin
->ds_user_ptr
) {
1990 origin
->ds_user_evict_func(origin
, origin
->ds_user_ptr
);
1991 origin
->ds_user_ptr
= NULL
;
1994 dsl_dataset_rele(origin
, tag
);
1998 dsl_dataset_destroy_sync(&ndsda
, tag
, cr
, tx
);
2003 dsl_dataset_snapshot_reserve_space(dsl_dataset_t
*ds
, dmu_tx_t
*tx
)
2007 if (!dmu_tx_is_syncing(tx
))
2011 * If there's an fs-only reservation, any blocks that might become
2012 * owned by the snapshot dataset must be accommodated by space
2013 * outside of the reservation.
2015 asize
= MIN(dsl_dataset_unique(ds
), ds
->ds_reserved
);
2016 if (asize
> dsl_dir_space_available(ds
->ds_dir
, NULL
, 0, FALSE
))
2020 * Propogate any reserved space for this snapshot to other
2021 * snapshot checks in this sync group.
2024 dsl_dir_willuse_space(ds
->ds_dir
, asize
, tx
);
2031 dsl_dataset_snapshot_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
2033 dsl_dataset_t
*ds
= arg1
;
2034 const char *snapname
= arg2
;
2039 * We don't allow multiple snapshots of the same txg. If there
2040 * is already one, try again.
2042 if (ds
->ds_phys
->ds_prev_snap_txg
>= tx
->tx_txg
)
2046 * Check for conflicting name snapshot name.
2048 err
= dsl_dataset_snap_lookup(ds
, snapname
, &value
);
2055 * Check that the dataset's name is not too long. Name consists
2056 * of the dataset's length + 1 for the @-sign + snapshot name's length
2058 if (dsl_dataset_namelen(ds
) + 1 + strlen(snapname
) >= MAXNAMELEN
)
2059 return (ENAMETOOLONG
);
2061 err
= dsl_dataset_snapshot_reserve_space(ds
, tx
);
2065 ds
->ds_trysnap_txg
= tx
->tx_txg
;
2070 dsl_dataset_snapshot_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
2072 dsl_dataset_t
*ds
= arg1
;
2073 const char *snapname
= arg2
;
2074 dsl_pool_t
*dp
= ds
->ds_dir
->dd_pool
;
2076 dsl_dataset_phys_t
*dsphys
;
2077 uint64_t dsobj
, crtxg
;
2078 objset_t
*mos
= dp
->dp_meta_objset
;
2081 ASSERT(RW_WRITE_HELD(&dp
->dp_config_rwlock
));
2084 * The origin's ds_creation_txg has to be < TXG_INITIAL
2086 if (strcmp(snapname
, ORIGIN_DIR_NAME
) == 0)
2091 dsobj
= dmu_object_alloc(mos
, DMU_OT_DSL_DATASET
, 0,
2092 DMU_OT_DSL_DATASET
, sizeof (dsl_dataset_phys_t
), tx
);
2093 VERIFY(0 == dmu_bonus_hold(mos
, dsobj
, FTAG
, &dbuf
));
2094 dmu_buf_will_dirty(dbuf
, tx
);
2095 dsphys
= dbuf
->db_data
;
2096 bzero(dsphys
, sizeof (dsl_dataset_phys_t
));
2097 dsphys
->ds_dir_obj
= ds
->ds_dir
->dd_object
;
2098 dsphys
->ds_fsid_guid
= unique_create();
2099 (void) random_get_pseudo_bytes((void*)&dsphys
->ds_guid
,
2100 sizeof (dsphys
->ds_guid
));
2101 dsphys
->ds_prev_snap_obj
= ds
->ds_phys
->ds_prev_snap_obj
;
2102 dsphys
->ds_prev_snap_txg
= ds
->ds_phys
->ds_prev_snap_txg
;
2103 dsphys
->ds_next_snap_obj
= ds
->ds_object
;
2104 dsphys
->ds_num_children
= 1;
2105 dsphys
->ds_creation_time
= gethrestime_sec();
2106 dsphys
->ds_creation_txg
= crtxg
;
2107 dsphys
->ds_deadlist_obj
= ds
->ds_phys
->ds_deadlist_obj
;
2108 dsphys
->ds_used_bytes
= ds
->ds_phys
->ds_used_bytes
;
2109 dsphys
->ds_compressed_bytes
= ds
->ds_phys
->ds_compressed_bytes
;
2110 dsphys
->ds_uncompressed_bytes
= ds
->ds_phys
->ds_uncompressed_bytes
;
2111 dsphys
->ds_flags
= ds
->ds_phys
->ds_flags
;
2112 dsphys
->ds_bp
= ds
->ds_phys
->ds_bp
;
2113 dmu_buf_rele(dbuf
, FTAG
);
2115 ASSERT3U(ds
->ds_prev
!= 0, ==, ds
->ds_phys
->ds_prev_snap_obj
!= 0);
2117 uint64_t next_clones_obj
=
2118 ds
->ds_prev
->ds_phys
->ds_next_clones_obj
;
2119 ASSERT(ds
->ds_prev
->ds_phys
->ds_next_snap_obj
==
2121 ds
->ds_prev
->ds_phys
->ds_num_children
> 1);
2122 if (ds
->ds_prev
->ds_phys
->ds_next_snap_obj
== ds
->ds_object
) {
2123 dmu_buf_will_dirty(ds
->ds_prev
->ds_dbuf
, tx
);
2124 ASSERT3U(ds
->ds_phys
->ds_prev_snap_txg
, ==,
2125 ds
->ds_prev
->ds_phys
->ds_creation_txg
);
2126 ds
->ds_prev
->ds_phys
->ds_next_snap_obj
= dsobj
;
2127 } else if (next_clones_obj
!= 0) {
2128 VERIFY3U(0, ==, zap_remove_int(mos
,
2129 next_clones_obj
, dsphys
->ds_next_snap_obj
, tx
));
2130 VERIFY3U(0, ==, zap_add_int(mos
,
2131 next_clones_obj
, dsobj
, tx
));
2136 * If we have a reference-reservation on this dataset, we will
2137 * need to increase the amount of refreservation being charged
2138 * since our unique space is going to zero.
2140 if (ds
->ds_reserved
) {
2141 int64_t add
= MIN(dsl_dataset_unique(ds
), ds
->ds_reserved
);
2142 dsl_dir_diduse_space(ds
->ds_dir
, DD_USED_REFRSRV
,
2146 bplist_close(&ds
->ds_deadlist
);
2147 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
2148 ASSERT3U(ds
->ds_phys
->ds_prev_snap_txg
, <, tx
->tx_txg
);
2149 ds
->ds_phys
->ds_prev_snap_obj
= dsobj
;
2150 ds
->ds_phys
->ds_prev_snap_txg
= crtxg
;
2151 ds
->ds_phys
->ds_unique_bytes
= 0;
2152 if (spa_version(dp
->dp_spa
) >= SPA_VERSION_UNIQUE_ACCURATE
)
2153 ds
->ds_phys
->ds_flags
|= DS_FLAG_UNIQUE_ACCURATE
;
2154 ds
->ds_phys
->ds_deadlist_obj
=
2155 bplist_create(mos
, DSL_DEADLIST_BLOCKSIZE
, tx
);
2156 VERIFY(0 == bplist_open(&ds
->ds_deadlist
, mos
,
2157 ds
->ds_phys
->ds_deadlist_obj
));
2159 dprintf("snap '%s' -> obj %llu\n", snapname
, dsobj
);
2160 err
= zap_add(mos
, ds
->ds_phys
->ds_snapnames_zapobj
,
2161 snapname
, 8, 1, &dsobj
, tx
);
2165 dsl_dataset_drop_ref(ds
->ds_prev
, ds
);
2166 VERIFY(0 == dsl_dataset_get_ref(dp
,
2167 ds
->ds_phys
->ds_prev_snap_obj
, ds
, &ds
->ds_prev
));
2169 dsl_pool_ds_snapshotted(ds
, tx
);
2171 spa_history_internal_log(LOG_DS_SNAPSHOT
, dp
->dp_spa
, tx
, cr
,
2172 "dataset = %llu", dsobj
);
2176 dsl_dataset_sync(dsl_dataset_t
*ds
, zio_t
*zio
, dmu_tx_t
*tx
)
2178 ASSERT(dmu_tx_is_syncing(tx
));
2179 ASSERT(ds
->ds_user_ptr
!= NULL
);
2180 ASSERT(ds
->ds_phys
->ds_next_snap_obj
== 0);
2183 * in case we had to change ds_fsid_guid when we opened it,
2186 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
2187 ds
->ds_phys
->ds_fsid_guid
= ds
->ds_fsid_guid
;
2189 dsl_dir_dirty(ds
->ds_dir
, tx
);
2190 dmu_objset_sync(ds
->ds_user_ptr
, zio
, tx
);
2194 dsl_dataset_stats(dsl_dataset_t
*ds
, nvlist_t
*nv
)
2196 uint64_t refd
, avail
, uobjs
, aobjs
;
2198 dsl_dir_stats(ds
->ds_dir
, nv
);
2200 dsl_dataset_space(ds
, &refd
, &avail
, &uobjs
, &aobjs
);
2201 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_AVAILABLE
, avail
);
2202 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_REFERENCED
, refd
);
2204 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_CREATION
,
2205 ds
->ds_phys
->ds_creation_time
);
2206 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_CREATETXG
,
2207 ds
->ds_phys
->ds_creation_txg
);
2208 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_REFQUOTA
,
2210 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_REFRESERVATION
,
2212 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_GUID
,
2213 ds
->ds_phys
->ds_guid
);
2214 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_USERREFS
, ds
->ds_userrefs
);
2215 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_DEFER_DESTROY
,
2216 DS_IS_DEFER_DESTROY(ds
) ? 1 : 0);
2218 if (ds
->ds_phys
->ds_next_snap_obj
) {
2220 * This is a snapshot; override the dd's space used with
2221 * our unique space and compression ratio.
2223 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_USED
,
2224 ds
->ds_phys
->ds_unique_bytes
);
2225 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_COMPRESSRATIO
,
2226 ds
->ds_phys
->ds_compressed_bytes
== 0 ? 100 :
2227 (ds
->ds_phys
->ds_uncompressed_bytes
* 100 /
2228 ds
->ds_phys
->ds_compressed_bytes
));
2233 dsl_dataset_fast_stat(dsl_dataset_t
*ds
, dmu_objset_stats_t
*stat
)
2235 stat
->dds_creation_txg
= ds
->ds_phys
->ds_creation_txg
;
2236 stat
->dds_inconsistent
= ds
->ds_phys
->ds_flags
& DS_FLAG_INCONSISTENT
;
2237 stat
->dds_guid
= ds
->ds_phys
->ds_guid
;
2238 if (ds
->ds_phys
->ds_next_snap_obj
) {
2239 stat
->dds_is_snapshot
= B_TRUE
;
2240 stat
->dds_num_clones
= ds
->ds_phys
->ds_num_children
- 1;
2242 stat
->dds_is_snapshot
= B_FALSE
;
2243 stat
->dds_num_clones
= 0;
2246 /* clone origin is really a dsl_dir thing... */
2247 rw_enter(&ds
->ds_dir
->dd_pool
->dp_config_rwlock
, RW_READER
);
2248 if (dsl_dir_is_clone(ds
->ds_dir
)) {
2251 VERIFY(0 == dsl_dataset_get_ref(ds
->ds_dir
->dd_pool
,
2252 ds
->ds_dir
->dd_phys
->dd_origin_obj
, FTAG
, &ods
));
2253 dsl_dataset_name(ods
, stat
->dds_origin
);
2254 dsl_dataset_drop_ref(ods
, FTAG
);
2256 stat
->dds_origin
[0] = '\0';
2258 rw_exit(&ds
->ds_dir
->dd_pool
->dp_config_rwlock
);
2262 dsl_dataset_fsid_guid(dsl_dataset_t
*ds
)
2264 return (ds
->ds_fsid_guid
);
2268 dsl_dataset_space(dsl_dataset_t
*ds
,
2269 uint64_t *refdbytesp
, uint64_t *availbytesp
,
2270 uint64_t *usedobjsp
, uint64_t *availobjsp
)
2272 *refdbytesp
= ds
->ds_phys
->ds_used_bytes
;
2273 *availbytesp
= dsl_dir_space_available(ds
->ds_dir
, NULL
, 0, TRUE
);
2274 if (ds
->ds_reserved
> ds
->ds_phys
->ds_unique_bytes
)
2275 *availbytesp
+= ds
->ds_reserved
- ds
->ds_phys
->ds_unique_bytes
;
2276 if (ds
->ds_quota
!= 0) {
2278 * Adjust available bytes according to refquota
2280 if (*refdbytesp
< ds
->ds_quota
)
2281 *availbytesp
= MIN(*availbytesp
,
2282 ds
->ds_quota
- *refdbytesp
);
2286 *usedobjsp
= ds
->ds_phys
->ds_bp
.blk_fill
;
2287 *availobjsp
= DN_MAX_OBJECT
- *usedobjsp
;
2291 dsl_dataset_modified_since_lastsnap(dsl_dataset_t
*ds
)
2293 dsl_pool_t
*dp
= ds
->ds_dir
->dd_pool
;
2295 ASSERT(RW_LOCK_HELD(&dp
->dp_config_rwlock
) ||
2296 dsl_pool_sync_context(dp
));
2297 if (ds
->ds_prev
== NULL
)
2299 if (ds
->ds_phys
->ds_bp
.blk_birth
>
2300 ds
->ds_prev
->ds_phys
->ds_creation_txg
)
2307 dsl_dataset_snapshot_rename_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
2309 dsl_dataset_t
*ds
= arg1
;
2310 char *newsnapname
= arg2
;
2311 dsl_dir_t
*dd
= ds
->ds_dir
;
2316 err
= dsl_dataset_hold_obj(dd
->dd_pool
,
2317 dd
->dd_phys
->dd_head_dataset_obj
, FTAG
, &hds
);
2321 /* new name better not be in use */
2322 err
= dsl_dataset_snap_lookup(hds
, newsnapname
, &val
);
2323 dsl_dataset_rele(hds
, FTAG
);
2327 else if (err
== ENOENT
)
2330 /* dataset name + 1 for the "@" + the new snapshot name must fit */
2331 if (dsl_dir_namelen(ds
->ds_dir
) + 1 + strlen(newsnapname
) >= MAXNAMELEN
)
2338 dsl_dataset_snapshot_rename_sync(void *arg1
, void *arg2
,
2339 cred_t
*cr
, dmu_tx_t
*tx
)
2341 dsl_dataset_t
*ds
= arg1
;
2342 const char *newsnapname
= arg2
;
2343 dsl_dir_t
*dd
= ds
->ds_dir
;
2344 objset_t
*mos
= dd
->dd_pool
->dp_meta_objset
;
2348 ASSERT(ds
->ds_phys
->ds_next_snap_obj
!= 0);
2350 VERIFY(0 == dsl_dataset_hold_obj(dd
->dd_pool
,
2351 dd
->dd_phys
->dd_head_dataset_obj
, FTAG
, &hds
));
2353 VERIFY(0 == dsl_dataset_get_snapname(ds
));
2354 err
= dsl_dataset_snap_remove(hds
, ds
->ds_snapname
, tx
);
2355 ASSERT3U(err
, ==, 0);
2356 mutex_enter(&ds
->ds_lock
);
2357 (void) strcpy(ds
->ds_snapname
, newsnapname
);
2358 mutex_exit(&ds
->ds_lock
);
2359 err
= zap_add(mos
, hds
->ds_phys
->ds_snapnames_zapobj
,
2360 ds
->ds_snapname
, 8, 1, &ds
->ds_object
, tx
);
2361 ASSERT3U(err
, ==, 0);
2363 spa_history_internal_log(LOG_DS_RENAME
, dd
->dd_pool
->dp_spa
, tx
,
2364 cr
, "dataset = %llu", ds
->ds_object
);
2365 dsl_dataset_rele(hds
, FTAG
);
2368 struct renamesnaparg
{
2369 dsl_sync_task_group_t
*dstg
;
2370 char failed
[MAXPATHLEN
];
2376 dsl_snapshot_rename_one(char *name
, void *arg
)
2378 struct renamesnaparg
*ra
= arg
;
2379 dsl_dataset_t
*ds
= NULL
;
2383 cp
= name
+ strlen(name
);
2385 (void) strcpy(cp
+ 1, ra
->oldsnap
);
2388 * For recursive snapshot renames the parent won't be changing
2389 * so we just pass name for both the to/from argument.
2391 err
= zfs_secpolicy_rename_perms(name
, name
, CRED());
2392 if (err
== ENOENT
) {
2395 (void) strcpy(ra
->failed
, name
);
2401 * For all filesystems undergoing rename, we'll need to unmount it.
2403 (void) zfs_unmount_snap(name
, NULL
);
2405 err
= dsl_dataset_hold(name
, ra
->dstg
, &ds
);
2407 if (err
== ENOENT
) {
2410 (void) strcpy(ra
->failed
, name
);
2414 dsl_sync_task_create(ra
->dstg
, dsl_dataset_snapshot_rename_check
,
2415 dsl_dataset_snapshot_rename_sync
, ds
, ra
->newsnap
, 0);
2421 dsl_recursive_rename(char *oldname
, const char *newname
)
2424 struct renamesnaparg
*ra
;
2425 dsl_sync_task_t
*dst
;
2427 char *cp
, *fsname
= spa_strdup(oldname
);
2428 int len
= strlen(oldname
);
2430 /* truncate the snapshot name to get the fsname */
2431 cp
= strchr(fsname
, '@');
2434 err
= spa_open(fsname
, &spa
, FTAG
);
2436 kmem_free(fsname
, len
+ 1);
2439 ra
= kmem_alloc(sizeof (struct renamesnaparg
), KM_SLEEP
);
2440 ra
->dstg
= dsl_sync_task_group_create(spa_get_dsl(spa
));
2442 ra
->oldsnap
= strchr(oldname
, '@') + 1;
2443 ra
->newsnap
= strchr(newname
, '@') + 1;
2446 err
= dmu_objset_find(fsname
, dsl_snapshot_rename_one
, ra
,
2448 kmem_free(fsname
, len
+ 1);
2451 err
= dsl_sync_task_group_wait(ra
->dstg
);
2454 for (dst
= list_head(&ra
->dstg
->dstg_tasks
); dst
;
2455 dst
= list_next(&ra
->dstg
->dstg_tasks
, dst
)) {
2456 dsl_dataset_t
*ds
= dst
->dst_arg1
;
2458 dsl_dir_name(ds
->ds_dir
, ra
->failed
);
2459 (void) strcat(ra
->failed
, "@");
2460 (void) strcat(ra
->failed
, ra
->newsnap
);
2462 dsl_dataset_rele(ds
, ra
->dstg
);
2466 (void) strcpy(oldname
, ra
->failed
);
2468 dsl_sync_task_group_destroy(ra
->dstg
);
2469 kmem_free(ra
, sizeof (struct renamesnaparg
));
2470 spa_close(spa
, FTAG
);
2475 dsl_valid_rename(char *oldname
, void *arg
)
2477 int delta
= *(int *)arg
;
2479 if (strlen(oldname
) + delta
>= MAXNAMELEN
)
2480 return (ENAMETOOLONG
);
2485 #pragma weak dmu_objset_rename = dsl_dataset_rename
2487 dsl_dataset_rename(char *oldname
, const char *newname
, boolean_t recursive
)
2494 err
= dsl_dir_open(oldname
, FTAG
, &dd
, &tail
);
2498 * If there are more than 2 references there may be holds
2499 * hanging around that haven't been cleared out yet.
2501 if (dmu_buf_refcount(dd
->dd_dbuf
) > 2)
2502 txg_wait_synced(dd
->dd_pool
, 0);
2504 int delta
= strlen(newname
) - strlen(oldname
);
2506 /* if we're growing, validate child name lengths */
2508 err
= dmu_objset_find(oldname
, dsl_valid_rename
,
2509 &delta
, DS_FIND_CHILDREN
| DS_FIND_SNAPSHOTS
);
2512 err
= dsl_dir_rename(dd
, newname
);
2513 dsl_dir_close(dd
, FTAG
);
2516 if (tail
[0] != '@') {
2517 /* the name ended in a nonexistant component */
2518 dsl_dir_close(dd
, FTAG
);
2522 dsl_dir_close(dd
, FTAG
);
2524 /* new name must be snapshot in same filesystem */
2525 tail
= strchr(newname
, '@');
2529 if (strncmp(oldname
, newname
, tail
- newname
) != 0)
2533 err
= dsl_recursive_rename(oldname
, newname
);
2535 err
= dsl_dataset_hold(oldname
, FTAG
, &ds
);
2539 err
= dsl_sync_task_do(ds
->ds_dir
->dd_pool
,
2540 dsl_dataset_snapshot_rename_check
,
2541 dsl_dataset_snapshot_rename_sync
, ds
, (char *)tail
, 1);
2543 dsl_dataset_rele(ds
, FTAG
);
2549 struct promotenode
{
2555 list_t shared_snaps
, origin_snaps
, clone_snaps
;
2556 dsl_dataset_t
*origin_origin
, *origin_head
;
2557 uint64_t used
, comp
, uncomp
, unique
, cloneusedsnap
, originusedsnap
;
2560 static int snaplist_space(list_t
*l
, uint64_t mintxg
, uint64_t *spacep
);
2564 dsl_dataset_promote_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
2566 dsl_dataset_t
*hds
= arg1
;
2567 struct promotearg
*pa
= arg2
;
2568 struct promotenode
*snap
= list_head(&pa
->shared_snaps
);
2569 dsl_dataset_t
*origin_ds
= snap
->ds
;
2572 /* Check that it is a real clone */
2573 if (!dsl_dir_is_clone(hds
->ds_dir
))
2576 /* Since this is so expensive, don't do the preliminary check */
2577 if (!dmu_tx_is_syncing(tx
))
2580 if (hds
->ds_phys
->ds_flags
& DS_FLAG_NOPROMOTE
)
2583 /* compute origin's new unique space */
2584 snap
= list_tail(&pa
->clone_snaps
);
2585 ASSERT3U(snap
->ds
->ds_phys
->ds_prev_snap_obj
, ==, origin_ds
->ds_object
);
2586 err
= bplist_space_birthrange(&snap
->ds
->ds_deadlist
,
2587 origin_ds
->ds_phys
->ds_prev_snap_txg
, UINT64_MAX
, &pa
->unique
);
2592 * Walk the snapshots that we are moving
2594 * Compute space to transfer. Consider the incremental changes
2595 * to used for each snapshot:
2596 * (my used) = (prev's used) + (blocks born) - (blocks killed)
2597 * So each snapshot gave birth to:
2598 * (blocks born) = (my used) - (prev's used) + (blocks killed)
2599 * So a sequence would look like:
2600 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2601 * Which simplifies to:
2602 * uN + kN + kN-1 + ... + k1 + k0
2603 * Note however, if we stop before we reach the ORIGIN we get:
2604 * uN + kN + kN-1 + ... + kM - uM-1
2606 pa
->used
= origin_ds
->ds_phys
->ds_used_bytes
;
2607 pa
->comp
= origin_ds
->ds_phys
->ds_compressed_bytes
;
2608 pa
->uncomp
= origin_ds
->ds_phys
->ds_uncompressed_bytes
;
2609 for (snap
= list_head(&pa
->shared_snaps
); snap
;
2610 snap
= list_next(&pa
->shared_snaps
, snap
)) {
2611 uint64_t val
, dlused
, dlcomp
, dluncomp
;
2612 dsl_dataset_t
*ds
= snap
->ds
;
2614 /* Check that the snapshot name does not conflict */
2615 VERIFY(0 == dsl_dataset_get_snapname(ds
));
2616 err
= dsl_dataset_snap_lookup(hds
, ds
->ds_snapname
, &val
);
2622 /* The very first snapshot does not have a deadlist */
2623 if (ds
->ds_phys
->ds_prev_snap_obj
== 0)
2626 if (err
= bplist_space(&ds
->ds_deadlist
,
2627 &dlused
, &dlcomp
, &dluncomp
))
2631 pa
->uncomp
+= dluncomp
;
2635 * If we are a clone of a clone then we never reached ORIGIN,
2636 * so we need to subtract out the clone origin's used space.
2638 if (pa
->origin_origin
) {
2639 pa
->used
-= pa
->origin_origin
->ds_phys
->ds_used_bytes
;
2640 pa
->comp
-= pa
->origin_origin
->ds_phys
->ds_compressed_bytes
;
2641 pa
->uncomp
-= pa
->origin_origin
->ds_phys
->ds_uncompressed_bytes
;
2644 /* Check that there is enough space here */
2645 err
= dsl_dir_transfer_possible(origin_ds
->ds_dir
, hds
->ds_dir
,
2651 * Compute the amounts of space that will be used by snapshots
2652 * after the promotion (for both origin and clone). For each,
2653 * it is the amount of space that will be on all of their
2654 * deadlists (that was not born before their new origin).
2656 if (hds
->ds_dir
->dd_phys
->dd_flags
& DD_FLAG_USED_BREAKDOWN
) {
2660 * Note, typically this will not be a clone of a clone,
2661 * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
2662 * these snaplist_space() -> bplist_space_birthrange()
2663 * calls will be fast because they do not have to
2664 * iterate over all bps.
2666 snap
= list_head(&pa
->origin_snaps
);
2667 err
= snaplist_space(&pa
->shared_snaps
,
2668 snap
->ds
->ds_origin_txg
, &pa
->cloneusedsnap
);
2672 err
= snaplist_space(&pa
->clone_snaps
,
2673 snap
->ds
->ds_origin_txg
, &space
);
2676 pa
->cloneusedsnap
+= space
;
2678 if (origin_ds
->ds_dir
->dd_phys
->dd_flags
& DD_FLAG_USED_BREAKDOWN
) {
2679 err
= snaplist_space(&pa
->origin_snaps
,
2680 origin_ds
->ds_phys
->ds_creation_txg
, &pa
->originusedsnap
);
2689 dsl_dataset_promote_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
2691 dsl_dataset_t
*hds
= arg1
;
2692 struct promotearg
*pa
= arg2
;
2693 struct promotenode
*snap
= list_head(&pa
->shared_snaps
);
2694 dsl_dataset_t
*origin_ds
= snap
->ds
;
2695 dsl_dataset_t
*origin_head
;
2696 dsl_dir_t
*dd
= hds
->ds_dir
;
2697 dsl_pool_t
*dp
= hds
->ds_dir
->dd_pool
;
2698 dsl_dir_t
*odd
= NULL
;
2699 uint64_t oldnext_obj
;
2702 ASSERT(0 == (hds
->ds_phys
->ds_flags
& DS_FLAG_NOPROMOTE
));
2704 snap
= list_head(&pa
->origin_snaps
);
2705 origin_head
= snap
->ds
;
2708 * We need to explicitly open odd, since origin_ds's dd will be
2711 VERIFY(0 == dsl_dir_open_obj(dp
, origin_ds
->ds_dir
->dd_object
,
2714 /* change origin's next snap */
2715 dmu_buf_will_dirty(origin_ds
->ds_dbuf
, tx
);
2716 oldnext_obj
= origin_ds
->ds_phys
->ds_next_snap_obj
;
2717 snap
= list_tail(&pa
->clone_snaps
);
2718 ASSERT3U(snap
->ds
->ds_phys
->ds_prev_snap_obj
, ==, origin_ds
->ds_object
);
2719 origin_ds
->ds_phys
->ds_next_snap_obj
= snap
->ds
->ds_object
;
2721 /* change the origin's next clone */
2722 if (origin_ds
->ds_phys
->ds_next_clones_obj
) {
2723 VERIFY3U(0, ==, zap_remove_int(dp
->dp_meta_objset
,
2724 origin_ds
->ds_phys
->ds_next_clones_obj
,
2725 origin_ds
->ds_phys
->ds_next_snap_obj
, tx
));
2726 VERIFY3U(0, ==, zap_add_int(dp
->dp_meta_objset
,
2727 origin_ds
->ds_phys
->ds_next_clones_obj
,
2732 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
2733 ASSERT3U(dd
->dd_phys
->dd_origin_obj
, ==, origin_ds
->ds_object
);
2734 dd
->dd_phys
->dd_origin_obj
= odd
->dd_phys
->dd_origin_obj
;
2735 hds
->ds_origin_txg
= origin_head
->ds_origin_txg
;
2736 dmu_buf_will_dirty(odd
->dd_dbuf
, tx
);
2737 odd
->dd_phys
->dd_origin_obj
= origin_ds
->ds_object
;
2738 origin_head
->ds_origin_txg
= origin_ds
->ds_phys
->ds_creation_txg
;
2740 /* move snapshots to this dir */
2741 for (snap
= list_head(&pa
->shared_snaps
); snap
;
2742 snap
= list_next(&pa
->shared_snaps
, snap
)) {
2743 dsl_dataset_t
*ds
= snap
->ds
;
2745 /* unregister props as dsl_dir is changing */
2746 if (ds
->ds_user_ptr
) {
2747 ds
->ds_user_evict_func(ds
, ds
->ds_user_ptr
);
2748 ds
->ds_user_ptr
= NULL
;
2750 /* move snap name entry */
2751 VERIFY(0 == dsl_dataset_get_snapname(ds
));
2752 VERIFY(0 == dsl_dataset_snap_remove(origin_head
,
2753 ds
->ds_snapname
, tx
));
2754 VERIFY(0 == zap_add(dp
->dp_meta_objset
,
2755 hds
->ds_phys
->ds_snapnames_zapobj
, ds
->ds_snapname
,
2756 8, 1, &ds
->ds_object
, tx
));
2757 /* change containing dsl_dir */
2758 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
2759 ASSERT3U(ds
->ds_phys
->ds_dir_obj
, ==, odd
->dd_object
);
2760 ds
->ds_phys
->ds_dir_obj
= dd
->dd_object
;
2761 ASSERT3P(ds
->ds_dir
, ==, odd
);
2762 dsl_dir_close(ds
->ds_dir
, ds
);
2763 VERIFY(0 == dsl_dir_open_obj(dp
, dd
->dd_object
,
2764 NULL
, ds
, &ds
->ds_dir
));
2766 ASSERT3U(dsl_prop_numcb(ds
), ==, 0);
2770 * Change space accounting.
2771 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2772 * both be valid, or both be 0 (resulting in delta == 0). This
2773 * is true for each of {clone,origin} independently.
2776 delta
= pa
->cloneusedsnap
-
2777 dd
->dd_phys
->dd_used_breakdown
[DD_USED_SNAP
];
2778 ASSERT3S(delta
, >=, 0);
2779 ASSERT3U(pa
->used
, >=, delta
);
2780 dsl_dir_diduse_space(dd
, DD_USED_SNAP
, delta
, 0, 0, tx
);
2781 dsl_dir_diduse_space(dd
, DD_USED_HEAD
,
2782 pa
->used
- delta
, pa
->comp
, pa
->uncomp
, tx
);
2784 delta
= pa
->originusedsnap
-
2785 odd
->dd_phys
->dd_used_breakdown
[DD_USED_SNAP
];
2786 ASSERT3S(delta
, <=, 0);
2787 ASSERT3U(pa
->used
, >=, -delta
);
2788 dsl_dir_diduse_space(odd
, DD_USED_SNAP
, delta
, 0, 0, tx
);
2789 dsl_dir_diduse_space(odd
, DD_USED_HEAD
,
2790 -pa
->used
- delta
, -pa
->comp
, -pa
->uncomp
, tx
);
2792 origin_ds
->ds_phys
->ds_unique_bytes
= pa
->unique
;
2794 /* log history record */
2795 spa_history_internal_log(LOG_DS_PROMOTE
, dd
->dd_pool
->dp_spa
, tx
,
2796 cr
, "dataset = %llu", hds
->ds_object
);
2798 dsl_dir_close(odd
, FTAG
);
2801 static char *snaplist_tag
= "snaplist";
2803 * Make a list of dsl_dataset_t's for the snapshots between first_obj
2804 * (exclusive) and last_obj (inclusive). The list will be in reverse
2805 * order (last_obj will be the list_head()). If first_obj == 0, do all
2806 * snapshots back to this dataset's origin.
2809 snaplist_make(dsl_pool_t
*dp
, boolean_t own
,
2810 uint64_t first_obj
, uint64_t last_obj
, list_t
*l
)
2812 uint64_t obj
= last_obj
;
2814 ASSERT(RW_LOCK_HELD(&dp
->dp_config_rwlock
));
2816 list_create(l
, sizeof (struct promotenode
),
2817 offsetof(struct promotenode
, link
));
2819 while (obj
!= first_obj
) {
2821 struct promotenode
*snap
;
2825 err
= dsl_dataset_own_obj(dp
, obj
,
2826 0, snaplist_tag
, &ds
);
2828 dsl_dataset_make_exclusive(ds
, snaplist_tag
);
2830 err
= dsl_dataset_hold_obj(dp
, obj
, snaplist_tag
, &ds
);
2832 if (err
== ENOENT
) {
2833 /* lost race with snapshot destroy */
2834 struct promotenode
*last
= list_tail(l
);
2835 ASSERT(obj
!= last
->ds
->ds_phys
->ds_prev_snap_obj
);
2836 obj
= last
->ds
->ds_phys
->ds_prev_snap_obj
;
2843 first_obj
= ds
->ds_dir
->dd_phys
->dd_origin_obj
;
2845 snap
= kmem_alloc(sizeof (struct promotenode
), KM_SLEEP
);
2847 list_insert_tail(l
, snap
);
2848 obj
= ds
->ds_phys
->ds_prev_snap_obj
;
2855 snaplist_space(list_t
*l
, uint64_t mintxg
, uint64_t *spacep
)
2857 struct promotenode
*snap
;
2860 for (snap
= list_head(l
); snap
; snap
= list_next(l
, snap
)) {
2862 int err
= bplist_space_birthrange(&snap
->ds
->ds_deadlist
,
2863 mintxg
, UINT64_MAX
, &used
);
2872 snaplist_destroy(list_t
*l
, boolean_t own
)
2874 struct promotenode
*snap
;
2876 if (!l
|| !list_link_active(&l
->list_head
))
2879 while ((snap
= list_tail(l
)) != NULL
) {
2880 list_remove(l
, snap
);
2882 dsl_dataset_disown(snap
->ds
, snaplist_tag
);
2884 dsl_dataset_rele(snap
->ds
, snaplist_tag
);
2885 kmem_free(snap
, sizeof (struct promotenode
));
2891 * Promote a clone. Nomenclature note:
2892 * "clone" or "cds": the original clone which is being promoted
2893 * "origin" or "ods": the snapshot which is originally clone's origin
2894 * "origin head" or "ohds": the dataset which is the head
2895 * (filesystem/volume) for the origin
2896 * "origin origin": the origin of the origin's filesystem (typically
2897 * NULL, indicating that the clone is not a clone of a clone).
2900 dsl_dataset_promote(const char *name
)
2905 dmu_object_info_t doi
;
2906 struct promotearg pa
= { 0 };
2907 struct promotenode
*snap
;
2910 err
= dsl_dataset_hold(name
, FTAG
, &ds
);
2916 err
= dmu_object_info(dp
->dp_meta_objset
,
2917 ds
->ds_phys
->ds_snapnames_zapobj
, &doi
);
2919 dsl_dataset_rele(ds
, FTAG
);
2923 if (dsl_dataset_is_snapshot(ds
) || dd
->dd_phys
->dd_origin_obj
== 0) {
2924 dsl_dataset_rele(ds
, FTAG
);
2929 * We are going to inherit all the snapshots taken before our
2930 * origin (i.e., our new origin will be our parent's origin).
2931 * Take ownership of them so that we can rename them into our
2934 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
2936 err
= snaplist_make(dp
, B_TRUE
, 0, dd
->dd_phys
->dd_origin_obj
,
2941 err
= snaplist_make(dp
, B_FALSE
, 0, ds
->ds_object
, &pa
.clone_snaps
);
2945 snap
= list_head(&pa
.shared_snaps
);
2946 ASSERT3U(snap
->ds
->ds_object
, ==, dd
->dd_phys
->dd_origin_obj
);
2947 err
= snaplist_make(dp
, B_FALSE
, dd
->dd_phys
->dd_origin_obj
,
2948 snap
->ds
->ds_dir
->dd_phys
->dd_head_dataset_obj
, &pa
.origin_snaps
);
2952 if (dsl_dir_is_clone(snap
->ds
->ds_dir
)) {
2953 err
= dsl_dataset_own_obj(dp
,
2954 snap
->ds
->ds_dir
->dd_phys
->dd_origin_obj
,
2955 0, FTAG
, &pa
.origin_origin
);
2961 rw_exit(&dp
->dp_config_rwlock
);
2964 * Add in 128x the snapnames zapobj size, since we will be moving
2965 * a bunch of snapnames to the promoted ds, and dirtying their
2969 err
= dsl_sync_task_do(dp
, dsl_dataset_promote_check
,
2970 dsl_dataset_promote_sync
, ds
, &pa
,
2971 2 + 2 * doi
.doi_physical_blks
);
2974 snaplist_destroy(&pa
.shared_snaps
, B_TRUE
);
2975 snaplist_destroy(&pa
.clone_snaps
, B_FALSE
);
2976 snaplist_destroy(&pa
.origin_snaps
, B_FALSE
);
2977 if (pa
.origin_origin
)
2978 dsl_dataset_disown(pa
.origin_origin
, FTAG
);
2979 dsl_dataset_rele(ds
, FTAG
);
2983 struct cloneswaparg
{
2984 dsl_dataset_t
*cds
; /* clone dataset */
2985 dsl_dataset_t
*ohds
; /* origin's head dataset */
2987 int64_t unused_refres_delta
; /* change in unconsumed refreservation */
2992 dsl_dataset_clone_swap_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
2994 struct cloneswaparg
*csa
= arg1
;
2996 /* they should both be heads */
2997 if (dsl_dataset_is_snapshot(csa
->cds
) ||
2998 dsl_dataset_is_snapshot(csa
->ohds
))
3001 /* the branch point should be just before them */
3002 if (csa
->cds
->ds_prev
!= csa
->ohds
->ds_prev
)
3005 /* cds should be the clone */
3006 if (csa
->cds
->ds_prev
->ds_phys
->ds_next_snap_obj
!=
3007 csa
->ohds
->ds_object
)
3010 /* the clone should be a child of the origin */
3011 if (csa
->cds
->ds_dir
->dd_parent
!= csa
->ohds
->ds_dir
)
3014 /* ohds shouldn't be modified unless 'force' */
3015 if (!csa
->force
&& dsl_dataset_modified_since_lastsnap(csa
->ohds
))
3018 /* adjust amount of any unconsumed refreservation */
3019 csa
->unused_refres_delta
=
3020 (int64_t)MIN(csa
->ohds
->ds_reserved
,
3021 csa
->ohds
->ds_phys
->ds_unique_bytes
) -
3022 (int64_t)MIN(csa
->ohds
->ds_reserved
,
3023 csa
->cds
->ds_phys
->ds_unique_bytes
);
3025 if (csa
->unused_refres_delta
> 0 &&
3026 csa
->unused_refres_delta
>
3027 dsl_dir_space_available(csa
->ohds
->ds_dir
, NULL
, 0, TRUE
))
3035 dsl_dataset_clone_swap_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
3037 struct cloneswaparg
*csa
= arg1
;
3038 dsl_pool_t
*dp
= csa
->cds
->ds_dir
->dd_pool
;
3040 ASSERT(csa
->cds
->ds_reserved
== 0);
3041 ASSERT(csa
->cds
->ds_quota
== csa
->ohds
->ds_quota
);
3043 dmu_buf_will_dirty(csa
->cds
->ds_dbuf
, tx
);
3044 dmu_buf_will_dirty(csa
->ohds
->ds_dbuf
, tx
);
3045 dmu_buf_will_dirty(csa
->cds
->ds_prev
->ds_dbuf
, tx
);
3047 if (csa
->cds
->ds_user_ptr
!= NULL
) {
3048 csa
->cds
->ds_user_evict_func(csa
->cds
, csa
->cds
->ds_user_ptr
);
3049 csa
->cds
->ds_user_ptr
= NULL
;
3052 if (csa
->ohds
->ds_user_ptr
!= NULL
) {
3053 csa
->ohds
->ds_user_evict_func(csa
->ohds
,
3054 csa
->ohds
->ds_user_ptr
);
3055 csa
->ohds
->ds_user_ptr
= NULL
;
3058 /* reset origin's unique bytes */
3059 VERIFY(0 == bplist_space_birthrange(&csa
->cds
->ds_deadlist
,
3060 csa
->cds
->ds_prev
->ds_phys
->ds_prev_snap_txg
, UINT64_MAX
,
3061 &csa
->cds
->ds_prev
->ds_phys
->ds_unique_bytes
));
3066 tmp
= csa
->ohds
->ds_phys
->ds_bp
;
3067 csa
->ohds
->ds_phys
->ds_bp
= csa
->cds
->ds_phys
->ds_bp
;
3068 csa
->cds
->ds_phys
->ds_bp
= tmp
;
3071 /* set dd_*_bytes */
3073 int64_t dused
, dcomp
, duncomp
;
3074 uint64_t cdl_used
, cdl_comp
, cdl_uncomp
;
3075 uint64_t odl_used
, odl_comp
, odl_uncomp
;
3077 ASSERT3U(csa
->cds
->ds_dir
->dd_phys
->
3078 dd_used_breakdown
[DD_USED_SNAP
], ==, 0);
3080 VERIFY(0 == bplist_space(&csa
->cds
->ds_deadlist
, &cdl_used
,
3081 &cdl_comp
, &cdl_uncomp
));
3082 VERIFY(0 == bplist_space(&csa
->ohds
->ds_deadlist
, &odl_used
,
3083 &odl_comp
, &odl_uncomp
));
3085 dused
= csa
->cds
->ds_phys
->ds_used_bytes
+ cdl_used
-
3086 (csa
->ohds
->ds_phys
->ds_used_bytes
+ odl_used
);
3087 dcomp
= csa
->cds
->ds_phys
->ds_compressed_bytes
+ cdl_comp
-
3088 (csa
->ohds
->ds_phys
->ds_compressed_bytes
+ odl_comp
);
3089 duncomp
= csa
->cds
->ds_phys
->ds_uncompressed_bytes
+
3091 (csa
->ohds
->ds_phys
->ds_uncompressed_bytes
+ odl_uncomp
);
3093 dsl_dir_diduse_space(csa
->ohds
->ds_dir
, DD_USED_HEAD
,
3094 dused
, dcomp
, duncomp
, tx
);
3095 dsl_dir_diduse_space(csa
->cds
->ds_dir
, DD_USED_HEAD
,
3096 -dused
, -dcomp
, -duncomp
, tx
);
3099 * The difference in the space used by snapshots is the
3100 * difference in snapshot space due to the head's
3101 * deadlist (since that's the only thing that's
3102 * changing that affects the snapused).
3104 VERIFY(0 == bplist_space_birthrange(&csa
->cds
->ds_deadlist
,
3105 csa
->ohds
->ds_origin_txg
, UINT64_MAX
, &cdl_used
));
3106 VERIFY(0 == bplist_space_birthrange(&csa
->ohds
->ds_deadlist
,
3107 csa
->ohds
->ds_origin_txg
, UINT64_MAX
, &odl_used
));
3108 dsl_dir_transfer_space(csa
->ohds
->ds_dir
, cdl_used
- odl_used
,
3109 DD_USED_HEAD
, DD_USED_SNAP
, tx
);
3112 #define SWITCH64(x, y) \
3114 uint64_t __tmp = (x); \
3119 /* swap ds_*_bytes */
3120 SWITCH64(csa
->ohds
->ds_phys
->ds_used_bytes
,
3121 csa
->cds
->ds_phys
->ds_used_bytes
);
3122 SWITCH64(csa
->ohds
->ds_phys
->ds_compressed_bytes
,
3123 csa
->cds
->ds_phys
->ds_compressed_bytes
);
3124 SWITCH64(csa
->ohds
->ds_phys
->ds_uncompressed_bytes
,
3125 csa
->cds
->ds_phys
->ds_uncompressed_bytes
);
3126 SWITCH64(csa
->ohds
->ds_phys
->ds_unique_bytes
,
3127 csa
->cds
->ds_phys
->ds_unique_bytes
);
3129 /* apply any parent delta for change in unconsumed refreservation */
3130 dsl_dir_diduse_space(csa
->ohds
->ds_dir
, DD_USED_REFRSRV
,
3131 csa
->unused_refres_delta
, 0, 0, tx
);
3133 /* swap deadlists */
3134 bplist_close(&csa
->cds
->ds_deadlist
);
3135 bplist_close(&csa
->ohds
->ds_deadlist
);
3136 SWITCH64(csa
->ohds
->ds_phys
->ds_deadlist_obj
,
3137 csa
->cds
->ds_phys
->ds_deadlist_obj
);
3138 VERIFY(0 == bplist_open(&csa
->cds
->ds_deadlist
, dp
->dp_meta_objset
,
3139 csa
->cds
->ds_phys
->ds_deadlist_obj
));
3140 VERIFY(0 == bplist_open(&csa
->ohds
->ds_deadlist
, dp
->dp_meta_objset
,
3141 csa
->ohds
->ds_phys
->ds_deadlist_obj
));
3143 dsl_pool_ds_clone_swapped(csa
->ohds
, csa
->cds
, tx
);
3147 * Swap 'clone' with its origin head file system. Used at the end
3148 * of "online recv" to swizzle the file system to the new version.
3151 dsl_dataset_clone_swap(dsl_dataset_t
*clone
, dsl_dataset_t
*origin_head
,
3154 struct cloneswaparg csa
;
3157 ASSERT(clone
->ds_owner
);
3158 ASSERT(origin_head
->ds_owner
);
3160 /* Need exclusive access for the swap */
3161 rw_enter(&clone
->ds_rwlock
, RW_WRITER
);
3162 if (!rw_tryenter(&origin_head
->ds_rwlock
, RW_WRITER
)) {
3163 rw_exit(&clone
->ds_rwlock
);
3164 rw_enter(&origin_head
->ds_rwlock
, RW_WRITER
);
3165 if (!rw_tryenter(&clone
->ds_rwlock
, RW_WRITER
)) {
3166 rw_exit(&origin_head
->ds_rwlock
);
3171 csa
.ohds
= origin_head
;
3173 error
= dsl_sync_task_do(clone
->ds_dir
->dd_pool
,
3174 dsl_dataset_clone_swap_check
,
3175 dsl_dataset_clone_swap_sync
, &csa
, NULL
, 9);
3180 * Given a pool name and a dataset object number in that pool,
3181 * return the name of that dataset.
3184 dsl_dsobj_to_dsname(char *pname
, uint64_t obj
, char *buf
)
3191 if ((error
= spa_open(pname
, &spa
, FTAG
)) != 0)
3193 dp
= spa_get_dsl(spa
);
3194 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
3195 if ((error
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
)) == 0) {
3196 dsl_dataset_name(ds
, buf
);
3197 dsl_dataset_rele(ds
, FTAG
);
3199 rw_exit(&dp
->dp_config_rwlock
);
3200 spa_close(spa
, FTAG
);
3206 dsl_dataset_check_quota(dsl_dataset_t
*ds
, boolean_t check_quota
,
3207 uint64_t asize
, uint64_t inflight
, uint64_t *used
, uint64_t *ref_rsrv
)
3211 ASSERT3S(asize
, >, 0);
3214 * *ref_rsrv is the portion of asize that will come from any
3215 * unconsumed refreservation space.
3219 mutex_enter(&ds
->ds_lock
);
3221 * Make a space adjustment for reserved bytes.
3223 if (ds
->ds_reserved
> ds
->ds_phys
->ds_unique_bytes
) {
3225 ds
->ds_reserved
- ds
->ds_phys
->ds_unique_bytes
);
3226 *used
-= (ds
->ds_reserved
- ds
->ds_phys
->ds_unique_bytes
);
3228 asize
- MIN(asize
, parent_delta(ds
, asize
+ inflight
));
3231 if (!check_quota
|| ds
->ds_quota
== 0) {
3232 mutex_exit(&ds
->ds_lock
);
3236 * If they are requesting more space, and our current estimate
3237 * is over quota, they get to try again unless the actual
3238 * on-disk is over quota and there are no pending changes (which
3239 * may free up space for us).
3241 if (ds
->ds_phys
->ds_used_bytes
+ inflight
>= ds
->ds_quota
) {
3242 if (inflight
> 0 || ds
->ds_phys
->ds_used_bytes
< ds
->ds_quota
)
3247 mutex_exit(&ds
->ds_lock
);
3254 dsl_dataset_set_quota_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
3256 dsl_dataset_t
*ds
= arg1
;
3257 uint64_t *quotap
= arg2
;
3258 uint64_t new_quota
= *quotap
;
3260 if (spa_version(ds
->ds_dir
->dd_pool
->dp_spa
) < SPA_VERSION_REFQUOTA
)
3266 if (new_quota
< ds
->ds_phys
->ds_used_bytes
||
3267 new_quota
< ds
->ds_reserved
)
3275 dsl_dataset_set_quota_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
3277 dsl_dataset_t
*ds
= arg1
;
3278 uint64_t *quotap
= arg2
;
3279 uint64_t new_quota
= *quotap
;
3281 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
3283 ds
->ds_quota
= new_quota
;
3285 dsl_dir_prop_set_uint64_sync(ds
->ds_dir
, "refquota", new_quota
, cr
, tx
);
3287 spa_history_internal_log(LOG_DS_REFQUOTA
, ds
->ds_dir
->dd_pool
->dp_spa
,
3288 tx
, cr
, "%lld dataset = %llu ",
3289 (longlong_t
)new_quota
, ds
->ds_object
);
3293 dsl_dataset_set_quota(const char *dsname
, uint64_t quota
)
3298 err
= dsl_dataset_hold(dsname
, FTAG
, &ds
);
3302 if (quota
!= ds
->ds_quota
) {
3304 * If someone removes a file, then tries to set the quota, we
3305 * want to make sure the file freeing takes effect.
3307 txg_wait_open(ds
->ds_dir
->dd_pool
, 0);
3309 err
= dsl_sync_task_do(ds
->ds_dir
->dd_pool
,
3310 dsl_dataset_set_quota_check
, dsl_dataset_set_quota_sync
,
3313 dsl_dataset_rele(ds
, FTAG
);
3318 dsl_dataset_set_reservation_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
3320 dsl_dataset_t
*ds
= arg1
;
3321 uint64_t *reservationp
= arg2
;
3322 uint64_t new_reservation
= *reservationp
;
3325 if (spa_version(ds
->ds_dir
->dd_pool
->dp_spa
) <
3326 SPA_VERSION_REFRESERVATION
)
3329 if (dsl_dataset_is_snapshot(ds
))
3333 * If we are doing the preliminary check in open context, the
3334 * space estimates may be inaccurate.
3336 if (!dmu_tx_is_syncing(tx
))
3339 mutex_enter(&ds
->ds_lock
);
3340 unique
= dsl_dataset_unique(ds
);
3341 mutex_exit(&ds
->ds_lock
);
3343 if (MAX(unique
, new_reservation
) > MAX(unique
, ds
->ds_reserved
)) {
3344 uint64_t delta
= MAX(unique
, new_reservation
) -
3345 MAX(unique
, ds
->ds_reserved
);
3347 if (delta
> dsl_dir_space_available(ds
->ds_dir
, NULL
, 0, TRUE
))
3349 if (ds
->ds_quota
> 0 &&
3350 new_reservation
> ds
->ds_quota
)
3359 dsl_dataset_set_reservation_sync(void *arg1
, void *arg2
, cred_t
*cr
,
3362 dsl_dataset_t
*ds
= arg1
;
3363 uint64_t *reservationp
= arg2
;
3364 uint64_t new_reservation
= *reservationp
;
3368 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
3370 mutex_enter(&ds
->ds_dir
->dd_lock
);
3371 mutex_enter(&ds
->ds_lock
);
3372 unique
= dsl_dataset_unique(ds
);
3373 delta
= MAX(0, (int64_t)(new_reservation
- unique
)) -
3374 MAX(0, (int64_t)(ds
->ds_reserved
- unique
));
3375 ds
->ds_reserved
= new_reservation
;
3376 mutex_exit(&ds
->ds_lock
);
3378 dsl_dir_diduse_space(ds
->ds_dir
, DD_USED_REFRSRV
, delta
, 0, 0, tx
);
3379 mutex_exit(&ds
->ds_dir
->dd_lock
);
3380 dsl_dir_prop_set_uint64_sync(ds
->ds_dir
, "refreservation",
3381 new_reservation
, cr
, tx
);
3383 spa_history_internal_log(LOG_DS_REFRESERV
,
3384 ds
->ds_dir
->dd_pool
->dp_spa
, tx
, cr
, "%lld dataset = %llu",
3385 (longlong_t
)new_reservation
, ds
->ds_object
);
3389 dsl_dataset_set_reservation(const char *dsname
, uint64_t reservation
)
3394 err
= dsl_dataset_hold(dsname
, FTAG
, &ds
);
3398 err
= dsl_sync_task_do(ds
->ds_dir
->dd_pool
,
3399 dsl_dataset_set_reservation_check
,
3400 dsl_dataset_set_reservation_sync
, ds
, &reservation
, 0);
3401 dsl_dataset_rele(ds
, FTAG
);
3406 dsl_dataset_user_hold_check(void *arg1
, void *arg2
, dmu_tx_t
*tx
)
3408 dsl_dataset_t
*ds
= arg1
;
3410 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
3413 if (spa_version(ds
->ds_dir
->dd_pool
->dp_spa
) < SPA_VERSION_USERREFS
)
3416 if (!dsl_dataset_is_snapshot(ds
))
3419 if (strlen(htag
) >= ZAP_MAXNAMELEN
)
3420 return (ENAMETOOLONG
);
3422 /* tags must be unique */
3423 mutex_enter(&ds
->ds_lock
);
3424 if (ds
->ds_phys
->ds_userrefs_obj
) {
3425 error
= zap_lookup(mos
, ds
->ds_phys
->ds_userrefs_obj
, htag
,
3429 else if (error
== ENOENT
)
3432 mutex_exit(&ds
->ds_lock
);
3438 dsl_dataset_user_hold_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
3440 dsl_dataset_t
*ds
= arg1
;
3442 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
3443 time_t now
= gethrestime_sec();
3446 mutex_enter(&ds
->ds_lock
);
3447 if (ds
->ds_phys
->ds_userrefs_obj
== 0) {
3449 * This is the first user hold for this dataset. Create
3450 * the userrefs zap object.
3452 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
3453 zapobj
= ds
->ds_phys
->ds_userrefs_obj
=
3454 zap_create(mos
, DMU_OT_USERREFS
, DMU_OT_NONE
, 0, tx
);
3456 zapobj
= ds
->ds_phys
->ds_userrefs_obj
;
3459 mutex_exit(&ds
->ds_lock
);
3461 VERIFY(0 == zap_add(mos
, zapobj
, htag
, 8, 1, &now
, tx
));
3463 spa_history_internal_log(LOG_DS_USER_HOLD
,
3464 ds
->ds_dir
->dd_pool
->dp_spa
, tx
, cr
, "<%s> dataset = %llu",
3465 htag
, ds
->ds_object
);
3468 struct dsl_ds_holdarg
{
3469 dsl_sync_task_group_t
*dstg
;
3472 boolean_t recursive
;
3473 char failed
[MAXPATHLEN
];
3477 dsl_dataset_user_hold_one(char *dsname
, void *arg
)
3479 struct dsl_ds_holdarg
*ha
= arg
;
3485 /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3486 buflen
= strlen(dsname
) + strlen(ha
->snapname
) + 2;
3487 name
= kmem_alloc(buflen
, KM_SLEEP
);
3488 (void) snprintf(name
, buflen
, "%s@%s", dsname
, ha
->snapname
);
3489 error
= dsl_dataset_hold(name
, ha
->dstg
, &ds
);
3490 kmem_free(name
, buflen
);
3492 dsl_sync_task_create(ha
->dstg
, dsl_dataset_user_hold_check
,
3493 dsl_dataset_user_hold_sync
, ds
, ha
->htag
, 0);
3494 } else if (error
== ENOENT
&& ha
->recursive
) {
3497 (void) strcpy(ha
->failed
, dsname
);
3503 dsl_dataset_user_hold(char *dsname
, char *snapname
, char *htag
,
3504 boolean_t recursive
)
3506 struct dsl_ds_holdarg
*ha
;
3507 dsl_sync_task_t
*dst
;
3511 ha
= kmem_zalloc(sizeof (struct dsl_ds_holdarg
), KM_SLEEP
);
3513 (void) strlcpy(ha
->failed
, dsname
, sizeof (ha
->failed
));
3515 error
= spa_open(dsname
, &spa
, FTAG
);
3517 kmem_free(ha
, sizeof (struct dsl_ds_holdarg
));
3521 ha
->dstg
= dsl_sync_task_group_create(spa_get_dsl(spa
));
3523 ha
->snapname
= snapname
;
3524 ha
->recursive
= recursive
;
3526 error
= dmu_objset_find(dsname
, dsl_dataset_user_hold_one
,
3527 ha
, DS_FIND_CHILDREN
);
3529 error
= dsl_dataset_user_hold_one(dsname
, ha
);
3532 error
= dsl_sync_task_group_wait(ha
->dstg
);
3534 for (dst
= list_head(&ha
->dstg
->dstg_tasks
); dst
;
3535 dst
= list_next(&ha
->dstg
->dstg_tasks
, dst
)) {
3536 dsl_dataset_t
*ds
= dst
->dst_arg1
;
3539 dsl_dataset_name(ds
, ha
->failed
);
3540 *strchr(ha
->failed
, '@') = '\0';
3542 dsl_dataset_rele(ds
, ha
->dstg
);
3546 (void) strcpy(dsname
, ha
->failed
);
3548 dsl_sync_task_group_destroy(ha
->dstg
);
3549 kmem_free(ha
, sizeof (struct dsl_ds_holdarg
));
3550 spa_close(spa
, FTAG
);
3554 struct dsl_ds_releasearg
{
3557 boolean_t own
; /* do we own or just hold ds? */
3561 dsl_dataset_release_might_destroy(dsl_dataset_t
*ds
, const char *htag
,
3562 boolean_t
*might_destroy
)
3564 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
3569 *might_destroy
= B_FALSE
;
3571 mutex_enter(&ds
->ds_lock
);
3572 zapobj
= ds
->ds_phys
->ds_userrefs_obj
;
3574 /* The tag can't possibly exist */
3575 mutex_exit(&ds
->ds_lock
);
3579 /* Make sure the tag exists */
3580 error
= zap_lookup(mos
, zapobj
, htag
, 8, 1, &tmp
);
3582 mutex_exit(&ds
->ds_lock
);
3583 if (error
== ENOENT
)
3588 if (ds
->ds_userrefs
== 1 && ds
->ds_phys
->ds_num_children
== 1 &&
3589 DS_IS_DEFER_DESTROY(ds
))
3590 *might_destroy
= B_TRUE
;
3592 mutex_exit(&ds
->ds_lock
);
3597 dsl_dataset_user_release_check(void *arg1
, void *tag
, dmu_tx_t
*tx
)
3599 struct dsl_ds_releasearg
*ra
= arg1
;
3600 dsl_dataset_t
*ds
= ra
->ds
;
3601 boolean_t might_destroy
;
3604 if (spa_version(ds
->ds_dir
->dd_pool
->dp_spa
) < SPA_VERSION_USERREFS
)
3607 error
= dsl_dataset_release_might_destroy(ds
, ra
->htag
, &might_destroy
);
3611 if (might_destroy
) {
3612 struct dsl_ds_destroyarg dsda
= {0};
3614 if (dmu_tx_is_syncing(tx
)) {
3616 * If we're not prepared to remove the snapshot,
3617 * we can't allow the release to happen right now.
3621 if (ds
->ds_user_ptr
) {
3622 ds
->ds_user_evict_func(ds
, ds
->ds_user_ptr
);
3623 ds
->ds_user_ptr
= NULL
;
3627 dsda
.releasing
= B_TRUE
;
3628 return (dsl_dataset_destroy_check(&dsda
, tag
, tx
));
3635 dsl_dataset_user_release_sync(void *arg1
, void *tag
, cred_t
*cr
, dmu_tx_t
*tx
)
3637 struct dsl_ds_releasearg
*ra
= arg1
;
3638 dsl_dataset_t
*ds
= ra
->ds
;
3639 spa_t
*spa
= ds
->ds_dir
->dd_pool
->dp_spa
;
3640 objset_t
*mos
= ds
->ds_dir
->dd_pool
->dp_meta_objset
;
3642 uint64_t dsobj
= ds
->ds_object
;
3645 mutex_enter(&ds
->ds_lock
);
3647 refs
= ds
->ds_userrefs
;
3648 mutex_exit(&ds
->ds_lock
);
3649 zapobj
= ds
->ds_phys
->ds_userrefs_obj
;
3650 VERIFY(0 == zap_remove(mos
, zapobj
, ra
->htag
, tx
));
3651 if (ds
->ds_userrefs
== 0 && ds
->ds_phys
->ds_num_children
== 1 &&
3652 DS_IS_DEFER_DESTROY(ds
)) {
3653 struct dsl_ds_destroyarg dsda
= {0};
3657 dsda
.releasing
= B_TRUE
;
3658 /* We already did the destroy_check */
3659 dsl_dataset_destroy_sync(&dsda
, tag
, cr
, tx
);
3662 spa_history_internal_log(LOG_DS_USER_RELEASE
,
3663 spa
, tx
, cr
, "<%s> %lld dataset = %llu",
3664 ra
->htag
, (longlong_t
)refs
, dsobj
);
3668 dsl_dataset_user_release_one(char *dsname
, void *arg
)
3670 struct dsl_ds_holdarg
*ha
= arg
;
3671 struct dsl_ds_releasearg
*ra
;
3674 void *dtag
= ha
->dstg
;
3677 boolean_t own
= B_FALSE
;
3678 boolean_t might_destroy
;
3680 if (strlen(ha
->htag
) >= ZAP_MAXNAMELEN
)
3681 return (ENAMETOOLONG
);
3683 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3684 buflen
= strlen(dsname
) + strlen(ha
->snapname
) + 2;
3685 name
= kmem_alloc(buflen
, KM_SLEEP
);
3686 (void) snprintf(name
, buflen
, "%s@%s", dsname
, ha
->snapname
);
3687 error
= dsl_dataset_hold(name
, dtag
, &ds
);
3688 kmem_free(name
, buflen
);
3689 if (error
== ENOENT
&& ha
->recursive
)
3691 (void) strcpy(ha
->failed
, dsname
);
3695 ASSERT(dsl_dataset_is_snapshot(ds
));
3697 error
= dsl_dataset_release_might_destroy(ds
, ha
->htag
, &might_destroy
);
3699 dsl_dataset_rele(ds
, dtag
);
3703 if (might_destroy
) {
3705 error
= zfs_unmount_snap(name
, NULL
);
3707 dsl_dataset_rele(ds
, dtag
);
3710 error
= dsl_dataset_zvol_cleanup(ds
, name
);
3712 dsl_dataset_rele(ds
, dtag
);
3716 if (!dsl_dataset_tryown(ds
,
3717 DS_MODE_READONLY
| DS_MODE_INCONSISTENT
, dtag
)) {
3718 dsl_dataset_rele(ds
, dtag
);
3722 dsl_dataset_make_exclusive(ds
, dtag
);
3726 ra
= kmem_alloc(sizeof (struct dsl_ds_releasearg
), KM_SLEEP
);
3728 ra
->htag
= ha
->htag
;
3730 dsl_sync_task_create(ha
->dstg
, dsl_dataset_user_release_check
,
3731 dsl_dataset_user_release_sync
, ra
, dtag
, 0);
3737 dsl_dataset_user_release(char *dsname
, char *snapname
, char *htag
,
3738 boolean_t recursive
)
3740 struct dsl_ds_holdarg
*ha
;
3741 dsl_sync_task_t
*dst
;
3745 ha
= kmem_zalloc(sizeof (struct dsl_ds_holdarg
), KM_SLEEP
);
3747 (void) strlcpy(ha
->failed
, dsname
, sizeof (ha
->failed
));
3749 error
= spa_open(dsname
, &spa
, FTAG
);
3751 kmem_free(ha
, sizeof (struct dsl_ds_holdarg
));
3755 ha
->dstg
= dsl_sync_task_group_create(spa_get_dsl(spa
));
3757 ha
->snapname
= snapname
;
3758 ha
->recursive
= recursive
;
3760 error
= dmu_objset_find(dsname
, dsl_dataset_user_release_one
,
3761 ha
, DS_FIND_CHILDREN
);
3763 error
= dsl_dataset_user_release_one(dsname
, ha
);
3766 error
= dsl_sync_task_group_wait(ha
->dstg
);
3768 for (dst
= list_head(&ha
->dstg
->dstg_tasks
); dst
;
3769 dst
= list_next(&ha
->dstg
->dstg_tasks
, dst
)) {
3770 struct dsl_ds_releasearg
*ra
= dst
->dst_arg1
;
3771 dsl_dataset_t
*ds
= ra
->ds
;
3774 dsl_dataset_name(ds
, ha
->failed
);
3777 dsl_dataset_disown(ds
, ha
->dstg
);
3779 dsl_dataset_rele(ds
, ha
->dstg
);
3781 kmem_free(ra
, sizeof (struct dsl_ds_releasearg
));
3785 (void) strcpy(dsname
, ha
->failed
);
3787 dsl_sync_task_group_destroy(ha
->dstg
);
3788 kmem_free(ha
, sizeof (struct dsl_ds_holdarg
));
3789 spa_close(spa
, FTAG
);
3794 dsl_dataset_get_holds(const char *dsname
, nvlist_t
**nvp
)
3799 err
= dsl_dataset_hold(dsname
, FTAG
, &ds
);
3803 VERIFY(0 == nvlist_alloc(nvp
, NV_UNIQUE_NAME
, KM_SLEEP
));
3804 if (ds
->ds_phys
->ds_userrefs_obj
!= 0) {
3805 zap_attribute_t
*za
;
3808 za
= kmem_alloc(sizeof (zap_attribute_t
), KM_SLEEP
);
3809 for (zap_cursor_init(&zc
, ds
->ds_dir
->dd_pool
->dp_meta_objset
,
3810 ds
->ds_phys
->ds_userrefs_obj
);
3811 zap_cursor_retrieve(&zc
, za
) == 0;
3812 zap_cursor_advance(&zc
)) {
3813 VERIFY(0 == nvlist_add_uint64(*nvp
, za
->za_name
,
3814 za
->za_first_integer
));
3816 zap_cursor_fini(&zc
);
3817 kmem_free(za
, sizeof (zap_attribute_t
));
3819 dsl_dataset_rele(ds
, FTAG
);